kernel: backport netfilter NAT offload support to 4.14

This only works with nftables for now, iptables support will be added
later. Includes a number of related upstream nftables improvements to
simplify backporting follow-up changes

Signed-off-by: John Crispin <john@phrozen.org>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
This commit is contained in:
Felix Fietkau 2018-02-05 13:35:24 +01:00
parent b7265c59ab
commit 1033356442
43 changed files with 11343 additions and 0 deletions

View file

@ -140,6 +140,23 @@ endef
$(eval $(call KernelPackage,nf-nat6))
define KernelPackage/nf-flow
SUBMENU:=$(NF_MENU)
TITLE:=Netfilter flowtable support
KCONFIG:= \
CONFIG_NETFILTER_INGRESS=y \
CONFIG_NF_FLOW_TABLE \
CONFIG_NF_FLOW_TABLE_HW
DEPENDS:=+kmod-nf-conntrack +kmod-nft-core @!LINUX_3_18 @!LINUX_4_4 @!LINUX_4_9
FILES:= \
$(LINUX_DIR)/net/netfilter/nf_flow_table.ko \
$(LINUX_DIR)/net/netfilter/nf_flow_table_hw.ko
AUTOLOAD:=$(call AutoProbe,nf_flow_table nf_flow_table_hw)
endef
$(eval $(call KernelPackage,nf-flow))
define AddDepends/ipt
SUBMENU:=$(NF_MENU)
DEPENDS+= +kmod-ipt-core $(1)
@ -970,6 +987,26 @@ endef
$(eval $(call KernelPackage,nft-nat))
define KernelPackage/nft-offload
SUBMENU:=$(NF_MENU)
TITLE:=Netfilter nf_tables routing/NAT offload support
DEPENDS:=+kmod-nf-flow +kmod-nft-nat
KCONFIG:= \
CONFIG_NF_FLOW_TABLE_INET \
CONFIG_NF_FLOW_TABLE_IPV4 \
CONFIG_NF_FLOW_TABLE_IPV6 \
CONFIG_NFT_FLOW_OFFLOAD
FILES:= \
$(LINUX_DIR)/net/netfilter/nf_flow_table_inet.ko \
$(LINUX_DIR)/net/ipv4/netfilter/nf_flow_table_ipv4.ko \
$(LINUX_DIR)/net/ipv6/netfilter/nf_flow_table_ipv6.ko \
$(LINUX_DIR)/net/netfilter/nft_flow_offload.ko
AUTOLOAD:=$(call AutoProbe,nf_flow_table_inet nf_flow_table_ipv4 nf_flow_table_ipv6 nft_flow_offload)
endef
$(eval $(call KernelPackage,nft-offload))
define KernelPackage/nft-nat6
SUBMENU:=$(NF_MENU)
TITLE:=Netfilter nf_tables IPv6-NAT support

View file

@ -0,0 +1,169 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:03:56 +0100
Subject: [PATCH] netfilter: nf_conntrack: add IPS_OFFLOAD status bit
This new bit tells us that the conntrack entry is owned by the flow
table offload infrastructure.
# cat /proc/net/nf_conntrack
ipv4 2 tcp 6 src=10.141.10.2 dst=147.75.205.195 sport=36392 dport=443 src=147.75.205.195 dst=192.168.2.195 sport=443 dport=36392 [OFFLOAD] mark=0 zone=0 use=2
Note the [OFFLOAD] tag in the listing.
The timer of such conntrack entries look like stopped from userspace.
In practise, to make sure the conntrack entry does not go away, the
conntrack timer is periodically set to an arbitrary large value that
gets refreshed on every iteration from the garbage collector, so it
never expires- and they display no internal state in the case of TCP
flows. This allows us to save a bitcheck from the packet path via
nf_ct_is_expired().
Conntrack entries that have been offloaded to the flow table
infrastructure cannot be deleted/flushed via ctnetlink. The flow table
infrastructure is also responsible for releasing this conntrack entry.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -101,12 +101,16 @@ enum ip_conntrack_status {
IPS_HELPER_BIT = 13,
IPS_HELPER = (1 << IPS_HELPER_BIT),
+ /* Conntrack has been offloaded to flow table. */
+ IPS_OFFLOAD_BIT = 14,
+ IPS_OFFLOAD = (1 << IPS_OFFLOAD_BIT),
+
/* Be careful here, modifying these bits can make things messy,
* so don't let users modify them directly.
*/
IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK |
IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING |
- IPS_SEQ_ADJUST | IPS_TEMPLATE),
+ IPS_SEQ_ADJUST | IPS_TEMPLATE | IPS_OFFLOAD),
__IPS_MAX_BIT = 14,
};
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -901,6 +901,9 @@ static unsigned int early_drop_list(stru
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h);
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status))
+ continue;
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
continue;
@@ -975,6 +978,18 @@ static bool gc_worker_can_early_drop(con
return false;
}
+#define DAY (86400 * HZ)
+
+/* Set an arbitrary timeout large enough not to ever expire, this save
+ * us a check for the IPS_OFFLOAD_BIT from the packet path via
+ * nf_ct_is_expired().
+ */
+static void nf_ct_offload_timeout(struct nf_conn *ct)
+{
+ if (nf_ct_expires(ct) < DAY / 2)
+ ct->timeout = nfct_time_stamp + DAY;
+}
+
static void gc_worker(struct work_struct *work)
{
unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
@@ -1011,6 +1026,11 @@ static void gc_worker(struct work_struct
tmp = nf_ct_tuplehash_to_ctrack(h);
scanned++;
+ if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
+ nf_ct_offload_timeout(tmp);
+ continue;
+ }
+
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
expired_count++;
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1105,6 +1105,14 @@ static const struct nla_policy ct_nla_po
.len = NF_CT_LABELS_MAX_SIZE },
};
+static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
+{
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return 0;
+
+ return ctnetlink_filter_match(ct, data);
+}
+
static int ctnetlink_flush_conntrack(struct net *net,
const struct nlattr * const cda[],
u32 portid, int report)
@@ -1117,7 +1125,7 @@ static int ctnetlink_flush_conntrack(str
return PTR_ERR(filter);
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_filter_match, filter,
+ nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
portid, report);
kfree(filter);
@@ -1163,6 +1171,11 @@ static int ctnetlink_del_conntrack(struc
ct = nf_ct_tuplehash_to_ctrack(h);
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ nf_ct_put(ct);
+ return -EBUSY;
+ }
+
if (cda[CTA_ID]) {
u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID]));
if (id != (u32)(unsigned long)ct) {
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -305,6 +305,9 @@ static bool tcp_invert_tuple(struct nf_c
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
{
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ return;
+
seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]);
}
#endif
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -309,10 +309,12 @@ static int ct_seq_show(struct seq_file *
WARN_ON(!l4proto);
ret = -ENOSPC;
- seq_printf(s, "%-8s %u %-8s %u %ld ",
+ seq_printf(s, "%-8s %u %-8s %u ",
l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
- l4proto_name(l4proto->l4proto), nf_ct_protonum(ct),
- nf_ct_expires(ct) / HZ);
+ l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
+
+ if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ seq_printf(s, "%ld ", nf_ct_expires(ct) / HZ);
if (l4proto->print_conntrack)
l4proto->print_conntrack(s, ct);
@@ -339,7 +341,9 @@ static int ct_seq_show(struct seq_file *
if (seq_print_acct(s, ct, IP_CT_DIR_REPLY))
goto release;
- if (test_bit(IPS_ASSURED_BIT, &ct->status))
+ if (test_bit(IPS_OFFLOAD_BIT, &ct->status))
+ seq_puts(s, "[OFFLOAD] ");
+ else if (test_bit(IPS_ASSURED_BIT, &ct->status))
seq_puts(s, "[ASSURED] ");
if (seq_has_overflowed(s))

View file

@ -0,0 +1,586 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:11 +0100
Subject: [PATCH] netfilter: add generic flow table infrastructure
This patch defines the API to interact with flow tables, this allows to
add, delete and lookup for entries in the flow table. This also adds the
generic garbage code that removes entries that have expired, ie. no
traffic has been seen for a while.
Users of the flow table infrastructure can delete entries via
flow_offload_dead(), which sets the dying bit, this signals the garbage
collector to release an entry from user context.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/netfilter/nf_flow_table.c
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -1,7 +1,12 @@
#ifndef _NF_FLOW_TABLE_H
#define _NF_FLOW_TABLE_H
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
#include <linux/rhashtable.h>
+#include <linux/rcupdate.h>
+#include <net/dst.h>
struct nf_flowtable;
@@ -20,4 +25,93 @@ struct nf_flowtable {
struct delayed_work gc_work;
};
+enum flow_offload_tuple_dir {
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ FLOW_OFFLOAD_DIR_REPLY,
+ __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY,
+};
+#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1)
+
+struct flow_offload_tuple {
+ union {
+ struct in_addr src_v4;
+ struct in6_addr src_v6;
+ };
+ union {
+ struct in_addr dst_v4;
+ struct in6_addr dst_v6;
+ };
+ struct {
+ __be16 src_port;
+ __be16 dst_port;
+ };
+
+ int iifidx;
+
+ u8 l3proto;
+ u8 l4proto;
+ u8 dir;
+
+ int oifidx;
+
+ struct dst_entry *dst_cache;
+};
+
+struct flow_offload_tuple_rhash {
+ struct rhash_head node;
+ struct flow_offload_tuple tuple;
+};
+
+#define FLOW_OFFLOAD_SNAT 0x1
+#define FLOW_OFFLOAD_DNAT 0x2
+#define FLOW_OFFLOAD_DYING 0x4
+
+struct flow_offload {
+ struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
+ u32 flags;
+ union {
+ /* Your private driver data here. */
+ u32 timeout;
+ };
+};
+
+#define NF_FLOW_TIMEOUT (30 * HZ)
+
+struct nf_flow_route {
+ struct {
+ struct dst_entry *dst;
+ int ifindex;
+ } tuple[FLOW_OFFLOAD_DIR_MAX];
+};
+
+struct flow_offload *flow_offload_alloc(struct nf_conn *ct,
+ struct nf_flow_route *route);
+void flow_offload_free(struct flow_offload *flow);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
+void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
+struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
+ struct flow_offload_tuple *tuple);
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct flow_offload *flow, void *data),
+ void *data);
+void nf_flow_offload_work_gc(struct work_struct *work);
+extern const struct rhashtable_params nf_flow_offload_rhash_params;
+
+void flow_offload_dead(struct flow_offload *flow);
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir);
+int nf_flow_dnat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir);
+
+struct flow_ports {
+ __be16 source, dest;
+};
+
+#define MODULE_ALIAS_NF_FLOWTABLE(family) \
+ MODULE_ALIAS("nf-flowtable-" __stringify(family))
+
#endif /* _FLOW_OFFLOAD_H */
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -661,6 +661,13 @@ endif # NF_TABLES_NETDEV
endif # NF_TABLES
+config NF_FLOW_TABLE
+ tristate "Netfilter flow table module"
+ help
+ This option adds the flow table core infrastructure.
+
+ To compile it as a module, choose M here.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -110,6 +110,9 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
+# flow table infrastructure
+obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
+
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
--- /dev/null
+++ b/net/netfilter/nf_flow_table.c
@@ -0,0 +1,429 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+struct flow_offload_entry {
+ struct flow_offload flow;
+ struct nf_conn *ct;
+ struct rcu_head rcu_head;
+};
+
+struct flow_offload *
+flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+{
+ struct flow_offload_entry *entry;
+ struct flow_offload *flow;
+
+ if (unlikely(nf_ct_is_dying(ct) ||
+ !atomic_inc_not_zero(&ct->ct_general.use)))
+ return NULL;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ goto err_ct_refcnt;
+
+ flow = &entry->flow;
+
+ if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
+ goto err_dst_cache_original;
+
+ if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
+ goto err_dst_cache_reply;
+
+ entry->ct = ct;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
+ case NFPROTO_IPV4:
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
+ break;
+ case NFPROTO_IPV6:
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
+ break;
+ }
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
+ route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
+ route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
+ ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
+ FLOW_OFFLOAD_DIR_ORIGINAL;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
+ FLOW_OFFLOAD_DIR_REPLY;
+
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
+ route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
+ route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
+ route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
+ flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
+ route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+
+ if (ct->status & IPS_SRC_NAT)
+ flow->flags |= FLOW_OFFLOAD_SNAT;
+ else if (ct->status & IPS_DST_NAT)
+ flow->flags |= FLOW_OFFLOAD_DNAT;
+
+ return flow;
+
+err_dst_cache_reply:
+ dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+err_dst_cache_original:
+ kfree(entry);
+err_ct_refcnt:
+ nf_ct_put(ct);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
+
+void flow_offload_free(struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
+ dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ e = container_of(flow, struct flow_offload_entry, flow);
+ kfree(e);
+}
+EXPORT_SYMBOL_GPL(flow_offload_free);
+
+void flow_offload_dead(struct flow_offload *flow)
+{
+ flow->flags |= FLOW_OFFLOAD_DYING;
+}
+EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
+{
+ flow->timeout = (u32)jiffies;
+
+ rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+ *flow_table->type->params);
+ rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+ *flow_table->type->params);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(flow_offload_add);
+
+void flow_offload_del(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+ *flow_table->type->params);
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+ *flow_table->type->params);
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ kfree_rcu(e, rcu_head);
+}
+EXPORT_SYMBOL_GPL(flow_offload_del);
+
+struct flow_offload_tuple_rhash *
+flow_offload_lookup(struct nf_flowtable *flow_table,
+ struct flow_offload_tuple *tuple)
+{
+ return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+ *flow_table->type->params);
+}
+EXPORT_SYMBOL_GPL(flow_offload_lookup);
+
+static void nf_flow_release_ct(const struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ e = container_of(flow, struct flow_offload_entry, flow);
+ nf_ct_delete(e->ct, 0, 0);
+ nf_ct_put(e->ct);
+}
+
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct flow_offload *flow, void *data),
+ void *data)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct rhashtable_iter hti;
+ struct flow_offload *flow;
+ int err;
+
+ err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+ if (err)
+ return err;
+
+ rhashtable_walk_start(&hti);
+
+ while ((tuplehash = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(tuplehash)) {
+ err = PTR_ERR(tuplehash);
+ if (err != -EAGAIN)
+ goto out;
+
+ continue;
+ }
+ if (tuplehash->tuple.dir)
+ continue;
+
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+ iter(flow, data);
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
+
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
+static inline bool nf_flow_is_dying(const struct flow_offload *flow)
+{
+ return flow->flags & FLOW_OFFLOAD_DYING;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table;
+ struct rhashtable_iter hti;
+ struct flow_offload *flow;
+ int err;
+
+ flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+
+ err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+ if (err)
+ goto schedule;
+
+ rhashtable_walk_start(&hti);
+
+ while ((tuplehash = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(tuplehash)) {
+ err = PTR_ERR(tuplehash);
+ if (err != -EAGAIN)
+ goto out;
+
+ continue;
+ }
+ if (tuplehash->tuple.dir)
+ continue;
+
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+ if (nf_flow_has_expired(flow) ||
+ nf_flow_is_dying(flow)) {
+ flow_offload_del(flow_table, flow);
+ nf_flow_release_ct(flow);
+ }
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+schedule:
+ queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
+
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct flow_offload_tuple *tuple = arg->key;
+ const struct flow_offload_tuple_rhash *x = ptr;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ return 1;
+
+ return 0;
+}
+
+const struct rhashtable_params nf_flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
+EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
+
+static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace2(&udph->check, skb, port,
+ new_port, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
+{
+ struct flow_ports *hdr;
+ __be16 port, new_port;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ return -1;
+
+ hdr = (void *)(skb_network_header(skb) + thoff);
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = hdr->source;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+ hdr->source = new_port;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = hdr->dest;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+ hdr->dest = new_port;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_snat_port);
+
+int nf_flow_dnat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
+{
+ struct flow_ports *hdr;
+ __be16 port, new_port;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ return -1;
+
+ hdr = (void *)(skb_network_header(skb) + thoff);
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = hdr->dest;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
+ hdr->dest = new_port;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = hdr->source;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
+ hdr->source = new_port;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");

View file

@ -0,0 +1,334 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:15 +0100
Subject: [PATCH] netfilter: flow table support for IPv4
This patch adds the IPv4 flow table type, that implements the datapath
flow table to forward IPv4 traffic. Rationale is:
1) Look up for the packet in the flow table, from the ingress hook.
2) If there's a hit, decrement ttl and pass it on to the neighbour layer
for transmission.
3) If there's a miss, packet is passed up to the classic forwarding
path.
This patch also supports layer 3 source and destination NAT.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/ipv4/netfilter/nf_flow_table_ipv4.c
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -77,6 +77,14 @@ config NF_TABLES_ARP
endif # NF_TABLES
+config NF_FLOW_TABLE_IPV4
+ select NF_FLOW_TABLE
+ tristate "Netfilter flow table IPv4 module"
+ help
+ This option adds the flow table IPv4 support.
+
+ To compile it as a module, choose M here.
+
config NF_DUP_IPV4
tristate "Netfilter IPv4 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -43,6 +43,9 @@ obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redi
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
+# flow table support
+obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
+
# generic IP tables
obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
--- /dev/null
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -0,0 +1,283 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int thoff = iph->ihl * 4;
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+ return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -1;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(ip_has_options(thoff)))
+ return -1;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+ return false;
+
+ return true;
+}
+
+static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
+{
+ u32 mtu;
+
+ mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
+ if (__nf_flow_exceeds_mtu(skb, mtu))
+ return true;
+
+ return false;
+}
+
+static unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ const struct rtable *rt;
+ struct iphdr *iph;
+ __be32 nexthop;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+ rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ip(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+
+static struct nf_flowtable_type flowtable_ipv4 = {
+ .family = NFPROTO_IPV4,
+ .params = &nf_flow_offload_rhash_params,
+ .gc = nf_flow_offload_work_gc,
+ .hook = nf_flow_offload_ip_hook,
+ .owner = THIS_MODULE,
+};
+
+static int __init nf_flow_ipv4_module_init(void)
+{
+ nft_register_flowtable_type(&flowtable_ipv4);
+
+ return 0;
+}
+
+static void __exit nf_flow_ipv4_module_exit(void)
+{
+ nft_unregister_flowtable_type(&flowtable_ipv4);
+}
+
+module_init(nf_flow_ipv4_module_init);
+module_exit(nf_flow_ipv4_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET);

View file

@ -0,0 +1,354 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:19 +0100
Subject: [PATCH] netfilter: flow table support for IPv6
This patch adds the IPv6 flow table type, that implements the datapath
flow table to forward IPv6 traffic.
This patch exports ip6_dst_mtu_forward() that is required to check for
mtu to pass up packets that need PMTUD handling to the classic
forwarding path.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/ipv6/netfilter/nf_flow_table_ipv6.c
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -913,6 +913,8 @@ static inline struct sk_buff *ip6_finish
&inet6_sk(sk)->cork);
}
+unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
+
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
struct flowi6 *fl6);
struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -370,7 +370,7 @@ static inline int ip6_forward_finish(str
return dst_output(net, sk, skb);
}
-static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
{
unsigned int mtu;
struct inet6_dev *idev;
@@ -390,6 +390,7 @@ static unsigned int ip6_dst_mtu_forward(
return mtu;
}
+EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -71,6 +71,14 @@ config NFT_FIB_IPV6
endif # NF_TABLES_IPV6
endif # NF_TABLES
+config NF_FLOW_TABLE_IPV6
+ select NF_FLOW_TABLE
+ tristate "Netfilter flow table IPv6 module"
+ help
+ This option adds the flow table IPv6 support.
+
+ To compile it as a module, choose M here.
+
config NF_DUP_IPV6
tristate "Netfilter IPv6 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -45,6 +45,9 @@ obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redi
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
+# flow table support
+obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o
+
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
--- /dev/null
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -0,0 +1,277 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/ipv6.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ switch (ip6h->nexthdr) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ unsigned int thoff = sizeof(*ip6h);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ return -1;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return -1;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+ return false;
+
+ return true;
+}
+
+static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
+{
+ u32 mtu;
+
+ mtu = ip6_dst_mtu_forward(&rt->dst);
+ if (__nf_flow_exceeds_mtu(skb, mtu))
+ return true;
+
+ return false;
+}
+
+static unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct in6_addr *nexthop;
+ struct ipv6hdr *ip6h;
+ struct rt6_info *rt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+
+ rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+ if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+ return NF_STOLEN;
+}
+
+static struct nf_flowtable_type flowtable_ipv6 = {
+ .family = NFPROTO_IPV6,
+ .params = &nf_flow_offload_rhash_params,
+ .gc = nf_flow_offload_work_gc,
+ .hook = nf_flow_offload_ipv6_hook,
+ .owner = THIS_MODULE,
+};
+
+static int __init nf_flow_ipv6_module_init(void)
+{
+ nft_register_flowtable_type(&flowtable_ipv6);
+
+ return 0;
+}
+
+static void __exit nf_flow_ipv6_module_exit(void)
+{
+ nft_unregister_flowtable_type(&flowtable_ipv6);
+}
+
+module_init(nf_flow_ipv6_module_init);
+module_exit(nf_flow_ipv6_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);

View file

@ -0,0 +1,141 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:22 +0100
Subject: [PATCH] netfilter: flow table support for the mixed IPv4/IPv6 family
This patch adds the IPv6 flow table type, that implements the datapath
flow table to forward IPv6 traffic.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/netfilter/nf_flow_table_inet.c
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -111,6 +111,11 @@ struct flow_ports {
__be16 source, dest;
};
+unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state);
+unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state);
+
#define MODULE_ALIAS_NF_FLOWTABLE(family) \
MODULE_ALIAS("nf-flowtable-" __stringify(family))
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -202,7 +202,7 @@ static bool nf_flow_exceeds_mtu(struct s
return false;
}
-static unsigned int
+unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -254,6 +254,7 @@ nf_flow_offload_ip_hook(void *priv, stru
return NF_STOLEN;
}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -196,7 +196,7 @@ static bool nf_flow_exceeds_mtu(struct s
return false;
}
-static unsigned int
+unsigned int
nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
@@ -248,6 +248,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
return NF_STOLEN;
}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -661,6 +661,14 @@ endif # NF_TABLES_NETDEV
endif # NF_TABLES
+config NF_FLOW_TABLE_INET
+ select NF_FLOW_TABLE
+ tristate "Netfilter flow table mixed IPv4/IPv6 module"
+ help
+ This option adds the flow table mixed IPv4/IPv6 support.
+
+ To compile it as a module, choose M here.
+
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
help
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -112,6 +112,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
+obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
--- /dev/null
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -0,0 +1,48 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_tables.h>
+
+static unsigned int
+nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return nf_flow_offload_ip_hook(priv, skb, state);
+ case htons(ETH_P_IPV6):
+ return nf_flow_offload_ipv6_hook(priv, skb, state);
+ }
+
+ return NF_ACCEPT;
+}
+
+static struct nf_flowtable_type flowtable_inet = {
+ .family = NFPROTO_INET,
+ .params = &nf_flow_offload_rhash_params,
+ .gc = nf_flow_offload_work_gc,
+ .hook = nf_flow_offload_inet_hook,
+ .owner = THIS_MODULE,
+};
+
+static int __init nf_flow_inet_module_init(void)
+{
+ nft_register_flowtable_type(&flowtable_inet);
+
+ return 0;
+}
+
+static void __exit nf_flow_inet_module_exit(void)
+{
+ nft_unregister_flowtable_type(&flowtable_inet);
+}
+
+module_init(nf_flow_inet_module_init);
+module_exit(nf_flow_inet_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NF_FLOWTABLE(1); /* NFPROTO_INET */

View file

@ -0,0 +1,332 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 7 Jan 2018 01:04:26 +0100
Subject: [PATCH] netfilter: nf_tables: flow offload expression
Add new instruction for the nf_tables VM that allows us to specify what
flows are offloaded into a given flow table via name. This new
instruction creates the flow entry and adds it to the flow table.
Only established flows, ie. we have seen traffic in both directions, are
added to the flow table. You can still decide to offload entries at a
later stage via packet counting or checking the ct status in case you
want to offload assured conntracks.
This new extension depends on the conntrack subsystem.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/netfilter/nft_flow_offload.c
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -957,6 +957,17 @@ enum nft_ct_attributes {
};
#define NFTA_CT_MAX (__NFTA_CT_MAX - 1)
+/**
+ * enum nft_flow_attributes - ct offload expression attributes
+ * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING)
+ */
+enum nft_offload_attributes {
+ NFTA_FLOW_UNSPEC,
+ NFTA_FLOW_TABLE_NAME,
+ __NFTA_FLOW_MAX,
+};
+#define NFTA_FLOW_MAX (__NFTA_FLOW_MAX - 1)
+
enum nft_limit_type {
NFT_LIMIT_PKTS,
NFT_LIMIT_PKT_BYTES
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -509,6 +509,13 @@ config NFT_CT
This option adds the "ct" expression that you can use to match
connection tracking information such as the flow state.
+config NFT_FLOW_OFFLOAD
+ depends on NF_CONNTRACK
+ tristate "Netfilter nf_tables hardware flow offload module"
+ help
+ This option adds the "flow_offload" expression that you can use to
+ choose what flows are placed into the hardware.
+
config NFT_SET_RBTREE
tristate "Netfilter nf_tables rbtree set module"
help
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_NFT_META) += nft_meta.o
obj-$(CONFIG_NFT_RT) += nft_rt.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
obj-$(CONFIG_NFT_CT) += nft_ct.o
+obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o
obj-$(CONFIG_NFT_LIMIT) += nft_limit.o
obj-$(CONFIG_NFT_NAT) += nft_nat.o
obj-$(CONFIG_NFT_OBJREF) += nft_objref.o
--- /dev/null
+++ b/net/netfilter/nft_flow_offload.c
@@ -0,0 +1,264 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/ip.h> /* for ipv4 options. */
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <net/netfilter/nf_flow_table.h>
+
+struct nft_flow_offload {
+ struct nft_flowtable *flowtable;
+};
+
+static int nft_flow_route(const struct nft_pktinfo *pkt,
+ const struct nf_conn *ct,
+ struct nf_flow_route *route,
+ enum ip_conntrack_dir dir)
+{
+ struct dst_entry *this_dst = skb_dst(pkt->skb);
+ struct dst_entry *other_dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ fl.u.ip4.daddr = ct->tuplehash[!dir].tuple.dst.u3.ip;
+ break;
+ case NFPROTO_IPV6:
+ fl.u.ip6.daddr = ct->tuplehash[!dir].tuple.dst.u3.in6;
+ break;
+ }
+
+ nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
+ if (!other_dst)
+ return -ENOENT;
+
+ route->tuple[dir].dst = this_dst;
+ route->tuple[dir].ifindex = nft_in(pkt)->ifindex;
+ route->tuple[!dir].dst = other_dst;
+ route->tuple[!dir].ifindex = nft_out(pkt)->ifindex;
+
+ return 0;
+}
+
+static bool nft_flow_offload_skip(struct sk_buff *skb)
+{
+ struct ip_options *opt = &(IPCB(skb)->opt);
+
+ if (unlikely(opt->optlen))
+ return true;
+ if (skb_sec_path(skb))
+ return true;
+
+ return false;
+}
+
+static void nft_flow_offload_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+ struct nf_flowtable *flowtable = &priv->flowtable->data;
+ enum ip_conntrack_info ctinfo;
+ struct nf_flow_route route;
+ struct flow_offload *flow;
+ enum ip_conntrack_dir dir;
+ struct nf_conn *ct;
+ int ret;
+
+ if (nft_flow_offload_skip(pkt->skb))
+ goto out;
+
+ ct = nf_ct_get(pkt->skb, &ctinfo);
+ if (!ct)
+ goto out;
+
+ switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ break;
+ default:
+ goto out;
+ }
+
+ if (test_bit(IPS_HELPER_BIT, &ct->status))
+ goto out;
+
+ if (ctinfo == IP_CT_NEW ||
+ ctinfo == IP_CT_RELATED)
+ goto out;
+
+ if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
+ goto out;
+
+ dir = CTINFO2DIR(ctinfo);
+ if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ goto err_flow_route;
+
+ flow = flow_offload_alloc(ct, &route);
+ if (!flow)
+ goto err_flow_alloc;
+
+ ret = flow_offload_add(flowtable, flow);
+ if (ret < 0)
+ goto err_flow_add;
+
+ return;
+
+err_flow_add:
+ flow_offload_free(flow);
+err_flow_alloc:
+ dst_release(route.tuple[!dir].dst);
+err_flow_route:
+ clear_bit(IPS_OFFLOAD_BIT, &ct->status);
+out:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ unsigned int hook_mask = (1 << NF_INET_FORWARD);
+
+ return nft_chain_validate_hooks(ctx->chain, hook_mask);
+}
+
+static int nft_flow_offload_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+ u8 genmask = nft_genmask_next(ctx->net);
+ struct nft_flowtable *flowtable;
+
+ if (!tb[NFTA_FLOW_TABLE_NAME])
+ return -EINVAL;
+
+ flowtable = nf_tables_flowtable_lookup(ctx->table,
+ tb[NFTA_FLOW_TABLE_NAME],
+ genmask);
+ if (IS_ERR(flowtable))
+ return PTR_ERR(flowtable);
+
+ priv->flowtable = flowtable;
+ flowtable->use++;
+
+ return nf_ct_netns_get(ctx->net, ctx->afi->family);
+}
+
+static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+ priv->flowtable->use--;
+ nf_ct_netns_put(ctx->net, ctx->afi->family);
+}
+
+static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_flow_offload *priv = nft_expr_priv(expr);
+
+ if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_flow_offload_type;
+static const struct nft_expr_ops nft_flow_offload_ops = {
+ .type = &nft_flow_offload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
+ .eval = nft_flow_offload_eval,
+ .init = nft_flow_offload_init,
+ .destroy = nft_flow_offload_destroy,
+ .validate = nft_flow_offload_validate,
+ .dump = nft_flow_offload_dump,
+};
+
+static struct nft_expr_type nft_flow_offload_type __read_mostly = {
+ .name = "flow_offload",
+ .ops = &nft_flow_offload_ops,
+ .maxattr = NFTA_FLOW_MAX,
+ .owner = THIS_MODULE,
+};
+
+static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
+{
+ struct net_device *dev = data;
+
+ if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+ return;
+
+ flow_offload_dead(flow);
+}
+
+static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
+ void *data)
+{
+ nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+}
+
+static int flow_offload_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = flow_offload_netdev_event,
+};
+
+static int __init nft_flow_offload_module_init(void)
+{
+ int err;
+
+ register_netdevice_notifier(&flow_offload_netdev_notifier);
+
+ err = nft_register_expr(&nft_flow_offload_type);
+ if (err < 0)
+ goto register_expr;
+
+ return 0;
+
+register_expr:
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+ return err;
+}
+
+static void __exit nft_flow_offload_module_exit(void)
+{
+ struct net *net;
+
+ nft_unregister_expr(&nft_flow_offload_type);
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+ rtnl_lock();
+ for_each_net(net)
+ nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
+ rtnl_unlock();
+}
+
+module_init(nft_flow_offload_module_init);
+module_exit(nft_flow_offload_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("flow_offload");

View file

@ -0,0 +1,113 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 13:53:45 +0100
Subject: [PATCH] netfilter: nf_tables: remove nhooks field from struct
nft_af_info
We already validate the hook through bitmask, so this check is
superfluous. When removing this, this patch is also fixing a bug in the
new flowtable codebase, since ctx->afi points to the table family
instead of the netdev family which is where the flowtable is really
hooked in.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -963,7 +963,6 @@ enum nft_af_flags {
*
* @list: used internally
* @family: address family
- * @nhooks: number of hooks in this family
* @owner: module owner
* @tables: used internally
* @flags: family flags
@@ -971,7 +970,6 @@ enum nft_af_flags {
struct nft_af_info {
struct list_head list;
int family;
- unsigned int nhooks;
struct module *owner;
struct list_head tables;
u32 flags;
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -44,7 +44,6 @@ nft_do_chain_bridge(void *priv,
static struct nft_af_info nft_af_bridge __read_mostly = {
.family = NFPROTO_BRIDGE,
- .nhooks = NF_BR_NUMHOOKS,
.owner = THIS_MODULE,
};
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -29,7 +29,6 @@ nft_do_chain_arp(void *priv,
static struct nft_af_info nft_af_arp __read_mostly = {
.family = NFPROTO_ARP,
- .nhooks = NF_ARP_NUMHOOKS,
.owner = THIS_MODULE,
};
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -32,7 +32,6 @@ static unsigned int nft_do_chain_ipv4(vo
static struct nft_af_info nft_af_ipv4 __read_mostly = {
.family = NFPROTO_IPV4,
- .nhooks = NF_INET_NUMHOOKS,
.owner = THIS_MODULE,
};
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -30,7 +30,6 @@ static unsigned int nft_do_chain_ipv6(vo
static struct nft_af_info nft_af_ipv6 __read_mostly = {
.family = NFPROTO_IPV6,
- .nhooks = NF_INET_NUMHOOKS,
.owner = THIS_MODULE,
};
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1328,9 +1328,6 @@ static int nft_chain_parse_hook(struct n
return -EINVAL;
hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
- if (hook->num >= afi->nhooks)
- return -EINVAL;
-
hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
@@ -4917,7 +4914,7 @@ static int nf_tables_flowtable_parse_hoo
return -EINVAL;
hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
- if (hooknum >= ctx->afi->nhooks)
+ if (hooknum != NF_NETDEV_INGRESS)
return -EINVAL;
priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -40,7 +40,6 @@ static unsigned int nft_do_chain_inet(vo
static struct nft_af_info nft_af_inet __read_mostly = {
.family = NFPROTO_INET,
- .nhooks = NF_INET_NUMHOOKS,
.owner = THIS_MODULE,
};
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -40,7 +40,6 @@ nft_do_chain_netdev(void *priv, struct s
static struct nft_af_info nft_af_netdev __read_mostly = {
.family = NFPROTO_NETDEV,
- .nhooks = NF_NETDEV_NUMHOOKS,
.owner = THIS_MODULE,
.flags = NFT_AF_NEEDS_DEV,
};

View file

@ -0,0 +1,22 @@
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 07:04:54 +0000
Subject: [PATCH] netfilter: nf_tables: fix a typo in nf_tables_getflowtable()
Fix a typo, we should check 'flowtable' instead of 'table'.
Fixes: 3b49e2e94e6e ("netfilter: nf_tables: add flow table netlink frontend")
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5341,7 +5341,7 @@ static int nf_tables_getflowtable(struct
flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
genmask);
- if (IS_ERR(table))
+ if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);

View file

@ -0,0 +1,106 @@
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 18:10:59 +0100
Subject: [PATCH] netfilter: improve flow table Kconfig dependencies
The newly added NF_FLOW_TABLE options cause some build failures in
randconfig kernels:
- when CONFIG_NF_CONNTRACK is disabled, or is a loadable module but
NF_FLOW_TABLE is built-in:
In file included from net/netfilter/nf_flow_table.c:8:0:
include/net/netfilter/nf_conntrack.h:59:22: error: field 'ct_general' has incomplete type
struct nf_conntrack ct_general;
include/net/netfilter/nf_conntrack.h: In function 'nf_ct_get':
include/net/netfilter/nf_conntrack.h:148:15: error: 'const struct sk_buff' has no member named '_nfct'
include/net/netfilter/nf_conntrack.h: In function 'nf_ct_put':
include/net/netfilter/nf_conntrack.h:157:2: error: implicit declaration of function 'nf_conntrack_put'; did you mean 'nf_ct_put'? [-Werror=implicit-function-declaration]
net/netfilter/nf_flow_table.o: In function `nf_flow_offload_work_gc':
(.text+0x1540): undefined reference to `nf_ct_delete'
- when CONFIG_NF_TABLES is disabled:
In file included from net/ipv6/netfilter/nf_flow_table_ipv6.c:13:0:
include/net/netfilter/nf_tables.h: In function 'nft_gencursor_next':
include/net/netfilter/nf_tables.h:1189:14: error: 'const struct net' has no member named 'nft'; did you mean 'nf'?
- when CONFIG_NF_FLOW_TABLE_INET is enabled, but NF_FLOW_TABLE_IPV4
or NF_FLOW_TABLE_IPV6 are not, or are loadable modules
net/netfilter/nf_flow_table_inet.o: In function `nf_flow_offload_inet_hook':
nf_flow_table_inet.c:(.text+0x94): undefined reference to `nf_flow_offload_ipv6_hook'
nf_flow_table_inet.c:(.text+0x40): undefined reference to `nf_flow_offload_ip_hook'
- when CONFIG_NF_FLOW_TABLES is disabled, but the other options are
enabled:
net/netfilter/nf_flow_table_inet.o: In function `nf_flow_offload_inet_hook':
nf_flow_table_inet.c:(.text+0x6c): undefined reference to `nf_flow_offload_ipv6_hook'
net/netfilter/nf_flow_table_inet.o: In function `nf_flow_inet_module_exit':
nf_flow_table_inet.c:(.exit.text+0x8): undefined reference to `nft_unregister_flowtable_type'
net/netfilter/nf_flow_table_inet.o: In function `nf_flow_inet_module_init':
nf_flow_table_inet.c:(.init.text+0x8): undefined reference to `nft_register_flowtable_type'
net/ipv4/netfilter/nf_flow_table_ipv4.o: In function `nf_flow_ipv4_module_exit':
nf_flow_table_ipv4.c:(.exit.text+0x8): undefined reference to `nft_unregister_flowtable_type'
net/ipv4/netfilter/nf_flow_table_ipv4.o: In function `nf_flow_ipv4_module_init':
nf_flow_table_ipv4.c:(.init.text+0x8): undefined reference to `nft_register_flowtable_type'
This adds additional Kconfig dependencies to ensure that NF_CONNTRACK and NF_TABLES
are always visible from NF_FLOW_TABLE, and that the internal dependencies between
the four new modules are met.
Fixes: 7c23b629a808 ("netfilter: flow table support for the mixed IPv4/IPv6 family")
Fixes: 0995210753a2 ("netfilter: flow table support for IPv6")
Fixes: 97add9f0d66d ("netfilter: flow table support for IPv4")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -78,8 +78,9 @@ config NF_TABLES_ARP
endif # NF_TABLES
config NF_FLOW_TABLE_IPV4
- select NF_FLOW_TABLE
tristate "Netfilter flow table IPv4 module"
+ depends on NF_CONNTRACK && NF_TABLES
+ select NF_FLOW_TABLE
help
This option adds the flow table IPv4 support.
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -72,8 +72,9 @@ endif # NF_TABLES_IPV6
endif # NF_TABLES
config NF_FLOW_TABLE_IPV6
- select NF_FLOW_TABLE
tristate "Netfilter flow table IPv6 module"
+ depends on NF_CONNTRACK && NF_TABLES
+ select NF_FLOW_TABLE
help
This option adds the flow table IPv6 support.
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -669,8 +669,9 @@ endif # NF_TABLES_NETDEV
endif # NF_TABLES
config NF_FLOW_TABLE_INET
- select NF_FLOW_TABLE
tristate "Netfilter flow table mixed IPv4/IPv6 module"
+ depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6
+ select NF_FLOW_TABLE
help
This option adds the flow table mixed IPv4/IPv6 support.
@@ -678,6 +679,7 @@ config NF_FLOW_TABLE_INET
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
+ depends on NF_CONNTRACK && NF_TABLES
help
This option adds the flow table core infrastructure.

View file

@ -0,0 +1,59 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 14:07:52 +0100
Subject: [PATCH] netfilter: nf_tables: remove flag field from struct
nft_af_info
Replace it by a direct check for the netdev protocol family.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -954,10 +954,6 @@ struct nft_table {
char *name;
};
-enum nft_af_flags {
- NFT_AF_NEEDS_DEV = (1 << 0),
-};
-
/**
* struct nft_af_info - nf_tables address family info
*
@@ -965,14 +961,12 @@ enum nft_af_flags {
* @family: address family
* @owner: module owner
* @tables: used internally
- * @flags: family flags
*/
struct nft_af_info {
struct list_head list;
int family;
struct module *owner;
struct list_head tables;
- u32 flags;
};
int nft_register_afinfo(struct net *, struct nft_af_info *);
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1345,7 +1345,7 @@ static int nft_chain_parse_hook(struct n
hook->type = type;
hook->dev = NULL;
- if (afi->flags & NFT_AF_NEEDS_DEV) {
+ if (afi->family == NFPROTO_NETDEV) {
char ifname[IFNAMSIZ];
if (!ha[NFTA_HOOK_DEV]) {
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -41,7 +41,6 @@ nft_do_chain_netdev(void *priv, struct s
static struct nft_af_info nft_af_netdev __read_mostly = {
.family = NFPROTO_NETDEV,
.owner = THIS_MODULE,
- .flags = NFT_AF_NEEDS_DEV,
};
static int nf_tables_netdev_init_net(struct net *net)

View file

@ -0,0 +1,80 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 12:17:52 +0100
Subject: [PATCH] netfilter: nf_tables: no need for struct nft_af_info to
enable/disable table
nf_tables_table_enable() and nf_tables_table_disable() take a pointer to
struct nft_af_info that is never used, remove it.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -611,10 +611,7 @@ err:
return err;
}
-static void _nf_tables_table_disable(struct net *net,
- const struct nft_af_info *afi,
- struct nft_table *table,
- u32 cnt)
+static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
{
struct nft_chain *chain;
u32 i = 0;
@@ -632,9 +629,7 @@ static void _nf_tables_table_disable(str
}
}
-static int nf_tables_table_enable(struct net *net,
- const struct nft_af_info *afi,
- struct nft_table *table)
+static int nf_tables_table_enable(struct net *net, struct nft_table *table)
{
struct nft_chain *chain;
int err, i = 0;
@@ -654,15 +649,13 @@ static int nf_tables_table_enable(struct
return 0;
err:
if (i)
- _nf_tables_table_disable(net, afi, table, i);
+ nft_table_disable(net, table, i);
return err;
}
-static void nf_tables_table_disable(struct net *net,
- const struct nft_af_info *afi,
- struct nft_table *table)
+static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
- _nf_tables_table_disable(net, afi, table, 0);
+ nft_table_disable(net, table, 0);
}
static int nf_tables_updtable(struct nft_ctx *ctx)
@@ -691,7 +684,7 @@ static int nf_tables_updtable(struct nft
nft_trans_table_enable(trans) = false;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ret = nf_tables_table_enable(ctx->net, ctx->afi, ctx->table);
+ ret = nf_tables_table_enable(ctx->net, ctx->table);
if (ret >= 0) {
ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
nft_trans_table_enable(trans) = true;
@@ -5719,7 +5712,6 @@ static int nf_tables_commit(struct net *
if (nft_trans_table_update(trans)) {
if (!nft_trans_table_enable(trans)) {
nf_tables_table_disable(net,
- trans->ctx.afi,
trans->ctx.table);
trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
}
@@ -5881,7 +5873,6 @@ static int nf_tables_abort(struct net *n
if (nft_trans_table_update(trans)) {
if (nft_trans_table_enable(trans)) {
nf_tables_table_disable(net,
- trans->ctx.afi,
trans->ctx.table);
trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
}

View file

@ -0,0 +1,60 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 13:40:22 +0100
Subject: [PATCH] netfilter: nf_tables: remove struct nft_af_info parameter in
nf_tables_chain_type_lookup()
Pass family number instead, this comes in preparation for the removal of
struct nft_af_info.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -423,7 +423,7 @@ static inline u64 nf_tables_alloc_handle
static const struct nf_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];
static const struct nf_chain_type *
-__nf_tables_chain_type_lookup(int family, const struct nlattr *nla)
+__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
{
int i;
@@ -436,22 +436,20 @@ __nf_tables_chain_type_lookup(int family
}
static const struct nf_chain_type *
-nf_tables_chain_type_lookup(const struct nft_af_info *afi,
- const struct nlattr *nla,
- bool autoload)
+nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
{
const struct nf_chain_type *type;
- type = __nf_tables_chain_type_lookup(afi->family, nla);
+ type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return type;
#ifdef CONFIG_MODULES
if (autoload) {
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-chain-%u-%.*s", afi->family,
+ request_module("nft-chain-%u-%.*s", family,
nla_len(nla), (const char *)nla_data(nla));
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- type = __nf_tables_chain_type_lookup(afi->family, nla);
+ type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return ERR_PTR(-EAGAIN);
}
@@ -1325,8 +1323,8 @@ static int nft_chain_parse_hook(struct n
type = chain_type[afi->family][NFT_CHAIN_T_DEFAULT];
if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(afi, nla[NFTA_CHAIN_TYPE],
- create);
+ type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
+ afi->family, create);
if (IS_ERR(type))
return PTR_ERR(type);
}

View file

@ -0,0 +1,24 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 19 Dec 2017 12:01:21 +0100
Subject: [PATCH] netfilter: nf_tables: fix chain filter in
nf_tables_dump_rules()
ctx->chain may be null now that we have very large object names,
so we cannot check for ctx->chain[0] here.
Fixes: b7263e071aba7 ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Phil Sutter <phil@nwl.cc>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2090,7 +2090,7 @@ static int nf_tables_dump_rules(struct s
continue;
list_for_each_entry_rcu(chain, &table->chains, list) {
- if (ctx && ctx->chain[0] &&
+ if (ctx && ctx->chain &&
strcmp(ctx->chain, chain->name) != 0)
continue;

View file

@ -0,0 +1,30 @@
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Mon, 25 Dec 2017 11:34:54 +0800
Subject: [PATCH] netfilter: nf_tables: fix potential NULL-ptr deref in
nf_tables_dump_obj_done()
If there is no NFTA_OBJ_TABLE and NFTA_OBJ_TYPE, the c.data will be NULL in
nf_tables_getobj(). So before free filter->table in nf_tables_dump_obj_done(),
we need to check if filter is NULL first.
Fixes: e46abbcc05aa ("netfilter: nf_tables: Allow table names of up to 255 chars")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4614,8 +4614,10 @@ static int nf_tables_dump_obj_done(struc
{
struct nft_obj_filter *filter = cb->data;
- kfree(filter->table);
- kfree(filter);
+ if (filter) {
+ kfree(filter->table);
+ kfree(filter);
+ }
return 0;
}

View file

@ -0,0 +1,100 @@
From: Vasily Averin <vvs@virtuozzo.com>
Date: Sun, 12 Nov 2017 14:32:37 +0300
Subject: [PATCH] netfilter: exit_net cleanup check added
Be sure that lists initialized in net_init hook was return to initial
state.
Signed-off-by: Vasily Averin <vvs@virtuozzo.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -819,6 +819,7 @@ static void clusterip_net_exit(struct ne
cn->procdir = NULL;
#endif
nf_unregister_net_hook(net, &cip_arp_ops);
+ WARN_ON_ONCE(!list_empty(&cn->configs));
}
static struct pernet_operations clusterip_net_ops = {
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -6468,6 +6468,12 @@ static int __net_init nf_tables_init_net
return 0;
}
+static void __net_exit nf_tables_exit_net(struct net *net)
+{
+ WARN_ON_ONCE(!list_empty(&net->nft.af_info));
+ WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
+}
+
int __nft_release_basechain(struct nft_ctx *ctx)
{
struct nft_rule *rule, *nr;
@@ -6545,6 +6551,7 @@ static void __nft_release_afinfo(struct
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
+ .exit = nf_tables_exit_net,
};
static int __init nf_tables_module_init(void)
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1093,10 +1093,15 @@ static int __net_init nfnl_log_net_init(
static void __net_exit nfnl_log_net_exit(struct net *net)
{
+ struct nfnl_log_net *log = nfnl_log_pernet(net);
+ unsigned int i;
+
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
#endif
nf_log_unset(net, &nfulnl_logger);
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ WARN_ON_ONCE(!hlist_empty(&log->instance_table[i]));
}
static struct pernet_operations nfnl_log_net_ops = {
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -1512,10 +1512,15 @@ static int __net_init nfnl_queue_net_ini
static void __net_exit nfnl_queue_net_exit(struct net *net)
{
+ struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ unsigned int i;
+
nf_unregister_queue_handler(net);
#ifdef CONFIG_PROC_FS
remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
+ for (i = 0; i < INSTANCE_BUCKETS; i++)
+ WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}
static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list)
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1714,8 +1714,17 @@ static int __net_init xt_net_init(struct
return 0;
}
+static void __net_exit xt_net_exit(struct net *net)
+{
+ int i;
+
+ for (i = 0; i < NFPROTO_NUMPROTO; i++)
+ WARN_ON_ONCE(!list_empty(&net->xt.tables[i]));
+}
+
static struct pernet_operations xt_net_ops = {
.init = xt_net_init,
+ .exit = xt_net_exit,
};
static int __init xt_init(void)

View file

@ -0,0 +1,598 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 9 Jan 2018 02:42:11 +0100
Subject: [PATCH] netfilter: nf_tables: get rid of pernet families
Now that we have a single table list for each netns, we can get rid of
one pointer per family and the global afinfo list, thus, shrinking
struct netns for nftables that now becomes 64 bytes smaller.
And call __nft_release_afinfo() from __net_exit path accordingly to
release netnamespace objects on removal.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -969,8 +969,8 @@ struct nft_af_info {
struct module *owner;
};
-int nft_register_afinfo(struct net *, struct nft_af_info *);
-void nft_unregister_afinfo(struct net *, struct nft_af_info *);
+int nft_register_afinfo(struct nft_af_info *);
+void nft_unregister_afinfo(struct nft_af_info *);
int nft_register_chain_type(const struct nf_chain_type *);
void nft_unregister_chain_type(const struct nf_chain_type *);
--- a/include/net/netns/nftables.h
+++ b/include/net/netns/nftables.h
@@ -7,15 +7,8 @@
struct nft_af_info;
struct netns_nftables {
- struct list_head af_info;
struct list_head tables;
struct list_head commit_list;
- struct nft_af_info *ipv4;
- struct nft_af_info *ipv6;
- struct nft_af_info *inet;
- struct nft_af_info *arp;
- struct nft_af_info *bridge;
- struct nft_af_info *netdev;
unsigned int base_seq;
u8 gencursor;
};
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -47,34 +47,6 @@ static struct nft_af_info nft_af_bridge
.owner = THIS_MODULE,
};
-static int nf_tables_bridge_init_net(struct net *net)
-{
- net->nft.bridge = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.bridge == NULL)
- return -ENOMEM;
-
- memcpy(net->nft.bridge, &nft_af_bridge, sizeof(nft_af_bridge));
-
- if (nft_register_afinfo(net, net->nft.bridge) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.bridge);
- return -ENOMEM;
-}
-
-static void nf_tables_bridge_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.bridge);
- kfree(net->nft.bridge);
-}
-
-static struct pernet_operations nf_tables_bridge_net_ops = {
- .init = nf_tables_bridge_init_net,
- .exit = nf_tables_bridge_exit_net,
-};
-
static const struct nf_chain_type filter_bridge = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -98,17 +70,17 @@ static int __init nf_tables_bridge_init(
{
int ret;
- ret = nft_register_chain_type(&filter_bridge);
+ ret = nft_register_afinfo(&nft_af_bridge);
if (ret < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_bridge_net_ops);
+ ret = nft_register_chain_type(&filter_bridge);
if (ret < 0)
- goto err_register_subsys;
+ goto err_register_chain;
return ret;
-err_register_subsys:
+err_register_chain:
nft_unregister_chain_type(&filter_bridge);
return ret;
@@ -116,8 +88,8 @@ err_register_subsys:
static void __exit nf_tables_bridge_exit(void)
{
- unregister_pernet_subsys(&nf_tables_bridge_net_ops);
nft_unregister_chain_type(&filter_bridge);
+ nft_unregister_afinfo(&nft_af_bridge);
}
module_init(nf_tables_bridge_init);
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -32,34 +32,6 @@ static struct nft_af_info nft_af_arp __r
.owner = THIS_MODULE,
};
-static int nf_tables_arp_init_net(struct net *net)
-{
- net->nft.arp = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.arp== NULL)
- return -ENOMEM;
-
- memcpy(net->nft.arp, &nft_af_arp, sizeof(nft_af_arp));
-
- if (nft_register_afinfo(net, net->nft.arp) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.arp);
- return -ENOMEM;
-}
-
-static void nf_tables_arp_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.arp);
- kfree(net->nft.arp);
-}
-
-static struct pernet_operations nf_tables_arp_net_ops = {
- .init = nf_tables_arp_init_net,
- .exit = nf_tables_arp_exit_net,
-};
-
static const struct nf_chain_type filter_arp = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -77,21 +49,26 @@ static int __init nf_tables_arp_init(voi
{
int ret;
- ret = nft_register_chain_type(&filter_arp);
+ ret = nft_register_afinfo(&nft_af_arp);
if (ret < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_arp_net_ops);
+ ret = nft_register_chain_type(&filter_arp);
if (ret < 0)
- nft_unregister_chain_type(&filter_arp);
+ goto err_register_chain;
+
+ return 0;
+
+err_register_chain:
+ nft_unregister_chain_type(&filter_arp);
return ret;
}
static void __exit nf_tables_arp_exit(void)
{
- unregister_pernet_subsys(&nf_tables_arp_net_ops);
nft_unregister_chain_type(&filter_arp);
+ nft_unregister_afinfo(&nft_af_arp);
}
module_init(nf_tables_arp_init);
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -35,34 +35,6 @@ static struct nft_af_info nft_af_ipv4 __
.owner = THIS_MODULE,
};
-static int nf_tables_ipv4_init_net(struct net *net)
-{
- net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.ipv4 == NULL)
- return -ENOMEM;
-
- memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4));
-
- if (nft_register_afinfo(net, net->nft.ipv4) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.ipv4);
- return -ENOMEM;
-}
-
-static void nf_tables_ipv4_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.ipv4);
- kfree(net->nft.ipv4);
-}
-
-static struct pernet_operations nf_tables_ipv4_net_ops = {
- .init = nf_tables_ipv4_init_net,
- .exit = nf_tables_ipv4_exit_net,
-};
-
static const struct nf_chain_type filter_ipv4 = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -86,21 +58,25 @@ static int __init nf_tables_ipv4_init(vo
{
int ret;
- ret = nft_register_chain_type(&filter_ipv4);
+ ret = nft_register_afinfo(&nft_af_ipv4);
if (ret < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_ipv4_net_ops);
+ ret = nft_register_chain_type(&filter_ipv4);
if (ret < 0)
- nft_unregister_chain_type(&filter_ipv4);
+ goto err_register_chain;
+
+ return 0;
+err_register_chain:
+ nft_unregister_afinfo(&nft_af_ipv4);
return ret;
}
static void __exit nf_tables_ipv4_exit(void)
{
- unregister_pernet_subsys(&nf_tables_ipv4_net_ops);
nft_unregister_chain_type(&filter_ipv4);
+ nft_unregister_afinfo(&nft_af_ipv4);
}
module_init(nf_tables_ipv4_init);
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -33,34 +33,6 @@ static struct nft_af_info nft_af_ipv6 __
.owner = THIS_MODULE,
};
-static int nf_tables_ipv6_init_net(struct net *net)
-{
- net->nft.ipv6 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.ipv6 == NULL)
- return -ENOMEM;
-
- memcpy(net->nft.ipv6, &nft_af_ipv6, sizeof(nft_af_ipv6));
-
- if (nft_register_afinfo(net, net->nft.ipv6) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.ipv6);
- return -ENOMEM;
-}
-
-static void nf_tables_ipv6_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.ipv6);
- kfree(net->nft.ipv6);
-}
-
-static struct pernet_operations nf_tables_ipv6_net_ops = {
- .init = nf_tables_ipv6_init_net,
- .exit = nf_tables_ipv6_exit_net,
-};
-
static const struct nf_chain_type filter_ipv6 = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -84,20 +56,24 @@ static int __init nf_tables_ipv6_init(vo
{
int ret;
- ret = nft_register_chain_type(&filter_ipv6);
+ ret = nft_register_afinfo(&nft_af_ipv6);
if (ret < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_ipv6_net_ops);
+ ret = nft_register_chain_type(&filter_ipv6);
if (ret < 0)
- nft_unregister_chain_type(&filter_ipv6);
+ goto err_register_chain;
+
+ return 0;
+err_register_chain:
+ nft_unregister_afinfo(&nft_af_ipv6);
return ret;
}
static void __exit nf_tables_ipv6_exit(void)
{
- unregister_pernet_subsys(&nf_tables_ipv6_net_ops);
+ nft_unregister_afinfo(&nft_af_ipv6);
nft_unregister_chain_type(&filter_ipv6);
}
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,6 +26,7 @@
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
+static LIST_HEAD(nf_tables_af_info);
/**
* nft_register_afinfo - register nf_tables address family info
@@ -35,17 +36,15 @@ static LIST_HEAD(nf_tables_flowtables);
* Register the address family for use with nf_tables. Returns zero on
* success or a negative errno code otherwise.
*/
-int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
+int nft_register_afinfo(struct nft_af_info *afi)
{
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_add_tail_rcu(&afi->list, &net->nft.af_info);
+ list_add_tail_rcu(&afi->list, &nf_tables_af_info);
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
return 0;
}
EXPORT_SYMBOL_GPL(nft_register_afinfo);
-static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi);
-
/**
* nft_unregister_afinfo - unregister nf_tables address family info
*
@@ -53,10 +52,9 @@ static void __nft_release_afinfo(struct
*
* Unregister the address family for use with nf_tables.
*/
-void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi)
+void nft_unregister_afinfo(struct nft_af_info *afi)
{
nfnl_lock(NFNL_SUBSYS_NFTABLES);
- __nft_release_afinfo(net, afi);
list_del_rcu(&afi->list);
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
@@ -66,7 +64,7 @@ static struct nft_af_info *nft_afinfo_lo
{
struct nft_af_info *afi;
- list_for_each_entry(afi, &net->nft.af_info, list) {
+ list_for_each_entry(afi, &nf_tables_af_info, list) {
if (afi->family == family)
return afi;
}
@@ -4968,15 +4966,12 @@ void nft_flow_table_iterate(struct net *
void *data)
{
struct nft_flowtable *flowtable;
- const struct nft_af_info *afi;
const struct nft_table *table;
rcu_read_lock();
- list_for_each_entry_rcu(afi, &net->nft.af_info, list) {
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
- list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
- iter(&flowtable->data, data);
- }
+ list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+ iter(&flowtable->data, data);
}
}
rcu_read_unlock();
@@ -6459,21 +6454,6 @@ int nft_data_dump(struct sk_buff *skb, i
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-static int __net_init nf_tables_init_net(struct net *net)
-{
- INIT_LIST_HEAD(&net->nft.af_info);
- INIT_LIST_HEAD(&net->nft.tables);
- INIT_LIST_HEAD(&net->nft.commit_list);
- net->nft.base_seq = 1;
- return 0;
-}
-
-static void __net_exit nf_tables_exit_net(struct net *net)
-{
- WARN_ON_ONCE(!list_empty(&net->nft.af_info));
- WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
-}
-
int __nft_release_basechain(struct nft_ctx *ctx)
{
struct nft_rule *rule, *nr;
@@ -6494,8 +6474,7 @@ int __nft_release_basechain(struct nft_c
}
EXPORT_SYMBOL_GPL(__nft_release_basechain);
-/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */
-static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
+static void __nft_release_afinfo(struct net *net)
{
struct nft_flowtable *flowtable, *nf;
struct nft_table *table, *nt;
@@ -6505,10 +6484,11 @@ static void __nft_release_afinfo(struct
struct nft_set *set, *ns;
struct nft_ctx ctx = {
.net = net,
- .family = afi->family,
};
list_for_each_entry_safe(table, nt, &net->nft.tables, list) {
+ ctx.family = table->afi->family;
+
list_for_each_entry(chain, &table->chains, list)
nf_tables_unregister_hook(net, table, chain);
list_for_each_entry(flowtable, &table->flowtables, list)
@@ -6549,6 +6529,21 @@ static void __nft_release_afinfo(struct
}
}
+static int __net_init nf_tables_init_net(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nft.tables);
+ INIT_LIST_HEAD(&net->nft.commit_list);
+ net->nft.base_seq = 1;
+ return 0;
+}
+
+static void __net_exit nf_tables_exit_net(struct net *net)
+{
+ __nft_release_afinfo(net);
+ WARN_ON_ONCE(!list_empty(&net->nft.tables));
+ WARN_ON_ONCE(!list_empty(&net->nft.commit_list));
+}
+
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
.exit = nf_tables_exit_net,
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -43,34 +43,6 @@ static struct nft_af_info nft_af_inet __
.owner = THIS_MODULE,
};
-static int __net_init nf_tables_inet_init_net(struct net *net)
-{
- net->nft.inet = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.inet == NULL)
- return -ENOMEM;
- memcpy(net->nft.inet, &nft_af_inet, sizeof(nft_af_inet));
-
- if (nft_register_afinfo(net, net->nft.inet) < 0)
- goto err;
-
- return 0;
-
-err:
- kfree(net->nft.inet);
- return -ENOMEM;
-}
-
-static void __net_exit nf_tables_inet_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.inet);
- kfree(net->nft.inet);
-}
-
-static struct pernet_operations nf_tables_inet_net_ops = {
- .init = nf_tables_inet_init_net,
- .exit = nf_tables_inet_exit_net,
-};
-
static const struct nf_chain_type filter_inet = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -94,21 +66,24 @@ static int __init nf_tables_inet_init(vo
{
int ret;
- ret = nft_register_chain_type(&filter_inet);
- if (ret < 0)
+ if (nft_register_afinfo(&nft_af_inet) < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_inet_net_ops);
+ ret = nft_register_chain_type(&filter_inet);
if (ret < 0)
- nft_unregister_chain_type(&filter_inet);
+ goto err_register_chain;
+
+ return ret;
+err_register_chain:
+ nft_unregister_afinfo(&nft_af_inet);
return ret;
}
static void __exit nf_tables_inet_exit(void)
{
- unregister_pernet_subsys(&nf_tables_inet_net_ops);
nft_unregister_chain_type(&filter_inet);
+ nft_unregister_afinfo(&nft_af_inet);
}
module_init(nf_tables_inet_init);
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -43,34 +43,6 @@ static struct nft_af_info nft_af_netdev
.owner = THIS_MODULE,
};
-static int nf_tables_netdev_init_net(struct net *net)
-{
- net->nft.netdev = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL);
- if (net->nft.netdev == NULL)
- return -ENOMEM;
-
- memcpy(net->nft.netdev, &nft_af_netdev, sizeof(nft_af_netdev));
-
- if (nft_register_afinfo(net, net->nft.netdev) < 0)
- goto err;
-
- return 0;
-err:
- kfree(net->nft.netdev);
- return -ENOMEM;
-}
-
-static void nf_tables_netdev_exit_net(struct net *net)
-{
- nft_unregister_afinfo(net, net->nft.netdev);
- kfree(net->nft.netdev);
-}
-
-static struct pernet_operations nf_tables_netdev_net_ops = {
- .init = nf_tables_netdev_init_net,
- .exit = nf_tables_netdev_exit_net,
-};
-
static const struct nf_chain_type nft_filter_chain_netdev = {
.name = "filter",
.type = NFT_CHAIN_T_DEFAULT,
@@ -145,32 +117,32 @@ static int __init nf_tables_netdev_init(
{
int ret;
- ret = nft_register_chain_type(&nft_filter_chain_netdev);
- if (ret)
+ if (nft_register_afinfo(&nft_af_netdev) < 0)
return ret;
- ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
+ ret = nft_register_chain_type(&nft_filter_chain_netdev);
if (ret)
- goto err1;
+ goto err_register_chain_type;
ret = register_netdevice_notifier(&nf_tables_netdev_notifier);
if (ret)
- goto err2;
+ goto err_register_netdevice_notifier;
return 0;
-err2:
- unregister_pernet_subsys(&nf_tables_netdev_net_ops);
-err1:
+err_register_netdevice_notifier:
nft_unregister_chain_type(&nft_filter_chain_netdev);
+err_register_chain_type:
+ nft_unregister_afinfo(&nft_af_netdev);
+
return ret;
}
static void __exit nf_tables_netdev_exit(void)
{
unregister_netdevice_notifier(&nf_tables_netdev_notifier);
- unregister_pernet_subsys(&nf_tables_netdev_net_ops);
nft_unregister_chain_type(&nft_filter_chain_netdev);
+ nft_unregister_afinfo(&nft_af_netdev);
}
module_init(nf_tables_netdev_init);

View file

@ -0,0 +1,47 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:00 +0100
Subject: [PATCH] netfilter: nft_flow_offload: wait for garbage collector
to run after cleanup
If netdevice goes down, then flowtable entries are scheduled to be
removed. Wait for garbage collector to have a chance to run so it can
delete them from the hashtable.
The flush call might sleep, so hold the nfnl mutex from
nft_flow_table_iterate() instead of rcu read side lock. The use of the
nfnl mutex is also implicitly fixing races between updates via nfnetlink
and netdevice event.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4818,13 +4818,13 @@ void nft_flow_table_iterate(struct net *
struct nft_flowtable *flowtable;
const struct nft_table *table;
- rcu_read_lock();
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
- list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
+ nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ list_for_each_entry(table, &net->nft.tables, list) {
+ list_for_each_entry(flowtable, &table->flowtables, list) {
iter(&flowtable->data, data);
}
}
- rcu_read_unlock();
+ nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -208,6 +208,7 @@ static void nft_flow_offload_iterate_cle
void *data)
{
nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
+ flush_delayed_work(&flowtable->gc_work);
}
static int flow_offload_netdev_event(struct notifier_block *this,

View file

@ -0,0 +1,29 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 1 Feb 2018 18:49:01 +0100
Subject: [PATCH] netfilter: nft_flow_offload: no need to flush entries on
module removal
nft_flow_offload module removal does not require to flush existing
flowtables, it is valid to remove this module while keeping flowtables
around.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -247,14 +247,8 @@ register_expr:
static void __exit nft_flow_offload_module_exit(void)
{
- struct net *net;
-
nft_unregister_expr(&nft_flow_offload_type);
unregister_netdevice_notifier(&flow_offload_netdev_notifier);
- rtnl_lock();
- for_each_net(net)
- nft_flow_table_iterate(net, nft_flow_offload_iterate_cleanup, NULL);
- rtnl_unlock();
}
module_init(nft_flow_offload_module_init);

View file

@ -0,0 +1,97 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Jan 2018 17:46:09 +0100
Subject: [PATCH] netfilter: nft_flow_offload: move flowtable cleanup
routines to nf_flow_table
Move the flowtable cleanup routines to nf_flow_table and expose the
nf_flow_table_cleanup() helper function.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -95,6 +95,9 @@ struct flow_offload_tuple_rhash *flow_of
int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct flow_offload *flow, void *data),
void *data);
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+
void nf_flow_offload_work_gc(struct work_struct *work);
extern const struct rhashtable_params nf_flow_offload_rhash_params;
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,7 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
+#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -425,5 +426,28 @@ int nf_flow_dnat_port(const struct flow_
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+{
+ struct net_device *dev = data;
+
+ if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+ return;
+
+ flow_offload_dead(flow);
+}
+
+static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
+ void *data)
+{
+ nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+ flush_delayed_work(&flowtable->gc_work);
+}
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+{
+ nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -194,23 +194,6 @@ static struct nft_expr_type nft_flow_off
.owner = THIS_MODULE,
};
-static void flow_offload_iterate_cleanup(struct flow_offload *flow, void *data)
-{
- struct net_device *dev = data;
-
- if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
- return;
-
- flow_offload_dead(flow);
-}
-
-static void nft_flow_offload_iterate_cleanup(struct nf_flowtable *flowtable,
- void *data)
-{
- nf_flow_table_iterate(flowtable, flow_offload_iterate_cleanup, data);
- flush_delayed_work(&flowtable->gc_work);
-}
-
static int flow_offload_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -219,7 +202,7 @@ static int flow_offload_netdev_event(str
if (event != NETDEV_DOWN)
return NOTIFY_DONE;
- nft_flow_table_iterate(dev_net(dev), nft_flow_offload_iterate_cleanup, dev);
+ nf_flow_table_cleanup(dev_net(dev), dev);
return NOTIFY_DONE;
}

View file

@ -0,0 +1,140 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 5 Feb 2018 21:44:50 +0100
Subject: [PATCH] netfilter: nf_tables: fix flowtable free
Every flow_offload entry is added into the table twice. Because of this,
rhashtable_free_and_destroy can't be used, since it would call kfree for
each flow_offload object twice.
This patch adds a call to nf_flow_table_iterate_cleanup() to schedule
removal of entries, then there is an explicitly invocation of the
garbage collector to clean up resources.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,6 +14,7 @@ struct nf_flowtable_type {
struct list_head list;
int family;
void (*gc)(struct work_struct *work);
+ void (*free)(struct nf_flowtable *ft);
const struct rhashtable_params *params;
nf_hookfn *hook;
struct module *owner;
@@ -98,6 +99,7 @@ int nf_flow_table_iterate(struct nf_flow
void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+void nf_flow_table_free(struct nf_flowtable *flow_table);
void nf_flow_offload_work_gc(struct work_struct *work);
extern const struct rhashtable_params nf_flow_offload_rhash_params;
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -260,6 +260,7 @@ static struct nf_flowtable_type flowtabl
.family = NFPROTO_IPV4,
.params = &nf_flow_offload_rhash_params,
.gc = nf_flow_offload_work_gc,
+ .free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
};
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -254,6 +254,7 @@ static struct nf_flowtable_type flowtabl
.family = NFPROTO_IPV6,
.params = &nf_flow_offload_rhash_params,
.gc = nf_flow_offload_work_gc,
+ .free = nf_flow_table_free,
.hook = nf_flow_offload_ipv6_hook,
.owner = THIS_MODULE,
};
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -232,19 +232,16 @@ static inline bool nf_flow_is_dying(cons
return flow->flags & FLOW_OFFLOAD_DYING;
}
-void nf_flow_offload_work_gc(struct work_struct *work)
+static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
{
struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table;
struct rhashtable_iter hti;
struct flow_offload *flow;
int err;
- flow_table = container_of(work, struct nf_flowtable, gc_work.work);
-
err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
if (err)
- goto schedule;
+ return 0;
rhashtable_walk_start(&hti);
@@ -270,7 +267,16 @@ void nf_flow_offload_work_gc(struct work
out:
rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti);
-schedule:
+
+ return 1;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+ struct nf_flowtable *flow_table;
+
+ flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+ nf_flow_offload_gc_step(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
@@ -449,5 +455,12 @@ void nf_flow_table_cleanup(struct net *n
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+void nf_flow_table_free(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+ WARN_ON(!nf_flow_offload_gc_step(flow_table));
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_free);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -24,6 +24,7 @@ static struct nf_flowtable_type flowtabl
.family = NFPROTO_INET,
.params = &nf_flow_offload_rhash_params,
.gc = nf_flow_offload_work_gc,
+ .free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
};
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5201,17 +5201,12 @@ err:
nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}
-static void nft_flowtable_destroy(void *ptr, void *arg)
-{
- kfree(ptr);
-}
-
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
cancel_delayed_work_sync(&flowtable->data.gc_work);
kfree(flowtable->name);
- rhashtable_free_and_destroy(&flowtable->data.rhashtable,
- nft_flowtable_destroy, NULL);
+ flowtable->data.type->free(&flowtable->data);
+ rhashtable_destroy(&flowtable->data.rhashtable);
module_put(flowtable->data.type->owner);
}

View file

@ -0,0 +1,96 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 25 Jan 2018 12:58:55 +0100
Subject: [PATCH] netfilter: nft_flow_offload: handle netdevice events from
nf_flow_table
Move the code that deals with device events to the core.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -462,5 +462,35 @@ void nf_flow_table_free(struct nf_flowta
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+static int nf_flow_table_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ nf_flow_table_cleanup(dev_net(dev), dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = nf_flow_table_netdev_event,
+};
+
+static int __init nf_flow_table_module_init(void)
+{
+ return register_netdevice_notifier(&flow_offload_netdev_notifier);
+}
+
+static void __exit nf_flow_table_module_exit(void)
+{
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+}
+
+module_init(nf_flow_table_module_init);
+module_exit(nf_flow_table_module_exit);
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -194,44 +194,14 @@ static struct nft_expr_type nft_flow_off
.owner = THIS_MODULE,
};
-static int flow_offload_netdev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
- if (event != NETDEV_DOWN)
- return NOTIFY_DONE;
-
- nf_flow_table_cleanup(dev_net(dev), dev);
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block flow_offload_netdev_notifier = {
- .notifier_call = flow_offload_netdev_event,
-};
-
static int __init nft_flow_offload_module_init(void)
{
- int err;
-
- register_netdevice_notifier(&flow_offload_netdev_notifier);
-
- err = nft_register_expr(&nft_flow_offload_type);
- if (err < 0)
- goto register_expr;
-
- return 0;
-
-register_expr:
- unregister_netdevice_notifier(&flow_offload_netdev_notifier);
- return err;
+ return nft_register_expr(&nft_flow_offload_type);
}
static void __exit nft_flow_offload_module_exit(void)
{
nft_unregister_expr(&nft_flow_offload_type);
- unregister_netdevice_notifier(&flow_offload_netdev_notifier);
}
module_init(nft_flow_offload_module_init);

View file

@ -0,0 +1,468 @@
From: Harsha Sharma <harshasharmaiitr@gmail.com>
Date: Wed, 27 Dec 2017 00:59:00 +0530
Subject: [PATCH] netfilter: nf_tables: allocate handle and delete objects via
handle
This patch allows deletion of objects via unique handle which can be
listed via '-a' option.
Signed-off-by: Harsha Sharma <harshasharmaiitr@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -369,6 +369,7 @@ void nft_unregister_set(struct nft_set_t
* @list: table set list node
* @bindings: list of set bindings
* @name: name of the set
+ * @handle: unique handle of the set
* @ktype: key type (numeric type defined by userspace, not used in the kernel)
* @dtype: data type (verdict or numeric type defined by userspace)
* @objtype: object type (see NFT_OBJECT_* definitions)
@@ -391,6 +392,7 @@ struct nft_set {
struct list_head list;
struct list_head bindings;
char *name;
+ u64 handle;
u32 ktype;
u32 dtype;
u32 objtype;
@@ -936,6 +938,7 @@ unsigned int nft_do_chain(struct nft_pkt
* @objects: stateful objects in the table
* @flowtables: flow tables in the table
* @hgenerator: handle generator state
+ * @handle: table handle
* @use: number of chain references to this table
* @flags: table flag (see enum nft_table_flags)
* @genmask: generation mask
@@ -949,6 +952,7 @@ struct nft_table {
struct list_head objects;
struct list_head flowtables;
u64 hgenerator;
+ u64 handle;
u32 use;
u16 family:6,
flags:8,
@@ -973,9 +977,9 @@ int nft_verdict_dump(struct sk_buff *skb
* @name: name of this stateful object
* @genmask: generation mask
* @use: number of references to this stateful object
- * @data: object data, layout depends on type
+ * @handle: unique object handle
* @ops: object operations
- * @data: pointer to object data
+ * @data: object data, layout depends on type
*/
struct nft_object {
struct list_head list;
@@ -983,6 +987,7 @@ struct nft_object {
struct nft_table *table;
u32 genmask:2,
use:30;
+ u64 handle;
/* runtime data below here */
const struct nft_object_ops *ops ____cacheline_aligned;
unsigned char data[]
@@ -1064,6 +1069,7 @@ void nft_unregister_obj(struct nft_objec
* @ops_len: number of hooks in array
* @genmask: generation mask
* @use: number of references to this flow table
+ * @handle: unique object handle
* @data: rhashtable and garbage collector
* @ops: array of hooks
*/
@@ -1076,6 +1082,7 @@ struct nft_flowtable {
int ops_len;
u32 genmask:2,
use:30;
+ u64 handle;
/* runtime data below here */
struct nf_hook_ops *ops ____cacheline_aligned;
struct nf_flowtable data;
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -174,6 +174,8 @@ enum nft_table_attributes {
NFTA_TABLE_NAME,
NFTA_TABLE_FLAGS,
NFTA_TABLE_USE,
+ NFTA_TABLE_HANDLE,
+ NFTA_TABLE_PAD,
__NFTA_TABLE_MAX
};
#define NFTA_TABLE_MAX (__NFTA_TABLE_MAX - 1)
@@ -317,6 +319,7 @@ enum nft_set_desc_attributes {
* @NFTA_SET_GC_INTERVAL: garbage collection interval (NLA_U32)
* @NFTA_SET_USERDATA: user data (NLA_BINARY)
* @NFTA_SET_OBJ_TYPE: stateful object type (NLA_U32: NFT_OBJECT_*)
+ * @NFTA_SET_HANDLE: set handle (NLA_U64)
*/
enum nft_set_attributes {
NFTA_SET_UNSPEC,
@@ -335,6 +338,7 @@ enum nft_set_attributes {
NFTA_SET_USERDATA,
NFTA_SET_PAD,
NFTA_SET_OBJ_TYPE,
+ NFTA_SET_HANDLE,
__NFTA_SET_MAX
};
#define NFTA_SET_MAX (__NFTA_SET_MAX - 1)
@@ -1314,6 +1318,7 @@ enum nft_ct_helper_attributes {
* @NFTA_OBJ_TYPE: stateful object type (NLA_U32)
* @NFTA_OBJ_DATA: stateful object data (NLA_NESTED)
* @NFTA_OBJ_USE: number of references to this expression (NLA_U32)
+ * @NFTA_OBJ_HANDLE: object handle (NLA_U64)
*/
enum nft_object_attributes {
NFTA_OBJ_UNSPEC,
@@ -1322,6 +1327,8 @@ enum nft_object_attributes {
NFTA_OBJ_TYPE,
NFTA_OBJ_DATA,
NFTA_OBJ_USE,
+ NFTA_OBJ_HANDLE,
+ NFTA_OBJ_PAD,
__NFTA_OBJ_MAX
};
#define NFTA_OBJ_MAX (__NFTA_OBJ_MAX - 1)
@@ -1333,6 +1340,7 @@ enum nft_object_attributes {
* @NFTA_FLOWTABLE_NAME: name of this flow table (NLA_STRING)
* @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
* @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
+ * @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
*/
enum nft_flowtable_attributes {
NFTA_FLOWTABLE_UNSPEC,
@@ -1340,6 +1348,8 @@ enum nft_flowtable_attributes {
NFTA_FLOWTABLE_NAME,
NFTA_FLOWTABLE_HOOK,
NFTA_FLOWTABLE_USE,
+ NFTA_FLOWTABLE_HANDLE,
+ NFTA_FLOWTABLE_PAD,
__NFTA_FLOWTABLE_MAX
};
#define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1)
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -26,6 +26,7 @@
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
+static u64 table_handle;
static void nft_ctx_init(struct nft_ctx *ctx,
struct net *net,
@@ -332,6 +333,20 @@ static struct nft_table *nft_table_looku
return NULL;
}
+static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
+ const struct nlattr *nla,
+ u8 genmask)
+{
+ struct nft_table *table;
+
+ list_for_each_entry(table, &net->nft.tables, list) {
+ if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
+ nft_active_genmask(table, genmask))
+ return table;
+ }
+ return NULL;
+}
+
static struct nft_table *nf_tables_table_lookup(const struct net *net,
const struct nlattr *nla,
u8 family, u8 genmask)
@@ -348,6 +363,22 @@ static struct nft_table *nf_tables_table
return ERR_PTR(-ENOENT);
}
+static struct nft_table *nf_tables_table_lookup_byhandle(const struct net *net,
+ const struct nlattr *nla,
+ u8 genmask)
+{
+ struct nft_table *table;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ table = nft_table_lookup_byhandle(net, nla, genmask);
+ if (table != NULL)
+ return table;
+
+ return ERR_PTR(-ENOENT);
+}
+
static inline u64 nf_tables_alloc_handle(struct nft_table *table)
{
return ++table->hgenerator;
@@ -394,6 +425,7 @@ static const struct nla_policy nft_table
[NFTA_TABLE_NAME] = { .type = NLA_STRING,
.len = NFT_TABLE_MAXNAMELEN - 1 },
[NFTA_TABLE_FLAGS] = { .type = NLA_U32 },
+ [NFTA_TABLE_HANDLE] = { .type = NLA_U64 },
};
static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
@@ -415,7 +447,9 @@ static int nf_tables_fill_table_info(str
if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
nla_put_be32(skb, NFTA_TABLE_FLAGS, htonl(table->flags)) ||
- nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)))
+ nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
+ nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
+ NFTA_TABLE_PAD))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -674,6 +708,7 @@ static int nf_tables_newtable(struct net
INIT_LIST_HEAD(&table->flowtables);
table->family = family;
table->flags = flags;
+ table->handle = ++table_handle;
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
@@ -791,11 +826,18 @@ static int nf_tables_deltable(struct net
struct nft_ctx ctx;
nft_ctx_init(&ctx, net, skb, nlh, 0, NULL, NULL, nla);
- if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
+ if (family == AF_UNSPEC ||
+ (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
return nft_flush(&ctx, family);
- table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME], family,
- genmask);
+ if (nla[NFTA_TABLE_HANDLE])
+ table = nf_tables_table_lookup_byhandle(net,
+ nla[NFTA_TABLE_HANDLE],
+ genmask);
+ else
+ table = nf_tables_table_lookup(net, nla[NFTA_TABLE_NAME],
+ family, genmask);
+
if (IS_ERR(table))
return PTR_ERR(table);
@@ -1534,6 +1576,7 @@ static int nf_tables_delchain(struct net
struct nft_rule *rule;
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
+ u64 handle;
u32 use;
int err;
@@ -1542,7 +1585,12 @@ static int nf_tables_delchain(struct net
if (IS_ERR(table))
return PTR_ERR(table);
- chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ if (nla[NFTA_CHAIN_HANDLE]) {
+ handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
+ chain = nf_tables_chain_lookup_byhandle(table, handle, genmask);
+ } else {
+ chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ }
if (IS_ERR(chain))
return PTR_ERR(chain);
@@ -2503,6 +2551,7 @@ static const struct nla_policy nft_set_p
[NFTA_SET_USERDATA] = { .type = NLA_BINARY,
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
+ [NFTA_SET_HANDLE] = { .type = NLA_U64 },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -2546,6 +2595,22 @@ static struct nft_set *nf_tables_set_loo
return ERR_PTR(-ENOENT);
}
+static struct nft_set *nf_tables_set_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
+{
+ struct nft_set *set;
+
+ if (nla == NULL)
+ return ERR_PTR(-EINVAL);
+
+ list_for_each_entry(set, &table->sets, list) {
+ if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
+ nft_active_genmask(set, genmask))
+ return set;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
static struct nft_set *nf_tables_set_lookup_byid(const struct net *net,
const struct nlattr *nla,
u8 genmask)
@@ -2661,6 +2726,9 @@ static int nf_tables_fill_set(struct sk_
goto nla_put_failure;
if (nla_put_string(skb, NFTA_SET_NAME, set->name))
goto nla_put_failure;
+ if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
+ NFTA_SET_PAD))
+ goto nla_put_failure;
if (set->flags != 0)
if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
goto nla_put_failure;
@@ -3069,6 +3137,7 @@ static int nf_tables_newset(struct net *
set->udata = udata;
set->timeout = timeout;
set->gc_int = gc_int;
+ set->handle = nf_tables_alloc_handle(table);
err = ops->init(set, &desc, nla);
if (err < 0)
@@ -3126,7 +3195,10 @@ static int nf_tables_delset(struct net *
if (err < 0)
return err;
- set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
+ if (nla[NFTA_SET_HANDLE])
+ set = nf_tables_set_lookup_byhandle(ctx.table, nla[NFTA_SET_HANDLE], genmask);
+ else
+ set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME], genmask);
if (IS_ERR(set))
return PTR_ERR(set);
@@ -4182,6 +4254,21 @@ struct nft_object *nf_tables_obj_lookup(
}
EXPORT_SYMBOL_GPL(nf_tables_obj_lookup);
+struct nft_object *nf_tables_obj_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla,
+ u32 objtype, u8 genmask)
+{
+ struct nft_object *obj;
+
+ list_for_each_entry(obj, &table->objects, list) {
+ if (be64_to_cpu(nla_get_be64(nla)) == obj->handle &&
+ objtype == obj->ops->type->type &&
+ nft_active_genmask(obj, genmask))
+ return obj;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
[NFTA_OBJ_TABLE] = { .type = NLA_STRING,
.len = NFT_TABLE_MAXNAMELEN - 1 },
@@ -4189,6 +4276,7 @@ static const struct nla_policy nft_obj_p
.len = NFT_OBJ_MAXNAMELEN - 1 },
[NFTA_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_OBJ_DATA] = { .type = NLA_NESTED },
+ [NFTA_OBJ_HANDLE] = { .type = NLA_U64},
};
static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
@@ -4336,6 +4424,8 @@ static int nf_tables_newobj(struct net *
goto err1;
}
obj->table = table;
+ obj->handle = nf_tables_alloc_handle(table);
+
obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL);
if (!obj->name) {
err = -ENOMEM;
@@ -4382,7 +4472,9 @@ static int nf_tables_fill_obj_info(struc
nla_put_string(skb, NFTA_OBJ_NAME, obj->name) ||
nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
- nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
+ nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset) ||
+ nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
+ NFTA_OBJ_PAD))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -4580,7 +4672,7 @@ static int nf_tables_delobj(struct net *
u32 objtype;
if (!nla[NFTA_OBJ_TYPE] ||
- !nla[NFTA_OBJ_NAME])
+ (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
return -EINVAL;
table = nf_tables_table_lookup(net, nla[NFTA_OBJ_TABLE], family,
@@ -4589,7 +4681,12 @@ static int nf_tables_delobj(struct net *
return PTR_ERR(table);
objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME], objtype, genmask);
+ if (nla[NFTA_OBJ_HANDLE])
+ obj = nf_tables_obj_lookup_byhandle(table, nla[NFTA_OBJ_HANDLE],
+ objtype, genmask);
+ else
+ obj = nf_tables_obj_lookup(table, nla[NFTA_OBJ_NAME],
+ objtype, genmask);
if (IS_ERR(obj))
return PTR_ERR(obj);
if (obj->use > 0)
@@ -4661,6 +4758,7 @@ static const struct nla_policy nft_flowt
[NFTA_FLOWTABLE_NAME] = { .type = NLA_STRING,
.len = NFT_NAME_MAXLEN - 1 },
[NFTA_FLOWTABLE_HOOK] = { .type = NLA_NESTED },
+ [NFTA_FLOWTABLE_HANDLE] = { .type = NLA_U64 },
};
struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
@@ -4678,6 +4776,20 @@ struct nft_flowtable *nf_tables_flowtabl
}
EXPORT_SYMBOL_GPL(nf_tables_flowtable_lookup);
+struct nft_flowtable *
+nf_tables_flowtable_lookup_byhandle(const struct nft_table *table,
+ const struct nlattr *nla, u8 genmask)
+{
+ struct nft_flowtable *flowtable;
+
+ list_for_each_entry(flowtable, &table->flowtables, list) {
+ if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle &&
+ nft_active_genmask(flowtable, genmask))
+ return flowtable;
+ }
+ return ERR_PTR(-ENOENT);
+}
+
#define NFT_FLOWTABLE_DEVICE_MAX 8
static int nf_tables_parse_devices(const struct nft_ctx *ctx,
@@ -4886,6 +4998,8 @@ static int nf_tables_newflowtable(struct
return -ENOMEM;
flowtable->table = table;
+ flowtable->handle = nf_tables_alloc_handle(table);
+
flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL);
if (!flowtable->name) {
err = -ENOMEM;
@@ -4960,8 +5074,14 @@ static int nf_tables_delflowtable(struct
if (IS_ERR(table))
return PTR_ERR(table);
- flowtable = nf_tables_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME],
- genmask);
+ if (nla[NFTA_FLOWTABLE_HANDLE])
+ flowtable = nf_tables_flowtable_lookup_byhandle(table,
+ nla[NFTA_FLOWTABLE_HANDLE],
+ genmask);
+ else
+ flowtable = nf_tables_flowtable_lookup(table,
+ nla[NFTA_FLOWTABLE_NAME],
+ genmask);
if (IS_ERR(flowtable))
return PTR_ERR(flowtable);
if (flowtable->use > 0)
@@ -4994,7 +5114,9 @@ static int nf_tables_fill_flowtable_info
if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
- nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)))
+ nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
+ nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
+ NFTA_FLOWTABLE_PAD))
goto nla_put_failure;
nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);

View file

@ -0,0 +1,95 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 7 Feb 2018 09:23:25 +0100
Subject: [PATCH] netfilter: nf_flow_offload: fix use-after-free and a resource
leak
flow_offload_del frees the flow, so all associated resource must be
freed before.
Since the ct entry in struct flow_offload_entry was allocated by
flow_offload_alloc, it should be freed by flow_offload_free to take care
of the error handling path when flow_offload_add fails.
While at it, make flow_offload_del static, since it should never be
called directly, only from the gc step
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -90,7 +90,6 @@ struct flow_offload *flow_offload_alloc(
void flow_offload_free(struct flow_offload *flow);
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
-void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow);
struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
struct flow_offload_tuple *tuple);
int nf_flow_table_iterate(struct nf_flowtable *flow_table,
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -125,7 +125,9 @@ void flow_offload_free(struct flow_offlo
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
e = container_of(flow, struct flow_offload_entry, flow);
- kfree(e);
+ nf_ct_delete(e->ct, 0, 0);
+ nf_ct_put(e->ct);
+ kfree_rcu(e, rcu_head);
}
EXPORT_SYMBOL_GPL(flow_offload_free);
@@ -149,11 +151,9 @@ int flow_offload_add(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_add);
-void flow_offload_del(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
+static void flow_offload_del(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
{
- struct flow_offload_entry *e;
-
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
*flow_table->type->params);
@@ -161,10 +161,8 @@ void flow_offload_del(struct nf_flowtabl
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
*flow_table->type->params);
- e = container_of(flow, struct flow_offload_entry, flow);
- kfree_rcu(e, rcu_head);
+ flow_offload_free(flow);
}
-EXPORT_SYMBOL_GPL(flow_offload_del);
struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable *flow_table,
@@ -175,15 +173,6 @@ flow_offload_lookup(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
-static void nf_flow_release_ct(const struct flow_offload *flow)
-{
- struct flow_offload_entry *e;
-
- e = container_of(flow, struct flow_offload_entry, flow);
- nf_ct_delete(e->ct, 0, 0);
- nf_ct_put(e->ct);
-}
-
int nf_flow_table_iterate(struct nf_flowtable *flow_table,
void (*iter)(struct flow_offload *flow, void *data),
void *data)
@@ -259,10 +248,8 @@ static int nf_flow_offload_gc_step(struc
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
if (nf_flow_has_expired(flow) ||
- nf_flow_is_dying(flow)) {
+ nf_flow_is_dying(flow))
flow_offload_del(flow_table, flow);
- nf_flow_release_ct(flow);
- }
}
out:
rhashtable_walk_stop(&hti);

View file

@ -0,0 +1,73 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 31 Jan 2018 18:13:39 +0100
Subject: [PATCH] netfilter: flowtable infrastructure depends on
NETFILTER_INGRESS
config NF_FLOW_TABLE depends on NETFILTER_INGRESS. If users forget to
enable this toggle, flowtable registration fails with EOPNOTSUPP.
Moreover, turn 'select NF_FLOW_TABLE' in every flowtable family flavour
into dependency instead, otherwise this new dependency on
NETFILTER_INGRESS causes a warning. This also allows us to remove the
explicit dependency between family flowtables <-> NF_TABLES and
NF_CONNTRACK, given they depend on the NF_FLOW_TABLE core that already
expresses the general dependencies for this new infrastructure.
Moreover, NF_FLOW_TABLE_INET depends on NF_FLOW_TABLE_IPV4 and
NF_FLOWTABLE_IPV6, which already depends on NF_FLOW_TABLE. So we can get
rid of direct dependency with NF_FLOW_TABLE.
In general, let's avoid 'select', it just makes things more complicated.
Reported-by: John Crispin <john@phrozen.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -79,8 +79,7 @@ endif # NF_TABLES
config NF_FLOW_TABLE_IPV4
tristate "Netfilter flow table IPv4 module"
- depends on NF_CONNTRACK && NF_TABLES
- select NF_FLOW_TABLE
+ depends on NF_FLOW_TABLE
help
This option adds the flow table IPv4 support.
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -73,8 +73,7 @@ endif # NF_TABLES
config NF_FLOW_TABLE_IPV6
tristate "Netfilter flow table IPv6 module"
- depends on NF_CONNTRACK && NF_TABLES
- select NF_FLOW_TABLE
+ depends on NF_FLOW_TABLE
help
This option adds the flow table IPv6 support.
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -670,8 +670,8 @@ endif # NF_TABLES
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
- depends on NF_FLOW_TABLE_IPV4 && NF_FLOW_TABLE_IPV6
- select NF_FLOW_TABLE
+ depends on NF_FLOW_TABLE_IPV4
+ depends on NF_FLOW_TABLE_IPV6
help
This option adds the flow table mixed IPv4/IPv6 support.
@@ -679,7 +679,9 @@ config NF_FLOW_TABLE_INET
config NF_FLOW_TABLE
tristate "Netfilter flow table module"
- depends on NF_CONNTRACK && NF_TABLES
+ depends on NETFILTER_INGRESS
+ depends on NF_CONNTRACK
+ depends on NF_TABLES
help
This option adds the flow table core infrastructure.

View file

@ -0,0 +1,29 @@
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Wed, 10 Jan 2018 13:06:46 +0000
Subject: [PATCH] netfilter: remove duplicated include
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -5,7 +5,6 @@
#include <linux/rhashtable.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
-#include <linux/ipv6.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/neighbour.h>
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -15,8 +15,6 @@
#include <linux/netfilter_bridge.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter_ipv6.h>
#include <net/protocol.h>
#include <net/netfilter/nf_queue.h>
#include <net/dst.h>

View file

@ -0,0 +1,35 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 16 Feb 2018 09:41:18 +0100
Subject: [PATCH] netfilter: nf_flow_table: use IP_CT_DIR_* values for
FLOW_OFFLOAD_DIR_*
Simplifies further code cleanups
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -6,6 +6,7 @@
#include <linux/netdevice.h>
#include <linux/rhashtable.h>
#include <linux/rcupdate.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
#include <net/dst.h>
struct nf_flowtable;
@@ -27,11 +28,10 @@ struct nf_flowtable {
};
enum flow_offload_tuple_dir {
- FLOW_OFFLOAD_DIR_ORIGINAL,
- FLOW_OFFLOAD_DIR_REPLY,
- __FLOW_OFFLOAD_DIR_MAX = FLOW_OFFLOAD_DIR_REPLY,
+ FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
+ FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
+ FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX
};
-#define FLOW_OFFLOAD_DIR_MAX (__FLOW_OFFLOAD_DIR_MAX + 1)
struct flow_offload_tuple {
union {

View file

@ -0,0 +1,118 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 16 Feb 2018 09:42:32 +0100
Subject: [PATCH] netfilter: nf_flow_table: clean up flow_offload_alloc
Reduce code duplication and make it much easier to read
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -16,6 +16,38 @@ struct flow_offload_entry {
struct rcu_head rcu_head;
};
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+ struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+
+ ft->dir = dir;
+
+ switch (ctt->src.l3num) {
+ case NFPROTO_IPV4:
+ ft->src_v4 = ctt->src.u3.in;
+ ft->dst_v4 = ctt->dst.u3.in;
+ break;
+ case NFPROTO_IPV6:
+ ft->src_v6 = ctt->src.u3.in6;
+ ft->dst_v6 = ctt->dst.u3.in6;
+ break;
+ }
+
+ ft->l3proto = ctt->src.l3num;
+ ft->l4proto = ctt->dst.protonum;
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+
+ ft->iifidx = route->tuple[dir].ifindex;
+ ft->oifidx = route->tuple[!dir].ifindex;
+
+ ft->dst_cache = route->tuple[dir].dst;
+}
+
struct flow_offload *
flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
{
@@ -40,65 +72,8 @@ flow_offload_alloc(struct nf_conn *ct, s
entry->ct = ct;
- switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num) {
- case NFPROTO_IPV4:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in;
- break;
- case NFPROTO_IPV6:
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.in6;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6 =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.in6;
- break;
- }
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l3proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.l4proto =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u.tcp.port;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port =
- ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u.tcp.port;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dir =
- FLOW_OFFLOAD_DIR_ORIGINAL;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dir =
- FLOW_OFFLOAD_DIR_REPLY;
-
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.iifidx =
- route->tuple[FLOW_OFFLOAD_DIR_REPLY].ifindex;
- flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.oifidx =
- route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].ifindex;
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
if (ct->status & IPS_SRC_NAT)
flow->flags |= FLOW_OFFLOAD_SNAT;

View file

@ -0,0 +1,80 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 16 Feb 2018 10:54:24 +0100
Subject: [PATCH] ipv6: make ip6_dst_mtu_forward inline
Removes a direct dependency on ipv6.ko
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -252,4 +252,26 @@ static inline bool rt6_duplicate_nexthop
ipv6_addr_equal(&a->rt6i_gateway, &b->rt6i_gateway) &&
!lwtunnel_cmp_encap(a->dst.lwtstate, b->dst.lwtstate);
}
+
+static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
+{
+ unsigned int mtu;
+ struct inet6_dev *idev;
+
+ if (dst_metric_locked(dst, RTAX_MTU)) {
+ mtu = dst_metric_raw(dst, RTAX_MTU);
+ if (mtu)
+ return mtu;
+ }
+
+ mtu = IPV6_MIN_MTU;
+ rcu_read_lock();
+ idev = __in6_dev_get(dst->dev);
+ if (idev)
+ mtu = idev->cnf.mtu6;
+ rcu_read_unlock();
+
+ return mtu;
+}
+
#endif
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -913,8 +913,6 @@ static inline struct sk_buff *ip6_finish
&inet6_sk(sk)->cork);
}
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst);
-
int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
struct flowi6 *fl6);
struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -370,28 +370,6 @@ static inline int ip6_forward_finish(str
return dst_output(net, sk, skb);
}
-unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
-{
- unsigned int mtu;
- struct inet6_dev *idev;
-
- if (dst_metric_locked(dst, RTAX_MTU)) {
- mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
- return mtu;
- }
-
- mtu = IPV6_MIN_MTU;
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- if (idev)
- mtu = idev->cnf.mtu6;
- rcu_read_unlock();
-
- return mtu;
-}
-EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
-
static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)

View file

@ -0,0 +1,145 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 16 Feb 2018 10:57:23 +0100
Subject: [PATCH] netfilter: nf_flow_table: cache mtu in struct
flow_offload_tuple
Reduces the number of cache lines touched in the offload forwarding path
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -55,6 +55,8 @@ struct flow_offload_tuple {
int oifidx;
+ u16 mtu;
+
struct dst_entry *dst_cache;
};
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -177,7 +177,7 @@ static int nf_flow_tuple_ip(struct sk_bu
}
/* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
@@ -191,17 +191,6 @@ static bool __nf_flow_exceeds_mtu(const
return true;
}
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rtable *rt)
-{
- u32 mtu;
-
- mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
- if (__nf_flow_exceeds_mtu(skb, mtu))
- return true;
-
- return false;
-}
-
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -232,9 +221,9 @@ nf_flow_offload_ip_hook(void *priv, stru
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
if (skb_try_make_writable(skb, sizeof(*iph)))
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -173,7 +173,7 @@ static int nf_flow_tuple_ipv6(struct sk_
}
/* Based on ip_exceeds_mtu(). */
-static bool __nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
{
if (skb->len <= mtu)
return false;
@@ -184,17 +184,6 @@ static bool __nf_flow_exceeds_mtu(const
return true;
}
-static bool nf_flow_exceeds_mtu(struct sk_buff *skb, const struct rt6_info *rt)
-{
- u32 mtu;
-
- mtu = ip6_dst_mtu_forward(&rt->dst);
- if (__nf_flow_exceeds_mtu(skb, mtu))
- return true;
-
- return false;
-}
-
unsigned int
nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -225,9 +214,9 @@ nf_flow_offload_ipv6_hook(void *priv, st
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
-
rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, rt)))
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
if (skb_try_make_writable(skb, sizeof(*ip6h)))
--- a/net/netfilter/nf_flow_table.c
+++ b/net/netfilter/nf_flow_table.c
@@ -4,6 +4,8 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack.h>
@@ -23,6 +25,7 @@ flow_offload_fill_dir(struct flow_offloa
{
struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+ struct dst_entry *dst = route->tuple[dir].dst;
ft->dir = dir;
@@ -30,10 +33,12 @@ flow_offload_fill_dir(struct flow_offloa
case NFPROTO_IPV4:
ft->src_v4 = ctt->src.u3.in;
ft->dst_v4 = ctt->dst.u3.in;
+ ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
break;
case NFPROTO_IPV6:
ft->src_v6 = ctt->src.u3.in6;
ft->dst_v6 = ctt->dst.u3.in6;
+ ft->mtu = ip6_dst_mtu_forward(dst);
break;
}
@@ -44,8 +49,7 @@ flow_offload_fill_dir(struct flow_offloa
ft->iifidx = route->tuple[dir].ifindex;
ft->oifidx = route->tuple[!dir].ifindex;
-
- ft->dst_cache = route->tuple[dir].dst;
+ ft->dst_cache = dst;
}
struct flow_offload *

View file

@ -0,0 +1,952 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Fri, 16 Feb 2018 11:08:47 +0100
Subject: [PATCH] netfilter: nf_flow_table: rename nf_flow_table.c to
nf_flow_table_core.c
Preparation for adding more code to the same module
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
rename net/netfilter/{nf_flow_table.c => nf_flow_table_core.c} (100%)
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -113,6 +113,8 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
+nf_flow_table-objs := nf_flow_table_core.o
+
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
# generic X tables
--- a/net/netfilter/nf_flow_table.c
+++ /dev/null
@@ -1,462 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/ip6_route.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_tuple.h>
-
-struct flow_offload_entry {
- struct flow_offload flow;
- struct nf_conn *ct;
- struct rcu_head rcu_head;
-};
-
-static void
-flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
- struct nf_flow_route *route,
- enum flow_offload_tuple_dir dir)
-{
- struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
- struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
- struct dst_entry *dst = route->tuple[dir].dst;
-
- ft->dir = dir;
-
- switch (ctt->src.l3num) {
- case NFPROTO_IPV4:
- ft->src_v4 = ctt->src.u3.in;
- ft->dst_v4 = ctt->dst.u3.in;
- ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
- break;
- case NFPROTO_IPV6:
- ft->src_v6 = ctt->src.u3.in6;
- ft->dst_v6 = ctt->dst.u3.in6;
- ft->mtu = ip6_dst_mtu_forward(dst);
- break;
- }
-
- ft->l3proto = ctt->src.l3num;
- ft->l4proto = ctt->dst.protonum;
- ft->src_port = ctt->src.u.tcp.port;
- ft->dst_port = ctt->dst.u.tcp.port;
-
- ft->iifidx = route->tuple[dir].ifindex;
- ft->oifidx = route->tuple[!dir].ifindex;
- ft->dst_cache = dst;
-}
-
-struct flow_offload *
-flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
-{
- struct flow_offload_entry *entry;
- struct flow_offload *flow;
-
- if (unlikely(nf_ct_is_dying(ct) ||
- !atomic_inc_not_zero(&ct->ct_general.use)))
- return NULL;
-
- entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
- if (!entry)
- goto err_ct_refcnt;
-
- flow = &entry->flow;
-
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
- goto err_dst_cache_original;
-
- if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
- goto err_dst_cache_reply;
-
- entry->ct = ct;
-
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
- flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
-
- if (ct->status & IPS_SRC_NAT)
- flow->flags |= FLOW_OFFLOAD_SNAT;
- else if (ct->status & IPS_DST_NAT)
- flow->flags |= FLOW_OFFLOAD_DNAT;
-
- return flow;
-
-err_dst_cache_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
-err_dst_cache_original:
- kfree(entry);
-err_ct_refcnt:
- nf_ct_put(ct);
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(flow_offload_alloc);
-
-void flow_offload_free(struct flow_offload *flow)
-{
- struct flow_offload_entry *e;
-
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
- e = container_of(flow, struct flow_offload_entry, flow);
- nf_ct_delete(e->ct, 0, 0);
- nf_ct_put(e->ct);
- kfree_rcu(e, rcu_head);
-}
-EXPORT_SYMBOL_GPL(flow_offload_free);
-
-void flow_offload_dead(struct flow_offload *flow)
-{
- flow->flags |= FLOW_OFFLOAD_DYING;
-}
-EXPORT_SYMBOL_GPL(flow_offload_dead);
-
-int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
-{
- flow->timeout = (u32)jiffies;
-
- rhashtable_insert_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
- rhashtable_insert_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
- return 0;
-}
-EXPORT_SYMBOL_GPL(flow_offload_add);
-
-static void flow_offload_del(struct nf_flowtable *flow_table,
- struct flow_offload *flow)
-{
- rhashtable_remove_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
- rhashtable_remove_fast(&flow_table->rhashtable,
- &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
-
- flow_offload_free(flow);
-}
-
-struct flow_offload_tuple_rhash *
-flow_offload_lookup(struct nf_flowtable *flow_table,
- struct flow_offload_tuple *tuple)
-{
- return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
- *flow_table->type->params);
-}
-EXPORT_SYMBOL_GPL(flow_offload_lookup);
-
-int nf_flow_table_iterate(struct nf_flowtable *flow_table,
- void (*iter)(struct flow_offload *flow, void *data),
- void *data)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct rhashtable_iter hti;
- struct flow_offload *flow;
- int err;
-
- err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
- if (err)
- return err;
-
- rhashtable_walk_start(&hti);
-
- while ((tuplehash = rhashtable_walk_next(&hti))) {
- if (IS_ERR(tuplehash)) {
- err = PTR_ERR(tuplehash);
- if (err != -EAGAIN)
- goto out;
-
- continue;
- }
- if (tuplehash->tuple.dir)
- continue;
-
- flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
-
- iter(flow, data);
- }
-out:
- rhashtable_walk_stop(&hti);
- rhashtable_walk_exit(&hti);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
-
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return (__s32)(flow->timeout - (u32)jiffies) <= 0;
-}
-
-static inline bool nf_flow_is_dying(const struct flow_offload *flow)
-{
- return flow->flags & FLOW_OFFLOAD_DYING;
-}
-
-static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct rhashtable_iter hti;
- struct flow_offload *flow;
- int err;
-
- err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
- if (err)
- return 0;
-
- rhashtable_walk_start(&hti);
-
- while ((tuplehash = rhashtable_walk_next(&hti))) {
- if (IS_ERR(tuplehash)) {
- err = PTR_ERR(tuplehash);
- if (err != -EAGAIN)
- goto out;
-
- continue;
- }
- if (tuplehash->tuple.dir)
- continue;
-
- flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
-
- if (nf_flow_has_expired(flow) ||
- nf_flow_is_dying(flow))
- flow_offload_del(flow_table, flow);
- }
-out:
- rhashtable_walk_stop(&hti);
- rhashtable_walk_exit(&hti);
-
- return 1;
-}
-
-void nf_flow_offload_work_gc(struct work_struct *work)
-{
- struct nf_flowtable *flow_table;
-
- flow_table = container_of(work, struct nf_flowtable, gc_work.work);
- nf_flow_offload_gc_step(flow_table);
- queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple *tuple = data;
-
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple_rhash *tuplehash = data;
-
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct flow_offload_tuple *tuple = arg->key;
- const struct flow_offload_tuple_rhash *x = ptr;
-
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
- return 1;
-
- return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
- .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
- .hashfn = flow_offload_hash,
- .obj_hashfn = flow_offload_hash_obj,
- .obj_cmpfn = flow_offload_hash_cmp,
- .automatic_shrinking = true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
-
- return 0;
-}
-
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace2(&udph->check, skb, port,
- new_port, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
-{
- switch (protocol) {
- case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
-{
- struct flow_ports *hdr;
- __be16 port, new_port;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
- hdr = (void *)(skb_network_header(skb) + thoff);
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- port = hdr->source;
- new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
- hdr->source = new_port;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- port = hdr->dest;
- new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
- hdr->dest = new_port;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
-}
-EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
-{
- struct flow_ports *hdr;
- __be16 port, new_port;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
- skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
- hdr = (void *)(skb_network_header(skb) + thoff);
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- port = hdr->dest;
- new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
- hdr->dest = new_port;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- port = hdr->source;
- new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
- hdr->source = new_port;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
-}
-EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
-
-static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
-{
- struct net_device *dev = data;
-
- if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
- return;
-
- flow_offload_dead(flow);
-}
-
-static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- void *data)
-{
- nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
- flush_delayed_work(&flowtable->gc_work);
-}
-
-void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
-{
- nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
-
-void nf_flow_table_free(struct nf_flowtable *flow_table)
-{
- nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
- WARN_ON(!nf_flow_offload_gc_step(flow_table));
-}
-EXPORT_SYMBOL_GPL(nf_flow_table_free);
-
-static int nf_flow_table_netdev_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
- if (event != NETDEV_DOWN)
- return NOTIFY_DONE;
-
- nf_flow_table_cleanup(dev_net(dev), dev);
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block flow_offload_netdev_notifier = {
- .notifier_call = nf_flow_table_netdev_event,
-};
-
-static int __init nf_flow_table_module_init(void)
-{
- return register_netdevice_notifier(&flow_offload_netdev_notifier);
-}
-
-static void __exit nf_flow_table_module_exit(void)
-{
- unregister_netdevice_notifier(&flow_offload_netdev_notifier);
-}
-
-module_init(nf_flow_table_module_init);
-module_exit(nf_flow_table_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
--- /dev/null
+++ b/net/netfilter/nf_flow_table_core.c
@@ -0,0 +1,462 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+struct flow_offload_entry {
+ struct flow_offload flow;
+ struct nf_conn *ct;
+ struct rcu_head rcu_head;
+};
+
+static void
+flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
+ struct nf_flow_route *route,
+ enum flow_offload_tuple_dir dir)
+{
+ struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
+ struct nf_conntrack_tuple *ctt = &ct->tuplehash[dir].tuple;
+ struct dst_entry *dst = route->tuple[dir].dst;
+
+ ft->dir = dir;
+
+ switch (ctt->src.l3num) {
+ case NFPROTO_IPV4:
+ ft->src_v4 = ctt->src.u3.in;
+ ft->dst_v4 = ctt->dst.u3.in;
+ ft->mtu = ip_dst_mtu_maybe_forward(dst, true);
+ break;
+ case NFPROTO_IPV6:
+ ft->src_v6 = ctt->src.u3.in6;
+ ft->dst_v6 = ctt->dst.u3.in6;
+ ft->mtu = ip6_dst_mtu_forward(dst);
+ break;
+ }
+
+ ft->l3proto = ctt->src.l3num;
+ ft->l4proto = ctt->dst.protonum;
+ ft->src_port = ctt->src.u.tcp.port;
+ ft->dst_port = ctt->dst.u.tcp.port;
+
+ ft->iifidx = route->tuple[dir].ifindex;
+ ft->oifidx = route->tuple[!dir].ifindex;
+ ft->dst_cache = dst;
+}
+
+struct flow_offload *
+flow_offload_alloc(struct nf_conn *ct, struct nf_flow_route *route)
+{
+ struct flow_offload_entry *entry;
+ struct flow_offload *flow;
+
+ if (unlikely(nf_ct_is_dying(ct) ||
+ !atomic_inc_not_zero(&ct->ct_general.use)))
+ return NULL;
+
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+ if (!entry)
+ goto err_ct_refcnt;
+
+ flow = &entry->flow;
+
+ if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst))
+ goto err_dst_cache_original;
+
+ if (!dst_hold_safe(route->tuple[FLOW_OFFLOAD_DIR_REPLY].dst))
+ goto err_dst_cache_reply;
+
+ entry->ct = ct;
+
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_ORIGINAL);
+ flow_offload_fill_dir(flow, ct, route, FLOW_OFFLOAD_DIR_REPLY);
+
+ if (ct->status & IPS_SRC_NAT)
+ flow->flags |= FLOW_OFFLOAD_SNAT;
+ else if (ct->status & IPS_DST_NAT)
+ flow->flags |= FLOW_OFFLOAD_DNAT;
+
+ return flow;
+
+err_dst_cache_reply:
+ dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+err_dst_cache_original:
+ kfree(entry);
+err_ct_refcnt:
+ nf_ct_put(ct);
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(flow_offload_alloc);
+
+void flow_offload_free(struct flow_offload *flow)
+{
+ struct flow_offload_entry *e;
+
+ dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
+ dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ e = container_of(flow, struct flow_offload_entry, flow);
+ nf_ct_delete(e->ct, 0, 0);
+ nf_ct_put(e->ct);
+ kfree_rcu(e, rcu_head);
+}
+EXPORT_SYMBOL_GPL(flow_offload_free);
+
+void flow_offload_dead(struct flow_offload *flow)
+{
+ flow->flags |= FLOW_OFFLOAD_DYING;
+}
+EXPORT_SYMBOL_GPL(flow_offload_dead);
+
+int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
+{
+ flow->timeout = (u32)jiffies;
+
+ rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+ *flow_table->type->params);
+ rhashtable_insert_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+ *flow_table->type->params);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(flow_offload_add);
+
+static void flow_offload_del(struct nf_flowtable *flow_table,
+ struct flow_offload *flow)
+{
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
+ *flow_table->type->params);
+ rhashtable_remove_fast(&flow_table->rhashtable,
+ &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
+ *flow_table->type->params);
+
+ flow_offload_free(flow);
+}
+
+struct flow_offload_tuple_rhash *
+flow_offload_lookup(struct nf_flowtable *flow_table,
+ struct flow_offload_tuple *tuple)
+{
+ return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
+ *flow_table->type->params);
+}
+EXPORT_SYMBOL_GPL(flow_offload_lookup);
+
+int nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ void (*iter)(struct flow_offload *flow, void *data),
+ void *data)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct rhashtable_iter hti;
+ struct flow_offload *flow;
+ int err;
+
+ err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+ if (err)
+ return err;
+
+ rhashtable_walk_start(&hti);
+
+ while ((tuplehash = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(tuplehash)) {
+ err = PTR_ERR(tuplehash);
+ if (err != -EAGAIN)
+ goto out;
+
+ continue;
+ }
+ if (tuplehash->tuple.dir)
+ continue;
+
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+ iter(flow, data);
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_iterate);
+
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return (__s32)(flow->timeout - (u32)jiffies) <= 0;
+}
+
+static inline bool nf_flow_is_dying(const struct flow_offload *flow)
+{
+ return flow->flags & FLOW_OFFLOAD_DYING;
+}
+
+static int nf_flow_offload_gc_step(struct nf_flowtable *flow_table)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct rhashtable_iter hti;
+ struct flow_offload *flow;
+ int err;
+
+ err = rhashtable_walk_init(&flow_table->rhashtable, &hti, GFP_KERNEL);
+ if (err)
+ return 0;
+
+ rhashtable_walk_start(&hti);
+
+ while ((tuplehash = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(tuplehash)) {
+ err = PTR_ERR(tuplehash);
+ if (err != -EAGAIN)
+ goto out;
+
+ continue;
+ }
+ if (tuplehash->tuple.dir)
+ continue;
+
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+
+ if (nf_flow_has_expired(flow) ||
+ nf_flow_is_dying(flow))
+ flow_offload_del(flow_table, flow);
+ }
+out:
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ return 1;
+}
+
+void nf_flow_offload_work_gc(struct work_struct *work)
+{
+ struct nf_flowtable *flow_table;
+
+ flow_table = container_of(work, struct nf_flowtable, gc_work.work);
+ nf_flow_offload_gc_step(flow_table);
+ queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
+
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct flow_offload_tuple *tuple = arg->key;
+ const struct flow_offload_tuple_rhash *x = ptr;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ return 1;
+
+ return 0;
+}
+
+const struct rhashtable_params nf_flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
+EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
+
+static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace2(&tcph->check, skb, port, new_port, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace2(&udph->check, skb, port,
+ new_port, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
+{
+ switch (protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+int nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
+{
+ struct flow_ports *hdr;
+ __be16 port, new_port;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ return -1;
+
+ hdr = (void *)(skb_network_header(skb) + thoff);
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = hdr->source;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
+ hdr->source = new_port;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = hdr->dest;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
+ hdr->dest = new_port;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_snat_port);
+
+int nf_flow_dnat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
+{
+ struct flow_ports *hdr;
+ __be16 port, new_port;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*hdr)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*hdr)))
+ return -1;
+
+ hdr = (void *)(skb_network_header(skb) + thoff);
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ port = hdr->dest;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
+ hdr->dest = new_port;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ port = hdr->source;
+ new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
+ hdr->source = new_port;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+}
+EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+
+static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
+{
+ struct net_device *dev = data;
+
+ if (dev && flow->tuplehash[0].tuple.iifidx != dev->ifindex)
+ return;
+
+ flow_offload_dead(flow);
+}
+
+static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
+ void *data)
+{
+ nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+ flush_delayed_work(&flowtable->gc_work);
+}
+
+void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
+{
+ nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+
+void nf_flow_table_free(struct nf_flowtable *flow_table)
+{
+ nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
+ WARN_ON(!nf_flow_offload_gc_step(flow_table));
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_free);
+
+static int nf_flow_table_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_DOWN)
+ return NOTIFY_DONE;
+
+ nf_flow_table_cleanup(dev_net(dev), dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block flow_offload_netdev_notifier = {
+ .notifier_call = nf_flow_table_netdev_event,
+};
+
+static int __init nf_flow_table_module_init(void)
+{
+ return register_netdevice_notifier(&flow_offload_netdev_notifier);
+}
+
+static void __exit nf_flow_table_module_exit(void)
+{
+ unregister_netdevice_notifier(&flow_offload_netdev_notifier);
+}
+
+module_init(nf_flow_table_module_init);
+module_exit(nf_flow_table_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");

View file

@ -0,0 +1,522 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 17 Feb 2018 11:49:44 +0100
Subject: [PATCH] netfilter: nf_flow_table: move ipv4 offload hook code to
nf_flow_table
Allows some minor code sharing with the ipv6 hook code and is also
useful as preparation for adding iptables support for offload
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
create mode 100644 net/netfilter/nf_flow_table_ip.c
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -2,248 +2,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <linux/ip.h>
-#include <linux/netdevice.h>
-#include <net/ip.h>
-#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
-}
-
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace4(&udph->check, skb, addr,
- new_addr, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
-{
- switch (iph->protocol) {
- case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- default:
- return -1;
- }
- csum_replace4(&iph->check, addr, new_addr);
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- __be32 addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = iph->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
- iph->daddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = iph->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
- iph->saddr = new_addr;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
-}
-
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
-{
- struct iphdr *iph = ip_hdr(skb);
- unsigned int thoff = iph->ihl * 4;
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
-}
-
-static bool ip_has_options(unsigned int thoff)
-{
- return thoff != sizeof(struct iphdr);
-}
-
-static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
-{
- struct flow_ports *ports;
- unsigned int thoff;
- struct iphdr *iph;
-
- if (!pskb_may_pull(skb, sizeof(*iph)))
- return -1;
-
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
-
- if (ip_is_fragment(iph) ||
- unlikely(ip_has_options(thoff)))
- return -1;
-
- if (iph->protocol != IPPROTO_TCP &&
- iph->protocol != IPPROTO_UDP)
- return -1;
-
- thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
- return -1;
-
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
- tuple->src_v4.s_addr = iph->saddr;
- tuple->dst_v4.s_addr = iph->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
- tuple->l3proto = AF_INET;
- tuple->l4proto = iph->protocol;
- tuple->iifidx = dev->ifindex;
-
- return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
- if (skb->len <= mtu)
- return false;
-
- if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
- return false;
-
- if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
- return false;
-
- return true;
-}
-
-unsigned int
-nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- const struct rtable *rt;
- struct iphdr *iph;
- __be32 nexthop;
-
- if (skb->protocol != htons(ETH_P_IP))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
-
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
-
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*iph)))
- return NF_DROP;
-
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ip(flow, skb, dir) < 0)
- return NF_DROP;
-
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- iph = ip_hdr(skb);
- ip_decrease_ttl(iph);
-
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
-
- return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -113,7 +113,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_
# flow table infrastructure
obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o
-nf_flow_table-objs := nf_flow_table_core.o
+nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
--- /dev/null
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -0,0 +1,245 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/neighbour.h>
+#include <net/netfilter/nf_flow_table.h>
+/* For layer 4 checksum field offset. */
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace4(&udph->check, skb, addr,
+ new_addr, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+ csum_replace4(&iph->check, addr, new_addr);
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ struct iphdr *iph, unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ __be32 addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = iph->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr;
+ iph->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = iph->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
+ iph->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+}
+
+static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int thoff = iph->ihl * 4;
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static bool ip_has_options(unsigned int thoff)
+{
+ return thoff != sizeof(struct iphdr);
+}
+
+static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ unsigned int thoff;
+ struct iphdr *iph;
+
+ if (!pskb_may_pull(skb, sizeof(*iph)))
+ return -1;
+
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+
+ if (ip_is_fragment(iph) ||
+ unlikely(ip_has_options(thoff)))
+ return -1;
+
+ if (iph->protocol != IPPROTO_TCP &&
+ iph->protocol != IPPROTO_UDP)
+ return -1;
+
+ thoff = iph->ihl * 4;
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v4.s_addr = iph->saddr;
+ tuple->dst_v4.s_addr = iph->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET;
+ tuple->l4proto = iph->protocol;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+/* Based on ip_exceeds_mtu(). */
+static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
+{
+ if (skb->len <= mtu)
+ return false;
+
+ if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
+ return false;
+
+ if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+ return false;
+
+ return true;
+}
+
+unsigned int
+nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ const struct rtable *rt;
+ struct iphdr *iph;
+ __be32 nexthop;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*iph)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ip(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ iph = ip_hdr(skb);
+ ip_decrease_ttl(iph);
+
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);

View file

@ -0,0 +1,32 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 17 Feb 2018 11:51:20 +0100
Subject: [PATCH] netfilter: nf_flow_table: move ip header check out of
nf_flow_exceeds_mtu
Allows the function to be shared with the IPv6 hook code
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -181,9 +181,6 @@ static bool nf_flow_exceeds_mtu(const st
if (skb->len <= mtu)
return false;
- if ((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0)
- return false;
-
if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
return false;
@@ -222,7 +219,8 @@ nf_flow_offload_ip_hook(void *priv, stru
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) &&
+ (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0)
return NF_ACCEPT;
if (skb_try_make_writable(skb, sizeof(*iph)))

View file

@ -0,0 +1,483 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 17 Feb 2018 11:55:51 +0100
Subject: [PATCH] netfilter: nf_flow_table: move ipv6 offload hook code to
nf_flow_table
Useful as preparation for adding iptables support for offload
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -3,240 +3,8 @@
#include <linux/module.h>
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
-#include <linux/ipv6.h>
-#include <linux/netdevice.h>
-#include <net/ipv6.h>
-#include <net/ip6_route.h>
-#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
-/* For layer 4 checksum field offset. */
-#include <linux/tcp.h>
-#include <linux/udp.h>
-
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- struct tcphdr *tcph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
- tcph = (void *)(skb_network_header(skb) + thoff);
- inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
- new_addr->s6_addr32, true);
-
- return 0;
-}
-
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- struct udphdr *udph;
-
- if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
- skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
- udph = (void *)(skb_network_header(skb) + thoff);
- if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
- new_addr->s6_addr32, true);
- if (!udph->check)
- udph->check = CSUM_MANGLED_0;
- }
-
- return 0;
-}
-
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
-{
- switch (ip6h->nexthdr) {
- case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
- break;
- }
-
- return 0;
-}
-
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- struct in6_addr addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = ip6h->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
- ip6h->saddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = ip6h->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
- ip6h->daddr = new_addr;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
-{
- struct in6_addr addr, new_addr;
-
- switch (dir) {
- case FLOW_OFFLOAD_DIR_ORIGINAL:
- addr = ip6h->daddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
- ip6h->daddr = new_addr;
- break;
- case FLOW_OFFLOAD_DIR_REPLY:
- addr = ip6h->saddr;
- new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
- ip6h->saddr = new_addr;
- break;
- default:
- return -1;
- }
-
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
-}
-
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
-{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
- unsigned int thoff = sizeof(*ip6h);
-
- if (flow->flags & FLOW_OFFLOAD_SNAT &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
- if (flow->flags & FLOW_OFFLOAD_DNAT &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- return 0;
-}
-
-static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
-{
- struct flow_ports *ports;
- struct ipv6hdr *ip6h;
- unsigned int thoff;
-
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
- return -1;
-
- ip6h = ipv6_hdr(skb);
-
- if (ip6h->nexthdr != IPPROTO_TCP &&
- ip6h->nexthdr != IPPROTO_UDP)
- return -1;
-
- thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
- return -1;
-
- ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
-
- tuple->src_v6 = ip6h->saddr;
- tuple->dst_v6 = ip6h->daddr;
- tuple->src_port = ports->source;
- tuple->dst_port = ports->dest;
- tuple->l3proto = AF_INET6;
- tuple->l4proto = ip6h->nexthdr;
- tuple->iifidx = dev->ifindex;
-
- return 0;
-}
-
-/* Based on ip_exceeds_mtu(). */
-static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
-{
- if (skb->len <= mtu)
- return false;
-
- if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
- return false;
-
- return true;
-}
-
-unsigned int
-nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct flow_offload_tuple_rhash *tuplehash;
- struct nf_flowtable *flow_table = priv;
- struct flow_offload_tuple tuple = {};
- enum flow_offload_tuple_dir dir;
- struct flow_offload *flow;
- struct net_device *outdev;
- struct in6_addr *nexthop;
- struct ipv6hdr *ip6h;
- struct rt6_info *rt;
-
- if (skb->protocol != htons(ETH_P_IPV6))
- return NF_ACCEPT;
-
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
- return NF_ACCEPT;
-
- tuplehash = flow_offload_lookup(flow_table, &tuple);
- if (tuplehash == NULL)
- return NF_ACCEPT;
-
- outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
- if (!outdev)
- return NF_ACCEPT;
-
- dir = tuplehash->tuple.dir;
- flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
-
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
- return NF_ACCEPT;
-
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
- return NF_DROP;
-
- if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
- nf_flow_nat_ipv6(flow, skb, dir) < 0)
- return NF_DROP;
-
- flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
- ip6h = ipv6_hdr(skb);
- ip6h->hop_limit--;
-
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
-
- return NF_STOLEN;
-}
-EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -4,8 +4,11 @@
#include <linux/netfilter.h>
#include <linux/rhashtable.h>
#include <linux/ip.h>
+#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
/* For layer 4 checksum field offset. */
@@ -241,3 +244,215 @@ nf_flow_offload_ip_hook(void *priv, stru
return NF_STOLEN;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
+
+static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct tcphdr *tcph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*tcph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*tcph)))
+ return -1;
+
+ tcph = (void *)(skb_network_header(skb) + thoff);
+ inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ struct udphdr *udph;
+
+ if (!pskb_may_pull(skb, thoff + sizeof(*udph)) ||
+ skb_try_make_writable(skb, thoff + sizeof(*udph)))
+ return -1;
+
+ udph = (void *)(skb_network_header(skb) + thoff);
+ if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
+ new_addr->s6_addr32, true);
+ if (!udph->check)
+ udph->check = CSUM_MANGLED_0;
+ }
+
+ return 0;
+}
+
+static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
+{
+ switch (ip6h->nexthdr) {
+ case IPPROTO_TCP:
+ if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ case IPPROTO_UDP:
+ if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
+ return NF_DROP;
+ break;
+ }
+
+ return 0;
+}
+
+static int nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
+{
+ struct in6_addr addr, new_addr;
+
+ switch (dir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ addr = ip6h->daddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6;
+ ip6h->daddr = new_addr;
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ addr = ip6h->saddr;
+ new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
+ ip6h->saddr = new_addr;
+ break;
+ default:
+ return -1;
+ }
+
+ return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+}
+
+static int nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir)
+{
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ unsigned int thoff = sizeof(*ip6h);
+
+ if (flow->flags & FLOW_OFFLOAD_SNAT &&
+ (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+ if (flow->flags & FLOW_OFFLOAD_DNAT &&
+ (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ return -1;
+
+ return 0;
+}
+
+static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
+ struct flow_offload_tuple *tuple)
+{
+ struct flow_ports *ports;
+ struct ipv6hdr *ip6h;
+ unsigned int thoff;
+
+ if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ return -1;
+
+ ip6h = ipv6_hdr(skb);
+
+ if (ip6h->nexthdr != IPPROTO_TCP &&
+ ip6h->nexthdr != IPPROTO_UDP)
+ return -1;
+
+ thoff = sizeof(*ip6h);
+ if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
+ return -1;
+
+ ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
+
+ tuple->src_v6 = ip6h->saddr;
+ tuple->dst_v6 = ip6h->daddr;
+ tuple->src_port = ports->source;
+ tuple->dst_port = ports->dest;
+ tuple->l3proto = AF_INET6;
+ tuple->l4proto = ip6h->nexthdr;
+ tuple->iifidx = dev->ifindex;
+
+ return 0;
+}
+
+unsigned int
+nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct flow_offload_tuple_rhash *tuplehash;
+ struct nf_flowtable *flow_table = priv;
+ struct flow_offload_tuple tuple = {};
+ enum flow_offload_tuple_dir dir;
+ struct flow_offload *flow;
+ struct net_device *outdev;
+ struct in6_addr *nexthop;
+ struct ipv6hdr *ip6h;
+ struct rt6_info *rt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return NF_ACCEPT;
+
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ return NF_ACCEPT;
+
+ tuplehash = flow_offload_lookup(flow_table, &tuple);
+ if (tuplehash == NULL)
+ return NF_ACCEPT;
+
+ outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.oifidx);
+ if (!outdev)
+ return NF_ACCEPT;
+
+ dir = tuplehash->tuple.dir;
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
+
+ if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ return NF_ACCEPT;
+
+ if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ return NF_DROP;
+
+ if (flow->flags & (FLOW_OFFLOAD_SNAT | FLOW_OFFLOAD_DNAT) &&
+ nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ return NF_DROP;
+
+ flow->timeout = (u32)jiffies + NF_FLOW_TIMEOUT;
+ ip6h = ipv6_hdr(skb);
+ ip6h->hop_limit--;
+
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+
+ return NF_STOLEN;
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);

View file

@ -0,0 +1,23 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sat, 17 Feb 2018 12:02:28 +0100
Subject: [PATCH] netfilter: nf_flow_table: relax mixed ipv4/ipv6 flowtable
dependencies
Since the offload hook code was moved, this table no longer depends on
the IPv4 and IPv6 flowtable modules
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -670,8 +670,7 @@ endif # NF_TABLES
config NF_FLOW_TABLE_INET
tristate "Netfilter flow table mixed IPv4/IPv6 module"
- depends on NF_FLOW_TABLE_IPV4
- depends on NF_FLOW_TABLE_IPV6
+ depends on NF_FLOW_TABLE
help
This option adds the flow table mixed IPv4/IPv6 support.

View file

@ -0,0 +1,298 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Sun, 18 Feb 2018 18:16:31 +0100
Subject: [PATCH] netfilter: nf_flow_table: move init code to
nf_flow_table_core.c
Reduces duplication of .gc and .params in flowtable type definitions and
makes the API clearer
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -14,9 +14,8 @@ struct nf_flowtable;
struct nf_flowtable_type {
struct list_head list;
int family;
- void (*gc)(struct work_struct *work);
+ int (*init)(struct nf_flowtable *ft);
void (*free)(struct nf_flowtable *ft);
- const struct rhashtable_params *params;
nf_hookfn *hook;
struct module *owner;
};
@@ -100,9 +99,8 @@ int nf_flow_table_iterate(struct nf_flow
void nf_flow_table_cleanup(struct net *net, struct net_device *dev);
+int nf_flow_table_init(struct nf_flowtable *flow_table);
void nf_flow_table_free(struct nf_flowtable *flow_table);
-void nf_flow_offload_work_gc(struct work_struct *work);
-extern const struct rhashtable_params nf_flow_offload_rhash_params;
void flow_offload_dead(struct flow_offload *flow);
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ b/net/ipv4/netfilter/nf_flow_table_ipv4.c
@@ -7,8 +7,7 @@
static struct nf_flowtable_type flowtable_ipv4 = {
.family = NFPROTO_IPV4,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ip_hook,
.owner = THIS_MODULE,
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ b/net/ipv6/netfilter/nf_flow_table_ipv6.c
@@ -8,8 +8,7 @@
static struct nf_flowtable_type flowtable_ipv6 = {
.family = NFPROTO_IPV6,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_ipv6_hook,
.owner = THIS_MODULE,
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -116,16 +116,50 @@ void flow_offload_dead(struct flow_offlo
}
EXPORT_SYMBOL_GPL(flow_offload_dead);
+static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple *tuple = data;
+
+ return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct flow_offload_tuple_rhash *tuplehash = data;
+
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+}
+
+static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
+{
+ const struct flow_offload_tuple *tuple = arg->key;
+ const struct flow_offload_tuple_rhash *x = ptr;
+
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ return 1;
+
+ return 0;
+}
+
+static const struct rhashtable_params nf_flow_offload_rhash_params = {
+ .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
+ .hashfn = flow_offload_hash,
+ .obj_hashfn = flow_offload_hash_obj,
+ .obj_cmpfn = flow_offload_hash_cmp,
+ .automatic_shrinking = true,
+};
+
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
flow->timeout = (u32)jiffies;
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
return 0;
}
EXPORT_SYMBOL_GPL(flow_offload_add);
@@ -135,10 +169,10 @@ static void flow_offload_del(struct nf_f
{
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
flow_offload_free(flow);
}
@@ -148,7 +182,7 @@ flow_offload_lookup(struct nf_flowtable
struct flow_offload_tuple *tuple)
{
return rhashtable_lookup_fast(&flow_table->rhashtable, tuple,
- *flow_table->type->params);
+ nf_flow_offload_rhash_params);
}
EXPORT_SYMBOL_GPL(flow_offload_lookup);
@@ -237,7 +271,7 @@ out:
return 1;
}
-void nf_flow_offload_work_gc(struct work_struct *work)
+static void nf_flow_offload_work_gc(struct work_struct *work)
{
struct nf_flowtable *flow_table;
@@ -245,42 +279,6 @@ void nf_flow_offload_work_gc(struct work
nf_flow_offload_gc_step(flow_table);
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-EXPORT_SYMBOL_GPL(nf_flow_offload_work_gc);
-
-static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple *tuple = data;
-
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
-{
- const struct flow_offload_tuple_rhash *tuplehash = data;
-
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
-}
-
-static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
- const void *ptr)
-{
- const struct flow_offload_tuple *tuple = arg->key;
- const struct flow_offload_tuple_rhash *x = ptr;
-
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
- return 1;
-
- return 0;
-}
-
-const struct rhashtable_params nf_flow_offload_rhash_params = {
- .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
- .hashfn = flow_offload_hash,
- .obj_hashfn = flow_offload_hash_obj,
- .obj_cmpfn = flow_offload_hash_cmp,
- .automatic_shrinking = true,
-};
-EXPORT_SYMBOL_GPL(nf_flow_offload_rhash_params);
static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
__be16 port, __be16 new_port)
@@ -398,6 +396,24 @@ int nf_flow_dnat_port(const struct flow_
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+int nf_flow_table_init(struct nf_flowtable *flowtable)
+{
+ int err;
+
+ INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+
+ err = rhashtable_init(&flowtable->rhashtable,
+ &nf_flow_offload_rhash_params);
+ if (err < 0)
+ return err;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &flowtable->gc_work, HZ);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_init);
+
static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data)
{
struct net_device *dev = data;
@@ -423,8 +439,10 @@ EXPORT_SYMBOL_GPL(nf_flow_table_cleanup)
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
+ cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
+ rhashtable_destroy(&flow_table->rhashtable);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
--- a/net/netfilter/nf_flow_table_inet.c
+++ b/net/netfilter/nf_flow_table_inet.c
@@ -22,8 +22,7 @@ nf_flow_offload_inet_hook(void *priv, st
static struct nf_flowtable_type flowtable_inet = {
.family = NFPROTO_INET,
- .params = &nf_flow_offload_rhash_params,
- .gc = nf_flow_offload_work_gc,
+ .init = nf_flow_table_init,
.free = nf_flow_table_free,
.hook = nf_flow_offload_inet_hook,
.owner = THIS_MODULE,
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -5013,40 +5013,38 @@ static int nf_tables_newflowtable(struct
}
flowtable->data.type = type;
- err = rhashtable_init(&flowtable->data.rhashtable, type->params);
+ err = type->init(&flowtable->data);
if (err < 0)
goto err3;
err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
flowtable);
if (err < 0)
- goto err3;
+ goto err4;
for (i = 0; i < flowtable->ops_len; i++) {
err = nf_register_net_hook(net, &flowtable->ops[i]);
if (err < 0)
- goto err4;
+ goto err5;
}
err = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
if (err < 0)
- goto err5;
-
- INIT_DEFERRABLE_WORK(&flowtable->data.gc_work, type->gc);
- queue_delayed_work(system_power_efficient_wq,
- &flowtable->data.gc_work, HZ);
+ goto err6;
list_add_tail_rcu(&flowtable->list, &table->flowtables);
table->use++;
return 0;
-err5:
+err6:
i = flowtable->ops_len;
-err4:
+err5:
for (k = i - 1; k >= 0; k--)
nf_unregister_net_hook(net, &flowtable->ops[i]);
kfree(flowtable->ops);
+err4:
+ flowtable->data.type->free(&flowtable->data);
err3:
module_put(type->owner);
err2:
@@ -5325,10 +5323,8 @@ err:
static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
- cancel_delayed_work_sync(&flowtable->data.gc_work);
kfree(flowtable->name);
flowtable->data.type->free(&flowtable->data);
- rhashtable_destroy(&flowtable->data.rhashtable);
module_put(flowtable->data.type->owner);
}

View file

@ -0,0 +1,22 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 20 Feb 2018 14:48:51 +0100
Subject: [PATCH] netfilter: nf_flow_table: fix priv pointer for netdev hook
The offload ip hook expects a pointer to the flowtable, not to the
rhashtable. Since the rhashtable is the first member, this is safe for
the moment, but breaks as soon as the structure layout changes
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4879,7 +4879,7 @@ static int nf_tables_flowtable_parse_hoo
flowtable->ops[i].pf = NFPROTO_NETDEV;
flowtable->ops[i].hooknum = hooknum;
flowtable->ops[i].priority = priority;
- flowtable->ops[i].priv = &flowtable->data.rhashtable;
+ flowtable->ops[i].priv = &flowtable->data;
flowtable->ops[i].hook = flowtable->data.type->hook;
flowtable->ops[i].dev = dev_array[i];
}

View file

@ -0,0 +1,114 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Tue, 20 Feb 2018 14:08:14 +0100
Subject: [PATCH] netfilter: nf_flow_table: track flow tables in nf_flow_table
directly
Avoids having nf_flow_table depend on nftables (useful for future
iptables backport work)
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -21,6 +21,7 @@ struct nf_flowtable_type {
};
struct nf_flowtable {
+ struct list_head list;
struct rhashtable rhashtable;
const struct nf_flowtable_type *type;
struct delayed_work gc_work;
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1091,9 +1091,6 @@ struct nft_flowtable {
struct nft_flowtable *nf_tables_flowtable_lookup(const struct nft_table *table,
const struct nlattr *nla,
u8 genmask);
-void nft_flow_table_iterate(struct net *net,
- void (*iter)(struct nf_flowtable *flowtable, void *data),
- void *data);
void nft_register_flowtable_type(struct nf_flowtable_type *type);
void nft_unregister_flowtable_type(struct nf_flowtable_type *type);
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -18,6 +18,9 @@ struct flow_offload_entry {
struct rcu_head rcu_head;
};
+static DEFINE_MUTEX(flowtable_lock);
+static LIST_HEAD(flowtables);
+
static void
flow_offload_fill_dir(struct flow_offload *flow, struct nf_conn *ct,
struct nf_flow_route *route,
@@ -410,6 +413,10 @@ int nf_flow_table_init(struct nf_flowtab
queue_delayed_work(system_power_efficient_wq,
&flowtable->gc_work, HZ);
+ mutex_lock(&flowtable_lock);
+ list_add(&flowtable->list, &flowtables);
+ mutex_unlock(&flowtable_lock);
+
return 0;
}
EXPORT_SYMBOL_GPL(nf_flow_table_init);
@@ -425,20 +432,28 @@ static void nf_flow_table_do_cleanup(str
}
static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable,
- void *data)
+ struct net_device *dev)
{
- nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, data);
+ nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
}
void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
{
- nft_flow_table_iterate(net, nf_flow_table_iterate_cleanup, dev);
+ struct nf_flowtable *flowtable;
+
+ mutex_lock(&flowtable_lock);
+ list_for_each_entry(flowtable, &flowtables, list)
+ nf_flow_table_iterate_cleanup(flowtable, dev);
+ mutex_unlock(&flowtable_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
+ mutex_lock(&flowtable_lock);
+ list_del(&flow_table->list);
+ mutex_unlock(&flowtable_lock);
cancel_delayed_work_sync(&flow_table->gc_work);
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4923,23 +4923,6 @@ static const struct nf_flowtable_type *n
return ERR_PTR(-ENOENT);
}
-void nft_flow_table_iterate(struct net *net,
- void (*iter)(struct nf_flowtable *flowtable, void *data),
- void *data)
-{
- struct nft_flowtable *flowtable;
- const struct nft_table *table;
-
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
- list_for_each_entry(table, &net->nft.tables, list) {
- list_for_each_entry(flowtable, &table->flowtables, list) {
- iter(&flowtable->data, data);
- }
- }
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-}
-EXPORT_SYMBOL_GPL(nft_flow_table_iterate);
-
static void nft_unregister_flowtable_net_hooks(struct net *net,
struct nft_flowtable *flowtable)
{

View file

@ -0,0 +1,566 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Thu, 11 Jan 2018 16:32:00 +0100
Subject: [PATCH] netfilter: nf_flow_table: add hardware offload support
This patch adds the infrastructure to offload flows to hardware, in case
the nic/switch comes with built-in flow tables capabilities.
If the hardware comes with no hardware flow tables or they have
limitations in terms of features, the existing infrastructure falls back
to the software flow table implementation.
The software flow table garbage collector skips entries that resides in
the hardware, so the hardware will be responsible for releasing this
flow table entry too via flow_offload_dead().
Hardware configuration, either to add or to delete entries, is done from
the hardware offload workqueue, to ensure this is done from user context
given that we may sleep when grabbing the mdio mutex.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 net/netfilter/nf_flow_table_hw.c
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -826,6 +826,13 @@ struct xfrmdev_ops {
};
#endif
+struct flow_offload;
+
+enum flow_offload_type {
+ FLOW_OFFLOAD_ADD = 0,
+ FLOW_OFFLOAD_DEL,
+};
+
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
@@ -1057,6 +1064,10 @@ struct xfrmdev_ops {
* int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags);
*
+ * int (*ndo_flow_offload)(enum flow_offload_type type,
+ * struct flow_offload *flow);
+ * Adds/deletes flow entry to/from net device flowtable.
+ *
* int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
* Called to change device carrier. Soft-devices (like dummy, team, etc)
* which do not represent real hardware may define this to allow their
@@ -1281,6 +1292,8 @@ struct net_device_ops {
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);
+ int (*ndo_flow_offload)(enum flow_offload_type type,
+ struct flow_offload *flow);
int (*ndo_change_carrier)(struct net_device *dev,
bool new_carrier);
int (*ndo_get_phys_port_id)(struct net_device *dev,
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -20,11 +20,17 @@ struct nf_flowtable_type {
struct module *owner;
};
+enum nf_flowtable_flags {
+ NF_FLOWTABLE_F_HW = 0x1,
+};
+
struct nf_flowtable {
struct list_head list;
struct rhashtable rhashtable;
const struct nf_flowtable_type *type;
+ u32 flags;
struct delayed_work gc_work;
+ possible_net_t ft_net;
};
enum flow_offload_tuple_dir {
@@ -68,6 +74,7 @@ struct flow_offload_tuple_rhash {
#define FLOW_OFFLOAD_SNAT 0x1
#define FLOW_OFFLOAD_DNAT 0x2
#define FLOW_OFFLOAD_DYING 0x4
+#define FLOW_OFFLOAD_HW 0x8
struct flow_offload {
struct flow_offload_tuple_rhash tuplehash[FLOW_OFFLOAD_DIR_MAX];
@@ -121,6 +128,22 @@ unsigned int nf_flow_offload_ip_hook(voi
unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state);
+void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
+ struct nf_conn *ct);
+void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow);
+
+struct nf_flow_table_hw {
+ struct module *owner;
+ void (*add)(struct net *net, struct flow_offload *flow,
+ struct nf_conn *ct);
+ void (*del)(struct net *net, struct flow_offload *flow);
+};
+
+int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload);
+void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload);
+
+extern struct work_struct nf_flow_offload_hw_work;
+
#define MODULE_ALIAS_NF_FLOWTABLE(family) \
MODULE_ALIAS("nf-flowtable-" __stringify(family))
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -1341,6 +1341,7 @@ enum nft_object_attributes {
* @NFTA_FLOWTABLE_HOOK: netfilter hook configuration(NLA_U32)
* @NFTA_FLOWTABLE_USE: number of references to this flow table (NLA_U32)
* @NFTA_FLOWTABLE_HANDLE: object handle (NLA_U64)
+ * @NFTA_FLOWTABLE_FLAGS: flags (NLA_U32)
*/
enum nft_flowtable_attributes {
NFTA_FLOWTABLE_UNSPEC,
@@ -1350,6 +1351,7 @@ enum nft_flowtable_attributes {
NFTA_FLOWTABLE_USE,
NFTA_FLOWTABLE_HANDLE,
NFTA_FLOWTABLE_PAD,
+ NFTA_FLOWTABLE_FLAGS,
__NFTA_FLOWTABLE_MAX
};
#define NFTA_FLOWTABLE_MAX (__NFTA_FLOWTABLE_MAX - 1)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -686,6 +686,15 @@ config NF_FLOW_TABLE
To compile it as a module, choose M here.
+config NF_FLOW_TABLE_HW
+ tristate "Netfilter flow table hardware offload module"
+ depends on NF_FLOW_TABLE
+ help
+ This option adds hardware offload support for the flow table core
+ infrastructure.
+
+ To compile it as a module, choose M here.
+
config NETFILTER_XTABLES
tristate "Netfilter Xtables support (required for ip_tables)"
default m if NETFILTER_ADVANCED=n
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -116,6 +116,7 @@ obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_t
nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o
obj-$(CONFIG_NF_FLOW_TABLE_INET) += nf_flow_table_inet.o
+obj-$(CONFIG_NF_FLOW_TABLE_HW) += nf_flow_table_hw.o
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -167,9 +167,16 @@ int flow_offload_add(struct nf_flowtable
}
EXPORT_SYMBOL_GPL(flow_offload_add);
+static inline bool nf_flow_in_hw(const struct flow_offload *flow)
+{
+ return flow->flags & FLOW_OFFLOAD_HW;
+}
+
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
+ struct net *net = read_pnet(&flow_table->ft_net);
+
rhashtable_remove_fast(&flow_table->rhashtable,
&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
nf_flow_offload_rhash_params);
@@ -177,6 +184,9 @@ static void flow_offload_del(struct nf_f
&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
nf_flow_offload_rhash_params);
+ if (nf_flow_in_hw(flow))
+ nf_flow_offload_hw_del(net, flow);
+
flow_offload_free(flow);
}
@@ -263,6 +273,10 @@ static int nf_flow_offload_gc_step(struc
flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
+ if (nf_flow_in_hw(flow) &&
+ !nf_flow_is_dying(flow))
+ continue;
+
if (nf_flow_has_expired(flow) ||
nf_flow_is_dying(flow))
flow_offload_del(flow_table, flow);
@@ -399,10 +413,43 @@ int nf_flow_dnat_port(const struct flow_
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
+static const struct nf_flow_table_hw __rcu *nf_flow_table_hw_hook __read_mostly;
+
+static int nf_flow_offload_hw_init(struct nf_flowtable *flow_table)
+{
+ const struct nf_flow_table_hw *offload;
+
+ if (!rcu_access_pointer(nf_flow_table_hw_hook))
+ request_module("nf-flow-table-hw");
+
+ rcu_read_lock();
+ offload = rcu_dereference(nf_flow_table_hw_hook);
+ if (!offload)
+ goto err_no_hw_offload;
+
+ if (!try_module_get(offload->owner))
+ goto err_no_hw_offload;
+
+ rcu_read_unlock();
+
+ return 0;
+
+err_no_hw_offload:
+ rcu_read_unlock();
+
+ return -EOPNOTSUPP;
+}
+
int nf_flow_table_init(struct nf_flowtable *flowtable)
{
int err;
+ if (flowtable->flags & NF_FLOWTABLE_F_HW) {
+ err = nf_flow_offload_hw_init(flowtable);
+ if (err)
+ return err;
+ }
+
INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
err = rhashtable_init(&flowtable->rhashtable,
@@ -436,6 +483,8 @@ static void nf_flow_table_iterate_cleanu
{
nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
flush_delayed_work(&flowtable->gc_work);
+ if (flowtable->flags & NF_FLOWTABLE_F_HW)
+ flush_work(&nf_flow_offload_hw_work);
}
void nf_flow_table_cleanup(struct net *net, struct net_device *dev)
@@ -449,6 +498,26 @@ void nf_flow_table_cleanup(struct net *n
}
EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
+struct work_struct nf_flow_offload_hw_work;
+EXPORT_SYMBOL_GPL(nf_flow_offload_hw_work);
+
+/* Give the hardware workqueue the chance to remove entries from hardware.*/
+static void nf_flow_offload_hw_free(struct nf_flowtable *flowtable)
+{
+ const struct nf_flow_table_hw *offload;
+
+ flush_work(&nf_flow_offload_hw_work);
+
+ rcu_read_lock();
+ offload = rcu_dereference(nf_flow_table_hw_hook);
+ if (!offload) {
+ rcu_read_unlock();
+ return;
+ }
+ module_put(offload->owner);
+ rcu_read_unlock();
+}
+
void nf_flow_table_free(struct nf_flowtable *flow_table)
{
mutex_lock(&flowtable_lock);
@@ -458,9 +527,58 @@ void nf_flow_table_free(struct nf_flowta
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
WARN_ON(!nf_flow_offload_gc_step(flow_table));
rhashtable_destroy(&flow_table->rhashtable);
+ if (flow_table->flags & NF_FLOWTABLE_F_HW)
+ nf_flow_offload_hw_free(flow_table);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
+/* Must be called from user context. */
+void nf_flow_offload_hw_add(struct net *net, struct flow_offload *flow,
+ struct nf_conn *ct)
+{
+ const struct nf_flow_table_hw *offload;
+
+ rcu_read_lock();
+ offload = rcu_dereference(nf_flow_table_hw_hook);
+ if (offload)
+ offload->add(net, flow, ct);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_hw_add);
+
+/* Must be called from user context. */
+void nf_flow_offload_hw_del(struct net *net, struct flow_offload *flow)
+{
+ const struct nf_flow_table_hw *offload;
+
+ rcu_read_lock();
+ offload = rcu_dereference(nf_flow_table_hw_hook);
+ if (offload)
+ offload->del(net, flow);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(nf_flow_offload_hw_del);
+
+int nf_flow_table_hw_register(const struct nf_flow_table_hw *offload)
+{
+ if (rcu_access_pointer(nf_flow_table_hw_hook))
+ return -EBUSY;
+
+ rcu_assign_pointer(nf_flow_table_hw_hook, offload);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_hw_register);
+
+void nf_flow_table_hw_unregister(const struct nf_flow_table_hw *offload)
+{
+ WARN_ON(rcu_access_pointer(nf_flow_table_hw_hook) != offload);
+ rcu_assign_pointer(nf_flow_table_hw_hook, NULL);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(nf_flow_table_hw_unregister);
+
static int nf_flow_table_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
--- /dev/null
+++ b/net/netfilter/nf_flow_table_hw.c
@@ -0,0 +1,169 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_tuple.h>
+
+static DEFINE_SPINLOCK(flow_offload_hw_pending_list_lock);
+static LIST_HEAD(flow_offload_hw_pending_list);
+
+static DEFINE_MUTEX(nf_flow_offload_hw_mutex);
+
+struct flow_offload_hw {
+ struct list_head list;
+ enum flow_offload_type type;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ possible_net_t flow_hw_net;
+};
+
+static int do_flow_offload_hw(struct net *net, struct flow_offload *flow,
+ int type)
+{
+ struct net_device *indev;
+ int ret, ifindex;
+
+ ifindex = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.iifidx;
+ indev = dev_get_by_index(net, ifindex);
+ if (WARN_ON(!indev))
+ return 0;
+
+ mutex_lock(&nf_flow_offload_hw_mutex);
+ ret = indev->netdev_ops->ndo_flow_offload(type, flow);
+ mutex_unlock(&nf_flow_offload_hw_mutex);
+
+ dev_put(indev);
+
+ return ret;
+}
+
+static void flow_offload_hw_work_add(struct flow_offload_hw *offload)
+{
+ struct net *net;
+ int ret;
+
+ if (nf_ct_is_dying(offload->ct))
+ return;
+
+ net = read_pnet(&offload->flow_hw_net);
+ ret = do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_ADD);
+ if (ret >= 0)
+ offload->flow->flags |= FLOW_OFFLOAD_HW;
+}
+
+static void flow_offload_hw_work_del(struct flow_offload_hw *offload)
+{
+ struct net *net = read_pnet(&offload->flow_hw_net);
+
+ do_flow_offload_hw(net, offload->flow, FLOW_OFFLOAD_DEL);
+}
+
+static void flow_offload_hw_work(struct work_struct *work)
+{
+ struct flow_offload_hw *offload, *next;
+ LIST_HEAD(hw_offload_pending);
+
+ spin_lock_bh(&flow_offload_hw_pending_list_lock);
+ list_replace_init(&flow_offload_hw_pending_list, &hw_offload_pending);
+ spin_unlock_bh(&flow_offload_hw_pending_list_lock);
+
+ list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
+ switch (offload->type) {
+ case FLOW_OFFLOAD_ADD:
+ flow_offload_hw_work_add(offload);
+ break;
+ case FLOW_OFFLOAD_DEL:
+ flow_offload_hw_work_del(offload);
+ break;
+ }
+ if (offload->ct)
+ nf_conntrack_put(&offload->ct->ct_general);
+ list_del(&offload->list);
+ kfree(offload);
+ }
+}
+
+static void flow_offload_queue_work(struct flow_offload_hw *offload)
+{
+ spin_lock_bh(&flow_offload_hw_pending_list_lock);
+ list_add_tail(&offload->list, &flow_offload_hw_pending_list);
+ spin_unlock_bh(&flow_offload_hw_pending_list_lock);
+
+ schedule_work(&nf_flow_offload_hw_work);
+}
+
+static void flow_offload_hw_add(struct net *net, struct flow_offload *flow,
+ struct nf_conn *ct)
+{
+ struct flow_offload_hw *offload;
+
+ offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
+ if (!offload)
+ return;
+
+ nf_conntrack_get(&ct->ct_general);
+ offload->type = FLOW_OFFLOAD_ADD;
+ offload->ct = ct;
+ offload->flow = flow;
+ write_pnet(&offload->flow_hw_net, net);
+
+ flow_offload_queue_work(offload);
+}
+
+static void flow_offload_hw_del(struct net *net, struct flow_offload *flow)
+{
+ struct flow_offload_hw *offload;
+
+ offload = kmalloc(sizeof(struct flow_offload_hw), GFP_ATOMIC);
+ if (!offload)
+ return;
+
+ offload->type = FLOW_OFFLOAD_DEL;
+ offload->ct = NULL;
+ offload->flow = flow;
+ write_pnet(&offload->flow_hw_net, net);
+
+ flow_offload_queue_work(offload);
+}
+
+static const struct nf_flow_table_hw flow_offload_hw = {
+ .add = flow_offload_hw_add,
+ .del = flow_offload_hw_del,
+ .owner = THIS_MODULE,
+};
+
+static int __init nf_flow_table_hw_module_init(void)
+{
+ INIT_WORK(&nf_flow_offload_hw_work, flow_offload_hw_work);
+ nf_flow_table_hw_register(&flow_offload_hw);
+
+ return 0;
+}
+
+static void __exit nf_flow_table_hw_module_exit(void)
+{
+ struct flow_offload_hw *offload, *next;
+ LIST_HEAD(hw_offload_pending);
+
+ nf_flow_table_hw_unregister(&flow_offload_hw);
+ cancel_work_sync(&nf_flow_offload_hw_work);
+
+ list_for_each_entry_safe(offload, next, &hw_offload_pending, list) {
+ if (offload->ct)
+ nf_conntrack_put(&offload->ct->ct_general);
+ list_del(&offload->list);
+ kfree(offload);
+ }
+}
+
+module_init(nf_flow_table_hw_module_init);
+module_exit(nf_flow_table_hw_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS("nf-flow-table-hw");
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4866,6 +4866,14 @@ static int nf_tables_flowtable_parse_hoo
if (err < 0)
goto err1;
+ for (i = 0; i < n; i++) {
+ if (flowtable->data.flags & NF_FLOWTABLE_F_HW &&
+ !dev_array[i]->netdev_ops->ndo_flow_offload) {
+ err = -EOPNOTSUPP;
+ goto err1;
+ }
+ }
+
ops = kzalloc(sizeof(struct nf_hook_ops) * n, GFP_KERNEL);
if (!ops) {
err = -ENOMEM;
@@ -4996,10 +5004,19 @@ static int nf_tables_newflowtable(struct
}
flowtable->data.type = type;
+ write_pnet(&flowtable->data.ft_net, net);
+
err = type->init(&flowtable->data);
if (err < 0)
goto err3;
+ if (nla[NFTA_FLOWTABLE_FLAGS]) {
+ flowtable->data.flags =
+ ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+ if (flowtable->data.flags & ~NF_FLOWTABLE_F_HW)
+ goto err4;
+ }
+
err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK],
flowtable);
if (err < 0)
@@ -5097,7 +5114,8 @@ static int nf_tables_fill_flowtable_info
nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
- NFTA_FLOWTABLE_PAD))
+ NFTA_FLOWTABLE_PAD) ||
+ nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
goto nla_put_failure;
nest = nla_nest_start(skb, NFTA_FLOWTABLE_HOOK);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -110,6 +110,9 @@ static void nft_flow_offload_eval(const
if (ret < 0)
goto err_flow_add;
+ if (flowtable->flags & NF_FLOWTABLE_F_HW)
+ nf_flow_offload_hw_add(nft_net(pkt), flow, ct);
+
return;
err_flow_add:

View file

@ -3142,6 +3142,7 @@ CONFIG_NFS_V3=y
# CONFIG_NFT_DUP_IPV6 is not set
# CONFIG_NFT_FIB_IPV4 is not set
# CONFIG_NFT_FIB_IPV6 is not set
# CONFIG_NFT_FLOW_OFFLOAD is not set
# CONFIG_NFT_OBJREF is not set
# CONFIG_NFT_RT is not set
# CONFIG_NFT_SET_BITMAP is not set
@ -3174,6 +3175,7 @@ CONFIG_NF_CONNTRACK_PROCFS=y
# CONFIG_NF_DEFRAG_IPV4 is not set
# CONFIG_NF_DUP_IPV4 is not set
# CONFIG_NF_DUP_IPV6 is not set
# CONFIG_NF_FLOW_TABLE is not set
# CONFIG_NF_LOG_ARP is not set
# CONFIG_NF_LOG_IPV4 is not set
# CONFIG_NF_LOG_NETDEV is not set