kernel: backport fib_trie improvements/fixes from 4.0-rc

Signed-off-by: Felix Fietkau <nbd@openwrt.org>

SVN-Revision: 44695
This commit is contained in:
Felix Fietkau 2015-03-12 15:27:44 +00:00
parent 0e6ecf707d
commit 944612680d
25 changed files with 6099 additions and 0 deletions

View file

@ -0,0 +1,46 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Tue, 2 Dec 2014 10:58:21 -0800
Subject: [PATCH] fib_trie: Fix /proc/net/fib_trie when
CONFIG_IP_MULTIPLE_TABLES is not defined
In recent testing I had disabled CONFIG_IP_MULTIPLE_TABLES and as a result
when I ran "cat /proc/net/fib_trie" the main trie was displayed multiple
times. I found that the problem line of code was in the function
fib_trie_seq_next. Specifically the line below caused the indexes to go in
the opposite direction of our traversal:
h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
This issue was that the RT tables are defined such that RT_TABLE_LOCAL is ID
255, while it is located at TABLE_LOCAL_INDEX of 0, and RT_TABLE_MAIN is 254
with a TABLE_MAIN_INDEX of 1. This means that the above line will return 1
for the local table and 0 for main. The result is that fib_trie_seq_next
will return NULL at the end of the local table, fib_trie_seq_start will
return the start of the main table, and then fib_trie_seq_next will loop on
main forever as h will always return 0.
The fix for this is to reverse the ordering of the two tables. It has the
advantage of making it so that the tables now print in the same order
regardless of if multiple tables are enabled or not. In order to make the
definition consistent with the multiple tables case I simply masked the to
RT_TABLE_XXX values by (FIB_TABLE_HASHSZ - 1). This way the two table
layouts should always stay consistent.
Fixes: 93456b6 ("[IPV4]: Unify access to the routing tables")
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -201,8 +201,8 @@ void fib_free_table(struct fib_table *tb
#ifndef CONFIG_IP_MULTIPLE_TABLES
-#define TABLE_LOCAL_INDEX 0
-#define TABLE_MAIN_INDEX 1
+#define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1))
+#define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1))
static inline struct fib_table *fib_get_table(struct net *net, u32 id)
{

View file

@ -0,0 +1,72 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 10 Dec 2014 21:49:22 -0800
Subject: [PATCH] fib_trie: Fix trie balancing issue if new node pushes down
existing node
This patch addresses an issue with the level compression of the fib_trie.
Specifically in the case of adding a new leaf that triggers a new node to
be added that takes the place of the old node. The result is a trie where
the 1 child tnode is on one side and one leaf is on the other which gives
you a very deep trie. Below is the script I used to generate a trie on
dummy0 with a 10.X.X.X family of addresses.
ip link add type dummy
ipval=184549374
bit=2
for i in `seq 1 23`
do
ifconfig dummy0:$bit $ipval/8
ipval=`expr $ipval - $bit`
bit=`expr $bit \* 2`
done
cat /proc/net/fib_triestat
Running the script before the patch:
Local:
Aver depth: 10.82
Max depth: 23
Leaves: 29
Prefixes: 30
Internal nodes: 27
1: 26 2: 1
Pointers: 56
Null ptrs: 1
Total size: 5 kB
After applying the patch and repeating:
Local:
Aver depth: 4.72
Max depth: 9
Leaves: 29
Prefixes: 30
Internal nodes: 12
1: 3 2: 2 3: 7
Pointers: 70
Null ptrs: 30
Total size: 4 kB
What this fix does is start the rebalance at the newly created tnode
instead of at the parent tnode. This way if there is a gap between the
parent and the new node it doesn't prevent the new tnode from being
coalesced with any pre-existing nodes that may have been pushed into one
of the new nodes child branches.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1143,8 +1143,9 @@ static struct list_head *fib_insert_node
put_child(tp, cindex, (struct rt_trie_node *)tn);
} else {
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
- tp = tn;
}
+
+ tp = tn;
}
if (tp && tp->pos + tp->bits > 32)

View file

@ -0,0 +1,200 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:29 -0800
Subject: [PATCH] fib_trie: Update usage stats to be percpu instead of
global variables
The trie usage stats were currently being shared by all threads that were
calling fib_table_lookup. As a result when multiple threads were
performing lookups simultaneously the trie would begin to cache bounce
between those threads.
In order to prevent this I have updated the usage stats to use a set of
percpu variables. By doing this we should be able to avoid the cache
bouncing and still make use of these stats.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -67,7 +67,7 @@ static int __net_init fib4_rules_init(st
return 0;
fail:
- kfree(local_table);
+ fib_free_table(local_table);
return -ENOMEM;
}
#else
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -153,7 +153,7 @@ struct trie_stat {
struct trie {
struct rt_trie_node __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- struct trie_use_stats stats;
+ struct trie_use_stats __percpu *stats;
#endif
};
@@ -631,7 +631,7 @@ static struct rt_trie_node *resize(struc
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
+ this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
@@ -658,7 +658,7 @@ static struct rt_trie_node *resize(struc
if (IS_ERR(tn)) {
tn = old_tn;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.resize_node_skipped++;
+ this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
@@ -1357,7 +1357,7 @@ static int check_leaf(struct fib_table *
err = fib_props[fa->fa_type].error;
if (err) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
+ this_cpu_inc(t->stats->semantic_match_passed);
#endif
return err;
}
@@ -1372,7 +1372,7 @@ static int check_leaf(struct fib_table *
continue;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_passed++;
+ this_cpu_inc(t->stats->semantic_match_passed);
#endif
res->prefixlen = li->plen;
res->nh_sel = nhsel;
@@ -1388,7 +1388,7 @@ static int check_leaf(struct fib_table *
}
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.semantic_match_miss++;
+ this_cpu_inc(t->stats->semantic_match_miss);
#endif
}
@@ -1399,6 +1399,9 @@ int fib_table_lookup(struct fib_table *t
struct fib_result *res, int fib_flags)
{
struct trie *t = (struct trie *) tb->tb_data;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie_use_stats __percpu *stats = t->stats;
+#endif
int ret;
struct rt_trie_node *n;
struct tnode *pn;
@@ -1417,7 +1420,7 @@ int fib_table_lookup(struct fib_table *t
goto failed;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.gets++;
+ this_cpu_inc(stats->gets);
#endif
/* Just a leaf? */
@@ -1441,7 +1444,7 @@ int fib_table_lookup(struct fib_table *t
if (n == NULL) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.null_node_hit++;
+ this_cpu_inc(stats->null_node_hit);
#endif
goto backtrace;
}
@@ -1576,7 +1579,7 @@ backtrace:
chopped_off = 0;
#ifdef CONFIG_IP_FIB_TRIE_STATS
- t->stats.backtrack++;
+ this_cpu_inc(stats->backtrack);
#endif
goto backtrace;
}
@@ -1830,6 +1833,11 @@ int fib_table_flush(struct fib_table *tb
void fib_free_table(struct fib_table *tb)
{
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ struct trie *t = (struct trie *)tb->tb_data;
+
+ free_percpu(t->stats);
+#endif /* CONFIG_IP_FIB_TRIE_STATS */
kfree(tb);
}
@@ -1973,7 +1981,14 @@ struct fib_table *fib_trie_table(u32 id)
tb->tb_num_default = 0;
t = (struct trie *) tb->tb_data;
- memset(t, 0, sizeof(*t));
+ RCU_INIT_POINTER(t->trie, NULL);
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ t->stats = alloc_percpu(struct trie_use_stats);
+ if (!t->stats) {
+ kfree(tb);
+ tb = NULL;
+ }
+#endif
return tb;
}
@@ -2139,18 +2154,31 @@ static void trie_show_stats(struct seq_f
#ifdef CONFIG_IP_FIB_TRIE_STATS
static void trie_show_usage(struct seq_file *seq,
- const struct trie_use_stats *stats)
+ const struct trie_use_stats __percpu *stats)
{
+ struct trie_use_stats s = { 0 };
+ int cpu;
+
+ /* loop through all of the CPUs and gather up the stats */
+ for_each_possible_cpu(cpu) {
+ const struct trie_use_stats *pcpu = per_cpu_ptr(stats, cpu);
+
+ s.gets += pcpu->gets;
+ s.backtrack += pcpu->backtrack;
+ s.semantic_match_passed += pcpu->semantic_match_passed;
+ s.semantic_match_miss += pcpu->semantic_match_miss;
+ s.null_node_hit += pcpu->null_node_hit;
+ s.resize_node_skipped += pcpu->resize_node_skipped;
+ }
+
seq_printf(seq, "\nCounters:\n---------\n");
- seq_printf(seq, "gets = %u\n", stats->gets);
- seq_printf(seq, "backtracks = %u\n", stats->backtrack);
+ seq_printf(seq, "gets = %u\n", s.gets);
+ seq_printf(seq, "backtracks = %u\n", s.backtrack);
seq_printf(seq, "semantic match passed = %u\n",
- stats->semantic_match_passed);
- seq_printf(seq, "semantic match miss = %u\n",
- stats->semantic_match_miss);
- seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
- seq_printf(seq, "skipped node resize = %u\n\n",
- stats->resize_node_skipped);
+ s.semantic_match_passed);
+ seq_printf(seq, "semantic match miss = %u\n", s.semantic_match_miss);
+ seq_printf(seq, "null node hit= %u\n", s.null_node_hit);
+ seq_printf(seq, "skipped node resize = %u\n\n", s.resize_node_skipped);
}
#endif /* CONFIG_IP_FIB_TRIE_STATS */
@@ -2191,7 +2219,7 @@ static int fib_triestat_seq_show(struct
trie_collect_stats(t, &stat);
trie_show_stats(seq, &stat);
#ifdef CONFIG_IP_FIB_TRIE_STATS
- trie_show_usage(seq, &t->stats);
+ trie_show_usage(seq, t->stats);
#endif
}
}

View file

@ -0,0 +1,421 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:35 -0800
Subject: [PATCH] fib_trie: Make leaf and tnode more uniform
This change makes some fundamental changes to the way leaves and tnodes are
constructed. The big differences are:
1. Leaves now populate pos and bits indicating their full key size.
2. Trie nodes now mask out their lower bits to be consistent with the leaf
3. Both structures have been reordered so that rt_trie_node now consisists
of a much larger region including the pos, bits, and rcu portions of
the tnode structure.
On 32b systems this will result in the leaf being 4B larger as the pos and
bits values were added to a hole created by the key as it was only 4B in
length.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -87,24 +87,38 @@
typedef unsigned int t_key;
-#define T_TNODE 0
-#define T_LEAF 1
-#define NODE_TYPE_MASK 0x1UL
-#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
+#define IS_TNODE(n) ((n)->bits)
+#define IS_LEAF(n) (!(n)->bits)
-#define IS_TNODE(n) (!(n->parent & T_LEAF))
-#define IS_LEAF(n) (n->parent & T_LEAF)
+struct tnode {
+ t_key key;
+ unsigned char bits; /* 2log(KEYLENGTH) bits needed */
+ unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ struct tnode __rcu *parent;
+ union {
+ struct rcu_head rcu;
+ struct tnode *tnode_free;
+ };
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
+ struct rt_trie_node __rcu *child[0];
+};
struct rt_trie_node {
- unsigned long parent;
t_key key;
+ unsigned char bits;
+ unsigned char pos;
+ struct tnode __rcu *parent;
+ struct rcu_head rcu;
};
struct leaf {
- unsigned long parent;
t_key key;
- struct hlist_head list;
+ unsigned char bits;
+ unsigned char pos;
+ struct tnode __rcu *parent;
struct rcu_head rcu;
+ struct hlist_head list;
};
struct leaf_info {
@@ -115,20 +129,6 @@ struct leaf_info {
struct rcu_head rcu;
};
-struct tnode {
- unsigned long parent;
- t_key key;
- unsigned char pos; /* 2log(KEYLENGTH) bits needed */
- unsigned char bits; /* 2log(KEYLENGTH) bits needed */
- unsigned int full_children; /* KEYLENGTH bits needed */
- unsigned int empty_children; /* KEYLENGTH bits needed */
- union {
- struct rcu_head rcu;
- struct tnode *tnode_free;
- };
- struct rt_trie_node __rcu *child[0];
-};
-
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats {
unsigned int gets;
@@ -176,38 +176,27 @@ static const int sync_pages = 128;
static struct kmem_cache *fn_alias_kmem __read_mostly;
static struct kmem_cache *trie_leaf_kmem __read_mostly;
-/*
- * caller must hold RTNL
- */
-static inline struct tnode *node_parent(const struct rt_trie_node *node)
-{
- unsigned long parent;
+/* caller must hold RTNL */
+#define node_parent(n) rtnl_dereference((n)->parent)
- parent = rcu_dereference_index_check(node->parent, lockdep_rtnl_is_held());
+/* caller must hold RCU read lock or RTNL */
+#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
- return (struct tnode *)(parent & ~NODE_TYPE_MASK);
-}
-
-/*
- * caller must hold RCU read lock or RTNL
- */
-static inline struct tnode *node_parent_rcu(const struct rt_trie_node *node)
+/* wrapper for rcu_assign_pointer */
+static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
{
- unsigned long parent;
-
- parent = rcu_dereference_index_check(node->parent, rcu_read_lock_held() ||
- lockdep_rtnl_is_held());
-
- return (struct tnode *)(parent & ~NODE_TYPE_MASK);
+ if (node)
+ rcu_assign_pointer(node->parent, ptr);
}
-/* Same as rcu_assign_pointer
- * but that macro() assumes that value is a pointer.
+#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
+
+/* This provides us with the number of children in this node, in the case of a
+ * leaf this will return 0 meaning none of the children are accessible.
*/
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline int tnode_child_length(const struct tnode *tn)
{
- smp_wmb();
- node->parent = (unsigned long)ptr | NODE_TYPE(node);
+ return (1ul << tn->bits) & ~(1ul);
}
/*
@@ -215,7 +204,7 @@ static inline void node_set_parent(struc
*/
static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1U << tn->bits);
+ BUG_ON(i >= tnode_child_length(tn));
return rtnl_dereference(tn->child[i]);
}
@@ -225,16 +214,11 @@ static inline struct rt_trie_node *tnode
*/
static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
- BUG_ON(i >= 1U << tn->bits);
+ BUG_ON(i >= tnode_child_length(tn));
return rcu_dereference_rtnl(tn->child[i]);
}
-static inline int tnode_child_length(const struct tnode *tn)
-{
- return 1 << tn->bits;
-}
-
static inline t_key mask_pfx(t_key k, unsigned int l)
{
return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
@@ -336,11 +320,6 @@ static inline int tkey_mismatch(t_key a,
*/
-static inline void check_tnode(const struct tnode *tn)
-{
- WARN_ON(tn && tn->pos+tn->bits > 32);
-}
-
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
@@ -426,11 +405,20 @@ static void tnode_free_flush(void)
}
}
-static struct leaf *leaf_new(void)
+static struct leaf *leaf_new(t_key key)
{
struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
- l->parent = T_LEAF;
+ l->parent = NULL;
+ /* set key and pos to reflect full key value
+ * any trailing zeros in the key should be ignored
+ * as the nodes are searched
+ */
+ l->key = key;
+ l->pos = KEYLENGTH;
+ /* set bits to 0 indicating we are not a tnode */
+ l->bits = 0;
+
INIT_HLIST_HEAD(&l->list);
}
return l;
@@ -451,12 +439,16 @@ static struct tnode *tnode_new(t_key key
{
size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
struct tnode *tn = tnode_alloc(sz);
+ unsigned int shift = pos + bits;
+
+ /* verify bits and pos their msb bits clear and values are valid */
+ BUG_ON(!bits || (shift > KEYLENGTH));
if (tn) {
- tn->parent = T_TNODE;
+ tn->parent = NULL;
tn->pos = pos;
tn->bits = bits;
- tn->key = key;
+ tn->key = mask_pfx(key, pos);
tn->full_children = 0;
tn->empty_children = 1<<bits;
}
@@ -473,10 +465,7 @@ static struct tnode *tnode_new(t_key key
static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
{
- if (n == NULL || IS_LEAF(n))
- return 0;
-
- return ((struct tnode *) n)->pos == tn->pos + tn->bits;
+ return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
}
static inline void put_child(struct tnode *tn, int i,
@@ -514,8 +503,7 @@ static void tnode_put_child_reorg(struct
else if (!wasfull && isfull)
tn->full_children++;
- if (n)
- node_set_parent(n, tn);
+ node_set_parent(n, tn);
rcu_assign_pointer(tn->child[i], n);
}
@@ -523,7 +511,7 @@ static void tnode_put_child_reorg(struct
#define MAX_WORK 10
static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
{
- int i;
+ struct rt_trie_node *n = NULL;
struct tnode *old_tn;
int inflate_threshold_use;
int halve_threshold_use;
@@ -536,12 +524,11 @@ static struct rt_trie_node *resize(struc
tn, inflate_threshold, halve_threshold);
/* No children */
- if (tn->empty_children == tnode_child_length(tn)) {
- tnode_free_safe(tn);
- return NULL;
- }
+ if (tn->empty_children > (tnode_child_length(tn) - 1))
+ goto no_children;
+
/* One child */
- if (tn->empty_children == tnode_child_length(tn) - 1)
+ if (tn->empty_children == (tnode_child_length(tn) - 1))
goto one_child;
/*
* Double as long as the resulting node has a number of
@@ -607,11 +594,9 @@ static struct rt_trie_node *resize(struc
*
*/
- check_tnode(tn);
-
/* Keep root node larger */
- if (!node_parent((struct rt_trie_node *)tn)) {
+ if (!node_parent(tn)) {
inflate_threshold_use = inflate_threshold_root;
halve_threshold_use = halve_threshold_root;
} else {
@@ -637,8 +622,6 @@ static struct rt_trie_node *resize(struc
}
}
- check_tnode(tn);
-
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
return (struct rt_trie_node *) tn;
@@ -666,21 +649,16 @@ static struct rt_trie_node *resize(struc
/* Only one child remains */
- if (tn->empty_children == tnode_child_length(tn) - 1) {
+ if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+ unsigned long i;
one_child:
- for (i = 0; i < tnode_child_length(tn); i++) {
- struct rt_trie_node *n;
-
- n = rtnl_dereference(tn->child[i]);
- if (!n)
- continue;
-
- /* compress one level */
-
- node_set_parent(n, NULL);
- tnode_free_safe(tn);
- return n;
- }
+ for (i = tnode_child_length(tn); !n && i;)
+ n = tnode_get_child(tn, --i);
+no_children:
+ /* compress one level */
+ node_set_parent(n, NULL);
+ tnode_free_safe(tn);
+ return n;
}
return (struct rt_trie_node *) tn;
}
@@ -760,8 +738,7 @@ static struct tnode *inflate(struct trie
/* A leaf or an internal node with skipped bits */
- if (IS_LEAF(node) || ((struct tnode *) node)->pos >
- tn->pos + tn->bits - 1) {
+ if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
put_child(tn,
tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
node);
@@ -958,11 +935,9 @@ fib_find_node(struct trie *t, u32 key)
pos = 0;
n = rcu_dereference_rtnl(t->trie);
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ while (n && IS_TNODE(n)) {
tn = (struct tnode *) n;
- check_tnode(tn);
-
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
pos = tn->pos + tn->bits;
n = tnode_get_child_rcu(tn,
@@ -988,7 +963,7 @@ static void trie_rebalance(struct trie *
key = tn->key;
- while (tn != NULL && (tp = node_parent((struct rt_trie_node *)tn)) != NULL) {
+ while (tn != NULL && (tp = node_parent(tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = (struct tnode *)resize(t, tn);
@@ -996,7 +971,7 @@ static void trie_rebalance(struct trie *
tnode_put_child_reorg(tp, cindex,
(struct rt_trie_node *)tn, wasfull);
- tp = node_parent((struct rt_trie_node *) tn);
+ tp = node_parent(tn);
if (!tp)
rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
@@ -1048,11 +1023,9 @@ static struct list_head *fib_insert_node
* If it doesn't, we need to replace it with a T_TNODE.
*/
- while (n != NULL && NODE_TYPE(n) == T_TNODE) {
+ while (n && IS_TNODE(n)) {
tn = (struct tnode *) n;
- check_tnode(tn);
-
if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
tp = tn;
pos = tn->pos + tn->bits;
@@ -1087,12 +1060,11 @@ static struct list_head *fib_insert_node
insert_leaf_info(&l->list, li);
goto done;
}
- l = leaf_new();
+ l = leaf_new(key);
if (!l)
return NULL;
- l->key = key;
li = leaf_info_new(plen);
if (!li) {
@@ -1569,7 +1541,7 @@ backtrace:
if (chopped_off <= pn->bits) {
cindex &= ~(1 << (chopped_off-1));
} else {
- struct tnode *parent = node_parent_rcu((struct rt_trie_node *) pn);
+ struct tnode *parent = node_parent_rcu(pn);
if (!parent)
goto failed;
@@ -1597,7 +1569,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
*/
static void trie_leaf_remove(struct trie *t, struct leaf *l)
{
- struct tnode *tp = node_parent((struct rt_trie_node *) l);
+ struct tnode *tp = node_parent(l);
pr_debug("entering trie_leaf_remove(%p)\n", l);
@@ -2375,7 +2347,7 @@ static int fib_trie_seq_show(struct seq_
if (IS_TNODE(n)) {
struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
+ __be32 prf = htonl(tn->key);
seq_indent(seq, iter->depth-1);
seq_printf(seq, " +-- %pI4/%d %d %d %d\n",

View file

@ -0,0 +1,209 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:41 -0800
Subject: [PATCH] fib_trie: Merge tnode_free and leaf_free into node_free
Both the leaf and the tnode had an rcu_head in them, but they had them in
slightly different places. Since we now have them in the same spot and
know that any node with bits == 0 is a leaf and the rest are either vmalloc
or kmalloc tnodes depending on the value of bits it makes it easy to combine
the functions and reduce overhead.
In addition I have taken advantage of the rcu_head pointer to go ahead and
put together a simple linked list instead of using the tnode pointer as
this way we can merge either type of structure for freeing.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -95,15 +95,17 @@ struct tnode {
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
struct tnode __rcu *parent;
- union {
- struct rcu_head rcu;
- struct tnode *tnode_free;
- };
+ struct rcu_head rcu;
+ /* everything above this comment must be the same as rt_trie_node */
unsigned int full_children; /* KEYLENGTH bits needed */
unsigned int empty_children; /* KEYLENGTH bits needed */
struct rt_trie_node __rcu *child[0];
};
+/* This struct represents the shared bits between tnode and leaf. If any
+ * ordering is changed here is must also be updated in tnode and leaf as
+ * well.
+ */
struct rt_trie_node {
t_key key;
unsigned char bits;
@@ -118,6 +120,7 @@ struct leaf {
unsigned char pos;
struct tnode __rcu *parent;
struct rcu_head rcu;
+ /* everything above this comment must be the same as rt_trie_node */
struct hlist_head list;
};
@@ -163,7 +166,7 @@ static struct rt_trie_node *resize(struc
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
-static struct tnode *tnode_free_head;
+static struct callback_head *tnode_free_head;
static size_t tnode_free_size;
/*
@@ -336,17 +339,23 @@ static inline void alias_free_mem_rcu(st
call_rcu(&fa->rcu, __alias_free_mem);
}
-static void __leaf_free_rcu(struct rcu_head *head)
-{
- struct leaf *l = container_of(head, struct leaf, rcu);
- kmem_cache_free(trie_leaf_kmem, l);
-}
+#define TNODE_KMALLOC_MAX \
+ ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
-static inline void free_leaf(struct leaf *l)
+static void __node_free_rcu(struct rcu_head *head)
{
- call_rcu(&l->rcu, __leaf_free_rcu);
+ struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+
+ if (IS_LEAF(n))
+ kmem_cache_free(trie_leaf_kmem, n);
+ else if (n->bits <= TNODE_KMALLOC_MAX)
+ kfree(n);
+ else
+ vfree(n);
}
+#define node_free(n) call_rcu(&n->rcu, __node_free_rcu)
+
static inline void free_leaf_info(struct leaf_info *leaf)
{
kfree_rcu(leaf, rcu);
@@ -360,43 +369,24 @@ static struct tnode *tnode_alloc(size_t
return vzalloc(size);
}
-static void __tnode_free_rcu(struct rcu_head *head)
-{
- struct tnode *tn = container_of(head, struct tnode, rcu);
- size_t size = sizeof(struct tnode) +
- (sizeof(struct rt_trie_node *) << tn->bits);
-
- if (size <= PAGE_SIZE)
- kfree(tn);
- else
- vfree(tn);
-}
-
-static inline void tnode_free(struct tnode *tn)
-{
- if (IS_LEAF(tn))
- free_leaf((struct leaf *) tn);
- else
- call_rcu(&tn->rcu, __tnode_free_rcu);
-}
-
static void tnode_free_safe(struct tnode *tn)
{
BUG_ON(IS_LEAF(tn));
- tn->tnode_free = tnode_free_head;
- tnode_free_head = tn;
- tnode_free_size += sizeof(struct tnode) +
- (sizeof(struct rt_trie_node *) << tn->bits);
+ tn->rcu.next = tnode_free_head;
+ tnode_free_head = &tn->rcu;
}
static void tnode_free_flush(void)
{
- struct tnode *tn;
+ struct callback_head *head;
+
+ while ((head = tnode_free_head)) {
+ struct tnode *tn = container_of(head, struct tnode, rcu);
+
+ tnode_free_head = head->next;
+ tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
- while ((tn = tnode_free_head)) {
- tnode_free_head = tn->tnode_free;
- tn->tnode_free = NULL;
- tnode_free(tn);
+ node_free(tn);
}
if (tnode_free_size >= PAGE_SIZE * sync_pages) {
@@ -437,7 +427,7 @@ static struct leaf_info *leaf_info_new(i
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = sizeof(struct tnode) + (sizeof(struct rt_trie_node *) << bits);
+ size_t sz = offsetof(struct tnode, child[1 << bits]);
struct tnode *tn = tnode_alloc(sz);
unsigned int shift = pos + bits;
@@ -666,15 +656,15 @@ no_children:
static void tnode_clean_free(struct tnode *tn)
{
+ struct rt_trie_node *tofree;
int i;
- struct tnode *tofree;
for (i = 0; i < tnode_child_length(tn); i++) {
- tofree = (struct tnode *)rtnl_dereference(tn->child[i]);
+ tofree = rtnl_dereference(tn->child[i]);
if (tofree)
- tnode_free(tofree);
+ node_free(tofree);
}
- tnode_free(tn);
+ node_free(tn);
}
static struct tnode *inflate(struct trie *t, struct tnode *tn)
@@ -717,7 +707,7 @@ static struct tnode *inflate(struct trie
inode->bits - 1);
if (!right) {
- tnode_free(left);
+ node_free(left);
goto nomem;
}
@@ -1068,7 +1058,7 @@ static struct list_head *fib_insert_node
li = leaf_info_new(plen);
if (!li) {
- free_leaf(l);
+ node_free(l);
return NULL;
}
@@ -1100,7 +1090,7 @@ static struct list_head *fib_insert_node
if (!tn) {
free_leaf_info(li);
- free_leaf(l);
+ node_free(l);
return NULL;
}
@@ -1580,7 +1570,7 @@ static void trie_leaf_remove(struct trie
} else
RCU_INIT_POINTER(t->trie, NULL);
- free_leaf(l);
+ node_free(l);
}
/*

View file

@ -0,0 +1,928 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:47 -0800
Subject: [PATCH] fib_trie: Merge leaf into tnode
This change makes it so that leaf and tnode are the same struct. As a
result there is no need for rt_trie_node anymore since everyting can be
merged into tnode.
On 32b systems this results in the leaf being 4 bytes larger, however I
don't know if that is really an issue as this and an eariler patch that
added bits & pos have increased the size from 20 to 28. If I am not
mistaken slub/slab allocate on power of 2 sizes so 20 was likely being
rounded up to 32 anyway.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,32 +96,16 @@ struct tnode {
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
struct tnode __rcu *parent;
struct rcu_head rcu;
- /* everything above this comment must be the same as rt_trie_node */
- unsigned int full_children; /* KEYLENGTH bits needed */
- unsigned int empty_children; /* KEYLENGTH bits needed */
- struct rt_trie_node __rcu *child[0];
-};
-
-/* This struct represents the shared bits between tnode and leaf. If any
- * ordering is changed here is must also be updated in tnode and leaf as
- * well.
- */
-struct rt_trie_node {
- t_key key;
- unsigned char bits;
- unsigned char pos;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
-};
-
-struct leaf {
- t_key key;
- unsigned char bits;
- unsigned char pos;
- struct tnode __rcu *parent;
- struct rcu_head rcu;
- /* everything above this comment must be the same as rt_trie_node */
- struct hlist_head list;
+ union {
+ /* The fields in this struct are valid if bits > 0 (TNODE) */
+ struct {
+ unsigned int full_children; /* KEYLENGTH bits needed */
+ unsigned int empty_children; /* KEYLENGTH bits needed */
+ struct tnode __rcu *child[0];
+ };
+ /* This list pointer if valid if bits == 0 (LEAF) */
+ struct hlist_head list;
+ };
};
struct leaf_info {
@@ -154,15 +138,15 @@ struct trie_stat {
};
struct trie {
- struct rt_trie_node __rcu *trie;
+ struct tnode __rcu *trie;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats;
#endif
};
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
int wasfull);
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
+static struct tnode *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
@@ -186,10 +170,10 @@ static struct kmem_cache *trie_leaf_kmem
#define node_parent_rcu(n) rcu_dereference_rtnl((n)->parent)
/* wrapper for rcu_assign_pointer */
-static inline void node_set_parent(struct rt_trie_node *node, struct tnode *ptr)
+static inline void node_set_parent(struct tnode *n, struct tnode *tp)
{
- if (node)
- rcu_assign_pointer(node->parent, ptr);
+ if (n)
+ rcu_assign_pointer(n->parent, tp);
}
#define NODE_INIT_PARENT(n, p) RCU_INIT_POINTER((n)->parent, p)
@@ -205,7 +189,7 @@ static inline int tnode_child_length(con
/*
* caller must hold RTNL
*/
-static inline struct rt_trie_node *tnode_get_child(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= tnode_child_length(tn));
@@ -215,7 +199,7 @@ static inline struct rt_trie_node *tnode
/*
* caller must hold RCU read lock or RTNL
*/
-static inline struct rt_trie_node *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
{
BUG_ON(i >= tnode_child_length(tn));
@@ -340,11 +324,11 @@ static inline void alias_free_mem_rcu(st
}
#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct rt_trie_node *))
+ ilog2((PAGE_SIZE - sizeof(struct tnode)) / sizeof(struct tnode *))
static void __node_free_rcu(struct rcu_head *head)
{
- struct rt_trie_node *n = container_of(head, struct rt_trie_node, rcu);
+ struct tnode *n = container_of(head, struct tnode, rcu);
if (IS_LEAF(n))
kmem_cache_free(trie_leaf_kmem, n);
@@ -395,9 +379,9 @@ static void tnode_free_flush(void)
}
}
-static struct leaf *leaf_new(t_key key)
+static struct tnode *leaf_new(t_key key)
{
- struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
+ struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
if (l) {
l->parent = NULL;
/* set key and pos to reflect full key value
@@ -444,7 +428,7 @@ static struct tnode *tnode_new(t_key key
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
- sizeof(struct rt_trie_node *) << bits);
+ sizeof(struct tnode *) << bits);
return tn;
}
@@ -453,13 +437,13 @@ static struct tnode *tnode_new(t_key key
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *n)
+static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
{
return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
}
static inline void put_child(struct tnode *tn, int i,
- struct rt_trie_node *n)
+ struct tnode *n)
{
tnode_put_child_reorg(tn, i, n, -1);
}
@@ -469,10 +453,10 @@ static inline void put_child(struct tnod
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
+static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
int wasfull)
{
- struct rt_trie_node *chi = rtnl_dereference(tn->child[i]);
+ struct tnode *chi = rtnl_dereference(tn->child[i]);
int isfull;
BUG_ON(i >= 1<<tn->bits);
@@ -499,10 +483,9 @@ static void tnode_put_child_reorg(struct
}
#define MAX_WORK 10
-static struct rt_trie_node *resize(struct trie *t, struct tnode *tn)
+static struct tnode *resize(struct trie *t, struct tnode *tn)
{
- struct rt_trie_node *n = NULL;
- struct tnode *old_tn;
+ struct tnode *old_tn, *n = NULL;
int inflate_threshold_use;
int halve_threshold_use;
int max_work;
@@ -614,7 +597,7 @@ static struct rt_trie_node *resize(struc
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return (struct rt_trie_node *) tn;
+ return tn;
/*
* Halve as long as the number of empty children in this
@@ -650,13 +633,13 @@ no_children:
tnode_free_safe(tn);
return n;
}
- return (struct rt_trie_node *) tn;
+ return tn;
}
static void tnode_clean_free(struct tnode *tn)
{
- struct rt_trie_node *tofree;
+ struct tnode *tofree;
int i;
for (i = 0; i < tnode_child_length(tn); i++) {
@@ -667,10 +650,10 @@ static void tnode_clean_free(struct tnod
node_free(tn);
}
-static struct tnode *inflate(struct trie *t, struct tnode *tn)
+static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *oldtnode = tn;
- int olen = tnode_child_length(tn);
+ int olen = tnode_child_length(oldtnode);
+ struct tnode *tn;
int i;
pr_debug("In inflate\n");
@@ -690,11 +673,8 @@ static struct tnode *inflate(struct trie
for (i = 0; i < olen; i++) {
struct tnode *inode;
- inode = (struct tnode *) tnode_get_child(oldtnode, i);
- if (inode &&
- IS_TNODE(inode) &&
- inode->pos == oldtnode->pos + oldtnode->bits &&
- inode->bits > 1) {
+ inode = tnode_get_child(oldtnode, i);
+ if (tnode_full(oldtnode, inode) && inode->bits > 1) {
struct tnode *left, *right;
t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
@@ -711,33 +691,29 @@ static struct tnode *inflate(struct trie
goto nomem;
}
- put_child(tn, 2*i, (struct rt_trie_node *) left);
- put_child(tn, 2*i+1, (struct rt_trie_node *) right);
+ put_child(tn, 2*i, left);
+ put_child(tn, 2*i+1, right);
}
}
for (i = 0; i < olen; i++) {
- struct tnode *inode;
- struct rt_trie_node *node = tnode_get_child(oldtnode, i);
+ struct tnode *inode = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
int size, j;
/* An empty child */
- if (node == NULL)
+ if (inode == NULL)
continue;
/* A leaf or an internal node with skipped bits */
-
- if (IS_LEAF(node) || (node->pos > (tn->pos + tn->bits - 1))) {
+ if (!tnode_full(oldtnode, inode)) {
put_child(tn,
- tkey_extract_bits(node->key, oldtnode->pos, oldtnode->bits + 1),
- node);
+ tkey_extract_bits(inode->key, tn->pos, tn->bits),
+ inode);
continue;
}
/* An internal node with two children */
- inode = (struct tnode *) node;
-
if (inode->bits == 1) {
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
@@ -769,12 +745,12 @@ static struct tnode *inflate(struct trie
* bit to zero.
*/
- left = (struct tnode *) tnode_get_child(tn, 2*i);
+ left = tnode_get_child(tn, 2*i);
put_child(tn, 2*i, NULL);
BUG_ON(!left);
- right = (struct tnode *) tnode_get_child(tn, 2*i+1);
+ right = tnode_get_child(tn, 2*i+1);
put_child(tn, 2*i+1, NULL);
BUG_ON(!right);
@@ -796,12 +772,11 @@ nomem:
return ERR_PTR(-ENOMEM);
}
-static struct tnode *halve(struct trie *t, struct tnode *tn)
+static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *oldtnode = tn;
- struct rt_trie_node *left, *right;
+ int olen = tnode_child_length(oldtnode);
+ struct tnode *tn, *left, *right;
int i;
- int olen = tnode_child_length(tn);
pr_debug("In halve\n");
@@ -830,7 +805,7 @@ static struct tnode *halve(struct trie *
if (!newn)
goto nomem;
- put_child(tn, i/2, (struct rt_trie_node *)newn);
+ put_child(tn, i/2, newn);
}
}
@@ -855,7 +830,7 @@ static struct tnode *halve(struct trie *
}
/* Two nonempty children */
- newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
+ newBinNode = tnode_get_child(tn, i/2);
put_child(tn, i/2, NULL);
put_child(newBinNode, 0, left);
put_child(newBinNode, 1, right);
@@ -871,7 +846,7 @@ nomem:
/* readside must use rcu_read_lock currently dump routines
via get_fa_head and dump */
-static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
+static struct leaf_info *find_leaf_info(struct tnode *l, int plen)
{
struct hlist_head *head = &l->list;
struct leaf_info *li;
@@ -883,7 +858,7 @@ static struct leaf_info *find_leaf_info(
return NULL;
}
-static inline struct list_head *get_fa_head(struct leaf *l, int plen)
+static inline struct list_head *get_fa_head(struct tnode *l, int plen)
{
struct leaf_info *li = find_leaf_info(l, plen);
@@ -915,32 +890,25 @@ static void insert_leaf_info(struct hlis
/* rcu_read_lock needs to be hold by caller from readside */
-static struct leaf *
-fib_find_node(struct trie *t, u32 key)
+static struct tnode *fib_find_node(struct trie *t, u32 key)
{
- int pos;
- struct tnode *tn;
- struct rt_trie_node *n;
-
- pos = 0;
- n = rcu_dereference_rtnl(t->trie);
+ struct tnode *n = rcu_dereference_rtnl(t->trie);
+ int pos = 0;
while (n && IS_TNODE(n)) {
- tn = (struct tnode *) n;
-
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- pos = tn->pos + tn->bits;
- n = tnode_get_child_rcu(tn,
+ if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+ pos = n->pos + n->bits;
+ n = tnode_get_child_rcu(n,
tkey_extract_bits(key,
- tn->pos,
- tn->bits));
+ n->pos,
+ n->bits));
} else
break;
}
/* Case we have found a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
- return (struct leaf *)n;
+ return n;
return NULL;
}
@@ -956,14 +924,13 @@ static void trie_rebalance(struct trie *
while (tn != NULL && (tp = node_parent(tn)) != NULL) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = (struct tnode *)resize(t, tn);
+ tn = resize(t, tn);
- tnode_put_child_reorg(tp, cindex,
- (struct rt_trie_node *)tn, wasfull);
+ tnode_put_child_reorg(tp, cindex, tn, wasfull);
tp = node_parent(tn);
if (!tp)
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
tnode_free_flush();
if (!tp)
@@ -973,9 +940,9 @@ static void trie_rebalance(struct trie *
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = (struct tnode *)resize(t, tn);
+ tn = resize(t, tn);
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
tnode_free_flush();
}
@@ -985,8 +952,8 @@ static struct list_head *fib_insert_node
{
int pos, newpos;
struct tnode *tp = NULL, *tn = NULL;
- struct rt_trie_node *n;
- struct leaf *l;
+ struct tnode *n;
+ struct tnode *l;
int missbit;
struct list_head *fa_head = NULL;
struct leaf_info *li;
@@ -1014,17 +981,15 @@ static struct list_head *fib_insert_node
*/
while (n && IS_TNODE(n)) {
- tn = (struct tnode *) n;
-
- if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
- tp = tn;
- pos = tn->pos + tn->bits;
- n = tnode_get_child(tn,
+ if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
+ tp = n;
+ pos = n->pos + n->bits;
+ n = tnode_get_child(n,
tkey_extract_bits(key,
- tn->pos,
- tn->bits));
+ n->pos,
+ n->bits));
- BUG_ON(n && node_parent(n) != tn);
+ BUG_ON(n && node_parent(n) != tp);
} else
break;
}
@@ -1040,14 +1005,13 @@ static struct list_head *fib_insert_node
/* Case 1: n is a leaf. Compare prefixes */
if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- l = (struct leaf *) n;
li = leaf_info_new(plen);
if (!li)
return NULL;
fa_head = &li->falh;
- insert_leaf_info(&l->list, li);
+ insert_leaf_info(&n->list, li);
goto done;
}
l = leaf_new(key);
@@ -1068,10 +1032,10 @@ static struct list_head *fib_insert_node
if (t->trie && n == NULL) {
/* Case 2: n is NULL, and will just insert a new leaf */
- node_set_parent((struct rt_trie_node *)l, tp);
+ node_set_parent(l, tp);
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, (struct rt_trie_node *)l);
+ put_child(tp, cindex, l);
} else {
/* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
/*
@@ -1094,17 +1058,17 @@ static struct list_head *fib_insert_node
return NULL;
}
- node_set_parent((struct rt_trie_node *)tn, tp);
+ node_set_parent(tn, tp);
missbit = tkey_extract_bits(key, newpos, 1);
- put_child(tn, missbit, (struct rt_trie_node *)l);
+ put_child(tn, missbit, l);
put_child(tn, 1-missbit, n);
if (tp) {
cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, (struct rt_trie_node *)tn);
+ put_child(tp, cindex, tn);
} else {
- rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
+ rcu_assign_pointer(t->trie, tn);
}
tp = tn;
@@ -1134,7 +1098,7 @@ int fib_table_insert(struct fib_table *t
u8 tos = cfg->fc_tos;
u32 key, mask;
int err;
- struct leaf *l;
+ struct tnode *l;
if (plen > 32)
return -EINVAL;
@@ -1292,7 +1256,7 @@ err:
}
/* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct leaf *l,
+static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
t_key key, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
@@ -1365,7 +1329,7 @@ int fib_table_lookup(struct fib_table *t
struct trie_use_stats __percpu *stats = t->stats;
#endif
int ret;
- struct rt_trie_node *n;
+ struct tnode *n;
struct tnode *pn;
unsigned int pos, bits;
t_key key = ntohl(flp->daddr);
@@ -1387,11 +1351,11 @@ int fib_table_lookup(struct fib_table *t
/* Just a leaf? */
if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
goto found;
}
- pn = (struct tnode *) n;
+ pn = n;
chopped_off = 0;
while (pn) {
@@ -1412,13 +1376,13 @@ int fib_table_lookup(struct fib_table *t
}
if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, (struct leaf *)n, key, flp, res, fib_flags);
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
if (ret > 0)
goto backtrace;
goto found;
}
- cn = (struct tnode *)n;
+ cn = n;
/*
* It's a tnode, and we can do some extra checks here if we
@@ -1506,7 +1470,7 @@ int fib_table_lookup(struct fib_table *t
current_prefix_length = mp;
}
- pn = (struct tnode *)n; /* Descend */
+ pn = n; /* Descend */
chopped_off = 0;
continue;
@@ -1557,7 +1521,7 @@ EXPORT_SYMBOL_GPL(fib_table_lookup);
/*
* Remove the leaf and return parent.
*/
-static void trie_leaf_remove(struct trie *t, struct leaf *l)
+static void trie_leaf_remove(struct trie *t, struct tnode *l)
{
struct tnode *tp = node_parent(l);
@@ -1584,7 +1548,7 @@ int fib_table_delete(struct fib_table *t
u8 tos = cfg->fc_tos;
struct fib_alias *fa, *fa_to_delete;
struct list_head *fa_head;
- struct leaf *l;
+ struct tnode *l;
struct leaf_info *li;
if (plen > 32)
@@ -1682,7 +1646,7 @@ static int trie_flush_list(struct list_h
return found;
}
-static int trie_flush_leaf(struct leaf *l)
+static int trie_flush_leaf(struct tnode *l)
{
int found = 0;
struct hlist_head *lih = &l->list;
@@ -1704,7 +1668,7 @@ static int trie_flush_leaf(struct leaf *
* Scan for the next right leaf starting at node p->child[idx]
* Since we have back pointer, no recursion necessary.
*/
-static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c)
+static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
t_key idx;
@@ -1720,47 +1684,46 @@ static struct leaf *leaf_walk_rcu(struct
continue;
if (IS_LEAF(c))
- return (struct leaf *) c;
+ return c;
/* Rescan start scanning in new node */
- p = (struct tnode *) c;
+ p = c;
idx = 0;
}
/* Node empty, walk back up to parent */
- c = (struct rt_trie_node *) p;
+ c = p;
} while ((p = node_parent_rcu(c)) != NULL);
return NULL; /* Root of trie */
}
-static struct leaf *trie_firstleaf(struct trie *t)
+static struct tnode *trie_firstleaf(struct trie *t)
{
- struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
+ struct tnode *n = rcu_dereference_rtnl(t->trie);
if (!n)
return NULL;
if (IS_LEAF(n)) /* trie is just a leaf */
- return (struct leaf *) n;
+ return n;
return leaf_walk_rcu(n, NULL);
}
-static struct leaf *trie_nextleaf(struct leaf *l)
+static struct tnode *trie_nextleaf(struct tnode *l)
{
- struct rt_trie_node *c = (struct rt_trie_node *) l;
- struct tnode *p = node_parent_rcu(c);
+ struct tnode *p = node_parent_rcu(l);
if (!p)
return NULL; /* trie with just one leaf */
- return leaf_walk_rcu(p, c);
+ return leaf_walk_rcu(p, l);
}
-static struct leaf *trie_leafindex(struct trie *t, int index)
+static struct tnode *trie_leafindex(struct trie *t, int index)
{
- struct leaf *l = trie_firstleaf(t);
+ struct tnode *l = trie_firstleaf(t);
while (l && index-- > 0)
l = trie_nextleaf(l);
@@ -1775,7 +1738,7 @@ static struct leaf *trie_leafindex(struc
int fib_table_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
- struct leaf *l, *ll = NULL;
+ struct tnode *l, *ll = NULL;
int found = 0;
for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
@@ -1840,7 +1803,7 @@ static int fn_trie_dump_fa(t_key key, in
return skb->len;
}
-static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
+static int fn_trie_dump_leaf(struct tnode *l, struct fib_table *tb,
struct sk_buff *skb, struct netlink_callback *cb)
{
struct leaf_info *li;
@@ -1876,7 +1839,7 @@ static int fn_trie_dump_leaf(struct leaf
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
struct netlink_callback *cb)
{
- struct leaf *l;
+ struct tnode *l;
struct trie *t = (struct trie *) tb->tb_data;
t_key key = cb->args[2];
int count = cb->args[3];
@@ -1922,7 +1885,7 @@ void __init fib_trie_init(void)
0, SLAB_PANIC, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
- max(sizeof(struct leaf),
+ max(sizeof(struct tnode),
sizeof(struct leaf_info)),
0, SLAB_PANIC, NULL);
}
@@ -1965,7 +1928,7 @@ struct fib_trie_iter {
unsigned int depth;
};
-static struct rt_trie_node *fib_trie_get_next(struct fib_trie_iter *iter)
+static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
{
struct tnode *tn = iter->tnode;
unsigned int cindex = iter->index;
@@ -1979,7 +1942,7 @@ static struct rt_trie_node *fib_trie_get
iter->tnode, iter->index, iter->depth);
rescan:
while (cindex < (1<<tn->bits)) {
- struct rt_trie_node *n = tnode_get_child_rcu(tn, cindex);
+ struct tnode *n = tnode_get_child_rcu(tn, cindex);
if (n) {
if (IS_LEAF(n)) {
@@ -1987,7 +1950,7 @@ rescan:
iter->index = cindex + 1;
} else {
/* push down one level */
- iter->tnode = (struct tnode *) n;
+ iter->tnode = n;
iter->index = 0;
++iter->depth;
}
@@ -1998,7 +1961,7 @@ rescan:
}
/* Current node exhausted, pop back up */
- p = node_parent_rcu((struct rt_trie_node *)tn);
+ p = node_parent_rcu(tn);
if (p) {
cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
tn = p;
@@ -2010,10 +1973,10 @@ rescan:
return NULL;
}
-static struct rt_trie_node *fib_trie_get_first(struct fib_trie_iter *iter,
+static struct tnode *fib_trie_get_first(struct fib_trie_iter *iter,
struct trie *t)
{
- struct rt_trie_node *n;
+ struct tnode *n;
if (!t)
return NULL;
@@ -2023,7 +1986,7 @@ static struct rt_trie_node *fib_trie_get
return NULL;
if (IS_TNODE(n)) {
- iter->tnode = (struct tnode *) n;
+ iter->tnode = n;
iter->index = 0;
iter->depth = 1;
} else {
@@ -2037,7 +2000,7 @@ static struct rt_trie_node *fib_trie_get
static void trie_collect_stats(struct trie *t, struct trie_stat *s)
{
- struct rt_trie_node *n;
+ struct tnode *n;
struct fib_trie_iter iter;
memset(s, 0, sizeof(*s));
@@ -2045,7 +2008,6 @@ static void trie_collect_stats(struct tr
rcu_read_lock();
for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
if (IS_LEAF(n)) {
- struct leaf *l = (struct leaf *)n;
struct leaf_info *li;
s->leaves++;
@@ -2053,18 +2015,17 @@ static void trie_collect_stats(struct tr
if (iter.depth > s->maxdepth)
s->maxdepth = iter.depth;
- hlist_for_each_entry_rcu(li, &l->list, hlist)
+ hlist_for_each_entry_rcu(li, &n->list, hlist)
++s->prefixes;
} else {
- const struct tnode *tn = (const struct tnode *) n;
int i;
s->tnodes++;
- if (tn->bits < MAX_STAT_DEPTH)
- s->nodesizes[tn->bits]++;
+ if (n->bits < MAX_STAT_DEPTH)
+ s->nodesizes[n->bits]++;
- for (i = 0; i < (1<<tn->bits); i++)
- if (!tn->child[i])
+ for (i = 0; i < tnode_child_length(n); i++)
+ if (!rcu_access_pointer(n->child[i]))
s->nullpointers++;
}
}
@@ -2088,7 +2049,7 @@ static void trie_show_stats(struct seq_f
seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
- bytes = sizeof(struct leaf) * stat->leaves;
+ bytes = sizeof(struct tnode) * stat->leaves;
seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
bytes += sizeof(struct leaf_info) * stat->prefixes;
@@ -2109,7 +2070,7 @@ static void trie_show_stats(struct seq_f
seq_putc(seq, '\n');
seq_printf(seq, "\tPointers: %u\n", pointers);
- bytes += sizeof(struct rt_trie_node *) * pointers;
+ bytes += sizeof(struct tnode *) * pointers;
seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
}
@@ -2163,7 +2124,7 @@ static int fib_triestat_seq_show(struct
seq_printf(seq,
"Basic info: size of leaf:"
" %Zd bytes, size of tnode: %Zd bytes.\n",
- sizeof(struct leaf), sizeof(struct tnode));
+ sizeof(struct tnode), sizeof(struct tnode));
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
@@ -2202,7 +2163,7 @@ static const struct file_operations fib_
.release = single_release_net,
};
-static struct rt_trie_node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
+static struct tnode *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
{
struct fib_trie_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
@@ -2214,7 +2175,7 @@ static struct rt_trie_node *fib_trie_get
struct fib_table *tb;
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- struct rt_trie_node *n;
+ struct tnode *n;
for (n = fib_trie_get_first(iter,
(struct trie *) tb->tb_data);
@@ -2243,7 +2204,7 @@ static void *fib_trie_seq_next(struct se
struct fib_table *tb = iter->tb;
struct hlist_node *tb_node;
unsigned int h;
- struct rt_trie_node *n;
+ struct tnode *n;
++*pos;
/* next node in same table */
@@ -2330,29 +2291,26 @@ static inline const char *rtn_type(char
static int fib_trie_seq_show(struct seq_file *seq, void *v)
{
const struct fib_trie_iter *iter = seq->private;
- struct rt_trie_node *n = v;
+ struct tnode *n = v;
if (!node_parent_rcu(n))
fib_table_print(seq, iter->tb);
if (IS_TNODE(n)) {
- struct tnode *tn = (struct tnode *) n;
- __be32 prf = htonl(tn->key);
+ __be32 prf = htonl(n->key);
- seq_indent(seq, iter->depth-1);
+ seq_indent(seq, iter->depth - 1);
seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
- &prf, tn->pos, tn->bits, tn->full_children,
- tn->empty_children);
-
+ &prf, n->pos, n->bits, n->full_children,
+ n->empty_children);
} else {
- struct leaf *l = (struct leaf *) n;
struct leaf_info *li;
- __be32 val = htonl(l->key);
+ __be32 val = htonl(n->key);
seq_indent(seq, iter->depth);
seq_printf(seq, " |-- %pI4\n", &val);
- hlist_for_each_entry_rcu(li, &l->list, hlist) {
+ hlist_for_each_entry_rcu(li, &n->list, hlist) {
struct fib_alias *fa;
list_for_each_entry_rcu(fa, &li->falh, fa_list) {
@@ -2402,9 +2360,9 @@ struct fib_route_iter {
t_key key;
};
-static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
+static struct tnode *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
{
- struct leaf *l = NULL;
+ struct tnode *l = NULL;
struct trie *t = iter->main_trie;
/* use cache location of last found key */
@@ -2449,7 +2407,7 @@ static void *fib_route_seq_start(struct
static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct fib_route_iter *iter = seq->private;
- struct leaf *l = v;
+ struct tnode *l = v;
++*pos;
if (v == SEQ_START_TOKEN) {
@@ -2495,7 +2453,7 @@ static unsigned int fib_flag_trans(int t
*/
static int fib_route_seq_show(struct seq_file *seq, void *v)
{
- struct leaf *l = v;
+ struct tnode *l = v;
struct leaf_info *li;
if (v == SEQ_START_TOKEN) {

View file

@ -0,0 +1,343 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:55:54 -0800
Subject: [PATCH] fib_trie: Optimize fib_table_lookup to avoid wasting
time on loops/variables
This patch is meant to reduce the complexity of fib_table_lookup by reducing
the number of variables to the bare minimum while still keeping the same if
not improved functionality versus the original.
Most of this change was started off by the desire to rid the function of
chopped_off and current_prefix_length as they actually added very little to
the function since they only applied when computing the cindex. I was able
to replace them mostly with just a check for the prefix match. As long as
the prefix between the key and the node being tested was the same we know
we can search the tnode fully versus just testing cindex 0.
The second portion of the change ended up being a massive reordering.
Originally the calls to check_leaf were up near the start of the loop, and
the backtracing and descending into lower levels of tnodes was later. This
didn't make much sense as the structure of the tree means the leaves are
always the last thing to be tested. As such I reordered things so that we
instead have a loop that will delve into the tree and only exit when we
have either found a leaf or we have exhausted the tree. The advantage of
rearranging things like this is that we can fully inline check_leaf since
there is now only one reference to it in the function.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,6 +90,9 @@ typedef unsigned int t_key;
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
+#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+
struct tnode {
t_key key;
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
@@ -1281,7 +1284,7 @@ static int check_leaf(struct fib_table *
continue;
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
- if (err) {
+ if (unlikely(err < 0)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->semantic_match_passed);
#endif
@@ -1303,7 +1306,7 @@ static int check_leaf(struct fib_table *
res->prefixlen = li->plen;
res->nh_sel = nhsel;
res->type = fa->fa_type;
- res->scope = fa->fa_info->fib_scope;
+ res->scope = fi->fib_scope;
res->fi = fi;
res->table = tb;
res->fa_head = &li->falh;
@@ -1321,23 +1324,24 @@ static int check_leaf(struct fib_table *
return 1;
}
+static inline t_key prefix_mismatch(t_key key, struct tnode *n)
+{
+ t_key prefix = n->key;
+
+ return (key ^ prefix) & (prefix | -prefix);
+}
+
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
- struct trie *t = (struct trie *) tb->tb_data;
+ struct trie *t = (struct trie *)tb->tb_data;
#ifdef CONFIG_IP_FIB_TRIE_STATS
struct trie_use_stats __percpu *stats = t->stats;
#endif
- int ret;
- struct tnode *n;
- struct tnode *pn;
- unsigned int pos, bits;
- t_key key = ntohl(flp->daddr);
- unsigned int chopped_off;
- t_key cindex = 0;
- unsigned int current_prefix_length = KEYLENGTH;
- struct tnode *cn;
- t_key pref_mismatch;
+ const t_key key = ntohl(flp->daddr);
+ struct tnode *n, *pn;
+ t_key cindex;
+ int ret = 1;
rcu_read_lock();
@@ -1349,170 +1353,102 @@ int fib_table_lookup(struct fib_table *t
this_cpu_inc(stats->gets);
#endif
- /* Just a leaf? */
- if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
- goto found;
- }
-
pn = n;
- chopped_off = 0;
-
- while (pn) {
- pos = pn->pos;
- bits = pn->bits;
+ cindex = 0;
- if (!chopped_off)
- cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
- pos, bits);
-
- n = tnode_get_child_rcu(pn, cindex);
-
- if (n == NULL) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->null_node_hit);
-#endif
- goto backtrace;
- }
+ /* Step 1: Travel to the longest prefix match in the trie */
+ for (;;) {
+ unsigned long index = get_index(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is child index
+ * else
+ * we have a mismatch in skip bits and failed
+ */
+ if (index >> n->bits)
+ break;
- if (IS_LEAF(n)) {
- ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
- if (ret > 0)
- goto backtrace;
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
goto found;
- }
- cn = n;
-
- /*
- * It's a tnode, and we can do some extra checks here if we
- * like, to avoid descending into a dead-end branch.
- * This tnode is in the parent's child array at index
- * key[p_pos..p_pos+p_bits] but potentially with some bits
- * chopped off, so in reality the index may be just a
- * subprefix, padded with zero at the end.
- * We can also take a look at any skipped bits in this
- * tnode - everything up to p_pos is supposed to be ok,
- * and the non-chopped bits of the index (se previous
- * paragraph) are also guaranteed ok, but the rest is
- * considered unknown.
- *
- * The skipped bits are key[pos+bits..cn->pos].
- */
-
- /* If current_prefix_length < pos+bits, we are already doing
- * actual prefix matching, which means everything from
- * pos+(bits-chopped_off) onward must be zero along some
- * branch of this subtree - otherwise there is *no* valid
- * prefix present. Here we can only check the skipped
- * bits. Remember, since we have already indexed into the
- * parent's child array, we know that the bits we chopped of
- * *are* zero.
+ /* only record pn and cindex if we are going to be chopping
+ * bits later. Otherwise we are just wasting cycles.
*/
-
- /* NOTA BENE: Checking only skipped bits
- for the new node here */
-
- if (current_prefix_length < pos+bits) {
- if (tkey_extract_bits(cn->key, current_prefix_length,
- cn->pos - current_prefix_length)
- || !(cn->child[0]))
- goto backtrace;
+ if (index) {
+ pn = n;
+ cindex = index;
}
- /*
- * If chopped_off=0, the index is fully validated and we
- * only need to look at the skipped bits for this, the new,
- * tnode. What we actually want to do is to find out if
- * these skipped bits match our key perfectly, or if we will
- * have to count on finding a matching prefix further down,
- * because if we do, we would like to have some way of
- * verifying the existence of such a prefix at this point.
- */
-
- /* The only thing we can do at this point is to verify that
- * any such matching prefix can indeed be a prefix to our
- * key, and if the bits in the node we are inspecting that
- * do not match our key are not ZERO, this cannot be true.
- * Thus, find out where there is a mismatch (before cn->pos)
- * and verify that all the mismatching bits are zero in the
- * new tnode's key.
- */
+ n = rcu_dereference(n->child[index]);
+ if (unlikely(!n))
+ goto backtrace;
+ }
- /*
- * Note: We aren't very concerned about the piece of
- * the key that precede pn->pos+pn->bits, since these
- * have already been checked. The bits after cn->pos
- * aren't checked since these are by definition
- * "unknown" at this point. Thus, what we want to see
- * is if we are about to enter the "prefix matching"
- * state, and in that case verify that the skipped
- * bits that will prevail throughout this subtree are
- * zero, as they have to be if we are to find a
- * matching prefix.
+ /* Step 2: Sort out leaves and begin backtracing for longest prefix */
+ for (;;) {
+ /* record the pointer where our next node pointer is stored */
+ struct tnode __rcu **cptr = n->child;
+
+ /* This test verifies that none of the bits that differ
+ * between the key and the prefix exist in the region of
+ * the lsb and higher in the prefix.
*/
+ if (unlikely(prefix_mismatch(key, n)))
+ goto backtrace;
- pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
+ /* exit out and process leaf */
+ if (unlikely(IS_LEAF(n)))
+ break;
- /*
- * In short: If skipped bits in this node do not match
- * the search key, enter the "prefix matching"
- * state.directly.
+ /* Don't bother recording parent info. Since we are in
+ * prefix match mode we will have to come back to wherever
+ * we started this traversal anyway
*/
- if (pref_mismatch) {
- /* fls(x) = __fls(x) + 1 */
- int mp = KEYLENGTH - __fls(pref_mismatch) - 1;
-
- if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
- goto backtrace;
-
- if (current_prefix_length >= cn->pos)
- current_prefix_length = mp;
- }
-
- pn = n; /* Descend */
- chopped_off = 0;
- continue;
+ while ((n = rcu_dereference(*cptr)) == NULL) {
backtrace:
- chopped_off++;
-
- /* As zero don't change the child key (cindex) */
- while ((chopped_off <= pn->bits)
- && !(cindex & (1<<(chopped_off-1))))
- chopped_off++;
-
- /* Decrease current_... with bits chopped off */
- if (current_prefix_length > pn->pos + pn->bits - chopped_off)
- current_prefix_length = pn->pos + pn->bits
- - chopped_off;
-
- /*
- * Either we do the actual chop off according or if we have
- * chopped off all bits in this tnode walk up to our parent.
- */
-
- if (chopped_off <= pn->bits) {
- cindex &= ~(1 << (chopped_off-1));
- } else {
- struct tnode *parent = node_parent_rcu(pn);
- if (!parent)
- goto failed;
-
- /* Get Child's index */
- cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
- pn = parent;
- chopped_off = 0;
-
#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(stats->backtrack);
+ if (!n)
+ this_cpu_inc(stats->null_node_hit);
#endif
- goto backtrace;
+ /* If we are at cindex 0 there are no more bits for
+ * us to strip at this level so we must ascend back
+ * up one level to see if there are any more bits to
+ * be stripped there.
+ */
+ while (!cindex) {
+ t_key pkey = pn->key;
+
+ pn = node_parent_rcu(pn);
+ if (unlikely(!pn))
+ goto failed;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->backtrack);
+#endif
+ /* Get Child's index */
+ cindex = get_index(pkey, pn);
+ }
+
+ /* strip the least significant bit from the cindex */
+ cindex &= cindex - 1;
+
+ /* grab pointer for next child node */
+ cptr = &pn->child[cindex];
}
}
-failed:
- ret = 1;
+
found:
+ /* Step 3: Process the leaf, if that fails fall back to backtracing */
+ ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
+ if (unlikely(ret > 0))
+ goto backtrace;
+failed:
rcu_read_unlock();
return ret;
}

View file

@ -0,0 +1,64 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:00 -0800
Subject: [PATCH] fib_trie: Optimize fib_find_node
This patch makes use of the same features I made use of for
fib_table_lookup to streamline fib_find_node. The resultant code should be
smaller and run faster than the original.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -892,28 +892,34 @@ static void insert_leaf_info(struct hlis
}
/* rcu_read_lock needs to be hold by caller from readside */
-
static struct tnode *fib_find_node(struct trie *t, u32 key)
{
struct tnode *n = rcu_dereference_rtnl(t->trie);
- int pos = 0;
- while (n && IS_TNODE(n)) {
- if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
- pos = n->pos + n->bits;
- n = tnode_get_child_rcu(n,
- tkey_extract_bits(key,
- n->pos,
- n->bits));
- } else
+ while (n) {
+ unsigned long index = get_index(key, n);
+
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the bits in the cindex. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is cindex
+ * else
+ * we have a mismatch in skip bits and failed
+ */
+ if (index >> n->bits)
+ return NULL;
+
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n))
break;
- }
- /* Case we have found a leaf. Compare prefixes */
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
- return n;
+ n = rcu_dereference_rtnl(n->child[index]);
+ }
- return NULL;
+ return n;
}
static void trie_rebalance(struct trie *t, struct tnode *tn)

View file

@ -0,0 +1,276 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:06 -0800
Subject: [PATCH] fib_trie: Optimize fib_table_insert
This patch updates the fib_table_insert function to take advantage of the
changes made to improve the performance of fib_table_lookup. As a result
the code should be smaller and run faster then the original.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -222,31 +222,6 @@ static inline t_key tkey_extract_bits(t_
return 0;
}
-static inline int tkey_equals(t_key a, t_key b)
-{
- return a == b;
-}
-
-static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
-{
- if (bits == 0 || offset >= KEYLENGTH)
- return 1;
- bits = bits > KEYLENGTH ? KEYLENGTH : bits;
- return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
-}
-
-static inline int tkey_mismatch(t_key a, int offset, t_key b)
-{
- t_key diff = a ^ b;
- int i = offset;
-
- if (!diff)
- return 0;
- while ((diff << i) >> (KEYLENGTH-1) == 0)
- i++;
- return i;
-}
-
/*
To understand this stuff, an understanding of keys and all their bits is
necessary. Every node in the trie has a key associated with it, but not
@@ -485,6 +460,15 @@ static void tnode_put_child_reorg(struct
rcu_assign_pointer(tn->child[i], n);
}
+static void put_child_root(struct tnode *tp, struct trie *t,
+ t_key key, struct tnode *n)
+{
+ if (tp)
+ put_child(tp, get_index(key, tp), n);
+ else
+ rcu_assign_pointer(t->trie, n);
+}
+
#define MAX_WORK 10
static struct tnode *resize(struct trie *t, struct tnode *tn)
{
@@ -959,138 +943,100 @@ static void trie_rebalance(struct trie *
static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
{
- int pos, newpos;
- struct tnode *tp = NULL, *tn = NULL;
- struct tnode *n;
- struct tnode *l;
- int missbit;
struct list_head *fa_head = NULL;
+ struct tnode *l, *n, *tp = NULL;
struct leaf_info *li;
- t_key cindex;
- pos = 0;
+ li = leaf_info_new(plen);
+ if (!li)
+ return NULL;
+ fa_head = &li->falh;
+
n = rtnl_dereference(t->trie);
/* If we point to NULL, stop. Either the tree is empty and we should
* just put a new leaf in if, or we have reached an empty child slot,
* and we should just put our new leaf in that.
- * If we point to a T_TNODE, check if it matches our key. Note that
- * a T_TNODE might be skipping any number of bits - its 'pos' need
- * not be the parent's 'pos'+'bits'!
*
- * If it does match the current key, get pos/bits from it, extract
- * the index from our key, push the T_TNODE and walk the tree.
- *
- * If it doesn't, we have to replace it with a new T_TNODE.
- *
- * If we point to a T_LEAF, it might or might not have the same key
- * as we do. If it does, just change the value, update the T_LEAF's
- * value, and return it.
- * If it doesn't, we need to replace it with a T_TNODE.
+ * If we hit a node with a key that does't match then we should stop
+ * and create a new tnode to replace that node and insert ourselves
+ * and the other node into the new tnode.
*/
+ while (n) {
+ unsigned long index = get_index(key, n);
- while (n && IS_TNODE(n)) {
- if (tkey_sub_equals(n->key, pos, n->pos-pos, key)) {
- tp = n;
- pos = n->pos + n->bits;
- n = tnode_get_child(n,
- tkey_extract_bits(key,
- n->pos,
- n->bits));
-
- BUG_ON(n && node_parent(n) != tp);
- } else
+ /* This bit of code is a bit tricky but it combines multiple
+ * checks into a single check. The prefix consists of the
+ * prefix plus zeros for the "bits" in the prefix. The index
+ * is the difference between the key and this value. From
+ * this we can actually derive several pieces of data.
+ * if !(index >> bits)
+ * we know the value is child index
+ * else
+ * we have a mismatch in skip bits and failed
+ */
+ if (index >> n->bits)
break;
- }
- /*
- * n ----> NULL, LEAF or TNODE
- *
- * tp is n's (parent) ----> NULL or TNODE
- */
-
- BUG_ON(tp && IS_LEAF(tp));
-
- /* Case 1: n is a leaf. Compare prefixes */
-
- if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
- li = leaf_info_new(plen);
-
- if (!li)
- return NULL;
+ /* we have found a leaf. Prefixes have already been compared */
+ if (IS_LEAF(n)) {
+ /* Case 1: n is a leaf, and prefixes match*/
+ insert_leaf_info(&n->list, li);
+ return fa_head;
+ }
- fa_head = &li->falh;
- insert_leaf_info(&n->list, li);
- goto done;
+ tp = n;
+ n = rcu_dereference_rtnl(n->child[index]);
}
- l = leaf_new(key);
-
- if (!l)
- return NULL;
-
- li = leaf_info_new(plen);
- if (!li) {
- node_free(l);
+ l = leaf_new(key);
+ if (!l) {
+ free_leaf_info(li);
return NULL;
}
- fa_head = &li->falh;
insert_leaf_info(&l->list, li);
- if (t->trie && n == NULL) {
- /* Case 2: n is NULL, and will just insert a new leaf */
-
- node_set_parent(l, tp);
-
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, l);
- } else {
- /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
- /*
- * Add a new tnode here
- * first tnode need some special handling
- */
+ /* Case 2: n is a LEAF or a TNODE and the key doesn't match.
+ *
+ * Add a new tnode here
+ * first tnode need some special handling
+ * leaves us in position for handling as case 3
+ */
+ if (n) {
+ struct tnode *tn;
+ int newpos;
- if (n) {
- pos = tp ? tp->pos+tp->bits : 0;
- newpos = tkey_mismatch(key, pos, n->key);
- tn = tnode_new(n->key, newpos, 1);
- } else {
- newpos = 0;
- tn = tnode_new(key, newpos, 1); /* First tnode */
- }
+ newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
+ tn = tnode_new(key, newpos, 1);
if (!tn) {
free_leaf_info(li);
node_free(l);
return NULL;
}
- node_set_parent(tn, tp);
-
- missbit = tkey_extract_bits(key, newpos, 1);
- put_child(tn, missbit, l);
- put_child(tn, 1-missbit, n);
-
- if (tp) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
- put_child(tp, cindex, tn);
- } else {
- rcu_assign_pointer(t->trie, tn);
- }
+ /* initialize routes out of node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child(tn, get_index(key, tn) ^ 1, n);
+
+ /* start adding routes into the node */
+ put_child_root(tp, t, key, tn);
+ node_set_parent(n, tn);
+ /* parent now has a NULL spot where the leaf can go */
tp = tn;
}
- if (tp && tp->pos + tp->bits > 32)
- pr_warn("fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
- tp, tp->pos, tp->bits, key, plen);
-
- /* Rebalance the trie */
+ /* Case 3: n is NULL, and will just insert a new leaf */
+ if (tp) {
+ NODE_INIT_PARENT(l, tp);
+ put_child(tp, get_index(key, tp), l);
+ trie_rebalance(t, tp);
+ } else {
+ rcu_assign_pointer(t->trie, l);
+ }
- trie_rebalance(t, tp);
-done:
return fa_head;
}
@@ -1470,11 +1416,11 @@ static void trie_leaf_remove(struct trie
pr_debug("entering trie_leaf_remove(%p)\n", l);
if (tp) {
- t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
- put_child(tp, cindex, NULL);
+ put_child(tp, get_index(l->key, tp), NULL);
trie_rebalance(t, tp);
- } else
+ } else {
RCU_INIT_POINTER(t->trie, NULL);
+ }
node_free(l);
}

View file

@ -0,0 +1,346 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:12 -0800
Subject: [PATCH] fib_trie: Update meaning of pos to represent unchecked
bits
This change moves the pos value to the other side of the "bits" field. By
doing this it actually simplifies a significant amount of code in the trie.
For example when halving a tree we know that the bit lost exists at
oldnode->pos, and if we inflate the tree the new bit being add is at
tn->pos. Previously to find those bits you would have to subtract pos and
bits from the keylength or start with a value of (1 << 31) and then shift
that.
There are a number of spots throughout the code that benefit from this. In
the case of the hot-path searches the main advantage is that we can drop 2
or more operations from the search path as we no longer need to compute the
value for the index to be shifted by and can instead just use the raw pos
value.
In addition the tkey_extract_bits is now defunct and can be replaced by
get_index since the two operations were doing the same thing, but now
get_index does it much more quickly as it is only an xor and shift versus a
pair of shifts and a subtraction.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -90,8 +90,7 @@ typedef unsigned int t_key;
#define IS_TNODE(n) ((n)->bits)
#define IS_LEAF(n) (!(n)->bits)
-#define get_shift(_kv) (KEYLENGTH - (_kv)->pos - (_kv)->bits)
-#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> get_shift(_kv))
+#define get_index(_key, _kv) (((_key) ^ (_kv)->key) >> (_kv)->pos)
struct tnode {
t_key key;
@@ -209,81 +208,64 @@ static inline struct tnode *tnode_get_ch
return rcu_dereference_rtnl(tn->child[i]);
}
-static inline t_key mask_pfx(t_key k, unsigned int l)
-{
- return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
-}
-
-static inline t_key tkey_extract_bits(t_key a, unsigned int offset, unsigned int bits)
-{
- if (offset < KEYLENGTH)
- return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
- else
- return 0;
-}
-
-/*
- To understand this stuff, an understanding of keys and all their bits is
- necessary. Every node in the trie has a key associated with it, but not
- all of the bits in that key are significant.
-
- Consider a node 'n' and its parent 'tp'.
-
- If n is a leaf, every bit in its key is significant. Its presence is
- necessitated by path compression, since during a tree traversal (when
- searching for a leaf - unless we are doing an insertion) we will completely
- ignore all skipped bits we encounter. Thus we need to verify, at the end of
- a potentially successful search, that we have indeed been walking the
- correct key path.
-
- Note that we can never "miss" the correct key in the tree if present by
- following the wrong path. Path compression ensures that segments of the key
- that are the same for all keys with a given prefix are skipped, but the
- skipped part *is* identical for each node in the subtrie below the skipped
- bit! trie_insert() in this implementation takes care of that - note the
- call to tkey_sub_equals() in trie_insert().
-
- if n is an internal node - a 'tnode' here, the various parts of its key
- have many different meanings.
-
- Example:
- _________________________________________________________________
- | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
- -----------------------------------------------------------------
- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-
- _________________________________________________________________
- | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
- -----------------------------------------------------------------
- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
-
- tp->pos = 7
- tp->bits = 3
- n->pos = 15
- n->bits = 4
-
- First, let's just ignore the bits that come before the parent tp, that is
- the bits from 0 to (tp->pos-1). They are *known* but at this point we do
- not use them for anything.
-
- The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
- index into the parent's child array. That is, they will be used to find
- 'n' among tp's children.
-
- The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
- for the node n.
-
- All the bits we have seen so far are significant to the node n. The rest
- of the bits are really not needed or indeed known in n->key.
-
- The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
- n's child array, and will of course be different for each child.
-
-
- The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
- at this point.
-
-*/
+/* To understand this stuff, an understanding of keys and all their bits is
+ * necessary. Every node in the trie has a key associated with it, but not
+ * all of the bits in that key are significant.
+ *
+ * Consider a node 'n' and its parent 'tp'.
+ *
+ * If n is a leaf, every bit in its key is significant. Its presence is
+ * necessitated by path compression, since during a tree traversal (when
+ * searching for a leaf - unless we are doing an insertion) we will completely
+ * ignore all skipped bits we encounter. Thus we need to verify, at the end of
+ * a potentially successful search, that we have indeed been walking the
+ * correct key path.
+ *
+ * Note that we can never "miss" the correct key in the tree if present by
+ * following the wrong path. Path compression ensures that segments of the key
+ * that are the same for all keys with a given prefix are skipped, but the
+ * skipped part *is* identical for each node in the subtrie below the skipped
+ * bit! trie_insert() in this implementation takes care of that.
+ *
+ * if n is an internal node - a 'tnode' here, the various parts of its key
+ * have many different meanings.
+ *
+ * Example:
+ * _________________________________________________________________
+ * | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
+ * -----------------------------------------------------------------
+ * 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ *
+ * _________________________________________________________________
+ * | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
+ * -----------------------------------------------------------------
+ * 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0
+ *
+ * tp->pos = 22
+ * tp->bits = 3
+ * n->pos = 13
+ * n->bits = 4
+ *
+ * First, let's just ignore the bits that come before the parent tp, that is
+ * the bits from (tp->pos + tp->bits) to 31. They are *known* but at this
+ * point we do not use them for anything.
+ *
+ * The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
+ * index into the parent's child array. That is, they will be used to find
+ * 'n' among tp's children.
+ *
+ * The bits from (n->pos + n->bits) to (tn->pos - 1) - "S" - are skipped bits
+ * for the node n.
+ *
+ * All the bits we have seen so far are significant to the node n. The rest
+ * of the bits are really not needed or indeed known in n->key.
+ *
+ * The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
+ * n's child array, and will of course be different for each child.
+ *
+ * The rest of the bits, from 0 to (n->pos + n->bits), are completely unknown
+ * at this point.
+ */
static const int halve_threshold = 25;
static const int inflate_threshold = 50;
@@ -367,7 +349,7 @@ static struct tnode *leaf_new(t_key key)
* as the nodes are searched
*/
l->key = key;
- l->pos = KEYLENGTH;
+ l->pos = 0;
/* set bits to 0 indicating we are not a tnode */
l->bits = 0;
@@ -400,7 +382,7 @@ static struct tnode *tnode_new(t_key key
tn->parent = NULL;
tn->pos = pos;
tn->bits = bits;
- tn->key = mask_pfx(key, pos);
+ tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
tn->full_children = 0;
tn->empty_children = 1<<bits;
}
@@ -410,14 +392,12 @@ static struct tnode *tnode_new(t_key key
return tn;
}
-/*
- * Check whether a tnode 'n' is "full", i.e. it is an internal node
+/* Check whether a tnode 'n' is "full", i.e. it is an internal node
* and no bits are skipped. See discussion in dyntree paper p. 6
*/
-
static inline int tnode_full(const struct tnode *tn, const struct tnode *n)
{
- return n && IS_TNODE(n) && (n->pos == (tn->pos + tn->bits));
+ return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
static inline void put_child(struct tnode *tn, int i,
@@ -641,11 +621,12 @@ static struct tnode *inflate(struct trie
{
int olen = tnode_child_length(oldtnode);
struct tnode *tn;
+ t_key m;
int i;
pr_debug("In inflate\n");
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
+ tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
if (!tn)
return ERR_PTR(-ENOMEM);
@@ -656,21 +637,18 @@ static struct tnode *inflate(struct trie
* fails. In case of failure we return the oldnode and inflate
* of tnode is ignored.
*/
+ for (i = 0, m = 1u << tn->pos; i < olen; i++) {
+ struct tnode *inode = tnode_get_child(oldtnode, i);
- for (i = 0; i < olen; i++) {
- struct tnode *inode;
-
- inode = tnode_get_child(oldtnode, i);
- if (tnode_full(oldtnode, inode) && inode->bits > 1) {
+ if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
struct tnode *left, *right;
- t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
- left = tnode_new(inode->key&(~m), inode->pos + 1,
+ left = tnode_new(inode->key & ~m, inode->pos,
inode->bits - 1);
if (!left)
goto nomem;
- right = tnode_new(inode->key|m, inode->pos + 1,
+ right = tnode_new(inode->key | m, inode->pos,
inode->bits - 1);
if (!right) {
@@ -694,9 +672,7 @@ static struct tnode *inflate(struct trie
/* A leaf or an internal node with skipped bits */
if (!tnode_full(oldtnode, inode)) {
- put_child(tn,
- tkey_extract_bits(inode->key, tn->pos, tn->bits),
- inode);
+ put_child(tn, get_index(inode->key, tn), inode);
continue;
}
@@ -767,7 +743,7 @@ static struct tnode *halve(struct trie *
pr_debug("In halve\n");
- tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
+ tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
if (!tn)
return ERR_PTR(-ENOMEM);
@@ -787,7 +763,7 @@ static struct tnode *halve(struct trie *
if (left && right) {
struct tnode *newn;
- newn = tnode_new(left->key, tn->pos + tn->bits, 1);
+ newn = tnode_new(left->key, oldtnode->pos, 1);
if (!newn)
goto nomem;
@@ -915,7 +891,7 @@ static void trie_rebalance(struct trie *
key = tn->key;
while (tn != NULL && (tp = node_parent(tn)) != NULL) {
- cindex = tkey_extract_bits(key, tp->pos, tp->bits);
+ cindex = get_index(key, tp);
wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
tn = resize(t, tn);
@@ -1005,11 +981,8 @@ static struct list_head *fib_insert_node
*/
if (n) {
struct tnode *tn;
- int newpos;
-
- newpos = KEYLENGTH - __fls(n->key ^ key) - 1;
- tn = tnode_new(key, newpos, 1);
+ tn = tnode_new(key, __fls(key ^ n->key), 1);
if (!tn) {
free_leaf_info(li);
node_free(l);
@@ -1559,12 +1532,7 @@ static int trie_flush_leaf(struct tnode
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
- t_key idx;
-
- if (c)
- idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
- else
- idx = 0;
+ t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
while (idx < 1u << p->bits) {
c = tnode_get_child_rcu(p, idx++);
@@ -1851,7 +1819,7 @@ rescan:
/* Current node exhausted, pop back up */
p = node_parent_rcu(tn);
if (p) {
- cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
+ cindex = get_index(tn->key, p) + 1;
tn = p;
--iter->depth;
goto rescan;
@@ -2187,10 +2155,10 @@ static int fib_trie_seq_show(struct seq_
if (IS_TNODE(n)) {
__be32 prf = htonl(n->key);
- seq_indent(seq, iter->depth - 1);
- seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
- &prf, n->pos, n->bits, n->full_children,
- n->empty_children);
+ seq_indent(seq, iter->depth-1);
+ seq_printf(seq, " +-- %pI4/%zu %u %u %u\n",
+ &prf, KEYLENGTH - n->pos - n->bits, n->bits,
+ n->full_children, n->empty_children);
} else {
struct leaf_info *li;
__be32 val = htonl(n->key);

View file

@ -0,0 +1,186 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:18 -0800
Subject: [PATCH] fib_trie: Use unsigned long for anything dealing with a
shift by bits
This change makes it so that anything that can be shifted by, or compared
to a value shifted by bits is updated to be an unsigned long. This is
mostly a precaution against an insanely huge address space that somehow
starts coming close to the 2^32 root node size which would require
something like 1.5 billion addresses.
I chose unsigned long instead of unsigned long long since I do not believe
it is possible to allocate a 32 bit tnode on a 32 bit system as the memory
consumed would be 16GB + 28B which exceeds the addressible space for any
one process.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -146,8 +146,8 @@ struct trie {
#endif
};
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
- int wasfull);
+static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
+ struct tnode *n, int wasfull);
static struct tnode *resize(struct trie *t, struct tnode *tn);
static struct tnode *inflate(struct trie *t, struct tnode *tn);
static struct tnode *halve(struct trie *t, struct tnode *tn);
@@ -183,25 +183,23 @@ static inline void node_set_parent(struc
/* This provides us with the number of children in this node, in the case of a
* leaf this will return 0 meaning none of the children are accessible.
*/
-static inline int tnode_child_length(const struct tnode *tn)
+static inline unsigned long tnode_child_length(const struct tnode *tn)
{
return (1ul << tn->bits) & ~(1ul);
}
-/*
- * caller must hold RTNL
- */
-static inline struct tnode *tnode_get_child(const struct tnode *tn, unsigned int i)
+/* caller must hold RTNL */
+static inline struct tnode *tnode_get_child(const struct tnode *tn,
+ unsigned long i)
{
BUG_ON(i >= tnode_child_length(tn));
return rtnl_dereference(tn->child[i]);
}
-/*
- * caller must hold RCU read lock or RTNL
- */
-static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn, unsigned int i)
+/* caller must hold RCU read lock or RTNL */
+static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
+ unsigned long i)
{
BUG_ON(i >= tnode_child_length(tn));
@@ -400,7 +398,7 @@ static inline int tnode_full(const struc
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
-static inline void put_child(struct tnode *tn, int i,
+static inline void put_child(struct tnode *tn, unsigned long i,
struct tnode *n)
{
tnode_put_child_reorg(tn, i, n, -1);
@@ -411,13 +409,13 @@ static inline void put_child(struct tnod
* Update the value of full_children and empty_children.
*/
-static void tnode_put_child_reorg(struct tnode *tn, int i, struct tnode *n,
- int wasfull)
+static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
+ struct tnode *n, int wasfull)
{
struct tnode *chi = rtnl_dereference(tn->child[i]);
int isfull;
- BUG_ON(i >= 1<<tn->bits);
+ BUG_ON(i >= tnode_child_length(tn));
/* update emptyChildren */
if (n == NULL && chi != NULL)
@@ -607,10 +605,10 @@ no_children:
static void tnode_clean_free(struct tnode *tn)
{
struct tnode *tofree;
- int i;
+ unsigned long i;
for (i = 0; i < tnode_child_length(tn); i++) {
- tofree = rtnl_dereference(tn->child[i]);
+ tofree = tnode_get_child(tn, i);
if (tofree)
node_free(tofree);
}
@@ -619,10 +617,10 @@ static void tnode_clean_free(struct tnod
static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
{
- int olen = tnode_child_length(oldtnode);
+ unsigned long olen = tnode_child_length(oldtnode);
struct tnode *tn;
+ unsigned long i;
t_key m;
- int i;
pr_debug("In inflate\n");
@@ -664,7 +662,7 @@ static struct tnode *inflate(struct trie
for (i = 0; i < olen; i++) {
struct tnode *inode = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
- int size, j;
+ unsigned long size, j;
/* An empty child */
if (inode == NULL)
@@ -737,7 +735,7 @@ nomem:
static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
{
- int olen = tnode_child_length(oldtnode);
+ unsigned long olen = tnode_child_length(oldtnode);
struct tnode *tn, *left, *right;
int i;
@@ -1532,9 +1530,9 @@ static int trie_flush_leaf(struct tnode
static struct tnode *leaf_walk_rcu(struct tnode *p, struct tnode *c)
{
do {
- t_key idx = c ? idx = get_index(c->key, p) + 1 : 0;
+ unsigned long idx = c ? idx = get_index(c->key, p) + 1 : 0;
- while (idx < 1u << p->bits) {
+ while (idx < tnode_child_length(p)) {
c = tnode_get_child_rcu(p, idx++);
if (!c)
continue;
@@ -1786,8 +1784,8 @@ struct fib_trie_iter {
static struct tnode *fib_trie_get_next(struct fib_trie_iter *iter)
{
+ unsigned long cindex = iter->index;
struct tnode *tn = iter->tnode;
- unsigned int cindex = iter->index;
struct tnode *p;
/* A single entry routing table */
@@ -1797,7 +1795,7 @@ static struct tnode *fib_trie_get_next(s
pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
iter->tnode, iter->index, iter->depth);
rescan:
- while (cindex < (1<<tn->bits)) {
+ while (cindex < tnode_child_length(tn)) {
struct tnode *n = tnode_get_child_rcu(tn, cindex);
if (n) {
@@ -1874,15 +1872,16 @@ static void trie_collect_stats(struct tr
hlist_for_each_entry_rcu(li, &n->list, hlist)
++s->prefixes;
} else {
- int i;
+ unsigned long i;
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
- for (i = 0; i < tnode_child_length(n); i++)
+ for (i = 0; i < tnode_child_length(n); i++) {
if (!rcu_access_pointer(n->child[i]))
s->nullpointers++;
+ }
}
}
rcu_read_unlock();

View file

@ -0,0 +1,403 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:24 -0800
Subject: [PATCH] fib_trie: Push rcu_read_lock/unlock to callers
This change is to start cleaning up some of the rcu_read_lock/unlock
handling. I realized while reviewing the code there are several spots that
I don't believe are being handled correctly or are masking warnings by
locally calling rcu_read_lock/unlock instead of calling them at the correct
level.
A common example is a call to fib_get_table followed by fib_table_lookup.
The rcu_read_lock/unlock ought to wrap both but there are several spots where
they were not wrapped.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -222,16 +222,19 @@ static inline struct fib_table *fib_new_
static inline int fib_lookup(struct net *net, const struct flowi4 *flp,
struct fib_result *res)
{
- struct fib_table *table;
+ int err = -ENETUNREACH;
- table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
- return 0;
-
- table = fib_get_table(net, RT_TABLE_MAIN);
- if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
- return 0;
- return -ENETUNREACH;
+ rcu_read_lock();
+
+ if (!fib_table_lookup(fib_get_table(net, RT_TABLE_LOCAL), flp, res,
+ FIB_LOOKUP_NOREF) ||
+ !fib_table_lookup(fib_get_table(net, RT_TABLE_MAIN), flp, res,
+ FIB_LOOKUP_NOREF))
+ err = 0;
+
+ rcu_read_unlock();
+
+ return err;
}
#else /* CONFIG_IP_MULTIPLE_TABLES */
@@ -247,20 +250,25 @@ static inline int fib_lookup(struct net
struct fib_result *res)
{
if (!net->ipv4.fib_has_custom_rules) {
+ int err = -ENETUNREACH;
+
+ rcu_read_lock();
+
res->tclassid = 0;
- if (net->ipv4.fib_local &&
- !fib_table_lookup(net->ipv4.fib_local, flp, res,
- FIB_LOOKUP_NOREF))
- return 0;
- if (net->ipv4.fib_main &&
- !fib_table_lookup(net->ipv4.fib_main, flp, res,
- FIB_LOOKUP_NOREF))
- return 0;
- if (net->ipv4.fib_default &&
- !fib_table_lookup(net->ipv4.fib_default, flp, res,
- FIB_LOOKUP_NOREF))
- return 0;
- return -ENETUNREACH;
+ if ((net->ipv4.fib_local &&
+ !fib_table_lookup(net->ipv4.fib_local, flp, res,
+ FIB_LOOKUP_NOREF)) ||
+ (net->ipv4.fib_main &&
+ !fib_table_lookup(net->ipv4.fib_main, flp, res,
+ FIB_LOOKUP_NOREF)) ||
+ (net->ipv4.fib_default &&
+ !fib_table_lookup(net->ipv4.fib_default, flp, res,
+ FIB_LOOKUP_NOREF)))
+ err = 0;
+
+ rcu_read_unlock();
+
+ return err;
}
return __fib_lookup(net, flp, res);
}
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -109,6 +109,7 @@ struct fib_table *fib_new_table(struct n
return tb;
}
+/* caller must hold either rtnl or rcu read lock */
struct fib_table *fib_get_table(struct net *net, u32 id)
{
struct fib_table *tb;
@@ -119,15 +120,11 @@ struct fib_table *fib_get_table(struct n
id = RT_TABLE_MAIN;
h = id & (FIB_TABLE_HASHSZ - 1);
- rcu_read_lock();
head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry_rcu(tb, head, tb_hlist) {
- if (tb->tb_id == id) {
- rcu_read_unlock();
+ if (tb->tb_id == id)
return tb;
- }
}
- rcu_read_unlock();
return NULL;
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
@@ -167,16 +164,18 @@ static inline unsigned int __inet_dev_ad
if (ipv4_is_multicast(addr))
return RTN_MULTICAST;
+ rcu_read_lock();
+
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
- rcu_read_lock();
if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
}
- rcu_read_unlock();
}
+
+ rcu_read_unlock();
return ret;
}
@@ -919,7 +918,7 @@ void fib_del_ifaddr(struct in_ifaddr *if
#undef BRD1_OK
}
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
+static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
{
struct fib_result res;
@@ -929,6 +928,11 @@ static void nl_fib_lookup(struct fib_res
.flowi4_tos = frn->fl_tos,
.flowi4_scope = frn->fl_scope,
};
+ struct fib_table *tb;
+
+ rcu_read_lock();
+
+ tb = fib_get_table(net, frn->tb_id_in);
frn->err = -ENOENT;
if (tb) {
@@ -945,6 +949,8 @@ static void nl_fib_lookup(struct fib_res
}
local_bh_enable();
}
+
+ rcu_read_unlock();
}
static void nl_fib_input(struct sk_buff *skb)
@@ -952,7 +958,6 @@ static void nl_fib_input(struct sk_buff
struct net *net;
struct fib_result_nl *frn;
struct nlmsghdr *nlh;
- struct fib_table *tb;
u32 portid;
net = sock_net(skb->sk);
@@ -967,9 +972,7 @@ static void nl_fib_input(struct sk_buff
nlh = nlmsg_hdr(skb);
frn = (struct fib_result_nl *) nlmsg_data(nlh);
- tb = fib_get_table(net, frn->tb_id_in);
-
- nl_fib_lookup(frn, tb);
+ nl_fib_lookup(net, frn);
portid = NETLINK_CB(skb).portid; /* netlink portid */
NETLINK_CB(skb).portid = 0; /* from kernel */
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -81,27 +81,25 @@ static int fib4_rule_action(struct fib_r
break;
case FR_ACT_UNREACHABLE:
- err = -ENETUNREACH;
- goto errout;
+ return -ENETUNREACH;
case FR_ACT_PROHIBIT:
- err = -EACCES;
- goto errout;
+ return -EACCES;
case FR_ACT_BLACKHOLE:
default:
- err = -EINVAL;
- goto errout;
+ return -EINVAL;
}
+ rcu_read_lock();
+
tbl = fib_get_table(rule->fr_net, rule->table);
- if (!tbl)
- goto errout;
+ if (tbl)
+ err = fib_table_lookup(tbl, &flp->u.ip4,
+ (struct fib_result *)arg->result,
+ arg->flags);
- err = fib_table_lookup(tbl, &flp->u.ip4, (struct fib_result *) arg->result, arg->flags);
- if (err > 0)
- err = -EAGAIN;
-errout:
+ rcu_read_unlock();
return err;
}
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1181,72 +1181,6 @@ err:
return err;
}
-/* should be called with rcu_read_lock */
-static int check_leaf(struct fib_table *tb, struct trie *t, struct tnode *l,
- t_key key, const struct flowi4 *flp,
- struct fib_result *res, int fib_flags)
-{
- struct leaf_info *li;
- struct hlist_head *hhead = &l->list;
-
- hlist_for_each_entry_rcu(li, hhead, hlist) {
- struct fib_alias *fa;
-
- if (l->key != (key & li->mask_plen))
- continue;
-
- list_for_each_entry_rcu(fa, &li->falh, fa_list) {
- struct fib_info *fi = fa->fa_info;
- int nhsel, err;
-
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
- continue;
- if (fi->fib_dead)
- continue;
- if (fa->fa_info->fib_scope < flp->flowi4_scope)
- continue;
- fib_alias_accessed(fa);
- err = fib_props[fa->fa_type].error;
- if (unlikely(err < 0)) {
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->semantic_match_passed);
-#endif
- return err;
- }
- if (fi->fib_flags & RTNH_F_DEAD)
- continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
-
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
- continue;
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->semantic_match_passed);
-#endif
- res->prefixlen = li->plen;
- res->nh_sel = nhsel;
- res->type = fa->fa_type;
- res->scope = fi->fib_scope;
- res->fi = fi;
- res->table = tb;
- res->fa_head = &li->falh;
- if (!(fib_flags & FIB_LOOKUP_NOREF))
- atomic_inc(&fi->fib_clntref);
- return 0;
- }
- }
-
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->semantic_match_miss);
-#endif
- }
-
- return 1;
-}
-
static inline t_key prefix_mismatch(t_key key, struct tnode *n)
{
t_key prefix = n->key;
@@ -1254,6 +1188,7 @@ static inline t_key prefix_mismatch(t_ke
return (key ^ prefix) & (prefix | -prefix);
}
+/* should be called with rcu_read_lock */
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
{
@@ -1263,14 +1198,12 @@ int fib_table_lookup(struct fib_table *t
#endif
const t_key key = ntohl(flp->daddr);
struct tnode *n, *pn;
+ struct leaf_info *li;
t_key cindex;
- int ret = 1;
-
- rcu_read_lock();
n = rcu_dereference(t->trie);
if (!n)
- goto failed;
+ return -EAGAIN;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->gets);
@@ -1350,7 +1283,7 @@ backtrace:
pn = node_parent_rcu(pn);
if (unlikely(!pn))
- goto failed;
+ return -EAGAIN;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
@@ -1368,12 +1301,62 @@ backtrace:
found:
/* Step 3: Process the leaf, if that fails fall back to backtracing */
- ret = check_leaf(tb, t, n, key, flp, res, fib_flags);
- if (unlikely(ret > 0))
- goto backtrace;
-failed:
- rcu_read_unlock();
- return ret;
+ hlist_for_each_entry_rcu(li, &n->list, hlist) {
+ struct fib_alias *fa;
+
+ if ((key ^ n->key) & li->mask_plen)
+ continue;
+
+ list_for_each_entry_rcu(fa, &li->falh, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+ int nhsel, err;
+
+ if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ continue;
+ if (fi->fib_dead)
+ continue;
+ if (fa->fa_info->fib_scope < flp->flowi4_scope)
+ continue;
+ fib_alias_accessed(fa);
+ err = fib_props[fa->fa_type].error;
+ if (unlikely(err < 0)) {
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ if (fi->fib_flags & RTNH_F_DEAD)
+ continue;
+ for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
+ const struct fib_nh *nh = &fi->fib_nh[nhsel];
+
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+ if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif)
+ continue;
+
+ if (!(fib_flags & FIB_LOOKUP_NOREF))
+ atomic_inc(&fi->fib_clntref);
+
+ res->prefixlen = li->plen;
+ res->nh_sel = nhsel;
+ res->type = fa->fa_type;
+ res->scope = fi->fib_scope;
+ res->fi = fi;
+ res->table = tb;
+ res->fa_head = &li->falh;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_passed);
+#endif
+ return err;
+ }
+ }
+
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(stats->semantic_match_miss);
+#endif
+ }
+ goto backtrace;
}
EXPORT_SYMBOL_GPL(fib_table_lookup);

View file

@ -0,0 +1,345 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:31 -0800
Subject: [PATCH] fib_trie: Move resize to after inflate/halve
This change consists of a cut/paste of resize to behind inflate and halve
so that I could remove the two function prototypes.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -149,8 +149,6 @@ struct trie {
static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
struct tnode *n, int wasfull);
static struct tnode *resize(struct trie *t, struct tnode *tn);
-static struct tnode *inflate(struct trie *t, struct tnode *tn);
-static struct tnode *halve(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
static struct callback_head *tnode_free_head;
static size_t tnode_free_size;
@@ -447,161 +445,6 @@ static void put_child_root(struct tnode
rcu_assign_pointer(t->trie, n);
}
-#define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
-{
- struct tnode *old_tn, *n = NULL;
- int inflate_threshold_use;
- int halve_threshold_use;
- int max_work;
-
- if (!tn)
- return NULL;
-
- pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
- tn, inflate_threshold, halve_threshold);
-
- /* No children */
- if (tn->empty_children > (tnode_child_length(tn) - 1))
- goto no_children;
-
- /* One child */
- if (tn->empty_children == (tnode_child_length(tn) - 1))
- goto one_child;
- /*
- * Double as long as the resulting node has a number of
- * nonempty nodes that are above the threshold.
- */
-
- /*
- * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
- * the Helsinki University of Technology and Matti Tikkanen of Nokia
- * Telecommunications, page 6:
- * "A node is doubled if the ratio of non-empty children to all
- * children in the *doubled* node is at least 'high'."
- *
- * 'high' in this instance is the variable 'inflate_threshold'. It
- * is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
- * child array will be doubled by inflate()) and multiplying
- * the left-hand side by 100 (to handle the percentage thing) we
- * multiply the left-hand side by 50.
- *
- * The left-hand side may look a bit weird: tnode_child_length(tn)
- * - tn->empty_children is of course the number of non-null children
- * in the current node. tn->full_children is the number of "full"
- * children, that is non-null tnodes with a skip value of 0.
- * All of those will be doubled in the resulting inflated tnode, so
- * we just count them one extra time here.
- *
- * A clearer way to write this would be:
- *
- * to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
- * tn->full_children;
- *
- * new_child_length = tnode_child_length(tn) * 2;
- *
- * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
- * new_child_length;
- * if (new_fill_factor >= inflate_threshold)
- *
- * ...and so on, tho it would mess up the while () loop.
- *
- * anyway,
- * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
- * inflate_threshold
- *
- * avoid a division:
- * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
- * inflate_threshold * new_child_length
- *
- * expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >= inflate_threshold * new_child_length
- *
- * expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
- *
- * shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
- * tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
- *
- */
-
- /* Keep root node larger */
-
- if (!node_parent(tn)) {
- inflate_threshold_use = inflate_threshold_root;
- halve_threshold_use = halve_threshold_root;
- } else {
- inflate_threshold_use = inflate_threshold;
- halve_threshold_use = halve_threshold;
- }
-
- max_work = MAX_WORK;
- while ((tn->full_children > 0 && max_work-- &&
- 50 * (tn->full_children + tnode_child_length(tn)
- - tn->empty_children)
- >= inflate_threshold_use * tnode_child_length(tn))) {
-
- old_tn = tn;
- tn = inflate(t, tn);
-
- if (IS_ERR(tn)) {
- tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
-#endif
- break;
- }
- }
-
- /* Return if at least one inflate is run */
- if (max_work != MAX_WORK)
- return tn;
-
- /*
- * Halve as long as the number of empty children in this
- * node is above threshold.
- */
-
- max_work = MAX_WORK;
- while (tn->bits > 1 && max_work-- &&
- 100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold_use * tnode_child_length(tn)) {
-
- old_tn = tn;
- tn = halve(t, tn);
- if (IS_ERR(tn)) {
- tn = old_tn;
-#ifdef CONFIG_IP_FIB_TRIE_STATS
- this_cpu_inc(t->stats->resize_node_skipped);
-#endif
- break;
- }
- }
-
-
- /* Only one child remains */
- if (tn->empty_children == (tnode_child_length(tn) - 1)) {
- unsigned long i;
-one_child:
- for (i = tnode_child_length(tn); !n && i;)
- n = tnode_get_child(tn, --i);
-no_children:
- /* compress one level */
- node_set_parent(n, NULL);
- tnode_free_safe(tn);
- return n;
- }
- return tn;
-}
-
-
static void tnode_clean_free(struct tnode *tn)
{
struct tnode *tofree;
@@ -804,6 +647,160 @@ nomem:
return ERR_PTR(-ENOMEM);
}
+#define MAX_WORK 10
+static struct tnode *resize(struct trie *t, struct tnode *tn)
+{
+ struct tnode *old_tn, *n = NULL;
+ int inflate_threshold_use;
+ int halve_threshold_use;
+ int max_work;
+
+ if (!tn)
+ return NULL;
+
+ pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
+ tn, inflate_threshold, halve_threshold);
+
+ /* No children */
+ if (tn->empty_children > (tnode_child_length(tn) - 1))
+ goto no_children;
+
+ /* One child */
+ if (tn->empty_children == (tnode_child_length(tn) - 1))
+ goto one_child;
+ /*
+ * Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
+ */
+
+ /*
+ * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * tnode_child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = tnode_child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * tnode_child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + tnode_child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * tnode_child_length(tn)
+ *
+ */
+
+ /* Keep root node larger */
+
+ if (!node_parent(tn)) {
+ inflate_threshold_use = inflate_threshold_root;
+ halve_threshold_use = halve_threshold_root;
+ } else {
+ inflate_threshold_use = inflate_threshold;
+ halve_threshold_use = halve_threshold;
+ }
+
+ max_work = MAX_WORK;
+ while ((tn->full_children > 0 && max_work-- &&
+ 50 * (tn->full_children + tnode_child_length(tn)
+ - tn->empty_children)
+ >= inflate_threshold_use * tnode_child_length(tn))) {
+
+ old_tn = tn;
+ tn = inflate(t, tn);
+
+ if (IS_ERR(tn)) {
+ tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+ break;
+ }
+ }
+
+ /* Return if at least one inflate is run */
+ if (max_work != MAX_WORK)
+ return tn;
+
+ /*
+ * Halve as long as the number of empty children in this
+ * node is above threshold.
+ */
+
+ max_work = MAX_WORK;
+ while (tn->bits > 1 && max_work-- &&
+ 100 * (tnode_child_length(tn) - tn->empty_children) <
+ halve_threshold_use * tnode_child_length(tn)) {
+
+ old_tn = tn;
+ tn = halve(t, tn);
+ if (IS_ERR(tn)) {
+ tn = old_tn;
+#ifdef CONFIG_IP_FIB_TRIE_STATS
+ this_cpu_inc(t->stats->resize_node_skipped);
+#endif
+ break;
+ }
+ }
+
+
+ /* Only one child remains */
+ if (tn->empty_children == (tnode_child_length(tn) - 1)) {
+ unsigned long i;
+one_child:
+ for (i = tnode_child_length(tn); !n && i;)
+ n = tnode_get_child(tn, --i);
+no_children:
+ /* compress one level */
+ node_set_parent(n, NULL);
+ tnode_free_safe(tn);
+ return n;
+ }
+ return tn;
+}
+
/* readside must use rcu_read_lock currently dump routines
via get_fa_head and dump */

View file

@ -0,0 +1,250 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:37 -0800
Subject: [PATCH] fib_trie: Add functions should_inflate and should_halve
This change pulls the logic for if we should inflate/halve the nodes out
into separate functions. It also addresses what I believe is a bug where 1
full node is all that is needed to keep a node from ever being halved.
Simple script to reproduce the issue:
modprobe dummy; ifconfig dummy0 up
for i in `seq 0 255`; do ifconfig dummy0:$i 10.0.${i}.1/24 up; done
ifconfig dummy0:256 10.0.255.33/16 up
for i in `seq 0 254`; do ifconfig dummy0:$i down; done
Results from /proc/net/fib_triestat
Before:
Local:
Aver depth: 3.00
Max depth: 4
Leaves: 17
Prefixes: 18
Internal nodes: 11
1: 8 2: 2 10: 1
Pointers: 1048
Null ptrs: 1021
Total size: 11 kB
After:
Local:
Aver depth: 3.41
Max depth: 5
Leaves: 17
Prefixes: 18
Internal nodes: 12
1: 8 2: 3 3: 1
Pointers: 36
Null ptrs: 8
Total size: 3 kB
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -647,12 +647,94 @@ nomem:
return ERR_PTR(-ENOMEM);
}
+/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
+ * the Helsinki University of Technology and Matti Tikkanen of Nokia
+ * Telecommunications, page 6:
+ * "A node is doubled if the ratio of non-empty children to all
+ * children in the *doubled* node is at least 'high'."
+ *
+ * 'high' in this instance is the variable 'inflate_threshold'. It
+ * is expressed as a percentage, so we multiply it with
+ * tnode_child_length() and instead of multiplying by 2 (since the
+ * child array will be doubled by inflate()) and multiplying
+ * the left-hand side by 100 (to handle the percentage thing) we
+ * multiply the left-hand side by 50.
+ *
+ * The left-hand side may look a bit weird: tnode_child_length(tn)
+ * - tn->empty_children is of course the number of non-null children
+ * in the current node. tn->full_children is the number of "full"
+ * children, that is non-null tnodes with a skip value of 0.
+ * All of those will be doubled in the resulting inflated tnode, so
+ * we just count them one extra time here.
+ *
+ * A clearer way to write this would be:
+ *
+ * to_be_doubled = tn->full_children;
+ * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
+ * tn->full_children;
+ *
+ * new_child_length = tnode_child_length(tn) * 2;
+ *
+ * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
+ * new_child_length;
+ * if (new_fill_factor >= inflate_threshold)
+ *
+ * ...and so on, tho it would mess up the while () loop.
+ *
+ * anyway,
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
+ * inflate_threshold
+ *
+ * avoid a division:
+ * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
+ * inflate_threshold * new_child_length
+ *
+ * expand not_to_be_doubled and to_be_doubled, and shorten:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >= inflate_threshold * new_child_length
+ *
+ * expand new_child_length:
+ * 100 * (tnode_child_length(tn) - tn->empty_children +
+ * tn->full_children) >=
+ * inflate_threshold * tnode_child_length(tn) * 2
+ *
+ * shorten again:
+ * 50 * (tn->full_children + tnode_child_length(tn) -
+ * tn->empty_children) >= inflate_threshold *
+ * tnode_child_length(tn)
+ *
+ */
+static bool should_inflate(const struct tnode *tn)
+{
+ unsigned long used = tnode_child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= node_parent(tn) ? inflate_threshold :
+ inflate_threshold_root;
+ used += tn->full_children;
+ used -= tn->empty_children;
+
+ return tn->pos && ((50 * used) >= threshold);
+}
+
+static bool should_halve(const struct tnode *tn)
+{
+ unsigned long used = tnode_child_length(tn);
+ unsigned long threshold = used;
+
+ /* Keep root node larger */
+ threshold *= node_parent(tn) ? halve_threshold :
+ halve_threshold_root;
+ used -= tn->empty_children;
+
+ return (tn->bits > 1) && ((100 * used) < threshold);
+}
+
#define MAX_WORK 10
static struct tnode *resize(struct trie *t, struct tnode *tn)
{
struct tnode *old_tn, *n = NULL;
- int inflate_threshold_use;
- int halve_threshold_use;
int max_work;
if (!tn)
@@ -668,86 +750,12 @@ static struct tnode *resize(struct trie
/* One child */
if (tn->empty_children == (tnode_child_length(tn) - 1))
goto one_child;
- /*
- * Double as long as the resulting node has a number of
- * nonempty nodes that are above the threshold.
- */
- /*
- * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
- * the Helsinki University of Technology and Matti Tikkanen of Nokia
- * Telecommunications, page 6:
- * "A node is doubled if the ratio of non-empty children to all
- * children in the *doubled* node is at least 'high'."
- *
- * 'high' in this instance is the variable 'inflate_threshold'. It
- * is expressed as a percentage, so we multiply it with
- * tnode_child_length() and instead of multiplying by 2 (since the
- * child array will be doubled by inflate()) and multiplying
- * the left-hand side by 100 (to handle the percentage thing) we
- * multiply the left-hand side by 50.
- *
- * The left-hand side may look a bit weird: tnode_child_length(tn)
- * - tn->empty_children is of course the number of non-null children
- * in the current node. tn->full_children is the number of "full"
- * children, that is non-null tnodes with a skip value of 0.
- * All of those will be doubled in the resulting inflated tnode, so
- * we just count them one extra time here.
- *
- * A clearer way to write this would be:
- *
- * to_be_doubled = tn->full_children;
- * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
- * tn->full_children;
- *
- * new_child_length = tnode_child_length(tn) * 2;
- *
- * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
- * new_child_length;
- * if (new_fill_factor >= inflate_threshold)
- *
- * ...and so on, tho it would mess up the while () loop.
- *
- * anyway,
- * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
- * inflate_threshold
- *
- * avoid a division:
- * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
- * inflate_threshold * new_child_length
- *
- * expand not_to_be_doubled and to_be_doubled, and shorten:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >= inflate_threshold * new_child_length
- *
- * expand new_child_length:
- * 100 * (tnode_child_length(tn) - tn->empty_children +
- * tn->full_children) >=
- * inflate_threshold * tnode_child_length(tn) * 2
- *
- * shorten again:
- * 50 * (tn->full_children + tnode_child_length(tn) -
- * tn->empty_children) >= inflate_threshold *
- * tnode_child_length(tn)
- *
+ /* Double as long as the resulting node has a number of
+ * nonempty nodes that are above the threshold.
*/
-
- /* Keep root node larger */
-
- if (!node_parent(tn)) {
- inflate_threshold_use = inflate_threshold_root;
- halve_threshold_use = halve_threshold_root;
- } else {
- inflate_threshold_use = inflate_threshold;
- halve_threshold_use = halve_threshold;
- }
-
max_work = MAX_WORK;
- while ((tn->full_children > 0 && max_work-- &&
- 50 * (tn->full_children + tnode_child_length(tn)
- - tn->empty_children)
- >= inflate_threshold_use * tnode_child_length(tn))) {
-
+ while (should_inflate(tn) && max_work--) {
old_tn = tn;
tn = inflate(t, tn);
@@ -764,16 +772,11 @@ static struct tnode *resize(struct trie
if (max_work != MAX_WORK)
return tn;
- /*
- * Halve as long as the number of empty children in this
+ /* Halve as long as the number of empty children in this
* node is above threshold.
*/
-
max_work = MAX_WORK;
- while (tn->bits > 1 && max_work-- &&
- 100 * (tnode_child_length(tn) - tn->empty_children) <
- halve_threshold_use * tnode_child_length(tn)) {
-
+ while (should_halve(tn) && max_work--) {
old_tn = tn;
tn = halve(t, tn);
if (IS_ERR(tn)) {

View file

@ -0,0 +1,336 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:43 -0800
Subject: [PATCH] fib_trie: Push assignment of child to parent down into
inflate/halve
This change makes it so that the assignment of the tnode to the parent is
handled directly within whatever function is currently handling the node be
it inflate, halve, or resize. By doing this we can avoid some of the need
to set NULL pointers in the tree while we are resizing the subnodes.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -146,9 +146,7 @@ struct trie {
#endif
};
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
- struct tnode *n, int wasfull);
-static struct tnode *resize(struct trie *t, struct tnode *tn);
+static void resize(struct trie *t, struct tnode *tn);
/* tnodes to free after resize(); protected by RTNL */
static struct callback_head *tnode_free_head;
static size_t tnode_free_size;
@@ -396,22 +394,13 @@ static inline int tnode_full(const struc
return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
}
-static inline void put_child(struct tnode *tn, unsigned long i,
- struct tnode *n)
-{
- tnode_put_child_reorg(tn, i, n, -1);
-}
-
- /*
- * Add a child at position i overwriting the old value.
- * Update the value of full_children and empty_children.
- */
-
-static void tnode_put_child_reorg(struct tnode *tn, unsigned long i,
- struct tnode *n, int wasfull)
+/* Add a child at position i overwriting the old value.
+ * Update the value of full_children and empty_children.
+ */
+static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
{
struct tnode *chi = rtnl_dereference(tn->child[i]);
- int isfull;
+ int isfull, wasfull;
BUG_ON(i >= tnode_child_length(tn));
@@ -422,10 +411,9 @@ static void tnode_put_child_reorg(struct
tn->empty_children--;
/* update fullChildren */
- if (wasfull == -1)
- wasfull = tnode_full(tn, chi);
-
+ wasfull = tnode_full(tn, chi);
isfull = tnode_full(tn, n);
+
if (wasfull && !isfull)
tn->full_children--;
else if (!wasfull && isfull)
@@ -458,9 +446,10 @@ static void tnode_clean_free(struct tnod
node_free(tn);
}
-static struct tnode *inflate(struct trie *t, struct tnode *oldtnode)
+static int inflate(struct trie *t, struct tnode *oldtnode)
{
unsigned long olen = tnode_child_length(oldtnode);
+ struct tnode *tp = node_parent(oldtnode);
struct tnode *tn;
unsigned long i;
t_key m;
@@ -468,9 +457,8 @@ static struct tnode *inflate(struct trie
pr_debug("In inflate\n");
tn = tnode_new(oldtnode->key, oldtnode->pos - 1, oldtnode->bits + 1);
-
if (!tn)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
/*
* Preallocate and store tnodes before the actual work so we
@@ -564,30 +552,36 @@ static struct tnode *inflate(struct trie
put_child(left, j, rtnl_dereference(inode->child[j]));
put_child(right, j, rtnl_dereference(inode->child[j + size]));
}
- put_child(tn, 2*i, resize(t, left));
- put_child(tn, 2*i+1, resize(t, right));
+
+ put_child(tn, 2 * i, left);
+ put_child(tn, 2 * i + 1, right);
tnode_free_safe(inode);
+
+ resize(t, left);
+ resize(t, right);
}
+
+ put_child_root(tp, t, tn->key, tn);
tnode_free_safe(oldtnode);
- return tn;
+ return 0;
nomem:
tnode_clean_free(tn);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
-static struct tnode *halve(struct trie *t, struct tnode *oldtnode)
+static int halve(struct trie *t, struct tnode *oldtnode)
{
unsigned long olen = tnode_child_length(oldtnode);
+ struct tnode *tp = node_parent(oldtnode);
struct tnode *tn, *left, *right;
int i;
pr_debug("In halve\n");
tn = tnode_new(oldtnode->key, oldtnode->pos + 1, oldtnode->bits - 1);
-
if (!tn)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
/*
* Preallocate and store tnodes before the actual work so we
@@ -606,8 +600,10 @@ static struct tnode *halve(struct trie *
newn = tnode_new(left->key, oldtnode->pos, 1);
- if (!newn)
- goto nomem;
+ if (!newn) {
+ tnode_clean_free(tn);
+ return -ENOMEM;
+ }
put_child(tn, i/2, newn);
}
@@ -635,16 +631,18 @@ static struct tnode *halve(struct trie *
/* Two nonempty children */
newBinNode = tnode_get_child(tn, i/2);
- put_child(tn, i/2, NULL);
put_child(newBinNode, 0, left);
put_child(newBinNode, 1, right);
- put_child(tn, i/2, resize(t, newBinNode));
+
+ put_child(tn, i / 2, newBinNode);
+
+ resize(t, newBinNode);
}
+
+ put_child_root(tp, t, tn->key, tn);
tnode_free_safe(oldtnode);
- return tn;
-nomem:
- tnode_clean_free(tn);
- return ERR_PTR(-ENOMEM);
+
+ return 0;
}
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
@@ -704,45 +702,48 @@ nomem:
* tnode_child_length(tn)
*
*/
-static bool should_inflate(const struct tnode *tn)
+static bool should_inflate(const struct tnode *tp, const struct tnode *tn)
{
unsigned long used = tnode_child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= node_parent(tn) ? inflate_threshold :
- inflate_threshold_root;
+ threshold *= tp ? inflate_threshold : inflate_threshold_root;
used += tn->full_children;
used -= tn->empty_children;
return tn->pos && ((50 * used) >= threshold);
}
-static bool should_halve(const struct tnode *tn)
+static bool should_halve(const struct tnode *tp, const struct tnode *tn)
{
unsigned long used = tnode_child_length(tn);
unsigned long threshold = used;
/* Keep root node larger */
- threshold *= node_parent(tn) ? halve_threshold :
- halve_threshold_root;
+ threshold *= tp ? halve_threshold : halve_threshold_root;
used -= tn->empty_children;
return (tn->bits > 1) && ((100 * used) < threshold);
}
#define MAX_WORK 10
-static struct tnode *resize(struct trie *t, struct tnode *tn)
+static void resize(struct trie *t, struct tnode *tn)
{
- struct tnode *old_tn, *n = NULL;
+ struct tnode *tp = node_parent(tn), *n = NULL;
+ struct tnode __rcu **cptr;
int max_work;
- if (!tn)
- return NULL;
-
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
tn, inflate_threshold, halve_threshold);
+ /* track the tnode via the pointer from the parent instead of
+ * doing it ourselves. This way we can let RCU fully do its
+ * thing without us interfering
+ */
+ cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
+ BUG_ON(tn != rtnl_dereference(*cptr));
+
/* No children */
if (tn->empty_children > (tnode_child_length(tn) - 1))
goto no_children;
@@ -755,39 +756,35 @@ static struct tnode *resize(struct trie
* nonempty nodes that are above the threshold.
*/
max_work = MAX_WORK;
- while (should_inflate(tn) && max_work--) {
- old_tn = tn;
- tn = inflate(t, tn);
-
- if (IS_ERR(tn)) {
- tn = old_tn;
+ while (should_inflate(tp, tn) && max_work--) {
+ if (inflate(t, tn)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
+
+ tn = rtnl_dereference(*cptr);
}
/* Return if at least one inflate is run */
if (max_work != MAX_WORK)
- return tn;
+ return;
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
max_work = MAX_WORK;
- while (should_halve(tn) && max_work--) {
- old_tn = tn;
- tn = halve(t, tn);
- if (IS_ERR(tn)) {
- tn = old_tn;
+ while (should_halve(tp, tn) && max_work--) {
+ if (halve(t, tn)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->resize_node_skipped);
#endif
break;
}
- }
+ tn = rtnl_dereference(*cptr);
+ }
/* Only one child remains */
if (tn->empty_children == (tnode_child_length(tn) - 1)) {
@@ -797,11 +794,12 @@ one_child:
n = tnode_get_child(tn, --i);
no_children:
/* compress one level */
- node_set_parent(n, NULL);
+ put_child_root(tp, t, tn->key, n);
+ node_set_parent(n, tp);
+
+ /* drop dead node */
tnode_free_safe(tn);
- return n;
}
- return tn;
}
/* readside must use rcu_read_lock currently dump routines
@@ -882,34 +880,19 @@ static struct tnode *fib_find_node(struc
static void trie_rebalance(struct trie *t, struct tnode *tn)
{
- int wasfull;
- t_key cindex, key;
struct tnode *tp;
- key = tn->key;
-
- while (tn != NULL && (tp = node_parent(tn)) != NULL) {
- cindex = get_index(key, tp);
- wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
- tn = resize(t, tn);
-
- tnode_put_child_reorg(tp, cindex, tn, wasfull);
-
- tp = node_parent(tn);
- if (!tp)
- rcu_assign_pointer(t->trie, tn);
+ while ((tp = node_parent(tn)) != NULL) {
+ resize(t, tn);
tnode_free_flush();
- if (!tp)
- break;
tn = tp;
}
/* Handle last (top) tnode */
if (IS_TNODE(tn))
- tn = resize(t, tn);
+ resize(t, tn);
- rcu_assign_pointer(t->trie, tn);
tnode_free_flush();
}

View file

@ -0,0 +1,237 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:49 -0800
Subject: [PATCH] fib_trie: Push tnode flushing down to inflate/halve
This change pushes the tnode freeing down into the inflate and halve
functions. It makes more sense here as we have a better grasp of what is
going on and when a given cluster of nodes is ready to be freed.
I believe this may address a bug in the freeing logic as well. For some
reason if the freelist got to a certain size we would call
synchronize_rcu(). I'm assuming that what they meant to do is call
synchronize_rcu() after they had handed off that much memory via
call_rcu(). As such that is what I have updated the behavior to be.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -147,8 +147,6 @@ struct trie {
};
static void resize(struct trie *t, struct tnode *tn);
-/* tnodes to free after resize(); protected by RTNL */
-static struct callback_head *tnode_free_head;
static size_t tnode_free_size;
/*
@@ -307,32 +305,6 @@ static struct tnode *tnode_alloc(size_t
return vzalloc(size);
}
-static void tnode_free_safe(struct tnode *tn)
-{
- BUG_ON(IS_LEAF(tn));
- tn->rcu.next = tnode_free_head;
- tnode_free_head = &tn->rcu;
-}
-
-static void tnode_free_flush(void)
-{
- struct callback_head *head;
-
- while ((head = tnode_free_head)) {
- struct tnode *tn = container_of(head, struct tnode, rcu);
-
- tnode_free_head = head->next;
- tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
-
- node_free(tn);
- }
-
- if (tnode_free_size >= PAGE_SIZE * sync_pages) {
- tnode_free_size = 0;
- synchronize_rcu();
- }
-}
-
static struct tnode *leaf_new(t_key key)
{
struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -433,17 +405,33 @@ static void put_child_root(struct tnode
rcu_assign_pointer(t->trie, n);
}
-static void tnode_clean_free(struct tnode *tn)
+static inline void tnode_free_init(struct tnode *tn)
{
- struct tnode *tofree;
- unsigned long i;
+ tn->rcu.next = NULL;
+}
+
+static inline void tnode_free_append(struct tnode *tn, struct tnode *n)
+{
+ n->rcu.next = tn->rcu.next;
+ tn->rcu.next = &n->rcu;
+}
- for (i = 0; i < tnode_child_length(tn); i++) {
- tofree = tnode_get_child(tn, i);
- if (tofree)
- node_free(tofree);
+static void tnode_free(struct tnode *tn)
+{
+ struct callback_head *head = &tn->rcu;
+
+ while (head) {
+ head = head->next;
+ tnode_free_size += offsetof(struct tnode, child[1 << tn->bits]);
+ node_free(tn);
+
+ tn = container_of(head, struct tnode, rcu);
+ }
+
+ if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+ tnode_free_size = 0;
+ synchronize_rcu();
}
- node_free(tn);
}
static int inflate(struct trie *t, struct tnode *oldtnode)
@@ -476,20 +464,23 @@ static int inflate(struct trie *t, struc
inode->bits - 1);
if (!left)
goto nomem;
+ tnode_free_append(tn, left);
right = tnode_new(inode->key | m, inode->pos,
inode->bits - 1);
- if (!right) {
- node_free(left);
+ if (!right)
goto nomem;
- }
+ tnode_free_append(tn, right);
put_child(tn, 2*i, left);
put_child(tn, 2*i+1, right);
}
}
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
for (i = 0; i < olen; i++) {
struct tnode *inode = tnode_get_child(oldtnode, i);
struct tnode *left, *right;
@@ -505,12 +496,13 @@ static int inflate(struct trie *t, struc
continue;
}
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
+
/* An internal node with two children */
if (inode->bits == 1) {
put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
-
- tnode_free_safe(inode);
continue;
}
@@ -556,17 +548,19 @@ static int inflate(struct trie *t, struc
put_child(tn, 2 * i, left);
put_child(tn, 2 * i + 1, right);
- tnode_free_safe(inode);
-
+ /* resize child nodes */
resize(t, left);
resize(t, right);
}
put_child_root(tp, t, tn->key, tn);
- tnode_free_safe(oldtnode);
+
+ /* we completed without error, prepare to free old node */
+ tnode_free(oldtnode);
return 0;
nomem:
- tnode_clean_free(tn);
+ /* all pointers should be clean so we are done */
+ tnode_free(tn);
return -ENOMEM;
}
@@ -599,17 +593,20 @@ static int halve(struct trie *t, struct
struct tnode *newn;
newn = tnode_new(left->key, oldtnode->pos, 1);
-
if (!newn) {
- tnode_clean_free(tn);
+ tnode_free(tn);
return -ENOMEM;
}
+ tnode_free_append(tn, newn);
put_child(tn, i/2, newn);
}
}
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
for (i = 0; i < olen; i += 2) {
struct tnode *newBinNode;
@@ -636,11 +633,14 @@ static int halve(struct trie *t, struct
put_child(tn, i / 2, newBinNode);
+ /* resize child node */
resize(t, newBinNode);
}
put_child_root(tp, t, tn->key, tn);
- tnode_free_safe(oldtnode);
+
+ /* all pointers should be clean so we are done */
+ tnode_free(oldtnode);
return 0;
}
@@ -798,7 +798,8 @@ no_children:
node_set_parent(n, tp);
/* drop dead node */
- tnode_free_safe(tn);
+ tnode_free_init(tn);
+ tnode_free(tn);
}
}
@@ -884,16 +885,12 @@ static void trie_rebalance(struct trie *
while ((tp = node_parent(tn)) != NULL) {
resize(t, tn);
-
- tnode_free_flush();
tn = tp;
}
/* Handle last (top) tnode */
if (IS_TNODE(tn))
resize(t, tn);
-
- tnode_free_flush();
}
/* only used from updater-side */

View file

@ -0,0 +1,345 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:56:55 -0800
Subject: [PATCH] fib_trie: inflate/halve nodes in a more RCU friendly
way
This change pulls the node_set_parent functionality out of put_child_reorg
and instead leaves that to the function to take care of as well. By doing
this we can fully construct the new cluster of tnodes and all of the
pointers out of it before we start routing pointers into it.
I am suspecting this will likely fix some concurency issues though I don't
have a good test to show as such.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -391,8 +391,6 @@ static void put_child(struct tnode *tn,
else if (!wasfull && isfull)
tn->full_children++;
- node_set_parent(n, tn);
-
rcu_assign_pointer(tn->child[i], n);
}
@@ -436,10 +434,8 @@ static void tnode_free(struct tnode *tn)
static int inflate(struct trie *t, struct tnode *oldtnode)
{
- unsigned long olen = tnode_child_length(oldtnode);
- struct tnode *tp = node_parent(oldtnode);
- struct tnode *tn;
- unsigned long i;
+ struct tnode *inode, *node0, *node1, *tn, *tp;
+ unsigned long i, j, k;
t_key m;
pr_debug("In inflate\n");
@@ -448,43 +444,13 @@ static int inflate(struct trie *t, struc
if (!tn)
return -ENOMEM;
- /*
- * Preallocate and store tnodes before the actual work so we
- * don't get into an inconsistent state if memory allocation
- * fails. In case of failure we return the oldnode and inflate
- * of tnode is ignored.
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
*/
- for (i = 0, m = 1u << tn->pos; i < olen; i++) {
- struct tnode *inode = tnode_get_child(oldtnode, i);
-
- if (tnode_full(oldtnode, inode) && (inode->bits > 1)) {
- struct tnode *left, *right;
-
- left = tnode_new(inode->key & ~m, inode->pos,
- inode->bits - 1);
- if (!left)
- goto nomem;
- tnode_free_append(tn, left);
-
- right = tnode_new(inode->key | m, inode->pos,
- inode->bits - 1);
-
- if (!right)
- goto nomem;
- tnode_free_append(tn, right);
-
- put_child(tn, 2*i, left);
- put_child(tn, 2*i+1, right);
- }
- }
-
- /* prepare oldtnode to be freed */
- tnode_free_init(oldtnode);
-
- for (i = 0; i < olen; i++) {
- struct tnode *inode = tnode_get_child(oldtnode, i);
- struct tnode *left, *right;
- unsigned long size, j;
+ for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
+ inode = tnode_get_child(oldtnode, --i);
/* An empty child */
if (inode == NULL)
@@ -496,65 +462,99 @@ static int inflate(struct trie *t, struc
continue;
}
- /* drop the node in the old tnode free list */
- tnode_free_append(oldtnode, inode);
-
/* An internal node with two children */
if (inode->bits == 1) {
- put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
- put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
+ put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
+ put_child(tn, 2 * i, tnode_get_child(inode, 0));
continue;
}
- /* An internal node with more than two children */
-
/* We will replace this node 'inode' with two new
- * ones, 'left' and 'right', each with half of the
+ * ones, 'node0' and 'node1', each with half of the
* original children. The two new nodes will have
* a position one bit further down the key and this
* means that the "significant" part of their keys
* (see the discussion near the top of this file)
* will differ by one bit, which will be "0" in
- * left's key and "1" in right's key. Since we are
+ * node0's key and "1" in node1's key. Since we are
* moving the key position by one step, the bit that
* we are moving away from - the bit at position
- * (inode->pos) - is the one that will differ between
- * left and right. So... we synthesize that bit in the
- * two new keys.
- * The mask 'm' below will be a single "one" bit at
- * the position (inode->pos)
+ * (tn->pos) - is the one that will differ between
+ * node0 and node1. So... we synthesize that bit in the
+ * two new keys.
*/
+ node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
+ if (!node1)
+ goto nomem;
+ tnode_free_append(tn, node1);
+
+ node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
+ if (!node0)
+ goto nomem;
+ tnode_free_append(tn, node0);
+
+ /* populate child pointers in new nodes */
+ for (k = tnode_child_length(inode), j = k / 2; j;) {
+ put_child(node1, --j, tnode_get_child(inode, --k));
+ put_child(node0, j, tnode_get_child(inode, j));
+ put_child(node1, --j, tnode_get_child(inode, --k));
+ put_child(node0, j, tnode_get_child(inode, j));
+ }
+
+ /* link new nodes to parent */
+ NODE_INIT_PARENT(node1, tn);
+ NODE_INIT_PARENT(node0, tn);
+
+ /* link parent to nodes */
+ put_child(tn, 2 * i + 1, node1);
+ put_child(tn, 2 * i, node0);
+ }
+
+ /* setup the parent pointer into and out of this node */
+ tp = node_parent(oldtnode);
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, t, tn->key, tn);
- /* Use the old key, but set the new significant
- * bit to zero.
- */
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
- left = tnode_get_child(tn, 2*i);
- put_child(tn, 2*i, NULL);
+ /* update all child nodes parent pointers to route to us */
+ for (i = tnode_child_length(oldtnode); i;) {
+ inode = tnode_get_child(oldtnode, --i);
- BUG_ON(!left);
+ /* A leaf or an internal node with skipped bits */
+ if (!tnode_full(oldtnode, inode)) {
+ node_set_parent(inode, tn);
+ continue;
+ }
- right = tnode_get_child(tn, 2*i+1);
- put_child(tn, 2*i+1, NULL);
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
- BUG_ON(!right);
+ /* fetch new nodes */
+ node1 = tnode_get_child(tn, 2 * i + 1);
+ node0 = tnode_get_child(tn, 2 * i);
- size = tnode_child_length(left);
- for (j = 0; j < size; j++) {
- put_child(left, j, rtnl_dereference(inode->child[j]));
- put_child(right, j, rtnl_dereference(inode->child[j + size]));
+ /* bits == 1 then node0 and node1 represent inode's children */
+ if (inode->bits == 1) {
+ node_set_parent(node1, tn);
+ node_set_parent(node0, tn);
+ continue;
}
- put_child(tn, 2 * i, left);
- put_child(tn, 2 * i + 1, right);
+ /* update parent pointers in child node's children */
+ for (k = tnode_child_length(inode), j = k / 2; j;) {
+ node_set_parent(tnode_get_child(inode, --k), node1);
+ node_set_parent(tnode_get_child(inode, --j), node0);
+ node_set_parent(tnode_get_child(inode, --k), node1);
+ node_set_parent(tnode_get_child(inode, --j), node0);
+ }
/* resize child nodes */
- resize(t, left);
- resize(t, right);
+ resize(t, node1);
+ resize(t, node0);
}
- put_child_root(tp, t, tn->key, tn);
-
/* we completed without error, prepare to free old node */
tnode_free(oldtnode);
return 0;
@@ -566,10 +566,8 @@ nomem:
static int halve(struct trie *t, struct tnode *oldtnode)
{
- unsigned long olen = tnode_child_length(oldtnode);
- struct tnode *tp = node_parent(oldtnode);
- struct tnode *tn, *left, *right;
- int i;
+ struct tnode *tn, *tp, *inode, *node0, *node1;
+ unsigned long i;
pr_debug("In halve\n");
@@ -577,68 +575,64 @@ static int halve(struct trie *t, struct
if (!tn)
return -ENOMEM;
- /*
- * Preallocate and store tnodes before the actual work so we
- * don't get into an inconsistent state if memory allocation
- * fails. In case of failure we return the oldnode and halve
- * of tnode is ignored.
+ /* Assemble all of the pointers in our cluster, in this case that
+ * represents all of the pointers out of our allocated nodes that
+ * point to existing tnodes and the links between our allocated
+ * nodes.
*/
+ for (i = tnode_child_length(oldtnode); i;) {
+ node1 = tnode_get_child(oldtnode, --i);
+ node0 = tnode_get_child(oldtnode, --i);
- for (i = 0; i < olen; i += 2) {
- left = tnode_get_child(oldtnode, i);
- right = tnode_get_child(oldtnode, i+1);
+ /* At least one of the children is empty */
+ if (!node1 || !node0) {
+ put_child(tn, i / 2, node1 ? : node0);
+ continue;
+ }
/* Two nonempty children */
- if (left && right) {
- struct tnode *newn;
-
- newn = tnode_new(left->key, oldtnode->pos, 1);
- if (!newn) {
- tnode_free(tn);
- return -ENOMEM;
- }
- tnode_free_append(tn, newn);
-
- put_child(tn, i/2, newn);
+ inode = tnode_new(node0->key, oldtnode->pos, 1);
+ if (!inode) {
+ tnode_free(tn);
+ return -ENOMEM;
}
+ tnode_free_append(tn, inode);
+ /* initialize pointers out of node */
+ put_child(inode, 1, node1);
+ put_child(inode, 0, node0);
+ NODE_INIT_PARENT(inode, tn);
+
+ /* link parent to node */
+ put_child(tn, i / 2, inode);
}
+ /* setup the parent pointer out of and back into this node */
+ tp = node_parent(oldtnode);
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, t, tn->key, tn);
+
/* prepare oldtnode to be freed */
tnode_free_init(oldtnode);
- for (i = 0; i < olen; i += 2) {
- struct tnode *newBinNode;
-
- left = tnode_get_child(oldtnode, i);
- right = tnode_get_child(oldtnode, i+1);
-
- /* At least one of the children is empty */
- if (left == NULL) {
- if (right == NULL) /* Both are empty */
- continue;
- put_child(tn, i/2, right);
- continue;
- }
-
- if (right == NULL) {
- put_child(tn, i/2, left);
+ /* update all of the child parent pointers */
+ for (i = tnode_child_length(tn); i;) {
+ inode = tnode_get_child(tn, --i);
+
+ /* only new tnodes will be considered "full" nodes */
+ if (!tnode_full(tn, inode)) {
+ node_set_parent(inode, tn);
continue;
}
/* Two nonempty children */
- newBinNode = tnode_get_child(tn, i/2);
- put_child(newBinNode, 0, left);
- put_child(newBinNode, 1, right);
-
- put_child(tn, i / 2, newBinNode);
+ node_set_parent(tnode_get_child(inode, 1), inode);
+ node_set_parent(tnode_get_child(inode, 0), inode);
/* resize child node */
- resize(t, newBinNode);
+ resize(t, inode);
}
- put_child_root(tp, t, tn->key, tn);
-
/* all pointers should be clean so we are done */
tnode_free(oldtnode);

View file

@ -0,0 +1,95 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:57:02 -0800
Subject: [PATCH] fib_trie: Remove checks for index >= tnode_child_length
from tnode_get_child
For some reason the compiler doesn't seem to understand that when we are in
a loop that runs from tnode_child_length - 1 to 0 we don't expect the value
of tn->bits to change. As such every call to tnode_get_child was rerunning
tnode_chile_length which ended up consuming quite a bit of space in the
resultant assembly code.
I have gone though and verified that in all cases where tnode_get_child
is used we are either winding though a fixed loop from tnode_child_length -
1 to 0, or are in a fastpath case where we are verifying the value by
either checking for any remaining bits after shifting index by bits and
testing for leaf, or by using tnode_child_length.
size net/ipv4/fib_trie.o
Before:
text data bss dec hex filename
15506 376 8 15890 3e12 net/ipv4/fib_trie.o
After:
text data bss dec hex filename
14827 376 8 15211 3b6b net/ipv4/fib_trie.o
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,8 +186,6 @@ static inline unsigned long tnode_child_
static inline struct tnode *tnode_get_child(const struct tnode *tn,
unsigned long i)
{
- BUG_ON(i >= tnode_child_length(tn));
-
return rtnl_dereference(tn->child[i]);
}
@@ -195,8 +193,6 @@ static inline struct tnode *tnode_get_ch
static inline struct tnode *tnode_get_child_rcu(const struct tnode *tn,
unsigned long i)
{
- BUG_ON(i >= tnode_child_length(tn));
-
return rcu_dereference_rtnl(tn->child[i]);
}
@@ -371,7 +367,7 @@ static inline int tnode_full(const struc
*/
static void put_child(struct tnode *tn, unsigned long i, struct tnode *n)
{
- struct tnode *chi = rtnl_dereference(tn->child[i]);
+ struct tnode *chi = tnode_get_child(tn, i);
int isfull, wasfull;
BUG_ON(i >= tnode_child_length(tn));
@@ -867,7 +863,7 @@ static struct tnode *fib_find_node(struc
if (IS_LEAF(n))
break;
- n = rcu_dereference_rtnl(n->child[index]);
+ n = tnode_get_child_rcu(n, index);
}
return n;
@@ -934,7 +930,7 @@ static struct list_head *fib_insert_node
}
tp = n;
- n = rcu_dereference_rtnl(n->child[index]);
+ n = tnode_get_child_rcu(n, index);
}
l = leaf_new(key);
@@ -1215,7 +1211,7 @@ int fib_table_lookup(struct fib_table *t
cindex = index;
}
- n = rcu_dereference(n->child[index]);
+ n = tnode_get_child_rcu(n, index);
if (unlikely(!n))
goto backtrace;
}
@@ -1835,7 +1831,7 @@ static void trie_collect_stats(struct tr
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
- for (i = 0; i < tnode_child_length(n); i++) {
+ for (i = tnode_child_length(n); i--;) {
if (!rcu_access_pointer(n->child[i]))
s->nullpointers++;
}

View file

@ -0,0 +1,234 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Wed, 31 Dec 2014 10:57:08 -0800
Subject: [PATCH] fib_trie: Add tracking value for suffix length
This change adds a tracking value for the maximum suffix length of all
prefixes stored in any given tnode. With this value we can determine if we
need to backtrace or not based on if the suffix is greater than the pos
value.
By doing this we can reduce the CPU overhead for lookups in the local table
as many of the prefixes there are 32b long and have a suffix length of 0
meaning we can immediately backtrace to the root node without needing to
test any of the nodes between it and where we ended up.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -96,6 +96,7 @@ struct tnode {
t_key key;
unsigned char bits; /* 2log(KEYLENGTH) bits needed */
unsigned char pos; /* 2log(KEYLENGTH) bits needed */
+ unsigned char slen;
struct tnode __rcu *parent;
struct rcu_head rcu;
union {
@@ -311,6 +312,7 @@ static struct tnode *leaf_new(t_key key)
* as the nodes are searched
*/
l->key = key;
+ l->slen = 0;
l->pos = 0;
/* set bits to 0 indicating we are not a tnode */
l->bits = 0;
@@ -342,6 +344,7 @@ static struct tnode *tnode_new(t_key key
if (tn) {
tn->parent = NULL;
+ tn->slen = pos;
tn->pos = pos;
tn->bits = bits;
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
@@ -387,6 +390,9 @@ static void put_child(struct tnode *tn,
else if (!wasfull && isfull)
tn->full_children++;
+ if (n && (tn->slen < n->slen))
+ tn->slen = n->slen;
+
rcu_assign_pointer(tn->child[i], n);
}
@@ -635,6 +641,41 @@ static int halve(struct trie *t, struct
return 0;
}
+static unsigned char update_suffix(struct tnode *tn)
+{
+ unsigned char slen = tn->pos;
+ unsigned long stride, i;
+
+ /* search though the list of children looking for nodes that might
+ * have a suffix greater than the one we currently have. This is
+ * why we start with a stride of 2 since a stride of 1 would
+ * represent the nodes with suffix length equal to tn->pos
+ */
+ for (i = 0, stride = 0x2ul ; i < tnode_child_length(tn); i += stride) {
+ struct tnode *n = tnode_get_child(tn, i);
+
+ if (!n || (n->slen <= slen))
+ continue;
+
+ /* update stride and slen based on new value */
+ stride <<= (n->slen - slen);
+ slen = n->slen;
+ i &= ~(stride - 1);
+
+ /* if slen covers all but the last bit we can stop here
+ * there will be nothing longer than that since only node
+ * 0 and 1 << (bits - 1) could have that as their suffix
+ * length.
+ */
+ if ((slen + 1) >= (tn->pos + tn->bits))
+ break;
+ }
+
+ tn->slen = slen;
+
+ return slen;
+}
+
/* From "Implementing a dynamic compressed trie" by Stefan Nilsson of
* the Helsinki University of Technology and Matti Tikkanen of Nokia
* Telecommunications, page 6:
@@ -790,6 +831,19 @@ no_children:
/* drop dead node */
tnode_free_init(tn);
tnode_free(tn);
+ return;
+ }
+
+ /* Return if at least one deflate was run */
+ if (max_work != MAX_WORK)
+ return;
+
+ /* push the suffix length to the parent node */
+ if (tn->slen > tn->pos) {
+ unsigned char slen = update_suffix(tn);
+
+ if (tp && (slen > tp->slen))
+ tp->slen = slen;
}
}
@@ -818,8 +872,58 @@ static inline struct list_head *get_fa_h
return &li->falh;
}
-static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
+static void leaf_pull_suffix(struct tnode *l)
{
+ struct tnode *tp = node_parent(l);
+
+ while (tp && (tp->slen > tp->pos) && (tp->slen > l->slen)) {
+ if (update_suffix(tp) > l->slen)
+ break;
+ tp = node_parent(tp);
+ }
+}
+
+static void leaf_push_suffix(struct tnode *l)
+{
+ struct tnode *tn = node_parent(l);
+
+ /* if this is a new leaf then tn will be NULL and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ while (tn && (tn->slen < l->slen)) {
+ tn->slen = l->slen;
+ tn = node_parent(tn);
+ }
+}
+
+static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
+{
+ struct hlist_node *prev;
+
+ /* record the location of the pointer to this object */
+ prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist));
+
+ /* remove the leaf info from the list */
+ hlist_del_rcu(&old->hlist);
+
+ /* if we emptied the list this leaf will be freed and we can sort
+ * out parent suffix lengths as a part of trie_rebalance
+ */
+ if (hlist_empty(&l->list))
+ return;
+
+ /* if we removed the tail then we need to update slen */
+ if (!rcu_access_pointer(hlist_next_rcu(prev))) {
+ struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist);
+
+ l->slen = KEYLENGTH - li->plen;
+ leaf_pull_suffix(l);
+ }
+}
+
+static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
+{
+ struct hlist_head *head = &l->list;
struct leaf_info *li = NULL, *last = NULL;
if (hlist_empty(head)) {
@@ -836,6 +940,12 @@ static void insert_leaf_info(struct hlis
else
hlist_add_before_rcu(&new->hlist, &li->hlist);
}
+
+ /* if we added to the tail node then we need to update slen */
+ if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) {
+ l->slen = KEYLENGTH - new->plen;
+ leaf_push_suffix(l);
+ }
}
/* rcu_read_lock needs to be hold by caller from readside */
@@ -925,7 +1035,7 @@ static struct list_head *fib_insert_node
/* we have found a leaf. Prefixes have already been compared */
if (IS_LEAF(n)) {
/* Case 1: n is a leaf, and prefixes match*/
- insert_leaf_info(&n->list, li);
+ insert_leaf_info(n, li);
return fa_head;
}
@@ -939,7 +1049,7 @@ static struct list_head *fib_insert_node
return NULL;
}
- insert_leaf_info(&l->list, li);
+ insert_leaf_info(l, li);
/* Case 2: n is a LEAF or a TNODE and the key doesn't match.
*
@@ -1206,7 +1316,7 @@ int fib_table_lookup(struct fib_table *t
/* only record pn and cindex if we are going to be chopping
* bits later. Otherwise we are just wasting cycles.
*/
- if (index) {
+ if (n->slen > n->pos) {
pn = n;
cindex = index;
}
@@ -1225,7 +1335,7 @@ int fib_table_lookup(struct fib_table *t
* between the key and the prefix exist in the region of
* the lsb and higher in the prefix.
*/
- if (unlikely(prefix_mismatch(key, n)))
+ if (unlikely(prefix_mismatch(key, n)) || (n->slen == n->pos))
goto backtrace;
/* exit out and process leaf */
@@ -1425,7 +1535,7 @@ int fib_table_delete(struct fib_table *t
tb->tb_num_default--;
if (list_empty(fa_head)) {
- hlist_del_rcu(&li->hlist);
+ remove_leaf_info(l, li);
free_leaf_info(li);
}

View file

@ -0,0 +1,267 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:14 -0800
Subject: [PATCH] fib_trie: Fix RCU bug and merge similar bits of inflate/halve
This patch addresses two issues.
The first issue is the fact that I believe I had the RCU freeing sequence
slightly out of order. As a result we could get into an issue if a caller
went into a child of a child of the new node, then backtraced into the to be
freed parent, and then attempted to access a child of a child that may have
been consumed in a resize of one of the new nodes children. To resolve this I
have moved the resize after we have freed the oldtnode. The only side effect
of this is that we will now be calling resize on more nodes in the case of
inflate due to the fact that we don't have a good way to test to see if a
full_tnode on the new node was there before or after the allocation. This
should have minimal impact however since the node should already be
correctly size so it is just the cost of calling should_inflate that we
will be taking on the node which is only a couple of cycles.
The second issue is the fact that inflate and halve were essentially doing
the same thing after the new node was added to the trie replacing the old
one. As such it wasn't really necessary to keep the code in both functions
so I have split it out into two other functions, called replace and
update_children.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -396,8 +396,30 @@ static void put_child(struct tnode *tn,
rcu_assign_pointer(tn->child[i], n);
}
-static void put_child_root(struct tnode *tp, struct trie *t,
- t_key key, struct tnode *n)
+static void update_children(struct tnode *tn)
+{
+ unsigned long i;
+
+ /* update all of the child parent pointers */
+ for (i = tnode_child_length(tn); i;) {
+ struct tnode *inode = tnode_get_child(tn, --i);
+
+ if (!inode)
+ continue;
+
+ /* Either update the children of a tnode that
+ * already belongs to us or update the child
+ * to point to ourselves.
+ */
+ if (node_parent(inode) == tn)
+ update_children(inode);
+ else
+ node_set_parent(inode, tn);
+ }
+}
+
+static inline void put_child_root(struct tnode *tp, struct trie *t,
+ t_key key, struct tnode *n)
{
if (tp)
put_child(tp, get_index(key, tp), n);
@@ -434,10 +456,35 @@ static void tnode_free(struct tnode *tn)
}
}
+static void replace(struct trie *t, struct tnode *oldtnode, struct tnode *tn)
+{
+ struct tnode *tp = node_parent(oldtnode);
+ unsigned long i;
+
+ /* setup the parent pointer out of and back into this node */
+ NODE_INIT_PARENT(tn, tp);
+ put_child_root(tp, t, tn->key, tn);
+
+ /* update all of the child parent pointers */
+ update_children(tn);
+
+ /* all pointers should be clean so we are done */
+ tnode_free(oldtnode);
+
+ /* resize children now that oldtnode is freed */
+ for (i = tnode_child_length(tn); i;) {
+ struct tnode *inode = tnode_get_child(tn, --i);
+
+ /* resize child node */
+ if (tnode_full(tn, inode))
+ resize(t, inode);
+ }
+}
+
static int inflate(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *inode, *node0, *node1, *tn, *tp;
- unsigned long i, j, k;
+ struct tnode *tn;
+ unsigned long i;
t_key m;
pr_debug("In inflate\n");
@@ -446,13 +493,18 @@ static int inflate(struct trie *t, struc
if (!tn)
return -ENOMEM;
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
/* Assemble all of the pointers in our cluster, in this case that
* represents all of the pointers out of our allocated nodes that
* point to existing tnodes and the links between our allocated
* nodes.
*/
for (i = tnode_child_length(oldtnode), m = 1u << tn->pos; i;) {
- inode = tnode_get_child(oldtnode, --i);
+ struct tnode *inode = tnode_get_child(oldtnode, --i);
+ struct tnode *node0, *node1;
+ unsigned long j, k;
/* An empty child */
if (inode == NULL)
@@ -464,6 +516,9 @@ static int inflate(struct trie *t, struc
continue;
}
+ /* drop the node in the old tnode free list */
+ tnode_free_append(oldtnode, inode);
+
/* An internal node with two children */
if (inode->bits == 1) {
put_child(tn, 2 * i + 1, tnode_get_child(inode, 1));
@@ -488,9 +543,9 @@ static int inflate(struct trie *t, struc
node1 = tnode_new(inode->key | m, inode->pos, inode->bits - 1);
if (!node1)
goto nomem;
- tnode_free_append(tn, node1);
+ node0 = tnode_new(inode->key, inode->pos, inode->bits - 1);
- node0 = tnode_new(inode->key & ~m, inode->pos, inode->bits - 1);
+ tnode_free_append(tn, node1);
if (!node0)
goto nomem;
tnode_free_append(tn, node0);
@@ -512,53 +567,9 @@ static int inflate(struct trie *t, struc
put_child(tn, 2 * i, node0);
}
- /* setup the parent pointer into and out of this node */
- tp = node_parent(oldtnode);
- NODE_INIT_PARENT(tn, tp);
- put_child_root(tp, t, tn->key, tn);
-
- /* prepare oldtnode to be freed */
- tnode_free_init(oldtnode);
-
- /* update all child nodes parent pointers to route to us */
- for (i = tnode_child_length(oldtnode); i;) {
- inode = tnode_get_child(oldtnode, --i);
-
- /* A leaf or an internal node with skipped bits */
- if (!tnode_full(oldtnode, inode)) {
- node_set_parent(inode, tn);
- continue;
- }
-
- /* drop the node in the old tnode free list */
- tnode_free_append(oldtnode, inode);
-
- /* fetch new nodes */
- node1 = tnode_get_child(tn, 2 * i + 1);
- node0 = tnode_get_child(tn, 2 * i);
+ /* setup the parent pointers into and out of this node */
+ replace(t, oldtnode, tn);
- /* bits == 1 then node0 and node1 represent inode's children */
- if (inode->bits == 1) {
- node_set_parent(node1, tn);
- node_set_parent(node0, tn);
- continue;
- }
-
- /* update parent pointers in child node's children */
- for (k = tnode_child_length(inode), j = k / 2; j;) {
- node_set_parent(tnode_get_child(inode, --k), node1);
- node_set_parent(tnode_get_child(inode, --j), node0);
- node_set_parent(tnode_get_child(inode, --k), node1);
- node_set_parent(tnode_get_child(inode, --j), node0);
- }
-
- /* resize child nodes */
- resize(t, node1);
- resize(t, node0);
- }
-
- /* we completed without error, prepare to free old node */
- tnode_free(oldtnode);
return 0;
nomem:
/* all pointers should be clean so we are done */
@@ -568,7 +579,7 @@ nomem:
static int halve(struct trie *t, struct tnode *oldtnode)
{
- struct tnode *tn, *tp, *inode, *node0, *node1;
+ struct tnode *tn;
unsigned long i;
pr_debug("In halve\n");
@@ -577,14 +588,18 @@ static int halve(struct trie *t, struct
if (!tn)
return -ENOMEM;
+ /* prepare oldtnode to be freed */
+ tnode_free_init(oldtnode);
+
/* Assemble all of the pointers in our cluster, in this case that
* represents all of the pointers out of our allocated nodes that
* point to existing tnodes and the links between our allocated
* nodes.
*/
for (i = tnode_child_length(oldtnode); i;) {
- node1 = tnode_get_child(oldtnode, --i);
- node0 = tnode_get_child(oldtnode, --i);
+ struct tnode *node1 = tnode_get_child(oldtnode, --i);
+ struct tnode *node0 = tnode_get_child(oldtnode, --i);
+ struct tnode *inode;
/* At least one of the children is empty */
if (!node1 || !node0) {
@@ -609,34 +624,8 @@ static int halve(struct trie *t, struct
put_child(tn, i / 2, inode);
}
- /* setup the parent pointer out of and back into this node */
- tp = node_parent(oldtnode);
- NODE_INIT_PARENT(tn, tp);
- put_child_root(tp, t, tn->key, tn);
-
- /* prepare oldtnode to be freed */
- tnode_free_init(oldtnode);
-
- /* update all of the child parent pointers */
- for (i = tnode_child_length(tn); i;) {
- inode = tnode_get_child(tn, --i);
-
- /* only new tnodes will be considered "full" nodes */
- if (!tnode_full(tn, inode)) {
- node_set_parent(inode, tn);
- continue;
- }
-
- /* Two nonempty children */
- node_set_parent(tnode_get_child(inode, 1), inode);
- node_set_parent(tnode_get_child(inode, 0), inode);
-
- /* resize child node */
- resize(t, inode);
- }
-
- /* all pointers should be clean so we are done */
- tnode_free(oldtnode);
+ /* setup the parent pointers into and out of this node */
+ replace(t, oldtnode, tn);
return 0;
}

View file

@ -0,0 +1,61 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:20 -0800
Subject: [PATCH] fib_trie: Fall back to slen update on inflate/halve failure
This change corrects an issue where if inflate or halve fails we were
exiting the resize function without at least updating the slen for the
node. To correct this I have moved the update of max_size into the while
loop so that it is only decremented on a successful call to either inflate
or halve.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -752,7 +752,7 @@ static void resize(struct trie *t, struc
{
struct tnode *tp = node_parent(tn), *n = NULL;
struct tnode __rcu **cptr;
- int max_work;
+ int max_work = MAX_WORK;
pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
tn, inflate_threshold, halve_threshold);
@@ -775,8 +775,7 @@ static void resize(struct trie *t, struc
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
- max_work = MAX_WORK;
- while (should_inflate(tp, tn) && max_work--) {
+ while (should_inflate(tp, tn) && max_work) {
if (inflate(t, tn)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->resize_node_skipped);
@@ -784,6 +783,7 @@ static void resize(struct trie *t, struc
break;
}
+ max_work--;
tn = rtnl_dereference(*cptr);
}
@@ -794,8 +794,7 @@ static void resize(struct trie *t, struc
/* Halve as long as the number of empty children in this
* node is above threshold.
*/
- max_work = MAX_WORK;
- while (should_halve(tp, tn) && max_work--) {
+ while (should_halve(tp, tn) && max_work) {
if (halve(t, tn)) {
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(t->stats->resize_node_skipped);
@@ -803,6 +802,7 @@ static void resize(struct trie *t, struc
break;
}
+ max_work--;
tn = rtnl_dereference(*cptr);
}

View file

@ -0,0 +1,206 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:26 -0800
Subject: [PATCH] fib_trie: Add collapse() and should_collapse() to resize
This patch really does two things.
First it pulls the logic for determining if we should collapse one node out
of the tree and the actual code doing the collapse into a separate pair of
functions. This helps to make the changes to these areas more readable.
Second it encodes the upper 32b of the empty_children value onto the
full_children value in the case of bits == KEYLENGTH. By doing this we are
able to handle the case of a 32b node where empty_children would appear to
be 0 when it was actually 1ul << 32.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -83,7 +83,8 @@
#define MAX_STAT_DEPTH 32
-#define KEYLENGTH (8*sizeof(t_key))
+#define KEYLENGTH (8*sizeof(t_key))
+#define KEY_MAX ((t_key)~0)
typedef unsigned int t_key;
@@ -102,8 +103,8 @@ struct tnode {
union {
/* The fields in this struct are valid if bits > 0 (TNODE) */
struct {
- unsigned int full_children; /* KEYLENGTH bits needed */
- unsigned int empty_children; /* KEYLENGTH bits needed */
+ t_key empty_children; /* KEYLENGTH bits needed */
+ t_key full_children; /* KEYLENGTH bits needed */
struct tnode __rcu *child[0];
};
/* This list pointer if valid if bits == 0 (LEAF) */
@@ -302,6 +303,16 @@ static struct tnode *tnode_alloc(size_t
return vzalloc(size);
}
+static inline void empty_child_inc(struct tnode *n)
+{
+ ++n->empty_children ? : ++n->full_children;
+}
+
+static inline void empty_child_dec(struct tnode *n)
+{
+ n->empty_children-- ? : n->full_children--;
+}
+
static struct tnode *leaf_new(t_key key)
{
struct tnode *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
@@ -335,7 +346,7 @@ static struct leaf_info *leaf_info_new(i
static struct tnode *tnode_new(t_key key, int pos, int bits)
{
- size_t sz = offsetof(struct tnode, child[1 << bits]);
+ size_t sz = offsetof(struct tnode, child[1ul << bits]);
struct tnode *tn = tnode_alloc(sz);
unsigned int shift = pos + bits;
@@ -348,8 +359,10 @@ static struct tnode *tnode_new(t_key key
tn->pos = pos;
tn->bits = bits;
tn->key = (shift < KEYLENGTH) ? (key >> shift) << shift : 0;
- tn->full_children = 0;
- tn->empty_children = 1<<bits;
+ if (bits == KEYLENGTH)
+ tn->full_children = 1;
+ else
+ tn->empty_children = 1ul << bits;
}
pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
@@ -375,11 +388,11 @@ static void put_child(struct tnode *tn,
BUG_ON(i >= tnode_child_length(tn));
- /* update emptyChildren */
+ /* update emptyChildren, overflow into fullChildren */
if (n == NULL && chi != NULL)
- tn->empty_children++;
- else if (n != NULL && chi == NULL)
- tn->empty_children--;
+ empty_child_inc(tn);
+ if (n != NULL && chi == NULL)
+ empty_child_dec(tn);
/* update fullChildren */
wasfull = tnode_full(tn, chi);
@@ -630,6 +643,24 @@ static int halve(struct trie *t, struct
return 0;
}
+static void collapse(struct trie *t, struct tnode *oldtnode)
+{
+ struct tnode *n, *tp;
+ unsigned long i;
+
+ /* scan the tnode looking for that one child that might still exist */
+ for (n = NULL, i = tnode_child_length(oldtnode); !n && i;)
+ n = tnode_get_child(oldtnode, --i);
+
+ /* compress one level */
+ tp = node_parent(oldtnode);
+ put_child_root(tp, t, oldtnode->key, n);
+ node_set_parent(n, tp);
+
+ /* drop dead node */
+ node_free(oldtnode);
+}
+
static unsigned char update_suffix(struct tnode *tn)
{
unsigned char slen = tn->pos;
@@ -729,10 +760,12 @@ static bool should_inflate(const struct
/* Keep root node larger */
threshold *= tp ? inflate_threshold : inflate_threshold_root;
- used += tn->full_children;
used -= tn->empty_children;
+ used += tn->full_children;
- return tn->pos && ((50 * used) >= threshold);
+ /* if bits == KEYLENGTH then pos = 0, and will fail below */
+
+ return (used > 1) && tn->pos && ((50 * used) >= threshold);
}
static bool should_halve(const struct tnode *tp, const struct tnode *tn)
@@ -744,13 +777,29 @@ static bool should_halve(const struct tn
threshold *= tp ? halve_threshold : halve_threshold_root;
used -= tn->empty_children;
- return (tn->bits > 1) && ((100 * used) < threshold);
+ /* if bits == KEYLENGTH then used = 100% on wrap, and will fail below */
+
+ return (used > 1) && (tn->bits > 1) && ((100 * used) < threshold);
+}
+
+static bool should_collapse(const struct tnode *tn)
+{
+ unsigned long used = tnode_child_length(tn);
+
+ used -= tn->empty_children;
+
+ /* account for bits == KEYLENGTH case */
+ if ((tn->bits == KEYLENGTH) && tn->full_children)
+ used -= KEY_MAX;
+
+ /* One child or none, time to drop us from the trie */
+ return used < 2;
}
#define MAX_WORK 10
static void resize(struct trie *t, struct tnode *tn)
{
- struct tnode *tp = node_parent(tn), *n = NULL;
+ struct tnode *tp = node_parent(tn);
struct tnode __rcu **cptr;
int max_work = MAX_WORK;
@@ -764,14 +813,6 @@ static void resize(struct trie *t, struc
cptr = tp ? &tp->child[get_index(tn->key, tp)] : &t->trie;
BUG_ON(tn != rtnl_dereference(*cptr));
- /* No children */
- if (tn->empty_children > (tnode_child_length(tn) - 1))
- goto no_children;
-
- /* One child */
- if (tn->empty_children == (tnode_child_length(tn) - 1))
- goto one_child;
-
/* Double as long as the resulting node has a number of
* nonempty nodes that are above the threshold.
*/
@@ -807,19 +848,8 @@ static void resize(struct trie *t, struc
}
/* Only one child remains */
- if (tn->empty_children == (tnode_child_length(tn) - 1)) {
- unsigned long i;
-one_child:
- for (i = tnode_child_length(tn); !n && i;)
- n = tnode_get_child(tn, --i);
-no_children:
- /* compress one level */
- put_child_root(tp, t, tn->key, n);
- node_set_parent(n, tp);
-
- /* drop dead node */
- tnode_free_init(tn);
- tnode_free(tn);
+ if (should_collapse(tn)) {
+ collapse(t, tn);
return;
}

View file

@ -0,0 +1,34 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:33 -0800
Subject: [PATCH] fib_trie: Use empty_children instead of counting empty nodes
in stats collection
It doesn't make much sense to count the pointers ourselves when
empty_children already has a count for the number of NULL pointers stored
in the tnode. As such save ourselves the cycles and just use
empty_children.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1954,16 +1954,10 @@ static void trie_collect_stats(struct tr
hlist_for_each_entry_rcu(li, &n->list, hlist)
++s->prefixes;
} else {
- unsigned long i;
-
s->tnodes++;
if (n->bits < MAX_STAT_DEPTH)
s->nodesizes[n->bits]++;
-
- for (i = tnode_child_length(n); i--;) {
- if (!rcu_access_pointer(n->child[i]))
- s->nullpointers++;
- }
+ s->nullpointers += n->empty_children;
}
}
rcu_read_unlock();

View file

@ -0,0 +1,79 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:39 -0800
Subject: [PATCH] fib_trie: Move fib_find_alias to file where it is used
The function fib_find_alias is only accessed by functions in fib_trie.c as
such it makes sense to relocate it and cast it as static so that the
compiler can take advantage of optimizations it can do to it as a local
function.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -32,7 +32,6 @@ int fib_dump_info(struct sk_buff *skb, u
unsigned int);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -414,24 +414,6 @@ errout:
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
-/* Return the first fib alias matching TOS with
- * priority less than or equal to PRIO.
- */
-struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
-{
- if (fah) {
- struct fib_alias *fa;
- list_for_each_entry(fa, fah, fa_list) {
- if (fa->fa_tos > tos)
- continue;
- if (fa->fa_info->fib_priority >= prio ||
- fa->fa_tos < tos)
- return fa;
- }
- }
- return NULL;
-}
-
static int fib_detect_death(struct fib_info *fi, int order,
struct fib_info **last_resort, int *last_idx,
int dflt)
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -998,6 +998,26 @@ static struct tnode *fib_find_node(struc
return n;
}
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+static struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+ struct fib_alias *fa;
+
+ if (!fah)
+ return NULL;
+
+ list_for_each_entry(fa, fah, fa_list) {
+ if (fa->fa_tos > tos)
+ continue;
+ if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ return fa;
+ }
+
+ return NULL;
+}
+
static void trie_rebalance(struct trie *t, struct tnode *tn)
{
struct tnode *tp;

View file

@ -0,0 +1,116 @@
From: Alexander Duyck <alexander.h.duyck@redhat.com>
Date: Thu, 22 Jan 2015 15:51:45 -0800
Subject: [PATCH] fib_trie: Various clean-ups for handling slen
While doing further work on the fib_trie I noted a few items.
First I was using calls that were far more complicated than they needed to
be for determining when to push/pull the suffix length. I have updated the
code to reflect the simplier logic.
The second issue is that I realised we weren't necessarily handling the
case of a leaf_info struct surviving a flush. I have updated the logic so
that now we will call pull_suffix in the event of having a leaf info value
left in the leaf after flushing it.
Signed-off-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -917,27 +917,20 @@ static void leaf_push_suffix(struct tnod
static void remove_leaf_info(struct tnode *l, struct leaf_info *old)
{
- struct hlist_node *prev;
-
- /* record the location of the pointer to this object */
- prev = rtnl_dereference(hlist_pprev_rcu(&old->hlist));
+ /* record the location of the previous list_info entry */
+ struct hlist_node **pprev = old->hlist.pprev;
+ struct leaf_info *li = hlist_entry(pprev, typeof(*li), hlist.next);
/* remove the leaf info from the list */
hlist_del_rcu(&old->hlist);
- /* if we emptied the list this leaf will be freed and we can sort
- * out parent suffix lengths as a part of trie_rebalance
- */
- if (hlist_empty(&l->list))
+ /* only access li if it is pointing at the last valid hlist_node */
+ if (hlist_empty(&l->list) || (*pprev))
return;
- /* if we removed the tail then we need to update slen */
- if (!rcu_access_pointer(hlist_next_rcu(prev))) {
- struct leaf_info *li = hlist_entry(prev, typeof(*li), hlist);
-
- l->slen = KEYLENGTH - li->plen;
- leaf_pull_suffix(l);
- }
+ /* update the trie with the latest suffix length */
+ l->slen = KEYLENGTH - li->plen;
+ leaf_pull_suffix(l);
}
static void insert_leaf_info(struct tnode *l, struct leaf_info *new)
@@ -961,7 +954,7 @@ static void insert_leaf_info(struct tnod
}
/* if we added to the tail node then we need to update slen */
- if (!rcu_access_pointer(hlist_next_rcu(&new->hlist))) {
+ if (l->slen < (KEYLENGTH - new->plen)) {
l->slen = KEYLENGTH - new->plen;
leaf_push_suffix(l);
}
@@ -1613,6 +1606,7 @@ static int trie_flush_leaf(struct tnode
struct hlist_head *lih = &l->list;
struct hlist_node *tmp;
struct leaf_info *li = NULL;
+ unsigned char plen = KEYLENGTH;
hlist_for_each_entry_safe(li, tmp, lih, hlist) {
found += trie_flush_list(&li->falh);
@@ -1620,8 +1614,14 @@ static int trie_flush_leaf(struct tnode
if (list_empty(&li->falh)) {
hlist_del_rcu(&li->hlist);
free_leaf_info(li);
+ continue;
}
+
+ plen = li->plen;
}
+
+ l->slen = KEYLENGTH - plen;
+
return found;
}
@@ -1700,13 +1700,22 @@ int fib_table_flush(struct fib_table *tb
for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
found += trie_flush_leaf(l);
- if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
+ if (ll) {
+ if (hlist_empty(&ll->list))
+ trie_leaf_remove(t, ll);
+ else
+ leaf_pull_suffix(ll);
+ }
+
ll = l;
}
- if (ll && hlist_empty(&ll->list))
- trie_leaf_remove(t, ll);
+ if (ll) {
+ if (hlist_empty(&ll->list))
+ trie_leaf_remove(t, ll);
+ else
+ leaf_pull_suffix(ll);
+ }
pr_debug("trie_flush found=%d\n", found);
return found;