]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * net/core/dst.c Protocol independent destination cache. | |
3 | * | |
4 | * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> | |
5 | * | |
6 | */ | |
7 | ||
8 | #include <linux/bitops.h> | |
9 | #include <linux/errno.h> | |
10 | #include <linux/init.h> | |
11 | #include <linux/kernel.h> | |
86bba269 | 12 | #include <linux/workqueue.h> |
1da177e4 LT |
13 | #include <linux/mm.h> |
14 | #include <linux/module.h> | |
5a0e3ad6 | 15 | #include <linux/slab.h> |
1da177e4 | 16 | #include <linux/netdevice.h> |
1da177e4 LT |
17 | #include <linux/skbuff.h> |
18 | #include <linux/string.h> | |
19 | #include <linux/types.h> | |
e9dc8653 | 20 | #include <net/net_namespace.h> |
2fc1b5dd | 21 | #include <linux/sched.h> |
268bb0ce | 22 | #include <linux/prefetch.h> |
61adedf3 | 23 | #include <net/lwtunnel.h> |
1da177e4 LT |
24 | |
25 | #include <net/dst.h> | |
f38a9eb1 | 26 | #include <net/dst_metadata.h> |
1da177e4 | 27 | |
86bba269 ED |
28 | /* |
29 | * Theory of operations: | |
30 | * 1) We use a list, protected by a spinlock, to add | |
31 | * new entries from both BH and non-BH context. | |
32 | * 2) In order to keep spinlock held for a small delay, | |
33 | * we use a second list where are stored long lived | |
34 | * entries, that are handled by the garbage collect thread | |
35 | * fired by a workqueue. | |
36 | * 3) This list is guarded by a mutex, | |
37 | * so that the gc_task and dst_dev_event() can be synchronized. | |
1da177e4 | 38 | */ |
1da177e4 | 39 | |
86bba269 ED |
40 | /* |
41 | * We want to keep lock & list close together | |
42 | * to dirty as few cache lines as possible in __dst_free(). | |
43 | * As this is not a very strong hint, we dont force an alignment on SMP. | |
44 | */ | |
ede2059d | 45 | int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) |
1da177e4 LT |
46 | { |
47 | kfree_skb(skb); | |
48 | return 0; | |
49 | } | |
ede2059d | 50 | EXPORT_SYMBOL(dst_discard_out); |
1da177e4 | 51 | |
3fb07daf | 52 | const struct dst_metrics dst_default_metrics = { |
a37e6e34 ED |
53 | /* This initializer is needed to force linker to place this variable |
54 | * into const section. Otherwise it might end into bss section. | |
55 | * We really want to avoid false sharing on this variable, and catch | |
56 | * any writes on it. | |
57 | */ | |
9620fef2 | 58 | .refcnt = REFCOUNT_INIT(1), |
a37e6e34 ED |
59 | }; |
60 | ||
f38a9eb1 TG |
61 | void dst_init(struct dst_entry *dst, struct dst_ops *ops, |
62 | struct net_device *dev, int initial_ref, int initial_obsolete, | |
63 | unsigned short flags) | |
1da177e4 | 64 | { |
cf911662 | 65 | dst->child = NULL; |
5c1e6aa3 DM |
66 | dst->dev = dev; |
67 | if (dev) | |
68 | dev_hold(dev); | |
1da177e4 | 69 | dst->ops = ops; |
3fb07daf | 70 | dst_init_metrics(dst, dst_default_metrics.metrics, true); |
cf911662 | 71 | dst->expires = 0UL; |
5c1e6aa3 | 72 | dst->path = dst; |
ecd98837 | 73 | dst->from = NULL; |
cf911662 DM |
74 | #ifdef CONFIG_XFRM |
75 | dst->xfrm = NULL; | |
76 | #endif | |
5c1e6aa3 | 77 | dst->input = dst_discard; |
ede2059d | 78 | dst->output = dst_discard_out; |
cf911662 | 79 | dst->error = 0; |
5c1e6aa3 | 80 | dst->obsolete = initial_obsolete; |
cf911662 DM |
81 | dst->header_len = 0; |
82 | dst->trailer_len = 0; | |
83 | #ifdef CONFIG_IP_ROUTE_CLASSID | |
84 | dst->tclassid = 0; | |
1da177e4 | 85 | #endif |
61adedf3 | 86 | dst->lwtstate = NULL; |
5c1e6aa3 | 87 | atomic_set(&dst->__refcnt, initial_ref); |
cf911662 | 88 | dst->__use = 0; |
5c1e6aa3 DM |
89 | dst->lastuse = jiffies; |
90 | dst->flags = flags; | |
cf911662 | 91 | dst->next = NULL; |
957c665f DM |
92 | if (!(flags & DST_NOCOUNT)) |
93 | dst_entries_add(ops, 1); | |
f38a9eb1 TG |
94 | } |
95 | EXPORT_SYMBOL(dst_init); | |
96 | ||
97 | void *dst_alloc(struct dst_ops *ops, struct net_device *dev, | |
98 | int initial_ref, int initial_obsolete, unsigned short flags) | |
99 | { | |
100 | struct dst_entry *dst; | |
101 | ||
102 | if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { | |
103 | if (ops->gc(ops)) | |
104 | return NULL; | |
105 | } | |
106 | ||
107 | dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); | |
108 | if (!dst) | |
109 | return NULL; | |
110 | ||
111 | dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); | |
112 | ||
1da177e4 LT |
113 | return dst; |
114 | } | |
598ed936 | 115 | EXPORT_SYMBOL(dst_alloc); |
1da177e4 | 116 | |
1da177e4 LT |
117 | struct dst_entry *dst_destroy(struct dst_entry * dst) |
118 | { | |
119 | struct dst_entry *child; | |
1da177e4 LT |
120 | |
121 | smp_rmb(); | |
122 | ||
1da177e4 LT |
123 | child = dst->child; |
124 | ||
957c665f DM |
125 | if (!(dst->flags & DST_NOCOUNT)) |
126 | dst_entries_add(dst->ops, -1); | |
1da177e4 LT |
127 | |
128 | if (dst->ops->destroy) | |
129 | dst->ops->destroy(dst); | |
130 | if (dst->dev) | |
131 | dev_put(dst->dev); | |
f38a9eb1 | 132 | |
e252b3d1 WC |
133 | lwtstate_put(dst->lwtstate); |
134 | ||
f38a9eb1 | 135 | if (dst->flags & DST_METADATA) |
d71785ff | 136 | metadata_dst_free((struct metadata_dst *)dst); |
f38a9eb1 TG |
137 | else |
138 | kmem_cache_free(dst->ops->kmem_cachep, dst); | |
1da177e4 LT |
139 | |
140 | dst = child; | |
52df157f WW |
141 | if (dst) |
142 | dst_release_immediate(dst); | |
1da177e4 LT |
143 | return NULL; |
144 | } | |
598ed936 | 145 | EXPORT_SYMBOL(dst_destroy); |
1da177e4 | 146 | |
f8864972 ED |
147 | static void dst_destroy_rcu(struct rcu_head *head) |
148 | { | |
149 | struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); | |
150 | ||
151 | dst = dst_destroy(dst); | |
f8864972 ED |
152 | } |
153 | ||
4a6ce2b6 WW |
154 | /* Operations to mark dst as DEAD and clean up the net device referenced |
155 | * by dst: | |
156 | * 1. put the dst under loopback interface and discard all tx/rx packets | |
157 | * on this route. | |
158 | * 2. release the net_device | |
159 | * This function should be called when removing routes from the fib tree | |
160 | * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to | |
161 | * make the next dst_ops->check() fail. | |
162 | */ | |
163 | void dst_dev_put(struct dst_entry *dst) | |
164 | { | |
165 | struct net_device *dev = dst->dev; | |
166 | ||
167 | dst->obsolete = DST_OBSOLETE_DEAD; | |
168 | if (dst->ops->ifdown) | |
169 | dst->ops->ifdown(dst, dev, true); | |
170 | dst->input = dst_discard; | |
171 | dst->output = dst_discard_out; | |
172 | dst->dev = dev_net(dst->dev)->loopback_dev; | |
173 | dev_hold(dst->dev); | |
174 | dev_put(dev); | |
175 | } | |
176 | EXPORT_SYMBOL(dst_dev_put); | |
177 | ||
8d330868 IJ |
178 | void dst_release(struct dst_entry *dst) |
179 | { | |
180 | if (dst) { | |
598ed936 | 181 | int newrefcnt; |
ef711cf1 | 182 | |
598ed936 | 183 | newrefcnt = atomic_dec_return(&dst->__refcnt); |
8bf4ada2 KK |
184 | if (unlikely(newrefcnt < 0)) |
185 | net_warn_ratelimited("%s: dst:%p refcnt:%d\n", | |
186 | __func__, dst, newrefcnt); | |
b2a9c0ed | 187 | if (!newrefcnt) |
f8864972 | 188 | call_rcu(&dst->rcu_head, dst_destroy_rcu); |
8d330868 IJ |
189 | } |
190 | } | |
191 | EXPORT_SYMBOL(dst_release); | |
192 | ||
5f56f409 WW |
193 | void dst_release_immediate(struct dst_entry *dst) |
194 | { | |
195 | if (dst) { | |
196 | int newrefcnt; | |
197 | ||
198 | newrefcnt = atomic_dec_return(&dst->__refcnt); | |
199 | if (unlikely(newrefcnt < 0)) | |
200 | net_warn_ratelimited("%s: dst:%p refcnt:%d\n", | |
201 | __func__, dst, newrefcnt); | |
202 | if (!newrefcnt) | |
203 | dst_destroy(dst); | |
204 | } | |
205 | } | |
206 | EXPORT_SYMBOL(dst_release_immediate); | |
207 | ||
62fa8a84 DM |
208 | u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) |
209 | { | |
3fb07daf | 210 | struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); |
62fa8a84 DM |
211 | |
212 | if (p) { | |
3fb07daf | 213 | struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); |
62fa8a84 DM |
214 | unsigned long prev, new; |
215 | ||
9620fef2 | 216 | refcount_set(&p->refcnt, 1); |
3fb07daf | 217 | memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); |
62fa8a84 DM |
218 | |
219 | new = (unsigned long) p; | |
220 | prev = cmpxchg(&dst->_metrics, old, new); | |
221 | ||
222 | if (prev != old) { | |
223 | kfree(p); | |
3fb07daf | 224 | p = (struct dst_metrics *)__DST_METRICS_PTR(prev); |
62fa8a84 DM |
225 | if (prev & DST_METRICS_READ_ONLY) |
226 | p = NULL; | |
3fb07daf | 227 | } else if (prev & DST_METRICS_REFCOUNTED) { |
9620fef2 | 228 | if (refcount_dec_and_test(&old_p->refcnt)) |
3fb07daf | 229 | kfree(old_p); |
62fa8a84 DM |
230 | } |
231 | } | |
3fb07daf ED |
232 | BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); |
233 | return (u32 *)p; | |
62fa8a84 DM |
234 | } |
235 | EXPORT_SYMBOL(dst_cow_metrics_generic); | |
236 | ||
237 | /* Caller asserts that dst_metrics_read_only(dst) is false. */ | |
238 | void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) | |
239 | { | |
240 | unsigned long prev, new; | |
241 | ||
3fb07daf | 242 | new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; |
62fa8a84 DM |
243 | prev = cmpxchg(&dst->_metrics, old, new); |
244 | if (prev == old) | |
245 | kfree(__DST_METRICS_PTR(old)); | |
246 | } | |
247 | EXPORT_SYMBOL(__dst_destroy_metrics_generic); | |
248 | ||
f38a9eb1 TG |
249 | static struct dst_ops md_dst_ops = { |
250 | .family = AF_UNSPEC, | |
251 | }; | |
252 | ||
ede2059d | 253 | static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) |
f38a9eb1 TG |
254 | { |
255 | WARN_ONCE(1, "Attempting to call output on metadata dst\n"); | |
256 | kfree_skb(skb); | |
257 | return 0; | |
258 | } | |
259 | ||
260 | static int dst_md_discard(struct sk_buff *skb) | |
261 | { | |
262 | WARN_ONCE(1, "Attempting to call input on metadata dst\n"); | |
263 | kfree_skb(skb); | |
264 | return 0; | |
265 | } | |
266 | ||
3fcece12 JK |
267 | static void __metadata_dst_init(struct metadata_dst *md_dst, |
268 | enum metadata_type type, u8 optslen) | |
269 | ||
f38a9eb1 | 270 | { |
f38a9eb1 TG |
271 | struct dst_entry *dst; |
272 | ||
f38a9eb1 TG |
273 | dst = &md_dst->dst; |
274 | dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, | |
a4c2fd7f | 275 | DST_METADATA | DST_NOCOUNT); |
f38a9eb1 TG |
276 | |
277 | dst->input = dst_md_discard; | |
ede2059d | 278 | dst->output = dst_md_discard_out; |
f38a9eb1 TG |
279 | |
280 | memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); | |
3fcece12 | 281 | md_dst->type = type; |
d3aa45ce AS |
282 | } |
283 | ||
3fcece12 JK |
284 | struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, |
285 | gfp_t flags) | |
d3aa45ce AS |
286 | { |
287 | struct metadata_dst *md_dst; | |
288 | ||
289 | md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); | |
290 | if (!md_dst) | |
291 | return NULL; | |
292 | ||
3fcece12 | 293 | __metadata_dst_init(md_dst, type, optslen); |
f38a9eb1 TG |
294 | |
295 | return md_dst; | |
296 | } | |
297 | EXPORT_SYMBOL_GPL(metadata_dst_alloc); | |
298 | ||
d71785ff PA |
299 | void metadata_dst_free(struct metadata_dst *md_dst) |
300 | { | |
301 | #ifdef CONFIG_DST_CACHE | |
e65a4955 DL |
302 | if (md_dst->type == METADATA_IP_TUNNEL) |
303 | dst_cache_destroy(&md_dst->u.tun_info.dst_cache); | |
d71785ff PA |
304 | #endif |
305 | kfree(md_dst); | |
306 | } | |
307 | ||
3fcece12 JK |
308 | struct metadata_dst __percpu * |
309 | metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) | |
d3aa45ce AS |
310 | { |
311 | int cpu; | |
312 | struct metadata_dst __percpu *md_dst; | |
313 | ||
314 | md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, | |
315 | __alignof__(struct metadata_dst), flags); | |
316 | if (!md_dst) | |
317 | return NULL; | |
318 | ||
319 | for_each_possible_cpu(cpu) | |
3fcece12 | 320 | __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); |
d3aa45ce AS |
321 | |
322 | return md_dst; | |
323 | } | |
324 | EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); | |
d66f2b91 JK |
325 | |
326 | void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) | |
327 | { | |
833e0e2f | 328 | #ifdef CONFIG_DST_CACHE |
d66f2b91 JK |
329 | int cpu; |
330 | ||
d66f2b91 JK |
331 | for_each_possible_cpu(cpu) { |
332 | struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); | |
333 | ||
334 | if (one_md_dst->type == METADATA_IP_TUNNEL) | |
335 | dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); | |
336 | } | |
337 | #endif | |
338 | free_percpu(md_dst); | |
339 | } | |
340 | EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); |