]>
Commit | Line | Data |
---|---|---|
73e141f9 | 1 | /* Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc. |
e1ec7dd4 EJ |
2 | * |
3 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | * you may not use this file except in compliance with the License. | |
5 | * You may obtain a copy of the License at: | |
6 | * | |
7 | * http://www.apache.org/licenses/LICENSE-2.0 | |
8 | * | |
9 | * Unless required by applicable law or agreed to in writing, software | |
10 | * distributed under the License is distributed on an "AS IS" BASIS, | |
11 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | * See the License for the specific language governing permissions and | |
13 | * limitations under the License. */ | |
14 | ||
15 | #include <config.h> | |
16 | #include "ofproto-dpif-upcall.h" | |
17 | ||
18 | #include <errno.h> | |
19 | #include <stdbool.h> | |
20 | #include <inttypes.h> | |
21 | ||
0fb7792a | 22 | #include "connmgr.h" |
e1ec7dd4 | 23 | #include "coverage.h" |
e1ec7dd4 | 24 | #include "dpif.h" |
e22d52ee | 25 | #include "dynamic-string.h" |
e1ec7dd4 | 26 | #include "fail-open.h" |
05067881 | 27 | #include "guarded-list.h" |
e1ec7dd4 | 28 | #include "latch.h" |
e1ec7dd4 EJ |
29 | #include "list.h" |
30 | #include "netlink.h" | |
31 | #include "ofpbuf.h" | |
10e57640 EJ |
32 | #include "ofproto-dpif-ipfix.h" |
33 | #include "ofproto-dpif-sflow.h" | |
e79a6c83 | 34 | #include "ofproto-dpif-xlate.h" |
0f2ea848 | 35 | #include "ovs-rcu.h" |
e1ec7dd4 EJ |
36 | #include "packets.h" |
37 | #include "poll-loop.h" | |
e22d52ee EJ |
38 | #include "seq.h" |
39 | #include "unixctl.h" | |
e1ec7dd4 EJ |
40 | #include "vlog.h" |
41 | ||
42 | #define MAX_QUEUE_LENGTH 512 | |
e79a6c83 EJ |
43 | #define FLOW_MISS_MAX_BATCH 50 |
44 | #define REVALIDATE_MAX_BATCH 50 | |
e1ec7dd4 EJ |
45 | |
46 | VLOG_DEFINE_THIS_MODULE(ofproto_dpif_upcall); | |
47 | ||
10e57640 | 48 | COVERAGE_DEFINE(upcall_queue_overflow); |
e1ec7dd4 EJ |
49 | |
50 | /* A thread that processes each upcall handed to it by the dispatcher thread, | |
e79a6c83 EJ |
51 | * forwards the upcall's packet, and possibly sets up a kernel flow as a |
52 | * cache. */ | |
e1ec7dd4 EJ |
53 | struct handler { |
54 | struct udpif *udpif; /* Parent udpif. */ | |
55 | pthread_t thread; /* Thread ID. */ | |
e22d52ee | 56 | char *name; /* Thread name. */ |
e1ec7dd4 EJ |
57 | |
58 | struct ovs_mutex mutex; /* Mutex guarding the following. */ | |
59 | ||
10e57640 | 60 | /* Atomic queue of unprocessed upcalls. */ |
e1ec7dd4 EJ |
61 | struct list upcalls OVS_GUARDED; |
62 | size_t n_upcalls OVS_GUARDED; | |
63 | ||
9b32ece6 | 64 | bool need_signal; /* Only changed by the dispatcher. */ |
caf6491f | 65 | |
e1ec7dd4 EJ |
66 | pthread_cond_t wake_cond; /* Wakes 'thread' while holding |
67 | 'mutex'. */ | |
68 | }; | |
69 | ||
e79a6c83 EJ |
70 | /* A thread that processes each kernel flow handed to it by the flow_dumper |
71 | * thread, updates OpenFlow statistics, and updates or removes the kernel flow | |
72 | * as necessary. */ | |
73 | struct revalidator { | |
74 | struct udpif *udpif; /* Parent udpif. */ | |
75 | char *name; /* Thread name. */ | |
76 | ||
77 | pthread_t thread; /* Thread ID. */ | |
78 | struct hmap ukeys; /* Datapath flow keys. */ | |
79 | ||
80 | uint64_t dump_seq; | |
81 | ||
82 | struct ovs_mutex mutex; /* Mutex guarding the following. */ | |
83 | pthread_cond_t wake_cond; | |
84 | struct list udumps OVS_GUARDED; /* Unprocessed udumps. */ | |
85 | size_t n_udumps OVS_GUARDED; /* Number of unprocessed udumps. */ | |
86 | }; | |
87 | ||
e1ec7dd4 EJ |
88 | /* An upcall handler for ofproto_dpif. |
89 | * | |
e79a6c83 EJ |
90 | * udpif has two logically separate pieces: |
91 | * | |
92 | * - A "dispatcher" thread that reads upcalls from the kernel and dispatches | |
93 | * them to one of several "handler" threads (see struct handler). | |
94 | * | |
95 | * - A "flow_dumper" thread that reads the kernel flow table and dispatches | |
96 | * flows to one of several "revalidator" threads (see struct | |
97 | * revalidator). */ | |
e1ec7dd4 | 98 | struct udpif { |
e22d52ee EJ |
99 | struct list list_node; /* In all_udpifs list. */ |
100 | ||
e1ec7dd4 EJ |
101 | struct dpif *dpif; /* Datapath handle. */ |
102 | struct dpif_backer *backer; /* Opaque dpif_backer pointer. */ | |
103 | ||
104 | uint32_t secret; /* Random seed for upcall hash. */ | |
105 | ||
106 | pthread_t dispatcher; /* Dispatcher thread ID. */ | |
e79a6c83 | 107 | pthread_t flow_dumper; /* Flow dumper thread ID. */ |
e1ec7dd4 | 108 | |
10e57640 | 109 | struct handler *handlers; /* Upcall handlers. */ |
e1ec7dd4 EJ |
110 | size_t n_handlers; |
111 | ||
e79a6c83 EJ |
112 | struct revalidator *revalidators; /* Flow revalidators. */ |
113 | size_t n_revalidators; | |
114 | ||
115 | uint64_t last_reval_seq; /* 'reval_seq' at last revalidation. */ | |
116 | struct seq *reval_seq; /* Incremented to force revalidation. */ | |
117 | ||
118 | struct seq *dump_seq; /* Increments each dump iteration. */ | |
119 | ||
120 | struct latch exit_latch; /* Tells child threads to exit. */ | |
121 | ||
122 | long long int dump_duration; /* Duration of the last flow dump. */ | |
e1ec7dd4 | 123 | |
e79a6c83 EJ |
124 | /* Datapath flow statistics. */ |
125 | unsigned int max_n_flows; | |
126 | unsigned int avg_n_flows; | |
e1ec7dd4 | 127 | |
e79a6c83 | 128 | /* Following fields are accessed and modified by different threads. */ |
e79a6c83 | 129 | atomic_uint flow_limit; /* Datapath flow hard limit. */ |
64ca9472 JS |
130 | |
131 | /* n_flows_mutex prevents multiple threads updating these concurrently. */ | |
132 | atomic_uint64_t n_flows; /* Number of flows in the datapath. */ | |
133 | atomic_llong n_flows_timestamp; /* Last time n_flows was updated. */ | |
134 | struct ovs_mutex n_flows_mutex; | |
e1ec7dd4 EJ |
135 | }; |
136 | ||
10e57640 EJ |
137 | enum upcall_type { |
138 | BAD_UPCALL, /* Some kind of bug somewhere. */ | |
139 | MISS_UPCALL, /* A flow miss. */ | |
140 | SFLOW_UPCALL, /* sFlow sample. */ | |
141 | FLOW_SAMPLE_UPCALL, /* Per-flow sampling. */ | |
142 | IPFIX_UPCALL /* Per-bridge sampling. */ | |
143 | }; | |
144 | ||
145 | struct upcall { | |
146 | struct list list_node; /* For queuing upcalls. */ | |
147 | struct flow_miss *flow_miss; /* This upcall's flow_miss. */ | |
148 | ||
149 | /* Raw upcall plus data for keeping track of the memory backing it. */ | |
150 | struct dpif_upcall dpif_upcall; /* As returned by dpif_recv() */ | |
151 | struct ofpbuf upcall_buf; /* Owns some data in 'dpif_upcall'. */ | |
152 | uint64_t upcall_stub[512 / 8]; /* Buffer to reduce need for malloc(). */ | |
153 | }; | |
154 | ||
e79a6c83 EJ |
155 | /* 'udpif_key's are responsible for tracking the little bit of state udpif |
156 | * needs to do flow expiration which can't be pulled directly from the | |
157 | * datapath. They are owned, created by, maintained, and destroyed by a single | |
158 | * revalidator making them easy to efficiently handle with multiple threads. */ | |
159 | struct udpif_key { | |
160 | struct hmap_node hmap_node; /* In parent revalidator 'ukeys' map. */ | |
161 | ||
162 | struct nlattr *key; /* Datapath flow key. */ | |
163 | size_t key_len; /* Length of 'key'. */ | |
164 | ||
165 | struct dpif_flow_stats stats; /* Stats at most recent flow dump. */ | |
166 | long long int created; /* Estimation of creation time. */ | |
167 | ||
168 | bool mark; /* Used by mark and sweep GC algorithm. */ | |
169 | ||
170 | struct odputil_keybuf key_buf; /* Memory for 'key'. */ | |
171 | }; | |
172 | ||
173 | /* 'udpif_flow_dump's hold the state associated with one iteration in a flow | |
174 | * dump operation. This is created by the flow_dumper thread and handed to the | |
175 | * appropriate revalidator thread to be processed. */ | |
176 | struct udpif_flow_dump { | |
177 | struct list list_node; | |
178 | ||
179 | struct nlattr *key; /* Datapath flow key. */ | |
180 | size_t key_len; /* Length of 'key'. */ | |
181 | uint32_t key_hash; /* Hash of 'key'. */ | |
182 | ||
183 | struct odputil_keybuf mask_buf; | |
184 | struct nlattr *mask; /* Datapath mask for 'key'. */ | |
185 | size_t mask_len; /* Length of 'mask'. */ | |
186 | ||
187 | struct dpif_flow_stats stats; /* Stats pulled from the datapath. */ | |
188 | ||
189 | bool need_revalidate; /* Key needs revalidation? */ | |
190 | ||
191 | struct odputil_keybuf key_buf; | |
192 | }; | |
193 | ||
194 | /* Flow miss batching. | |
195 | * | |
196 | * Some dpifs implement operations faster when you hand them off in a batch. | |
197 | * To allow batching, "struct flow_miss" queues the dpif-related work needed | |
198 | * for a given flow. Each "struct flow_miss" corresponds to sending one or | |
199 | * more packets, plus possibly installing the flow in the dpif. */ | |
200 | struct flow_miss { | |
201 | struct hmap_node hmap_node; | |
202 | struct ofproto_dpif *ofproto; | |
203 | ||
204 | struct flow flow; | |
e79a6c83 EJ |
205 | const struct nlattr *key; |
206 | size_t key_len; | |
207 | enum dpif_upcall_type upcall_type; | |
208 | struct dpif_flow_stats stats; | |
209 | odp_port_t odp_in_port; | |
210 | ||
211 | uint64_t slow_path_buf[128 / 8]; | |
212 | struct odputil_keybuf mask_buf; | |
213 | ||
214 | struct xlate_out xout; | |
73e141f9 BP |
215 | |
216 | bool put; | |
e79a6c83 EJ |
217 | }; |
218 | ||
10e57640 EJ |
219 | static void upcall_destroy(struct upcall *); |
220 | ||
e1ec7dd4 | 221 | static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); |
e22d52ee | 222 | static struct list all_udpifs = LIST_INITIALIZER(&all_udpifs); |
e1ec7dd4 EJ |
223 | |
224 | static void recv_upcalls(struct udpif *); | |
e79a6c83 EJ |
225 | static void handle_upcalls(struct handler *handler, struct list *upcalls); |
226 | static void *udpif_flow_dumper(void *); | |
e1ec7dd4 | 227 | static void *udpif_dispatcher(void *); |
10e57640 | 228 | static void *udpif_upcall_handler(void *); |
e79a6c83 | 229 | static void *udpif_revalidator(void *); |
64ca9472 | 230 | static uint64_t udpif_get_n_flows(struct udpif *); |
e79a6c83 EJ |
231 | static void revalidate_udumps(struct revalidator *, struct list *udumps); |
232 | static void revalidator_sweep(struct revalidator *); | |
e96a5c24 | 233 | static void revalidator_purge(struct revalidator *); |
e22d52ee EJ |
234 | static void upcall_unixctl_show(struct unixctl_conn *conn, int argc, |
235 | const char *argv[], void *aux); | |
e79a6c83 EJ |
236 | static void upcall_unixctl_disable_megaflows(struct unixctl_conn *, int argc, |
237 | const char *argv[], void *aux); | |
238 | static void upcall_unixctl_enable_megaflows(struct unixctl_conn *, int argc, | |
239 | const char *argv[], void *aux); | |
94b8c324 JS |
240 | static void upcall_unixctl_set_flow_limit(struct unixctl_conn *conn, int argc, |
241 | const char *argv[], void *aux); | |
e79a6c83 EJ |
242 | static void ukey_delete(struct revalidator *, struct udpif_key *); |
243 | ||
244 | static atomic_bool enable_megaflows = ATOMIC_VAR_INIT(true); | |
e1ec7dd4 EJ |
245 | |
246 | struct udpif * | |
247 | udpif_create(struct dpif_backer *backer, struct dpif *dpif) | |
248 | { | |
e22d52ee | 249 | static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; |
e1ec7dd4 EJ |
250 | struct udpif *udpif = xzalloc(sizeof *udpif); |
251 | ||
e22d52ee EJ |
252 | if (ovsthread_once_start(&once)) { |
253 | unixctl_command_register("upcall/show", "", 0, 0, upcall_unixctl_show, | |
254 | NULL); | |
e79a6c83 EJ |
255 | unixctl_command_register("upcall/disable-megaflows", "", 0, 0, |
256 | upcall_unixctl_disable_megaflows, NULL); | |
257 | unixctl_command_register("upcall/enable-megaflows", "", 0, 0, | |
258 | upcall_unixctl_enable_megaflows, NULL); | |
94b8c324 JS |
259 | unixctl_command_register("upcall/set-flow-limit", "", 1, 1, |
260 | upcall_unixctl_set_flow_limit, NULL); | |
e22d52ee EJ |
261 | ovsthread_once_done(&once); |
262 | } | |
263 | ||
e1ec7dd4 EJ |
264 | udpif->dpif = dpif; |
265 | udpif->backer = backer; | |
e79a6c83 | 266 | atomic_init(&udpif->flow_limit, MIN(ofproto_flow_limit, 10000)); |
e1ec7dd4 | 267 | udpif->secret = random_uint32(); |
d7285d74 | 268 | udpif->reval_seq = seq_create(); |
e79a6c83 | 269 | udpif->dump_seq = seq_create(); |
e1ec7dd4 | 270 | latch_init(&udpif->exit_latch); |
e22d52ee | 271 | list_push_back(&all_udpifs, &udpif->list_node); |
64ca9472 JS |
272 | atomic_init(&udpif->n_flows, 0); |
273 | atomic_init(&udpif->n_flows_timestamp, LLONG_MIN); | |
274 | ovs_mutex_init(&udpif->n_flows_mutex); | |
e1ec7dd4 EJ |
275 | |
276 | return udpif; | |
277 | } | |
278 | ||
279 | void | |
280 | udpif_destroy(struct udpif *udpif) | |
281 | { | |
e79a6c83 | 282 | udpif_set_threads(udpif, 0, 0); |
1b5b5071 | 283 | udpif_flush(udpif); |
e1ec7dd4 | 284 | |
e22d52ee | 285 | list_remove(&udpif->list_node); |
e1ec7dd4 | 286 | latch_destroy(&udpif->exit_latch); |
d7285d74 | 287 | seq_destroy(udpif->reval_seq); |
e79a6c83 | 288 | seq_destroy(udpif->dump_seq); |
64ca9472 | 289 | ovs_mutex_destroy(&udpif->n_flows_mutex); |
e1ec7dd4 EJ |
290 | free(udpif); |
291 | } | |
292 | ||
6567010f | 293 | /* Tells 'udpif' how many threads it should use to handle upcalls. Disables |
e79a6c83 EJ |
294 | * all threads if 'n_handlers' and 'n_revalidators' is zero. 'udpif''s |
295 | * datapath handle must have packet reception enabled before starting threads. | |
296 | */ | |
e1ec7dd4 | 297 | void |
e79a6c83 EJ |
298 | udpif_set_threads(struct udpif *udpif, size_t n_handlers, |
299 | size_t n_revalidators) | |
e1ec7dd4 | 300 | { |
1954e6bb | 301 | int error; |
0f2ea848 | 302 | |
1954e6bb | 303 | ovsrcu_quiesce_start(); |
e1ec7dd4 | 304 | /* Stop the old threads (if any). */ |
e79a6c83 EJ |
305 | if (udpif->handlers && |
306 | (udpif->n_handlers != n_handlers | |
307 | || udpif->n_revalidators != n_revalidators)) { | |
e1ec7dd4 EJ |
308 | size_t i; |
309 | ||
310 | latch_set(&udpif->exit_latch); | |
311 | ||
e1ec7dd4 EJ |
312 | for (i = 0; i < udpif->n_handlers; i++) { |
313 | struct handler *handler = &udpif->handlers[i]; | |
314 | ||
315 | ovs_mutex_lock(&handler->mutex); | |
316 | xpthread_cond_signal(&handler->wake_cond); | |
317 | ovs_mutex_unlock(&handler->mutex); | |
e79a6c83 EJ |
318 | xpthread_join(handler->thread, NULL); |
319 | } | |
320 | ||
321 | for (i = 0; i < udpif->n_revalidators; i++) { | |
322 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
323 | ||
324 | ovs_mutex_lock(&revalidator->mutex); | |
325 | xpthread_cond_signal(&revalidator->wake_cond); | |
326 | ovs_mutex_unlock(&revalidator->mutex); | |
327 | xpthread_join(revalidator->thread, NULL); | |
e1ec7dd4 EJ |
328 | } |
329 | ||
e79a6c83 | 330 | xpthread_join(udpif->flow_dumper, NULL); |
e1ec7dd4 | 331 | xpthread_join(udpif->dispatcher, NULL); |
e79a6c83 EJ |
332 | |
333 | for (i = 0; i < udpif->n_revalidators; i++) { | |
334 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
335 | struct udpif_flow_dump *udump, *next_udump; | |
e79a6c83 EJ |
336 | |
337 | LIST_FOR_EACH_SAFE (udump, next_udump, list_node, | |
338 | &revalidator->udumps) { | |
339 | list_remove(&udump->list_node); | |
340 | free(udump); | |
341 | } | |
342 | ||
e96a5c24 JS |
343 | /* Delete ukeys, and delete all flows from the datapath to prevent |
344 | * double-counting stats. */ | |
345 | revalidator_purge(revalidator); | |
e79a6c83 EJ |
346 | hmap_destroy(&revalidator->ukeys); |
347 | ovs_mutex_destroy(&revalidator->mutex); | |
348 | ||
349 | free(revalidator->name); | |
350 | } | |
351 | ||
e1ec7dd4 EJ |
352 | for (i = 0; i < udpif->n_handlers; i++) { |
353 | struct handler *handler = &udpif->handlers[i]; | |
354 | struct upcall *miss, *next; | |
355 | ||
e1ec7dd4 EJ |
356 | LIST_FOR_EACH_SAFE (miss, next, list_node, &handler->upcalls) { |
357 | list_remove(&miss->list_node); | |
358 | upcall_destroy(miss); | |
359 | } | |
e1ec7dd4 EJ |
360 | ovs_mutex_destroy(&handler->mutex); |
361 | ||
362 | xpthread_cond_destroy(&handler->wake_cond); | |
e22d52ee | 363 | free(handler->name); |
e1ec7dd4 EJ |
364 | } |
365 | latch_poll(&udpif->exit_latch); | |
366 | ||
e79a6c83 EJ |
367 | free(udpif->revalidators); |
368 | udpif->revalidators = NULL; | |
369 | udpif->n_revalidators = 0; | |
370 | ||
e1ec7dd4 EJ |
371 | free(udpif->handlers); |
372 | udpif->handlers = NULL; | |
373 | udpif->n_handlers = 0; | |
374 | } | |
375 | ||
1954e6bb AW |
376 | error = dpif_handlers_set(udpif->dpif, 1); |
377 | if (error) { | |
378 | VLOG_ERR("failed to configure handlers in dpif %s: %s", | |
379 | dpif_name(udpif->dpif), ovs_strerror(error)); | |
380 | return; | |
381 | } | |
382 | ||
e1ec7dd4 EJ |
383 | /* Start new threads (if necessary). */ |
384 | if (!udpif->handlers && n_handlers) { | |
385 | size_t i; | |
386 | ||
387 | udpif->n_handlers = n_handlers; | |
e79a6c83 EJ |
388 | udpif->n_revalidators = n_revalidators; |
389 | ||
e1ec7dd4 EJ |
390 | udpif->handlers = xzalloc(udpif->n_handlers * sizeof *udpif->handlers); |
391 | for (i = 0; i < udpif->n_handlers; i++) { | |
392 | struct handler *handler = &udpif->handlers[i]; | |
393 | ||
394 | handler->udpif = udpif; | |
395 | list_init(&handler->upcalls); | |
9b32ece6 | 396 | handler->need_signal = false; |
e1ec7dd4 | 397 | xpthread_cond_init(&handler->wake_cond, NULL); |
834d6caf | 398 | ovs_mutex_init(&handler->mutex); |
10e57640 EJ |
399 | xpthread_create(&handler->thread, NULL, udpif_upcall_handler, |
400 | handler); | |
e1ec7dd4 | 401 | } |
e1ec7dd4 | 402 | |
e79a6c83 EJ |
403 | udpif->revalidators = xzalloc(udpif->n_revalidators |
404 | * sizeof *udpif->revalidators); | |
405 | for (i = 0; i < udpif->n_revalidators; i++) { | |
406 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
407 | ||
408 | revalidator->udpif = udpif; | |
409 | list_init(&revalidator->udumps); | |
410 | hmap_init(&revalidator->ukeys); | |
411 | ovs_mutex_init(&revalidator->mutex); | |
412 | xpthread_cond_init(&revalidator->wake_cond, NULL); | |
413 | xpthread_create(&revalidator->thread, NULL, udpif_revalidator, | |
414 | revalidator); | |
415 | } | |
416 | xpthread_create(&udpif->dispatcher, NULL, udpif_dispatcher, udpif); | |
417 | xpthread_create(&udpif->flow_dumper, NULL, udpif_flow_dumper, udpif); | |
e1ec7dd4 | 418 | } |
0f2ea848 BP |
419 | |
420 | ovsrcu_quiesce_end(); | |
e1ec7dd4 EJ |
421 | } |
422 | ||
3f142f59 BP |
423 | /* Waits for all ongoing upcall translations to complete. This ensures that |
424 | * there are no transient references to any removed ofprotos (or other | |
425 | * objects). In particular, this should be called after an ofproto is removed | |
426 | * (e.g. via xlate_remove_ofproto()) but before it is destroyed. */ | |
427 | void | |
428 | udpif_synchronize(struct udpif *udpif) | |
429 | { | |
430 | /* This is stronger than necessary. It would be sufficient to ensure | |
431 | * (somehow) that each handler and revalidator thread had passed through | |
432 | * its main loop once. */ | |
433 | size_t n_handlers = udpif->n_handlers; | |
434 | size_t n_revalidators = udpif->n_revalidators; | |
435 | udpif_set_threads(udpif, 0, 0); | |
436 | udpif_set_threads(udpif, n_handlers, n_revalidators); | |
437 | } | |
438 | ||
e1ec7dd4 EJ |
439 | /* Notifies 'udpif' that something changed which may render previous |
440 | * xlate_actions() results invalid. */ | |
441 | void | |
442 | udpif_revalidate(struct udpif *udpif) | |
443 | { | |
d7285d74 | 444 | seq_change(udpif->reval_seq); |
e79a6c83 | 445 | } |
05067881 | 446 | |
e79a6c83 EJ |
447 | /* Returns a seq which increments every time 'udpif' pulls stats from the |
448 | * datapath. Callers can use this to get a sense of when might be a good time | |
449 | * to do periodic work which relies on relatively up to date statistics. */ | |
450 | struct seq * | |
451 | udpif_dump_seq(struct udpif *udpif) | |
452 | { | |
453 | return udpif->dump_seq; | |
e1ec7dd4 EJ |
454 | } |
455 | ||
1c030aa5 EJ |
456 | void |
457 | udpif_get_memory_usage(struct udpif *udpif, struct simap *usage) | |
458 | { | |
459 | size_t i; | |
460 | ||
461 | simap_increase(usage, "dispatchers", 1); | |
462 | simap_increase(usage, "flow_dumpers", 1); | |
463 | ||
464 | simap_increase(usage, "handlers", udpif->n_handlers); | |
465 | for (i = 0; i < udpif->n_handlers; i++) { | |
466 | struct handler *handler = &udpif->handlers[i]; | |
467 | ovs_mutex_lock(&handler->mutex); | |
468 | simap_increase(usage, "handler upcalls", handler->n_upcalls); | |
469 | ovs_mutex_unlock(&handler->mutex); | |
470 | } | |
e79a6c83 EJ |
471 | |
472 | simap_increase(usage, "revalidators", udpif->n_revalidators); | |
473 | for (i = 0; i < udpif->n_revalidators; i++) { | |
474 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
475 | ovs_mutex_lock(&revalidator->mutex); | |
476 | simap_increase(usage, "revalidator dumps", revalidator->n_udumps); | |
477 | ||
478 | /* XXX: This isn't technically thread safe because the revalidator | |
479 | * ukeys maps isn't protected by a mutex since it's per thread. */ | |
480 | simap_increase(usage, "revalidator keys", | |
481 | hmap_count(&revalidator->ukeys)); | |
482 | ovs_mutex_unlock(&revalidator->mutex); | |
483 | } | |
1c030aa5 EJ |
484 | } |
485 | ||
1b5b5071 | 486 | /* Remove flows from a single datapath. */ |
e79a6c83 | 487 | void |
1b5b5071 AZ |
488 | udpif_flush(struct udpif *udpif) |
489 | { | |
490 | size_t n_handlers, n_revalidators; | |
491 | ||
492 | n_handlers = udpif->n_handlers; | |
493 | n_revalidators = udpif->n_revalidators; | |
494 | ||
495 | udpif_set_threads(udpif, 0, 0); | |
496 | dpif_flow_flush(udpif->dpif); | |
497 | udpif_set_threads(udpif, n_handlers, n_revalidators); | |
498 | } | |
499 | ||
500 | /* Removes all flows from all datapaths. */ | |
501 | static void | |
502 | udpif_flush_all_datapaths(void) | |
e79a6c83 EJ |
503 | { |
504 | struct udpif *udpif; | |
505 | ||
506 | LIST_FOR_EACH (udpif, list_node, &all_udpifs) { | |
1b5b5071 | 507 | udpif_flush(udpif); |
e79a6c83 EJ |
508 | } |
509 | } | |
1b5b5071 | 510 | |
e79a6c83 | 511 | \f |
e1ec7dd4 | 512 | /* Destroys and deallocates 'upcall'. */ |
10e57640 | 513 | static void |
e1ec7dd4 EJ |
514 | upcall_destroy(struct upcall *upcall) |
515 | { | |
516 | if (upcall) { | |
da546e07 | 517 | ofpbuf_uninit(&upcall->dpif_upcall.packet); |
e1ec7dd4 EJ |
518 | ofpbuf_uninit(&upcall->upcall_buf); |
519 | free(upcall); | |
520 | } | |
521 | } | |
522 | ||
e79a6c83 | 523 | static uint64_t |
64ca9472 | 524 | udpif_get_n_flows(struct udpif *udpif) |
e1ec7dd4 | 525 | { |
64ca9472 JS |
526 | long long int time, now; |
527 | uint64_t flow_count; | |
528 | ||
529 | now = time_msec(); | |
530 | atomic_read(&udpif->n_flows_timestamp, &time); | |
531 | if (time < now - 100 && !ovs_mutex_trylock(&udpif->n_flows_mutex)) { | |
532 | struct dpif_dp_stats stats; | |
533 | ||
534 | atomic_store(&udpif->n_flows_timestamp, now); | |
535 | dpif_get_dp_stats(udpif->dpif, &stats); | |
536 | flow_count = stats.n_flows; | |
537 | atomic_store(&udpif->n_flows, flow_count); | |
538 | ovs_mutex_unlock(&udpif->n_flows_mutex); | |
539 | } else { | |
540 | atomic_read(&udpif->n_flows, &flow_count); | |
541 | } | |
542 | return flow_count; | |
e79a6c83 | 543 | } |
e1ec7dd4 | 544 | |
e79a6c83 EJ |
545 | /* The dispatcher thread is responsible for receiving upcalls from the kernel, |
546 | * assigning them to a upcall_handler thread. */ | |
547 | static void * | |
548 | udpif_dispatcher(void *arg) | |
549 | { | |
550 | struct udpif *udpif = arg; | |
05067881 | 551 | |
e79a6c83 EJ |
552 | set_subprogram_name("dispatcher"); |
553 | while (!latch_is_set(&udpif->exit_latch)) { | |
554 | recv_upcalls(udpif); | |
1954e6bb | 555 | dpif_recv_wait(udpif->dpif, 0); |
e79a6c83 EJ |
556 | latch_wait(&udpif->exit_latch); |
557 | poll_block(); | |
e1ec7dd4 | 558 | } |
05067881 BP |
559 | |
560 | return NULL; | |
e1ec7dd4 EJ |
561 | } |
562 | ||
e79a6c83 EJ |
563 | static void * |
564 | udpif_flow_dumper(void *arg) | |
e1ec7dd4 | 565 | { |
e79a6c83 | 566 | struct udpif *udpif = arg; |
ddeca9a4 | 567 | |
e79a6c83 EJ |
568 | set_subprogram_name("flow_dumper"); |
569 | while (!latch_is_set(&udpif->exit_latch)) { | |
570 | const struct dpif_flow_stats *stats; | |
571 | long long int start_time, duration; | |
572 | const struct nlattr *key, *mask; | |
573 | struct dpif_flow_dump dump; | |
574 | size_t key_len, mask_len; | |
575 | unsigned int flow_limit; | |
e79a6c83 EJ |
576 | bool need_revalidate; |
577 | uint64_t reval_seq; | |
578 | size_t n_flows, i; | |
938eaa50 | 579 | int error; |
d2ad7ef1 | 580 | void *state = NULL; |
e79a6c83 EJ |
581 | |
582 | reval_seq = seq_read(udpif->reval_seq); | |
583 | need_revalidate = udpif->last_reval_seq != reval_seq; | |
584 | udpif->last_reval_seq = reval_seq; | |
585 | ||
586 | n_flows = udpif_get_n_flows(udpif); | |
587 | udpif->max_n_flows = MAX(n_flows, udpif->max_n_flows); | |
588 | udpif->avg_n_flows = (udpif->avg_n_flows + n_flows) / 2; | |
589 | ||
e79a6c83 | 590 | start_time = time_msec(); |
938eaa50 JS |
591 | error = dpif_flow_dump_start(&dump, udpif->dpif); |
592 | if (error) { | |
593 | VLOG_INFO("Failed to start flow dump (%s)", ovs_strerror(error)); | |
594 | goto skip; | |
595 | } | |
d2ad7ef1 JS |
596 | dpif_flow_dump_state_init(udpif->dpif, &state); |
597 | while (dpif_flow_dump_next(&dump, state, &key, &key_len, | |
598 | &mask, &mask_len, NULL, NULL, &stats) | |
e79a6c83 EJ |
599 | && !latch_is_set(&udpif->exit_latch)) { |
600 | struct udpif_flow_dump *udump = xmalloc(sizeof *udump); | |
601 | struct revalidator *revalidator; | |
602 | ||
603 | udump->key_hash = hash_bytes(key, key_len, udpif->secret); | |
604 | memcpy(&udump->key_buf, key, key_len); | |
605 | udump->key = (struct nlattr *) &udump->key_buf; | |
606 | udump->key_len = key_len; | |
607 | ||
608 | memcpy(&udump->mask_buf, mask, mask_len); | |
609 | udump->mask = (struct nlattr *) &udump->mask_buf; | |
610 | udump->mask_len = mask_len; | |
611 | ||
612 | udump->stats = *stats; | |
613 | udump->need_revalidate = need_revalidate; | |
614 | ||
615 | revalidator = &udpif->revalidators[udump->key_hash | |
616 | % udpif->n_revalidators]; | |
617 | ||
618 | ovs_mutex_lock(&revalidator->mutex); | |
619 | while (revalidator->n_udumps >= REVALIDATE_MAX_BATCH * 3 | |
620 | && !latch_is_set(&udpif->exit_latch)) { | |
621 | ovs_mutex_cond_wait(&revalidator->wake_cond, | |
622 | &revalidator->mutex); | |
623 | } | |
624 | list_push_back(&revalidator->udumps, &udump->list_node); | |
625 | revalidator->n_udumps++; | |
626 | xpthread_cond_signal(&revalidator->wake_cond); | |
627 | ovs_mutex_unlock(&revalidator->mutex); | |
628 | } | |
d2ad7ef1 | 629 | dpif_flow_dump_state_uninit(udpif->dpif, state); |
e79a6c83 EJ |
630 | dpif_flow_dump_done(&dump); |
631 | ||
632 | /* Let all the revalidators finish and garbage collect. */ | |
633 | seq_change(udpif->dump_seq); | |
634 | for (i = 0; i < udpif->n_revalidators; i++) { | |
635 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
636 | ovs_mutex_lock(&revalidator->mutex); | |
637 | xpthread_cond_signal(&revalidator->wake_cond); | |
638 | ovs_mutex_unlock(&revalidator->mutex); | |
639 | } | |
e1ec7dd4 | 640 | |
e79a6c83 EJ |
641 | for (i = 0; i < udpif->n_revalidators; i++) { |
642 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
e1ec7dd4 | 643 | |
e79a6c83 EJ |
644 | ovs_mutex_lock(&revalidator->mutex); |
645 | while (revalidator->dump_seq != seq_read(udpif->dump_seq) | |
646 | && !latch_is_set(&udpif->exit_latch)) { | |
647 | ovs_mutex_cond_wait(&revalidator->wake_cond, | |
648 | &revalidator->mutex); | |
649 | } | |
650 | ovs_mutex_unlock(&revalidator->mutex); | |
651 | } | |
e1ec7dd4 | 652 | |
08d74a96 | 653 | duration = MAX(time_msec() - start_time, 1); |
e79a6c83 | 654 | udpif->dump_duration = duration; |
0a8763fc | 655 | atomic_read(&udpif->flow_limit, &flow_limit); |
e79a6c83 EJ |
656 | if (duration > 2000) { |
657 | flow_limit /= duration / 1000; | |
658 | } else if (duration > 1300) { | |
659 | flow_limit = flow_limit * 3 / 4; | |
660 | } else if (duration < 1000 && n_flows > 2000 | |
661 | && flow_limit < n_flows * 1000 / duration) { | |
662 | flow_limit += 1000; | |
663 | } | |
664 | flow_limit = MIN(ofproto_flow_limit, MAX(flow_limit, 1000)); | |
665 | atomic_store(&udpif->flow_limit, flow_limit); | |
e1ec7dd4 | 666 | |
e79a6c83 | 667 | if (duration > 2000) { |
f8b92eb8 | 668 | VLOG_INFO("Spent an unreasonably long %lldms dumping flows", |
e79a6c83 EJ |
669 | duration); |
670 | } | |
e1ec7dd4 | 671 | |
938eaa50 | 672 | skip: |
72310b04 | 673 | poll_timer_wait_until(start_time + MIN(ofproto_max_idle, 500)); |
e79a6c83 | 674 | seq_wait(udpif->reval_seq, udpif->last_reval_seq); |
e1ec7dd4 EJ |
675 | latch_wait(&udpif->exit_latch); |
676 | poll_block(); | |
677 | } | |
678 | ||
679 | return NULL; | |
680 | } | |
681 | ||
5f37b938 | 682 | /* The miss handler thread is responsible for processing miss upcalls retrieved |
e1ec7dd4 EJ |
683 | * by the dispatcher thread. Once finished it passes the processed miss |
684 | * upcalls to ofproto-dpif where they're installed in the datapath. */ | |
685 | static void * | |
10e57640 | 686 | udpif_upcall_handler(void *arg) |
e1ec7dd4 | 687 | { |
e1ec7dd4 EJ |
688 | struct handler *handler = arg; |
689 | ||
e22d52ee EJ |
690 | handler->name = xasprintf("handler_%u", ovsthread_id_self()); |
691 | set_subprogram_name("%s", handler->name); | |
692 | ||
61057e88 | 693 | while (!latch_is_set(&handler->udpif->exit_latch)) { |
04a19fb8 | 694 | struct list misses = LIST_INITIALIZER(&misses); |
e1ec7dd4 EJ |
695 | size_t i; |
696 | ||
697 | ovs_mutex_lock(&handler->mutex); | |
5878877a AW |
698 | /* Must check the 'exit_latch' again to make sure the main thread is |
699 | * not joining on the handler thread. */ | |
700 | if (!handler->n_upcalls | |
701 | && !latch_is_set(&handler->udpif->exit_latch)) { | |
e1ec7dd4 EJ |
702 | ovs_mutex_cond_wait(&handler->wake_cond, &handler->mutex); |
703 | } | |
704 | ||
705 | for (i = 0; i < FLOW_MISS_MAX_BATCH; i++) { | |
706 | if (handler->n_upcalls) { | |
707 | handler->n_upcalls--; | |
708 | list_push_back(&misses, list_pop_front(&handler->upcalls)); | |
709 | } else { | |
710 | break; | |
711 | } | |
712 | } | |
713 | ovs_mutex_unlock(&handler->mutex); | |
714 | ||
e79a6c83 | 715 | handle_upcalls(handler, &misses); |
de80e4b6 BP |
716 | |
717 | coverage_clear(); | |
e1ec7dd4 | 718 | } |
61057e88 BP |
719 | |
720 | return NULL; | |
e1ec7dd4 | 721 | } |
e79a6c83 EJ |
722 | |
723 | static void * | |
724 | udpif_revalidator(void *arg) | |
e1ec7dd4 | 725 | { |
e79a6c83 | 726 | struct revalidator *revalidator = arg; |
e1ec7dd4 | 727 | |
e79a6c83 EJ |
728 | revalidator->name = xasprintf("revalidator_%u", ovsthread_id_self()); |
729 | set_subprogram_name("%s", revalidator->name); | |
730 | for (;;) { | |
731 | struct list udumps = LIST_INITIALIZER(&udumps); | |
732 | struct udpif *udpif = revalidator->udpif; | |
733 | size_t i; | |
734 | ||
735 | ovs_mutex_lock(&revalidator->mutex); | |
736 | if (latch_is_set(&udpif->exit_latch)) { | |
737 | ovs_mutex_unlock(&revalidator->mutex); | |
738 | return NULL; | |
739 | } | |
740 | ||
741 | if (!revalidator->n_udumps) { | |
742 | if (revalidator->dump_seq != seq_read(udpif->dump_seq)) { | |
743 | revalidator->dump_seq = seq_read(udpif->dump_seq); | |
744 | revalidator_sweep(revalidator); | |
745 | } else { | |
746 | ovs_mutex_cond_wait(&revalidator->wake_cond, | |
747 | &revalidator->mutex); | |
748 | } | |
749 | } | |
750 | ||
751 | for (i = 0; i < REVALIDATE_MAX_BATCH && revalidator->n_udumps; i++) { | |
752 | list_push_back(&udumps, list_pop_front(&revalidator->udumps)); | |
753 | revalidator->n_udumps--; | |
754 | } | |
755 | ||
756 | /* Wake up the flow dumper. */ | |
757 | xpthread_cond_signal(&revalidator->wake_cond); | |
758 | ovs_mutex_unlock(&revalidator->mutex); | |
759 | ||
760 | if (!list_is_empty(&udumps)) { | |
761 | revalidate_udumps(revalidator, &udumps); | |
762 | } | |
763 | } | |
764 | ||
765 | return NULL; | |
766 | } | |
767 | \f | |
e1ec7dd4 EJ |
768 | static enum upcall_type |
769 | classify_upcall(const struct upcall *upcall) | |
770 | { | |
771 | const struct dpif_upcall *dpif_upcall = &upcall->dpif_upcall; | |
772 | union user_action_cookie cookie; | |
773 | size_t userdata_len; | |
774 | ||
775 | /* First look at the upcall type. */ | |
776 | switch (dpif_upcall->type) { | |
777 | case DPIF_UC_ACTION: | |
778 | break; | |
779 | ||
780 | case DPIF_UC_MISS: | |
781 | return MISS_UPCALL; | |
782 | ||
783 | case DPIF_N_UC_TYPES: | |
784 | default: | |
785 | VLOG_WARN_RL(&rl, "upcall has unexpected type %"PRIu32, | |
786 | dpif_upcall->type); | |
787 | return BAD_UPCALL; | |
788 | } | |
789 | ||
790 | /* "action" upcalls need a closer look. */ | |
791 | if (!dpif_upcall->userdata) { | |
792 | VLOG_WARN_RL(&rl, "action upcall missing cookie"); | |
793 | return BAD_UPCALL; | |
794 | } | |
795 | userdata_len = nl_attr_get_size(dpif_upcall->userdata); | |
796 | if (userdata_len < sizeof cookie.type | |
797 | || userdata_len > sizeof cookie) { | |
34582733 | 798 | VLOG_WARN_RL(&rl, "action upcall cookie has unexpected size %"PRIuSIZE, |
e1ec7dd4 EJ |
799 | userdata_len); |
800 | return BAD_UPCALL; | |
801 | } | |
802 | memset(&cookie, 0, sizeof cookie); | |
803 | memcpy(&cookie, nl_attr_get(dpif_upcall->userdata), userdata_len); | |
f5790bf6 | 804 | if (userdata_len == MAX(8, sizeof cookie.sflow) |
e1ec7dd4 EJ |
805 | && cookie.type == USER_ACTION_COOKIE_SFLOW) { |
806 | return SFLOW_UPCALL; | |
f5790bf6 | 807 | } else if (userdata_len == MAX(8, sizeof cookie.slow_path) |
e1ec7dd4 EJ |
808 | && cookie.type == USER_ACTION_COOKIE_SLOW_PATH) { |
809 | return MISS_UPCALL; | |
f5790bf6 | 810 | } else if (userdata_len == MAX(8, sizeof cookie.flow_sample) |
e1ec7dd4 EJ |
811 | && cookie.type == USER_ACTION_COOKIE_FLOW_SAMPLE) { |
812 | return FLOW_SAMPLE_UPCALL; | |
f5790bf6 | 813 | } else if (userdata_len == MAX(8, sizeof cookie.ipfix) |
e1ec7dd4 EJ |
814 | && cookie.type == USER_ACTION_COOKIE_IPFIX) { |
815 | return IPFIX_UPCALL; | |
816 | } else { | |
817 | VLOG_WARN_RL(&rl, "invalid user cookie of type %"PRIu16 | |
34582733 | 818 | " and size %"PRIuSIZE, cookie.type, userdata_len); |
e1ec7dd4 EJ |
819 | return BAD_UPCALL; |
820 | } | |
821 | } | |
822 | ||
823 | static void | |
824 | recv_upcalls(struct udpif *udpif) | |
825 | { | |
caf6491f JR |
826 | int n; |
827 | ||
e1ec7dd4 | 828 | for (;;) { |
10e57640 EJ |
829 | uint32_t hash = udpif->secret; |
830 | struct handler *handler; | |
e1ec7dd4 | 831 | struct upcall *upcall; |
10e57640 EJ |
832 | size_t n_bytes, left; |
833 | struct nlattr *nla; | |
e1ec7dd4 EJ |
834 | int error; |
835 | ||
836 | upcall = xmalloc(sizeof *upcall); | |
837 | ofpbuf_use_stub(&upcall->upcall_buf, upcall->upcall_stub, | |
838 | sizeof upcall->upcall_stub); | |
1954e6bb | 839 | error = dpif_recv(udpif->dpif, 0, &upcall->dpif_upcall, |
e1ec7dd4 EJ |
840 | &upcall->upcall_buf); |
841 | if (error) { | |
837a88dc JR |
842 | /* upcall_destroy() can only be called on successfully received |
843 | * upcalls. */ | |
844 | ofpbuf_uninit(&upcall->upcall_buf); | |
845 | free(upcall); | |
e1ec7dd4 EJ |
846 | break; |
847 | } | |
848 | ||
10e57640 EJ |
849 | n_bytes = 0; |
850 | NL_ATTR_FOR_EACH (nla, left, upcall->dpif_upcall.key, | |
851 | upcall->dpif_upcall.key_len) { | |
852 | enum ovs_key_attr type = nl_attr_type(nla); | |
853 | if (type == OVS_KEY_ATTR_IN_PORT | |
854 | || type == OVS_KEY_ATTR_TCP | |
855 | || type == OVS_KEY_ATTR_UDP) { | |
856 | if (nl_attr_get_size(nla) == 4) { | |
be58eabb | 857 | hash = mhash_add(hash, nl_attr_get_u32(nla)); |
10e57640 EJ |
858 | n_bytes += 4; |
859 | } else { | |
860 | VLOG_WARN_RL(&rl, | |
861 | "Netlink attribute with incorrect size."); | |
e1ec7dd4 EJ |
862 | } |
863 | } | |
10e57640 EJ |
864 | } |
865 | hash = mhash_finish(hash, n_bytes); | |
dfbdea46 | 866 | |
10e57640 | 867 | handler = &udpif->handlers[hash % udpif->n_handlers]; |
dfbdea46 | 868 | |
10e57640 EJ |
869 | ovs_mutex_lock(&handler->mutex); |
870 | if (handler->n_upcalls < MAX_QUEUE_LENGTH) { | |
871 | list_push_back(&handler->upcalls, &upcall->list_node); | |
9b32ece6 YT |
872 | if (handler->n_upcalls == 0) { |
873 | handler->need_signal = true; | |
874 | } | |
875 | handler->n_upcalls++; | |
876 | if (handler->need_signal && | |
877 | handler->n_upcalls >= FLOW_MISS_MAX_BATCH) { | |
878 | handler->need_signal = false; | |
10e57640 | 879 | xpthread_cond_signal(&handler->wake_cond); |
dfbdea46 | 880 | } |
10e57640 EJ |
881 | ovs_mutex_unlock(&handler->mutex); |
882 | if (!VLOG_DROP_DBG(&rl)) { | |
883 | struct ds ds = DS_EMPTY_INITIALIZER; | |
884 | ||
885 | odp_flow_key_format(upcall->dpif_upcall.key, | |
886 | upcall->dpif_upcall.key_len, | |
887 | &ds); | |
888 | VLOG_DBG("dispatcher: enqueue (%s)", ds_cstr(&ds)); | |
889 | ds_destroy(&ds); | |
e1ec7dd4 | 890 | } |
10e57640 EJ |
891 | } else { |
892 | ovs_mutex_unlock(&handler->mutex); | |
893 | COVERAGE_INC(upcall_queue_overflow); | |
894 | upcall_destroy(upcall); | |
e1ec7dd4 EJ |
895 | } |
896 | } | |
10e57640 | 897 | |
caf6491f | 898 | for (n = 0; n < udpif->n_handlers; ++n) { |
10e57640 EJ |
899 | struct handler *handler = &udpif->handlers[n]; |
900 | ||
9b32ece6 YT |
901 | if (handler->need_signal) { |
902 | handler->need_signal = false; | |
caf6491f JR |
903 | ovs_mutex_lock(&handler->mutex); |
904 | xpthread_cond_signal(&handler->wake_cond); | |
905 | ovs_mutex_unlock(&handler->mutex); | |
906 | } | |
907 | } | |
e1ec7dd4 EJ |
908 | } |
909 | ||
e79a6c83 EJ |
910 | /* Calculates slow path actions for 'xout'. 'buf' must statically be |
911 | * initialized with at least 128 bytes of space. */ | |
912 | static void | |
913 | compose_slow_path(struct udpif *udpif, struct xlate_out *xout, | |
914 | odp_port_t odp_in_port, struct ofpbuf *buf) | |
915 | { | |
916 | union user_action_cookie cookie; | |
917 | odp_port_t port; | |
918 | uint32_t pid; | |
919 | ||
920 | cookie.type = USER_ACTION_COOKIE_SLOW_PATH; | |
921 | cookie.slow_path.unused = 0; | |
922 | cookie.slow_path.reason = xout->slow; | |
923 | ||
924 | port = xout->slow & (SLOW_CFM | SLOW_BFD | SLOW_LACP | SLOW_STP) | |
925 | ? ODPP_NONE | |
926 | : odp_in_port; | |
1954e6bb | 927 | pid = dpif_port_get_pid(udpif->dpif, port, 0); |
e79a6c83 EJ |
928 | odp_put_userspace_action(pid, &cookie, sizeof cookie.slow_path, buf); |
929 | } | |
930 | ||
e1ec7dd4 EJ |
931 | static struct flow_miss * |
932 | flow_miss_find(struct hmap *todo, const struct ofproto_dpif *ofproto, | |
933 | const struct flow *flow, uint32_t hash) | |
934 | { | |
935 | struct flow_miss *miss; | |
936 | ||
937 | HMAP_FOR_EACH_WITH_HASH (miss, hmap_node, hash, todo) { | |
938 | if (miss->ofproto == ofproto && flow_equal(&miss->flow, flow)) { | |
939 | return miss; | |
940 | } | |
941 | } | |
942 | ||
943 | return NULL; | |
944 | } | |
945 | ||
e1ec7dd4 | 946 | static void |
e79a6c83 | 947 | handle_upcalls(struct handler *handler, struct list *upcalls) |
e1ec7dd4 | 948 | { |
e79a6c83 EJ |
949 | struct hmap misses = HMAP_INITIALIZER(&misses); |
950 | struct udpif *udpif = handler->udpif; | |
951 | ||
952 | struct flow_miss miss_buf[FLOW_MISS_MAX_BATCH]; | |
953 | struct dpif_op *opsp[FLOW_MISS_MAX_BATCH * 2]; | |
954 | struct dpif_op ops[FLOW_MISS_MAX_BATCH * 2]; | |
955 | struct flow_miss *miss, *next_miss; | |
e1ec7dd4 | 956 | struct upcall *upcall, *next; |
ddeca9a4 | 957 | size_t n_misses, n_ops, i; |
e79a6c83 EJ |
958 | unsigned int flow_limit; |
959 | bool fail_open, may_put; | |
10e57640 | 960 | enum upcall_type type; |
e1ec7dd4 | 961 | |
e79a6c83 EJ |
962 | atomic_read(&udpif->flow_limit, &flow_limit); |
963 | may_put = udpif_get_n_flows(udpif) < flow_limit; | |
964 | ||
965 | /* Extract the flow from each upcall. Construct in 'misses' a hash table | |
966 | * that maps each unique flow to a 'struct flow_miss'. | |
04a19fb8 BP |
967 | * |
968 | * Most commonly there is a single packet per flow_miss, but there are | |
969 | * several reasons why there might be more than one, e.g.: | |
970 | * | |
971 | * - The dpif packet interface does not support TSO (or UFO, etc.), so a | |
972 | * large packet sent to userspace is split into a sequence of smaller | |
973 | * ones. | |
e1ec7dd4 | 974 | * |
04a19fb8 BP |
975 | * - A stream of quickly arriving packets in an established "slow-pathed" |
976 | * flow. | |
977 | * | |
978 | * - Rarely, a stream of quickly arriving packets in a flow not yet | |
979 | * established. (This is rare because most protocols do not send | |
980 | * multiple back-to-back packets before receiving a reply from the | |
981 | * other end of the connection, which gives OVS a chance to set up a | |
982 | * datapath flow.) | |
983 | */ | |
ddeca9a4 | 984 | n_misses = 0; |
e1ec7dd4 EJ |
985 | LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { |
986 | struct dpif_upcall *dupcall = &upcall->dpif_upcall; | |
e79a6c83 | 987 | struct flow_miss *miss = &miss_buf[n_misses]; |
da546e07 | 988 | struct ofpbuf *packet = &dupcall->packet; |
e1ec7dd4 EJ |
989 | struct flow_miss *existing_miss; |
990 | struct ofproto_dpif *ofproto; | |
10e57640 EJ |
991 | struct dpif_sflow *sflow; |
992 | struct dpif_ipfix *ipfix; | |
e1ec7dd4 EJ |
993 | odp_port_t odp_in_port; |
994 | struct flow flow; | |
e1ec7dd4 EJ |
995 | int error; |
996 | ||
04a19fb8 | 997 | error = xlate_receive(udpif->backer, packet, dupcall->key, |
836fbda7 | 998 | dupcall->key_len, &flow, |
1dfdb9b3 | 999 | &ofproto, &ipfix, &sflow, NULL, &odp_in_port); |
10e57640 EJ |
1000 | if (error) { |
1001 | if (error == ENODEV) { | |
10e57640 EJ |
1002 | /* Received packet on datapath port for which we couldn't |
1003 | * associate an ofproto. This can happen if a port is removed | |
1004 | * while traffic is being received. Print a rate-limited | |
1005 | * message in case it happens frequently. Install a drop flow | |
1006 | * so that future packets of the flow are inexpensively dropped | |
1007 | * in the kernel. */ | |
1008 | VLOG_INFO_RL(&rl, "received packet on unassociated datapath " | |
1009 | "port %"PRIu32, odp_in_port); | |
e79a6c83 EJ |
1010 | dpif_flow_put(udpif->dpif, DPIF_FP_CREATE | DPIF_FP_MODIFY, |
1011 | dupcall->key, dupcall->key_len, NULL, 0, NULL, 0, | |
1012 | NULL); | |
10e57640 EJ |
1013 | } |
1014 | list_remove(&upcall->list_node); | |
1015 | upcall_destroy(upcall); | |
1016 | continue; | |
1017 | } | |
1018 | ||
1019 | type = classify_upcall(upcall); | |
1020 | if (type == MISS_UPCALL) { | |
04a19fb8 | 1021 | uint32_t hash; |
fc3431c6 | 1022 | struct pkt_metadata md = pkt_metadata_from_flow(&flow); |
04a19fb8 | 1023 | |
b5e7e61a | 1024 | flow_extract(packet, &md, &miss->flow); |
04a19fb8 | 1025 | hash = flow_hash(&miss->flow, 0); |
e79a6c83 | 1026 | existing_miss = flow_miss_find(&misses, ofproto, &miss->flow, |
04a19fb8 BP |
1027 | hash); |
1028 | if (!existing_miss) { | |
e79a6c83 | 1029 | hmap_insert(&misses, &miss->hmap_node, hash); |
04a19fb8 BP |
1030 | miss->ofproto = ofproto; |
1031 | miss->key = dupcall->key; | |
1032 | miss->key_len = dupcall->key_len; | |
1033 | miss->upcall_type = dupcall->type; | |
1034 | miss->stats.n_packets = 0; | |
1035 | miss->stats.n_bytes = 0; | |
1036 | miss->stats.used = time_msec(); | |
1037 | miss->stats.tcp_flags = 0; | |
e79a6c83 | 1038 | miss->odp_in_port = odp_in_port; |
73e141f9 | 1039 | miss->put = false; |
04a19fb8 | 1040 | |
ddeca9a4 | 1041 | n_misses++; |
e1ec7dd4 | 1042 | } else { |
04a19fb8 | 1043 | miss = existing_miss; |
e1ec7dd4 | 1044 | } |
e0eecb1c | 1045 | miss->stats.tcp_flags |= ntohs(miss->flow.tcp_flags); |
1f317cb5 | 1046 | miss->stats.n_bytes += ofpbuf_size(packet); |
04a19fb8 | 1047 | miss->stats.n_packets++; |
e1ec7dd4 | 1048 | |
04a19fb8 | 1049 | upcall->flow_miss = miss; |
10e57640 EJ |
1050 | continue; |
1051 | } | |
04a19fb8 | 1052 | |
10e57640 EJ |
1053 | switch (type) { |
1054 | case SFLOW_UPCALL: | |
10e57640 EJ |
1055 | if (sflow) { |
1056 | union user_action_cookie cookie; | |
1057 | ||
1058 | memset(&cookie, 0, sizeof cookie); | |
1059 | memcpy(&cookie, nl_attr_get(dupcall->userdata), | |
1060 | sizeof cookie.sflow); | |
da546e07 | 1061 | dpif_sflow_received(sflow, packet, &flow, odp_in_port, |
10e57640 | 1062 | &cookie); |
04a19fb8 | 1063 | } |
10e57640 EJ |
1064 | break; |
1065 | case IPFIX_UPCALL: | |
10e57640 | 1066 | if (ipfix) { |
da546e07 | 1067 | dpif_ipfix_bridge_sample(ipfix, packet, &flow); |
10e57640 EJ |
1068 | } |
1069 | break; | |
1070 | case FLOW_SAMPLE_UPCALL: | |
10e57640 EJ |
1071 | if (ipfix) { |
1072 | union user_action_cookie cookie; | |
1073 | ||
1074 | memset(&cookie, 0, sizeof cookie); | |
1075 | memcpy(&cookie, nl_attr_get(dupcall->userdata), | |
1076 | sizeof cookie.flow_sample); | |
1077 | ||
1078 | /* The flow reflects exactly the contents of the packet. | |
1079 | * Sample the packet using it. */ | |
da546e07 | 1080 | dpif_ipfix_flow_sample(ipfix, packet, &flow, |
10e57640 EJ |
1081 | cookie.flow_sample.collector_set_id, |
1082 | cookie.flow_sample.probability, | |
1083 | cookie.flow_sample.obs_domain_id, | |
1084 | cookie.flow_sample.obs_point_id); | |
10e57640 EJ |
1085 | } |
1086 | break; | |
1087 | case BAD_UPCALL: | |
1088 | break; | |
1089 | case MISS_UPCALL: | |
428b2edd | 1090 | OVS_NOT_REACHED(); |
e1ec7dd4 | 1091 | } |
10e57640 | 1092 | |
1dfdb9b3 EJ |
1093 | dpif_ipfix_unref(ipfix); |
1094 | dpif_sflow_unref(sflow); | |
1095 | ||
10e57640 EJ |
1096 | list_remove(&upcall->list_node); |
1097 | upcall_destroy(upcall); | |
e1ec7dd4 EJ |
1098 | } |
1099 | ||
04a19fb8 BP |
1100 | /* Initialize each 'struct flow_miss's ->xout. |
1101 | * | |
1102 | * We do this per-flow_miss rather than per-packet because, most commonly, | |
1103 | * all the packets in a flow can use the same translation. | |
1104 | * | |
1105 | * We can't do this in the previous loop because we need the TCP flags for | |
1106 | * all the packets in each miss. */ | |
1107 | fail_open = false; | |
e79a6c83 | 1108 | HMAP_FOR_EACH (miss, hmap_node, &misses) { |
04a19fb8 BP |
1109 | struct xlate_in xin; |
1110 | ||
10c44245 | 1111 | xlate_in_init(&xin, miss->ofproto, &miss->flow, NULL, |
04a19fb8 BP |
1112 | miss->stats.tcp_flags, NULL); |
1113 | xin.may_learn = true; | |
e79a6c83 EJ |
1114 | |
1115 | if (miss->upcall_type == DPIF_UC_MISS) { | |
1116 | xin.resubmit_stats = &miss->stats; | |
1117 | } else { | |
1118 | /* For non-miss upcalls, there's a flow in the datapath which this | |
1119 | * packet was accounted to. Presumably the revalidators will deal | |
1120 | * with pushing its stats eventually. */ | |
1121 | } | |
1122 | ||
04a19fb8 | 1123 | xlate_actions(&xin, &miss->xout); |
10c44245 | 1124 | fail_open = fail_open || miss->xout.fail_open; |
04a19fb8 BP |
1125 | } |
1126 | ||
1127 | /* Now handle the packets individually in order of arrival. In the common | |
1128 | * case each packet of a miss can share the same actions, but slow-pathed | |
1129 | * packets need to be translated individually: | |
1130 | * | |
1131 | * - For SLOW_CFM, SLOW_LACP, SLOW_STP, and SLOW_BFD, translation is what | |
1132 | * processes received packets for these protocols. | |
1133 | * | |
1134 | * - For SLOW_CONTROLLER, translation sends the packet to the OpenFlow | |
1135 | * controller. | |
1136 | * | |
1137 | * The loop fills 'ops' with an array of operations to execute in the | |
1138 | * datapath. */ | |
1139 | n_ops = 0; | |
1140 | LIST_FOR_EACH (upcall, list_node, upcalls) { | |
1141 | struct flow_miss *miss = upcall->flow_miss; | |
da546e07 | 1142 | struct ofpbuf *packet = &upcall->dpif_upcall.packet; |
e79a6c83 | 1143 | struct dpif_op *op; |
d02c42bf AZ |
1144 | ovs_be16 flow_vlan_tci; |
1145 | ||
1146 | /* Save a copy of flow.vlan_tci in case it is changed to | |
1147 | * generate proper mega flow masks for VLAN splinter flows. */ | |
1148 | flow_vlan_tci = miss->flow.vlan_tci; | |
04a19fb8 BP |
1149 | |
1150 | if (miss->xout.slow) { | |
04a19fb8 BP |
1151 | struct xlate_in xin; |
1152 | ||
10c44245 | 1153 | xlate_in_init(&xin, miss->ofproto, &miss->flow, NULL, 0, packet); |
04a19fb8 | 1154 | xlate_actions_for_side_effects(&xin); |
04a19fb8 BP |
1155 | } |
1156 | ||
d02c42bf AZ |
1157 | if (miss->flow.in_port.ofp_port |
1158 | != vsp_realdev_to_vlandev(miss->ofproto, | |
1159 | miss->flow.in_port.ofp_port, | |
1160 | miss->flow.vlan_tci)) { | |
1161 | /* This packet was received on a VLAN splinter port. We | |
1162 | * added a VLAN to the packet to make the packet resemble | |
1163 | * the flow, but the actions were composed assuming that | |
1164 | * the packet contained no VLAN. So, we must remove the | |
1165 | * VLAN header from the packet before trying to execute the | |
1166 | * actions. */ | |
1f317cb5 | 1167 | if (ofpbuf_size(&miss->xout.odp_actions)) { |
d02c42bf AZ |
1168 | eth_pop_vlan(packet); |
1169 | } | |
1170 | ||
1171 | /* Remove the flow vlan tags inserted by vlan splinter logic | |
1172 | * to ensure megaflow masks generated match the data path flow. */ | |
1173 | miss->flow.vlan_tci = 0; | |
e79a6c83 | 1174 | } |
04a19fb8 | 1175 | |
73e141f9 BP |
1176 | /* Do not install a flow into the datapath if: |
1177 | * | |
1178 | * - The datapath already has too many flows. | |
1179 | * | |
1180 | * - An earlier iteration of this loop already put the same flow. | |
1181 | * | |
1182 | * - We received this packet via some flow installed in the kernel | |
1183 | * already. */ | |
1184 | if (may_put | |
1185 | && !miss->put | |
1186 | && upcall->dpif_upcall.type == DPIF_UC_MISS) { | |
d02c42bf AZ |
1187 | struct ofpbuf mask; |
1188 | bool megaflow; | |
1189 | ||
73e141f9 BP |
1190 | miss->put = true; |
1191 | ||
d02c42bf AZ |
1192 | atomic_read(&enable_megaflows, &megaflow); |
1193 | ofpbuf_use_stack(&mask, &miss->mask_buf, sizeof miss->mask_buf); | |
1194 | if (megaflow) { | |
8bfd0fda BP |
1195 | size_t max_mpls; |
1196 | ||
1197 | max_mpls = ofproto_dpif_get_max_mpls_depth(miss->ofproto); | |
d02c42bf | 1198 | odp_flow_key_from_mask(&mask, &miss->xout.wc.masks, |
8bfd0fda | 1199 | &miss->flow, UINT32_MAX, max_mpls); |
d02c42bf AZ |
1200 | } |
1201 | ||
e79a6c83 EJ |
1202 | op = &ops[n_ops++]; |
1203 | op->type = DPIF_OP_FLOW_PUT; | |
1204 | op->u.flow_put.flags = DPIF_FP_CREATE | DPIF_FP_MODIFY; | |
1205 | op->u.flow_put.key = miss->key; | |
1206 | op->u.flow_put.key_len = miss->key_len; | |
1f317cb5 PS |
1207 | op->u.flow_put.mask = ofpbuf_data(&mask); |
1208 | op->u.flow_put.mask_len = ofpbuf_size(&mask); | |
e79a6c83 EJ |
1209 | op->u.flow_put.stats = NULL; |
1210 | ||
1211 | if (!miss->xout.slow) { | |
1f317cb5 PS |
1212 | op->u.flow_put.actions = ofpbuf_data(&miss->xout.odp_actions); |
1213 | op->u.flow_put.actions_len = ofpbuf_size(&miss->xout.odp_actions); | |
e79a6c83 EJ |
1214 | } else { |
1215 | struct ofpbuf buf; | |
1216 | ||
1217 | ofpbuf_use_stack(&buf, miss->slow_path_buf, | |
1218 | sizeof miss->slow_path_buf); | |
1219 | compose_slow_path(udpif, &miss->xout, miss->odp_in_port, &buf); | |
1f317cb5 PS |
1220 | op->u.flow_put.actions = ofpbuf_data(&buf); |
1221 | op->u.flow_put.actions_len = ofpbuf_size(&buf); | |
e79a6c83 EJ |
1222 | } |
1223 | } | |
1224 | ||
d02c42bf AZ |
1225 | /* |
1226 | * The 'miss' may be shared by multiple upcalls. Restore | |
1227 | * the saved flow vlan_tci field before processing the next | |
1228 | * upcall. */ | |
1229 | miss->flow.vlan_tci = flow_vlan_tci; | |
1230 | ||
1f317cb5 | 1231 | if (ofpbuf_size(&miss->xout.odp_actions)) { |
04a19fb8 BP |
1232 | |
1233 | op = &ops[n_ops++]; | |
1234 | op->type = DPIF_OP_EXECUTE; | |
04a19fb8 | 1235 | op->u.execute.packet = packet; |
758c456d JR |
1236 | odp_key_to_pkt_metadata(miss->key, miss->key_len, |
1237 | &op->u.execute.md); | |
1f317cb5 PS |
1238 | op->u.execute.actions = ofpbuf_data(&miss->xout.odp_actions); |
1239 | op->u.execute.actions_len = ofpbuf_size(&miss->xout.odp_actions); | |
7fd91025 | 1240 | op->u.execute.needs_help = (miss->xout.slow & SLOW_ACTION) != 0; |
04a19fb8 | 1241 | } |
e1ec7dd4 | 1242 | } |
e1ec7dd4 | 1243 | |
04a19fb8 BP |
1244 | /* Special case for fail-open mode. |
1245 | * | |
1246 | * If we are in fail-open mode, but we are connected to a controller too, | |
1247 | * then we should send the packet up to the controller in the hope that it | |
1248 | * will try to set up a flow and thereby allow us to exit fail-open. | |
1249 | * | |
da546e07 JR |
1250 | * See the top-level comment in fail-open.c for more information. |
1251 | * | |
1252 | * Copy packets before they are modified by execution. */ | |
04a19fb8 BP |
1253 | if (fail_open) { |
1254 | LIST_FOR_EACH (upcall, list_node, upcalls) { | |
1255 | struct flow_miss *miss = upcall->flow_miss; | |
da546e07 | 1256 | struct ofpbuf *packet = &upcall->dpif_upcall.packet; |
0fb7792a | 1257 | struct ofproto_packet_in *pin; |
04a19fb8 BP |
1258 | |
1259 | pin = xmalloc(sizeof *pin); | |
1f317cb5 PS |
1260 | pin->up.packet = xmemdup(ofpbuf_data(packet), ofpbuf_size(packet)); |
1261 | pin->up.packet_len = ofpbuf_size(packet); | |
0fb7792a | 1262 | pin->up.reason = OFPR_NO_MATCH; |
0fb7792a | 1263 | pin->up.table_id = 0; |
d4fa4e79 | 1264 | pin->up.cookie = OVS_BE64_MAX; |
0fb7792a | 1265 | flow_get_metadata(&miss->flow, &pin->up.fmd); |
d38a3c7b | 1266 | pin->send_len = 0; /* Not used for flow table misses. */ |
32260212 | 1267 | pin->miss_type = OFPROTO_PACKET_IN_NO_MISS; |
04a19fb8 BP |
1268 | ofproto_dpif_send_packet_in(miss->ofproto, pin); |
1269 | } | |
1270 | } | |
1271 | ||
da546e07 JR |
1272 | /* Execute batch. */ |
1273 | for (i = 0; i < n_ops; i++) { | |
1274 | opsp[i] = &ops[i]; | |
1275 | } | |
1276 | dpif_operate(udpif->dpif, opsp, n_ops); | |
1277 | ||
e79a6c83 EJ |
1278 | HMAP_FOR_EACH_SAFE (miss, next_miss, hmap_node, &misses) { |
1279 | hmap_remove(&misses, &miss->hmap_node); | |
1280 | xlate_out_uninit(&miss->xout); | |
1281 | } | |
1282 | hmap_destroy(&misses); | |
1283 | ||
1284 | LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { | |
1285 | list_remove(&upcall->list_node); | |
1286 | upcall_destroy(upcall); | |
1287 | } | |
1288 | } | |
1289 | ||
1290 | static struct udpif_key * | |
1291 | ukey_lookup(struct revalidator *revalidator, struct udpif_flow_dump *udump) | |
1292 | { | |
1293 | struct udpif_key *ukey; | |
1294 | ||
1295 | HMAP_FOR_EACH_WITH_HASH (ukey, hmap_node, udump->key_hash, | |
1296 | &revalidator->ukeys) { | |
1297 | if (ukey->key_len == udump->key_len | |
1298 | && !memcmp(ukey->key, udump->key, udump->key_len)) { | |
1299 | return ukey; | |
1300 | } | |
1301 | } | |
1302 | return NULL; | |
1303 | } | |
1304 | ||
13bb6ed0 JS |
1305 | static struct udpif_key * |
1306 | ukey_create(const struct nlattr *key, size_t key_len, long long int used) | |
1307 | { | |
1308 | struct udpif_key *ukey = xmalloc(sizeof *ukey); | |
1309 | ||
1310 | ukey->key = (struct nlattr *) &ukey->key_buf; | |
1311 | memcpy(&ukey->key_buf, key, key_len); | |
1312 | ukey->key_len = key_len; | |
1313 | ||
1314 | ukey->mark = false; | |
1315 | ukey->created = used ? used : time_msec(); | |
1316 | memset(&ukey->stats, 0, sizeof ukey->stats); | |
1317 | ||
1318 | return ukey; | |
1319 | } | |
1320 | ||
e79a6c83 EJ |
1321 | static void |
1322 | ukey_delete(struct revalidator *revalidator, struct udpif_key *ukey) | |
1323 | { | |
1324 | hmap_remove(&revalidator->ukeys, &ukey->hmap_node); | |
1325 | free(ukey); | |
1326 | } | |
1327 | ||
1328 | static bool | |
1329 | revalidate_ukey(struct udpif *udpif, struct udpif_flow_dump *udump, | |
1330 | struct udpif_key *ukey) | |
1331 | { | |
1332 | struct ofpbuf xout_actions, *actions; | |
1333 | uint64_t slow_path_buf[128 / 8]; | |
1334 | struct xlate_out xout, *xoutp; | |
1335 | struct flow flow, udump_mask; | |
1336 | struct ofproto_dpif *ofproto; | |
1337 | struct dpif_flow_stats push; | |
1338 | uint32_t *udump32, *xout32; | |
1339 | odp_port_t odp_in_port; | |
1340 | struct xlate_in xin; | |
1341 | int error; | |
1342 | size_t i; | |
1343 | bool ok; | |
1344 | ||
1345 | ok = false; | |
1346 | xoutp = NULL; | |
1347 | actions = NULL; | |
1348 | ||
1349 | /* If we don't need to revalidate, we can simply push the stats contained | |
1350 | * in the udump, otherwise we'll have to get the actions so we can check | |
1351 | * them. */ | |
1352 | if (udump->need_revalidate) { | |
1353 | if (dpif_flow_get(udpif->dpif, ukey->key, ukey->key_len, &actions, | |
1354 | &udump->stats)) { | |
1355 | goto exit; | |
1356 | } | |
1357 | } | |
1358 | ||
1359 | push.used = udump->stats.used; | |
1360 | push.tcp_flags = udump->stats.tcp_flags; | |
1361 | push.n_packets = udump->stats.n_packets > ukey->stats.n_packets | |
1362 | ? udump->stats.n_packets - ukey->stats.n_packets | |
1363 | : 0; | |
1364 | push.n_bytes = udump->stats.n_bytes > ukey->stats.n_bytes | |
1365 | ? udump->stats.n_bytes - ukey->stats.n_bytes | |
1366 | : 0; | |
1367 | ukey->stats = udump->stats; | |
1368 | ||
1369 | if (!push.n_packets && !udump->need_revalidate) { | |
1370 | ok = true; | |
1371 | goto exit; | |
1372 | } | |
1373 | ||
1374 | error = xlate_receive(udpif->backer, NULL, ukey->key, ukey->key_len, &flow, | |
836fbda7 | 1375 | &ofproto, NULL, NULL, NULL, &odp_in_port); |
e79a6c83 EJ |
1376 | if (error) { |
1377 | goto exit; | |
1378 | } | |
1379 | ||
1380 | xlate_in_init(&xin, ofproto, &flow, NULL, push.tcp_flags, NULL); | |
1381 | xin.resubmit_stats = push.n_packets ? &push : NULL; | |
1382 | xin.may_learn = push.n_packets > 0; | |
1383 | xin.skip_wildcards = !udump->need_revalidate; | |
1384 | xlate_actions(&xin, &xout); | |
1385 | xoutp = &xout; | |
ddeca9a4 | 1386 | |
e79a6c83 EJ |
1387 | if (!udump->need_revalidate) { |
1388 | ok = true; | |
1389 | goto exit; | |
1390 | } | |
1391 | ||
1392 | if (!xout.slow) { | |
1f317cb5 PS |
1393 | ofpbuf_use_const(&xout_actions, ofpbuf_data(&xout.odp_actions), |
1394 | ofpbuf_size(&xout.odp_actions)); | |
05067881 | 1395 | } else { |
e79a6c83 EJ |
1396 | ofpbuf_use_stack(&xout_actions, slow_path_buf, sizeof slow_path_buf); |
1397 | compose_slow_path(udpif, &xout, odp_in_port, &xout_actions); | |
1398 | } | |
1399 | ||
1400 | if (!ofpbuf_equal(&xout_actions, actions)) { | |
1401 | goto exit; | |
1402 | } | |
1403 | ||
1404 | if (odp_flow_key_to_mask(udump->mask, udump->mask_len, &udump_mask, &flow) | |
1405 | == ODP_FIT_ERROR) { | |
1406 | goto exit; | |
1407 | } | |
1408 | ||
1409 | /* Since the kernel is free to ignore wildcarded bits in the mask, we can't | |
1410 | * directly check that the masks are the same. Instead we check that the | |
1411 | * mask in the kernel is more specific i.e. less wildcarded, than what | |
1412 | * we've calculated here. This guarantees we don't catch any packets we | |
1413 | * shouldn't with the megaflow. */ | |
1414 | udump32 = (uint32_t *) &udump_mask; | |
1415 | xout32 = (uint32_t *) &xout.wc.masks; | |
1416 | for (i = 0; i < FLOW_U32S; i++) { | |
1417 | if ((udump32[i] | xout32[i]) != udump32[i]) { | |
1418 | goto exit; | |
1419 | } | |
1420 | } | |
1421 | ok = true; | |
1422 | ||
1423 | exit: | |
1424 | ofpbuf_delete(actions); | |
1425 | xlate_out_uninit(xoutp); | |
1426 | return ok; | |
1427 | } | |
1428 | ||
13bb6ed0 JS |
1429 | struct dump_op { |
1430 | struct udpif_key *ukey; | |
1431 | struct udpif_flow_dump *udump; | |
1432 | struct dpif_flow_stats stats; /* Stats for 'op'. */ | |
1433 | struct dpif_op op; /* Flow del operation. */ | |
1434 | }; | |
1435 | ||
e79a6c83 | 1436 | static void |
13bb6ed0 JS |
1437 | dump_op_init(struct dump_op *op, const struct nlattr *key, size_t key_len, |
1438 | struct udpif_key *ukey, struct udpif_flow_dump *udump) | |
1439 | { | |
1440 | op->ukey = ukey; | |
1441 | op->udump = udump; | |
1442 | op->op.type = DPIF_OP_FLOW_DEL; | |
1443 | op->op.u.flow_del.key = key; | |
1444 | op->op.u.flow_del.key_len = key_len; | |
1445 | op->op.u.flow_del.stats = &op->stats; | |
1446 | } | |
1447 | ||
1448 | static void | |
1449 | push_dump_ops(struct revalidator *revalidator, | |
1450 | struct dump_op *ops, size_t n_ops) | |
e79a6c83 EJ |
1451 | { |
1452 | struct udpif *udpif = revalidator->udpif; | |
13bb6ed0 JS |
1453 | struct dpif_op *opsp[REVALIDATE_MAX_BATCH]; |
1454 | size_t i; | |
e79a6c83 | 1455 | |
13bb6ed0 JS |
1456 | ovs_assert(n_ops <= REVALIDATE_MAX_BATCH); |
1457 | for (i = 0; i < n_ops; i++) { | |
1458 | opsp[i] = &ops[i].op; | |
1459 | } | |
1460 | dpif_operate(udpif->dpif, opsp, n_ops); | |
1461 | ||
1462 | for (i = 0; i < n_ops; i++) { | |
1463 | struct dump_op *op = &ops[i]; | |
1464 | struct dpif_flow_stats *push, *stats, push_buf; | |
1465 | ||
1466 | stats = op->op.u.flow_del.stats; | |
1467 | if (op->ukey) { | |
1468 | push = &push_buf; | |
1469 | push->used = MAX(stats->used, op->ukey->stats.used); | |
1470 | push->tcp_flags = stats->tcp_flags | op->ukey->stats.tcp_flags; | |
1471 | push->n_packets = stats->n_packets - op->ukey->stats.n_packets; | |
1472 | push->n_bytes = stats->n_bytes - op->ukey->stats.n_bytes; | |
1473 | } else { | |
1474 | push = stats; | |
1475 | } | |
1476 | ||
1477 | if (push->n_packets || netflow_exists()) { | |
1478 | struct ofproto_dpif *ofproto; | |
1479 | struct netflow *netflow; | |
1480 | struct flow flow; | |
1481 | ||
1482 | if (!xlate_receive(udpif->backer, NULL, op->op.u.flow_del.key, | |
1483 | op->op.u.flow_del.key_len, &flow, &ofproto, | |
1484 | NULL, NULL, &netflow, NULL)) { | |
1485 | struct xlate_in xin; | |
1486 | ||
1487 | xlate_in_init(&xin, ofproto, &flow, NULL, push->tcp_flags, | |
1488 | NULL); | |
1489 | xin.resubmit_stats = push->n_packets ? push : NULL; | |
1490 | xin.may_learn = push->n_packets > 0; | |
1491 | xin.skip_wildcards = true; | |
1492 | xlate_actions_for_side_effects(&xin); | |
1493 | ||
1494 | if (netflow) { | |
1495 | netflow_expire(netflow, &flow); | |
1496 | netflow_flow_clear(netflow, &flow); | |
1497 | netflow_unref(netflow); | |
1498 | } | |
1499 | } | |
1500 | } | |
1501 | } | |
1502 | ||
1503 | for (i = 0; i < n_ops; i++) { | |
66fb5cc7 | 1504 | struct udpif_key *ukey; |
13bb6ed0 | 1505 | |
66fb5cc7 BP |
1506 | /* If there's a udump, this ukey came directly from a datapath flow |
1507 | * dump. Sometimes a datapath can send duplicates in flow dumps, in | |
1508 | * which case we wouldn't want to double-free a ukey, so avoid that by | |
1509 | * looking up the ukey again. | |
1510 | * | |
1511 | * If there's no udump then we know what we're doing. */ | |
1512 | ukey = (ops[i].udump | |
1513 | ? ukey_lookup(revalidator, ops[i].udump) | |
1514 | : ops[i].ukey); | |
13bb6ed0 JS |
1515 | if (ukey) { |
1516 | ukey_delete(revalidator, ukey); | |
1517 | } | |
1518 | } | |
1519 | } | |
1520 | ||
1521 | static void | |
1522 | revalidate_udumps(struct revalidator *revalidator, struct list *udumps) | |
1523 | { | |
1524 | struct udpif *udpif = revalidator->udpif; | |
e79a6c83 | 1525 | |
ad3415c0 | 1526 | struct dump_op ops[REVALIDATE_MAX_BATCH]; |
e79a6c83 | 1527 | struct udpif_flow_dump *udump, *next_udump; |
13bb6ed0 | 1528 | size_t n_ops, n_flows; |
e79a6c83 EJ |
1529 | unsigned int flow_limit; |
1530 | long long int max_idle; | |
1531 | bool must_del; | |
1532 | ||
e79a6c83 EJ |
1533 | atomic_read(&udpif->flow_limit, &flow_limit); |
1534 | ||
1535 | n_flows = udpif_get_n_flows(udpif); | |
1536 | ||
1537 | must_del = false; | |
72310b04 | 1538 | max_idle = ofproto_max_idle; |
e79a6c83 EJ |
1539 | if (n_flows > flow_limit) { |
1540 | must_del = n_flows > 2 * flow_limit; | |
1541 | max_idle = 100; | |
1542 | } | |
1543 | ||
1544 | n_ops = 0; | |
1545 | LIST_FOR_EACH_SAFE (udump, next_udump, list_node, udumps) { | |
1546 | long long int used, now; | |
1547 | struct udpif_key *ukey; | |
1548 | ||
1549 | now = time_msec(); | |
1550 | ukey = ukey_lookup(revalidator, udump); | |
1551 | ||
1552 | used = udump->stats.used; | |
1553 | if (!used && ukey) { | |
1554 | used = ukey->created; | |
1555 | } | |
1556 | ||
1557 | if (must_del || (used && used < now - max_idle)) { | |
ad3415c0 | 1558 | struct dump_op *dop = &ops[n_ops++]; |
e79a6c83 | 1559 | |
13bb6ed0 | 1560 | dump_op_init(dop, udump->key, udump->key_len, ukey, udump); |
e79a6c83 EJ |
1561 | continue; |
1562 | } | |
1563 | ||
1564 | if (!ukey) { | |
13bb6ed0 | 1565 | ukey = ukey_create(udump->key, udump->key_len, used); |
e79a6c83 EJ |
1566 | hmap_insert(&revalidator->ukeys, &ukey->hmap_node, |
1567 | udump->key_hash); | |
1568 | } | |
1569 | ukey->mark = true; | |
1570 | ||
1571 | if (!revalidate_ukey(udpif, udump, ukey)) { | |
1572 | dpif_flow_del(udpif->dpif, udump->key, udump->key_len, NULL); | |
1573 | ukey_delete(revalidator, ukey); | |
1574 | } | |
1575 | ||
1576 | list_remove(&udump->list_node); | |
1577 | free(udump); | |
1578 | } | |
1579 | ||
13bb6ed0 | 1580 | push_dump_ops(revalidator, ops, n_ops); |
ad3415c0 | 1581 | |
e79a6c83 EJ |
1582 | LIST_FOR_EACH_SAFE (udump, next_udump, list_node, udumps) { |
1583 | list_remove(&udump->list_node); | |
1584 | free(udump); | |
1585 | } | |
1586 | } | |
1587 | ||
1588 | static void | |
e96a5c24 | 1589 | revalidator_sweep__(struct revalidator *revalidator, bool purge) |
e79a6c83 | 1590 | { |
e4b79342 | 1591 | struct dump_op ops[REVALIDATE_MAX_BATCH]; |
e79a6c83 | 1592 | struct udpif_key *ukey, *next; |
e4b79342 JS |
1593 | size_t n_ops; |
1594 | ||
1595 | n_ops = 0; | |
e79a6c83 EJ |
1596 | |
1597 | HMAP_FOR_EACH_SAFE (ukey, next, hmap_node, &revalidator->ukeys) { | |
e96a5c24 | 1598 | if (!purge && ukey->mark) { |
e79a6c83 EJ |
1599 | ukey->mark = false; |
1600 | } else { | |
e4b79342 JS |
1601 | struct dump_op *op = &ops[n_ops++]; |
1602 | ||
1603 | /* If we have previously seen a flow in the datapath, but didn't | |
1604 | * see it during the most recent dump, delete it. This allows us | |
1605 | * to clean up the ukey and keep the statistics consistent. */ | |
1606 | dump_op_init(op, ukey->key, ukey->key_len, ukey, NULL); | |
1607 | if (n_ops == REVALIDATE_MAX_BATCH) { | |
1608 | push_dump_ops(revalidator, ops, n_ops); | |
1609 | n_ops = 0; | |
1610 | } | |
e79a6c83 | 1611 | } |
e1ec7dd4 | 1612 | } |
e4b79342 JS |
1613 | |
1614 | if (n_ops) { | |
1615 | push_dump_ops(revalidator, ops, n_ops); | |
1616 | } | |
e1ec7dd4 | 1617 | } |
e96a5c24 JS |
1618 | |
1619 | static void | |
1620 | revalidator_sweep(struct revalidator *revalidator) | |
1621 | { | |
1622 | revalidator_sweep__(revalidator, false); | |
1623 | } | |
1624 | ||
1625 | static void | |
1626 | revalidator_purge(struct revalidator *revalidator) | |
1627 | { | |
1628 | revalidator_sweep__(revalidator, true); | |
1629 | } | |
e22d52ee EJ |
1630 | \f |
1631 | static void | |
1632 | upcall_unixctl_show(struct unixctl_conn *conn, int argc OVS_UNUSED, | |
1633 | const char *argv[] OVS_UNUSED, void *aux OVS_UNUSED) | |
1634 | { | |
1635 | struct ds ds = DS_EMPTY_INITIALIZER; | |
1636 | struct udpif *udpif; | |
1637 | ||
1638 | LIST_FOR_EACH (udpif, list_node, &all_udpifs) { | |
e79a6c83 | 1639 | unsigned int flow_limit; |
e22d52ee EJ |
1640 | size_t i; |
1641 | ||
e79a6c83 | 1642 | atomic_read(&udpif->flow_limit, &flow_limit); |
e79a6c83 | 1643 | |
e22d52ee | 1644 | ds_put_format(&ds, "%s:\n", dpif_name(udpif->dpif)); |
e79a6c83 EJ |
1645 | ds_put_format(&ds, "\tflows : (current %"PRIu64")" |
1646 | " (avg %u) (max %u) (limit %u)\n", udpif_get_n_flows(udpif), | |
1647 | udpif->avg_n_flows, udpif->max_n_flows, flow_limit); | |
e79a6c83 EJ |
1648 | ds_put_format(&ds, "\tdump duration : %lldms\n", udpif->dump_duration); |
1649 | ||
1650 | ds_put_char(&ds, '\n'); | |
e22d52ee EJ |
1651 | for (i = 0; i < udpif->n_handlers; i++) { |
1652 | struct handler *handler = &udpif->handlers[i]; | |
1653 | ||
1654 | ovs_mutex_lock(&handler->mutex); | |
1655 | ds_put_format(&ds, "\t%s: (upcall queue %"PRIuSIZE")\n", | |
1656 | handler->name, handler->n_upcalls); | |
1657 | ovs_mutex_unlock(&handler->mutex); | |
1658 | } | |
e79a6c83 EJ |
1659 | |
1660 | ds_put_char(&ds, '\n'); | |
1661 | for (i = 0; i < n_revalidators; i++) { | |
1662 | struct revalidator *revalidator = &udpif->revalidators[i]; | |
1663 | ||
1664 | /* XXX: The result of hmap_count(&revalidator->ukeys) may not be | |
1665 | * accurate because it's not protected by the revalidator mutex. */ | |
1666 | ovs_mutex_lock(&revalidator->mutex); | |
1667 | ds_put_format(&ds, "\t%s: (dump queue %"PRIuSIZE") (keys %"PRIuSIZE | |
1668 | ")\n", revalidator->name, revalidator->n_udumps, | |
1669 | hmap_count(&revalidator->ukeys)); | |
1670 | ovs_mutex_unlock(&revalidator->mutex); | |
1671 | } | |
e22d52ee EJ |
1672 | } |
1673 | ||
1674 | unixctl_command_reply(conn, ds_cstr(&ds)); | |
1675 | ds_destroy(&ds); | |
1676 | } | |
e79a6c83 EJ |
1677 | |
1678 | /* Disable using the megaflows. | |
1679 | * | |
1680 | * This command is only needed for advanced debugging, so it's not | |
1681 | * documented in the man page. */ | |
1682 | static void | |
1683 | upcall_unixctl_disable_megaflows(struct unixctl_conn *conn, | |
1684 | int argc OVS_UNUSED, | |
1685 | const char *argv[] OVS_UNUSED, | |
1686 | void *aux OVS_UNUSED) | |
1687 | { | |
1688 | atomic_store(&enable_megaflows, false); | |
1b5b5071 | 1689 | udpif_flush_all_datapaths(); |
e79a6c83 EJ |
1690 | unixctl_command_reply(conn, "megaflows disabled"); |
1691 | } | |
1692 | ||
1693 | /* Re-enable using megaflows. | |
1694 | * | |
1695 | * This command is only needed for advanced debugging, so it's not | |
1696 | * documented in the man page. */ | |
1697 | static void | |
1698 | upcall_unixctl_enable_megaflows(struct unixctl_conn *conn, | |
1699 | int argc OVS_UNUSED, | |
1700 | const char *argv[] OVS_UNUSED, | |
1701 | void *aux OVS_UNUSED) | |
1702 | { | |
1703 | atomic_store(&enable_megaflows, true); | |
1b5b5071 | 1704 | udpif_flush_all_datapaths(); |
e79a6c83 EJ |
1705 | unixctl_command_reply(conn, "megaflows enabled"); |
1706 | } | |
94b8c324 JS |
1707 | |
1708 | /* Set the flow limit. | |
1709 | * | |
1710 | * This command is only needed for advanced debugging, so it's not | |
1711 | * documented in the man page. */ | |
1712 | static void | |
1713 | upcall_unixctl_set_flow_limit(struct unixctl_conn *conn, | |
1714 | int argc OVS_UNUSED, | |
1715 | const char *argv[] OVS_UNUSED, | |
1716 | void *aux OVS_UNUSED) | |
1717 | { | |
1718 | struct ds ds = DS_EMPTY_INITIALIZER; | |
1719 | struct udpif *udpif; | |
1720 | unsigned int flow_limit = atoi(argv[1]); | |
1721 | ||
1722 | LIST_FOR_EACH (udpif, list_node, &all_udpifs) { | |
1723 | atomic_store(&udpif->flow_limit, flow_limit); | |
1724 | } | |
1725 | ds_put_format(&ds, "set flow_limit to %u\n", flow_limit); | |
1726 | unixctl_command_reply(conn, ds_cstr(&ds)); | |
1727 | ds_destroy(&ds); | |
1728 | } |