2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
24 #include <netinet/in.h>
25 #include <sys/socket.h>
30 #include <sys/ioctl.h>
34 #include "classifier.h"
37 #include "dpif-provider.h"
39 #include "dynamic-string.h"
43 #include "meta-flow.h"
45 #include "netdev-vport.h"
47 #include "odp-execute.h"
49 #include "ofp-print.h"
52 #include "poll-loop.h"
62 VLOG_DEFINE_THIS_MODULE(dpif_netdev
);
64 /* By default, choose a priority in the middle. */
65 #define NETDEV_RULE_PRIORITY 0x8000
67 /* Configuration parameters. */
68 enum { MAX_PORTS
= 256 }; /* Maximum number of ports. */
69 enum { MAX_FLOWS
= 65536 }; /* Maximum number of flows in flow table. */
71 /* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
72 * headers to be aligned on a 4-byte boundary. */
73 enum { DP_NETDEV_HEADROOM
= 2 + VLAN_HEADER_LEN
};
76 enum { N_QUEUES
= 2 }; /* Number of queues for dpif_recv(). */
77 enum { MAX_QUEUE_LEN
= 128 }; /* Maximum number of packets per queue. */
78 enum { QUEUE_MASK
= MAX_QUEUE_LEN
- 1 };
79 BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE_LEN
));
81 struct dp_netdev_upcall
{
82 struct dpif_upcall upcall
; /* Queued upcall information. */
83 struct ofpbuf buf
; /* ofpbuf instance for upcall.packet. */
86 struct dp_netdev_queue
{
87 struct dp_netdev_upcall upcalls
[MAX_QUEUE_LEN
];
88 unsigned int head
, tail
;
91 /* Datapath based on the network device interface from netdev.h. */
93 const struct dpif_class
*class;
97 int max_mtu
; /* Maximum MTU of any port added so far. */
99 struct dp_netdev_queue queues
[N_QUEUES
];
100 struct classifier cls
; /* Classifier. */
101 struct hmap flow_table
; /* Flow table. */
102 struct seq
*queue_seq
; /* Incremented whenever a packet is queued. */
105 long long int n_hit
; /* Number of flow table matches. */
106 long long int n_missed
; /* Number of flow table misses. */
107 long long int n_lost
; /* Number of misses not passed to client. */
110 struct dp_netdev_port
*ports
[MAX_PORTS
];
111 struct list port_list
;
112 struct seq
*port_seq
; /* Incremented whenever a port changes. */
115 /* A port in a netdev-based datapath. */
116 struct dp_netdev_port
{
117 odp_port_t port_no
; /* Index into dp_netdev's 'ports'. */
118 struct list node
; /* Element in dp_netdev's 'port_list'. */
119 struct netdev
*netdev
;
120 struct netdev_saved_flags
*sf
;
121 struct netdev_rx
*rx
;
122 char *type
; /* Port type as requested by user. */
125 /* A flow in dp_netdev's 'flow_table'. */
126 struct dp_netdev_flow
{
127 /* Packet classification. */
128 struct cls_rule cr
; /* In owning dp_netdev's 'cls'. */
130 /* Hash table index by unmasked flow.*/
131 struct hmap_node node
; /* In owning dp_netdev's 'flow_table'. */
132 struct flow flow
; /* The flow that created this entry. */
135 long long int used
; /* Last used time, in monotonic msecs. */
136 long long int packet_count
; /* Number of packets matched. */
137 long long int byte_count
; /* Number of bytes matched. */
138 uint16_t tcp_flags
; /* Bitwise-OR of seen tcp_flags values. */
141 struct nlattr
*actions
;
145 /* Interface to netdev-based datapath. */
148 struct dp_netdev
*dp
;
149 uint64_t last_port_seq
;
152 /* All netdev-based datapaths. */
153 static struct shash dp_netdevs
= SHASH_INITIALIZER(&dp_netdevs
);
155 /* Global lock for all data. */
156 static struct ovs_mutex dp_netdev_mutex
= OVS_MUTEX_INITIALIZER
;
158 static int get_port_by_number(struct dp_netdev
*, odp_port_t port_no
,
159 struct dp_netdev_port
**portp
);
160 static int get_port_by_name(struct dp_netdev
*, const char *devname
,
161 struct dp_netdev_port
**portp
);
162 static void dp_netdev_free(struct dp_netdev
*);
163 static void dp_netdev_flow_flush(struct dp_netdev
*);
164 static int do_add_port(struct dp_netdev
*, const char *devname
,
165 const char *type
, odp_port_t port_no
);
166 static int do_del_port(struct dp_netdev
*, odp_port_t port_no
);
167 static int dpif_netdev_open(const struct dpif_class
*, const char *name
,
168 bool create
, struct dpif
**);
169 static int dp_netdev_output_userspace(struct dp_netdev
*, struct ofpbuf
*,
170 int queue_no
, const struct flow
*,
171 const struct nlattr
*userdata
);
172 static void dp_netdev_execute_actions(struct dp_netdev
*, const struct flow
*,
173 struct ofpbuf
*, struct pkt_metadata
*,
174 const struct nlattr
*actions
,
176 static void dp_netdev_port_input(struct dp_netdev
*dp
, struct ofpbuf
*packet
,
177 struct pkt_metadata
*md
);
179 static struct dpif_netdev
*
180 dpif_netdev_cast(const struct dpif
*dpif
)
182 ovs_assert(dpif
->dpif_class
->open
== dpif_netdev_open
);
183 return CONTAINER_OF(dpif
, struct dpif_netdev
, dpif
);
186 static struct dp_netdev
*
187 get_dp_netdev(const struct dpif
*dpif
)
189 return dpif_netdev_cast(dpif
)->dp
;
193 dpif_netdev_enumerate(struct sset
*all_dps
)
195 struct shash_node
*node
;
197 ovs_mutex_lock(&dp_netdev_mutex
);
198 SHASH_FOR_EACH(node
, &dp_netdevs
) {
199 sset_add(all_dps
, node
->name
);
201 ovs_mutex_unlock(&dp_netdev_mutex
);
207 dpif_netdev_class_is_dummy(const struct dpif_class
*class)
209 return class != &dpif_netdev_class
;
213 dpif_netdev_port_open_type(const struct dpif_class
*class, const char *type
)
215 return strcmp(type
, "internal") ? type
216 : dpif_netdev_class_is_dummy(class) ? "dummy"
221 create_dpif_netdev(struct dp_netdev
*dp
)
223 uint16_t netflow_id
= hash_string(dp
->name
, 0);
224 struct dpif_netdev
*dpif
;
228 dpif
= xmalloc(sizeof *dpif
);
229 dpif_init(&dpif
->dpif
, dp
->class, dp
->name
, netflow_id
>> 8, netflow_id
);
231 dpif
->last_port_seq
= seq_read(dp
->port_seq
);
236 /* Choose an unused, non-zero port number and return it on success.
237 * Return ODPP_NONE on failure. */
239 choose_port(struct dp_netdev
*dp
, const char *name
)
243 if (dp
->class != &dpif_netdev_class
) {
247 /* If the port name begins with "br", start the number search at
248 * 100 to make writing tests easier. */
249 if (!strncmp(name
, "br", 2)) {
253 /* If the port name contains a number, try to assign that port number.
254 * This can make writing unit tests easier because port numbers are
256 for (p
= name
; *p
!= '\0'; p
++) {
257 if (isdigit((unsigned char) *p
)) {
258 port_no
= start_no
+ strtol(p
, NULL
, 10);
259 if (port_no
> 0 && port_no
< MAX_PORTS
260 && !dp
->ports
[port_no
]) {
261 return u32_to_odp(port_no
);
268 for (port_no
= 1; port_no
< MAX_PORTS
; port_no
++) {
269 if (!dp
->ports
[port_no
]) {
270 return u32_to_odp(port_no
);
278 create_dp_netdev(const char *name
, const struct dpif_class
*class,
279 struct dp_netdev
**dpp
)
281 struct dp_netdev
*dp
;
285 dp
= xzalloc(sizeof *dp
);
287 dp
->name
= xstrdup(name
);
289 dp
->max_mtu
= ETH_PAYLOAD_MAX
;
290 for (i
= 0; i
< N_QUEUES
; i
++) {
291 dp
->queues
[i
].head
= dp
->queues
[i
].tail
= 0;
293 dp
->queue_seq
= seq_create();
294 classifier_init(&dp
->cls
, NULL
);
295 hmap_init(&dp
->flow_table
);
296 list_init(&dp
->port_list
);
297 dp
->port_seq
= seq_create();
299 error
= do_add_port(dp
, name
, "internal", ODPP_LOCAL
);
305 shash_add(&dp_netdevs
, name
, dp
);
312 dpif_netdev_open(const struct dpif_class
*class, const char *name
,
313 bool create
, struct dpif
**dpifp
)
315 struct dp_netdev
*dp
;
318 ovs_mutex_lock(&dp_netdev_mutex
);
319 dp
= shash_find_data(&dp_netdevs
, name
);
321 error
= create
? create_dp_netdev(name
, class, &dp
) : ENODEV
;
323 error
= (dp
->class != class ? EINVAL
328 *dpifp
= create_dpif_netdev(dp
);
330 ovs_mutex_unlock(&dp_netdev_mutex
);
336 dp_netdev_purge_queues(struct dp_netdev
*dp
)
340 for (i
= 0; i
< N_QUEUES
; i
++) {
341 struct dp_netdev_queue
*q
= &dp
->queues
[i
];
343 while (q
->tail
!= q
->head
) {
344 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->tail
++ & QUEUE_MASK
];
345 ofpbuf_uninit(&u
->upcall
.packet
);
346 ofpbuf_uninit(&u
->buf
);
352 dp_netdev_free(struct dp_netdev
*dp
)
354 struct dp_netdev_port
*port
, *next
;
356 dp_netdev_flow_flush(dp
);
357 LIST_FOR_EACH_SAFE (port
, next
, node
, &dp
->port_list
) {
358 do_del_port(dp
, port
->port_no
);
360 dp_netdev_purge_queues(dp
);
361 seq_destroy(dp
->queue_seq
);
362 classifier_destroy(&dp
->cls
);
363 hmap_destroy(&dp
->flow_table
);
364 seq_destroy(dp
->port_seq
);
370 dpif_netdev_close(struct dpif
*dpif
)
372 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
374 ovs_mutex_lock(&dp_netdev_mutex
);
376 ovs_assert(dp
->open_cnt
> 0);
377 if (--dp
->open_cnt
== 0 && dp
->destroyed
) {
378 shash_find_and_delete(&dp_netdevs
, dp
->name
);
383 ovs_mutex_unlock(&dp_netdev_mutex
);
387 dpif_netdev_destroy(struct dpif
*dpif
)
389 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
391 ovs_mutex_lock(&dp_netdev_mutex
);
392 dp
->destroyed
= true;
393 ovs_mutex_unlock(&dp_netdev_mutex
);
399 dpif_netdev_get_stats(const struct dpif
*dpif
, struct dpif_dp_stats
*stats
)
401 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
403 ovs_mutex_lock(&dp_netdev_mutex
);
404 stats
->n_flows
= hmap_count(&dp
->flow_table
);
405 stats
->n_hit
= dp
->n_hit
;
406 stats
->n_missed
= dp
->n_missed
;
407 stats
->n_lost
= dp
->n_lost
;
408 stats
->n_masks
= UINT32_MAX
;
409 stats
->n_mask_hit
= UINT64_MAX
;
410 ovs_mutex_unlock(&dp_netdev_mutex
);
416 do_add_port(struct dp_netdev
*dp
, const char *devname
, const char *type
,
419 struct netdev_saved_flags
*sf
;
420 struct dp_netdev_port
*port
;
421 struct netdev
*netdev
;
422 struct netdev_rx
*rx
;
423 enum netdev_flags flags
;
424 const char *open_type
;
428 /* XXX reject devices already in some dp_netdev. */
430 /* Open and validate network device. */
431 open_type
= dpif_netdev_port_open_type(dp
->class, type
);
432 error
= netdev_open(devname
, open_type
, &netdev
);
436 /* XXX reject non-Ethernet devices */
438 netdev_get_flags(netdev
, &flags
);
439 if (flags
& NETDEV_LOOPBACK
) {
440 VLOG_ERR("%s: cannot add a loopback device", devname
);
441 netdev_close(netdev
);
445 error
= netdev_rx_open(netdev
, &rx
);
447 && !(error
== EOPNOTSUPP
&& dpif_netdev_class_is_dummy(dp
->class))) {
448 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
449 devname
, ovs_strerror(errno
));
450 netdev_close(netdev
);
454 error
= netdev_turn_flags_on(netdev
, NETDEV_PROMISC
, &sf
);
457 netdev_close(netdev
);
461 port
= xmalloc(sizeof *port
);
462 port
->port_no
= port_no
;
463 port
->netdev
= netdev
;
466 port
->type
= xstrdup(type
);
468 error
= netdev_get_mtu(netdev
, &mtu
);
469 if (!error
&& mtu
> dp
->max_mtu
) {
473 list_push_back(&dp
->port_list
, &port
->node
);
474 dp
->ports
[odp_to_u32(port_no
)] = port
;
475 seq_change(dp
->port_seq
);
481 dpif_netdev_port_add(struct dpif
*dpif
, struct netdev
*netdev
,
482 odp_port_t
*port_nop
)
484 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
485 char namebuf
[NETDEV_VPORT_NAME_BUFSIZE
];
486 const char *dpif_port
;
490 ovs_mutex_lock(&dp_netdev_mutex
);
491 dpif_port
= netdev_vport_get_dpif_port(netdev
, namebuf
, sizeof namebuf
);
492 if (*port_nop
!= ODPP_NONE
) {
493 uint32_t port_idx
= odp_to_u32(*port_nop
);
494 if (port_idx
>= MAX_PORTS
) {
496 } else if (dp
->ports
[port_idx
]) {
503 port_no
= choose_port(dp
, dpif_port
);
504 error
= port_no
== ODPP_NONE
? EFBIG
: 0;
508 error
= do_add_port(dp
, dpif_port
, netdev_get_type(netdev
), port_no
);
510 ovs_mutex_unlock(&dp_netdev_mutex
);
516 dpif_netdev_port_del(struct dpif
*dpif
, odp_port_t port_no
)
518 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
521 ovs_mutex_lock(&dp_netdev_mutex
);
522 error
= port_no
== ODPP_LOCAL
? EINVAL
: do_del_port(dp
, port_no
);
523 ovs_mutex_unlock(&dp_netdev_mutex
);
529 is_valid_port_number(odp_port_t port_no
)
531 return odp_to_u32(port_no
) < MAX_PORTS
;
535 get_port_by_number(struct dp_netdev
*dp
,
536 odp_port_t port_no
, struct dp_netdev_port
**portp
)
538 if (!is_valid_port_number(port_no
)) {
542 *portp
= dp
->ports
[odp_to_u32(port_no
)];
543 return *portp
? 0 : ENOENT
;
548 get_port_by_name(struct dp_netdev
*dp
,
549 const char *devname
, struct dp_netdev_port
**portp
)
551 struct dp_netdev_port
*port
;
553 LIST_FOR_EACH (port
, node
, &dp
->port_list
) {
554 if (!strcmp(netdev_get_name(port
->netdev
), devname
)) {
563 do_del_port(struct dp_netdev
*dp
, odp_port_t port_no
)
565 struct dp_netdev_port
*port
;
568 error
= get_port_by_number(dp
, port_no
, &port
);
573 list_remove(&port
->node
);
574 dp
->ports
[odp_to_u32(port_no
)] = NULL
;
575 seq_change(dp
->port_seq
);
577 netdev_close(port
->netdev
);
578 netdev_restore_flags(port
->sf
);
579 netdev_rx_close(port
->rx
);
587 answer_port_query(const struct dp_netdev_port
*port
,
588 struct dpif_port
*dpif_port
)
590 dpif_port
->name
= xstrdup(netdev_get_name(port
->netdev
));
591 dpif_port
->type
= xstrdup(port
->type
);
592 dpif_port
->port_no
= port
->port_no
;
596 dpif_netdev_port_query_by_number(const struct dpif
*dpif
, odp_port_t port_no
,
597 struct dpif_port
*dpif_port
)
599 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
600 struct dp_netdev_port
*port
;
603 ovs_mutex_lock(&dp_netdev_mutex
);
604 error
= get_port_by_number(dp
, port_no
, &port
);
605 if (!error
&& dpif_port
) {
606 answer_port_query(port
, dpif_port
);
608 ovs_mutex_unlock(&dp_netdev_mutex
);
614 dpif_netdev_port_query_by_name(const struct dpif
*dpif
, const char *devname
,
615 struct dpif_port
*dpif_port
)
617 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
618 struct dp_netdev_port
*port
;
621 ovs_mutex_lock(&dp_netdev_mutex
);
622 error
= get_port_by_name(dp
, devname
, &port
);
623 if (!error
&& dpif_port
) {
624 answer_port_query(port
, dpif_port
);
626 ovs_mutex_unlock(&dp_netdev_mutex
);
632 dpif_netdev_get_max_ports(const struct dpif
*dpif OVS_UNUSED
)
638 dp_netdev_free_flow(struct dp_netdev
*dp
, struct dp_netdev_flow
*netdev_flow
)
640 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
641 classifier_remove(&dp
->cls
, &netdev_flow
->cr
);
642 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
643 cls_rule_destroy(&netdev_flow
->cr
);
645 hmap_remove(&dp
->flow_table
, &netdev_flow
->node
);
646 free(netdev_flow
->actions
);
651 dp_netdev_flow_flush(struct dp_netdev
*dp
)
653 struct dp_netdev_flow
*netdev_flow
, *next
;
655 HMAP_FOR_EACH_SAFE (netdev_flow
, next
, node
, &dp
->flow_table
) {
656 dp_netdev_free_flow(dp
, netdev_flow
);
661 dpif_netdev_flow_flush(struct dpif
*dpif
)
663 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
665 ovs_mutex_lock(&dp_netdev_mutex
);
666 dp_netdev_flow_flush(dp
);
667 ovs_mutex_unlock(&dp_netdev_mutex
);
672 struct dp_netdev_port_state
{
678 dpif_netdev_port_dump_start(const struct dpif
*dpif OVS_UNUSED
, void **statep
)
680 *statep
= xzalloc(sizeof(struct dp_netdev_port_state
));
685 dpif_netdev_port_dump_next(const struct dpif
*dpif
, void *state_
,
686 struct dpif_port
*dpif_port
)
688 struct dp_netdev_port_state
*state
= state_
;
689 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
692 ovs_mutex_lock(&dp_netdev_mutex
);
693 for (port_idx
= odp_to_u32(state
->port_no
);
694 port_idx
< MAX_PORTS
; port_idx
++) {
695 struct dp_netdev_port
*port
= dp
->ports
[port_idx
];
698 state
->name
= xstrdup(netdev_get_name(port
->netdev
));
699 dpif_port
->name
= state
->name
;
700 dpif_port
->type
= port
->type
;
701 dpif_port
->port_no
= port
->port_no
;
702 state
->port_no
= u32_to_odp(port_idx
+ 1);
703 ovs_mutex_unlock(&dp_netdev_mutex
);
708 ovs_mutex_unlock(&dp_netdev_mutex
);
714 dpif_netdev_port_dump_done(const struct dpif
*dpif OVS_UNUSED
, void *state_
)
716 struct dp_netdev_port_state
*state
= state_
;
723 dpif_netdev_port_poll(const struct dpif
*dpif_
, char **devnamep OVS_UNUSED
)
725 struct dpif_netdev
*dpif
= dpif_netdev_cast(dpif_
);
726 uint64_t new_port_seq
;
729 ovs_mutex_lock(&dp_netdev_mutex
);
730 new_port_seq
= seq_read(dpif
->dp
->port_seq
);
731 if (dpif
->last_port_seq
!= new_port_seq
) {
732 dpif
->last_port_seq
= new_port_seq
;
737 ovs_mutex_unlock(&dp_netdev_mutex
);
743 dpif_netdev_port_poll_wait(const struct dpif
*dpif_
)
745 struct dpif_netdev
*dpif
= dpif_netdev_cast(dpif_
);
747 ovs_mutex_lock(&dp_netdev_mutex
);
748 seq_wait(dpif
->dp
->port_seq
, dpif
->last_port_seq
);
749 ovs_mutex_unlock(&dp_netdev_mutex
);
752 static struct dp_netdev_flow
*
753 dp_netdev_lookup_flow(const struct dp_netdev
*dp
, const struct flow
*flow
)
757 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
758 cr
= classifier_lookup(&dp
->cls
, flow
, NULL
);
759 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
762 ? CONTAINER_OF(cr
, struct dp_netdev_flow
, cr
)
766 static struct dp_netdev_flow
*
767 dp_netdev_find_flow(const struct dp_netdev
*dp
, const struct flow
*flow
)
769 struct dp_netdev_flow
*netdev_flow
;
771 HMAP_FOR_EACH_WITH_HASH (netdev_flow
, node
, flow_hash(flow
, 0),
773 if (flow_equal(&netdev_flow
->flow
, flow
)) {
781 get_dpif_flow_stats(struct dp_netdev_flow
*netdev_flow
,
782 struct dpif_flow_stats
*stats
)
784 stats
->n_packets
= netdev_flow
->packet_count
;
785 stats
->n_bytes
= netdev_flow
->byte_count
;
786 stats
->used
= netdev_flow
->used
;
787 stats
->tcp_flags
= netdev_flow
->tcp_flags
;
791 dpif_netdev_mask_from_nlattrs(const struct nlattr
*key
, uint32_t key_len
,
792 const struct nlattr
*mask_key
,
793 uint32_t mask_key_len
, const struct flow
*flow
,
797 if (odp_flow_key_to_mask(mask_key
, mask_key_len
, mask
, flow
)) {
798 /* This should not happen: it indicates that
799 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
800 * disagree on the acceptable form of a mask. Log the problem
801 * as an error, with enough details to enable debugging. */
802 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
804 if (!VLOG_DROP_ERR(&rl
)) {
808 odp_flow_format(key
, key_len
, mask_key
, mask_key_len
, NULL
, &s
,
810 VLOG_ERR("internal error parsing flow mask %s", ds_cstr(&s
));
816 /* Force unwildcard the in_port. */
817 mask
->in_port
.odp_port
= u32_to_odp(UINT32_MAX
);
820 /* No mask key, unwildcard everything except fields whose
821 * prerequisities are not met. */
822 memset(mask
, 0x0, sizeof *mask
);
824 for (id
= 0; id
< MFF_N_IDS
; ++id
) {
825 /* Skip registers and metadata. */
826 if (!(id
>= MFF_REG0
&& id
< MFF_REG0
+ FLOW_N_REGS
)
827 && id
!= MFF_METADATA
) {
828 const struct mf_field
*mf
= mf_from_id(id
);
829 if (mf_are_prereqs_ok(mf
, flow
)) {
830 mf_mask_field(mf
, mask
);
840 dpif_netdev_flow_from_nlattrs(const struct nlattr
*key
, uint32_t key_len
,
845 if (odp_flow_key_to_flow(key
, key_len
, flow
)) {
846 /* This should not happen: it indicates that odp_flow_key_from_flow()
847 * and odp_flow_key_to_flow() disagree on the acceptable form of a
848 * flow. Log the problem as an error, with enough details to enable
850 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
852 if (!VLOG_DROP_ERR(&rl
)) {
856 odp_flow_format(key
, key_len
, NULL
, 0, NULL
, &s
, true);
857 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s
));
864 in_port
= flow
->in_port
.odp_port
;
865 if (!is_valid_port_number(in_port
) && in_port
!= ODPP_NONE
) {
873 dpif_netdev_flow_get(const struct dpif
*dpif
,
874 const struct nlattr
*nl_key
, size_t nl_key_len
,
875 struct ofpbuf
**actionsp
, struct dpif_flow_stats
*stats
)
877 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
878 struct dp_netdev_flow
*netdev_flow
;
882 error
= dpif_netdev_flow_from_nlattrs(nl_key
, nl_key_len
, &key
);
887 ovs_mutex_lock(&dp_netdev_mutex
);
888 netdev_flow
= dp_netdev_find_flow(dp
, &key
);
891 get_dpif_flow_stats(netdev_flow
, stats
);
894 *actionsp
= ofpbuf_clone_data(netdev_flow
->actions
,
895 netdev_flow
->actions_len
);
900 ovs_mutex_unlock(&dp_netdev_mutex
);
906 set_flow_actions(struct dp_netdev_flow
*netdev_flow
,
907 const struct nlattr
*actions
, size_t actions_len
)
909 netdev_flow
->actions
= xrealloc(netdev_flow
->actions
, actions_len
);
910 netdev_flow
->actions_len
= actions_len
;
911 memcpy(netdev_flow
->actions
, actions
, actions_len
);
916 dp_netdev_flow_add(struct dp_netdev
*dp
, const struct flow
*flow
,
917 const struct flow_wildcards
*wc
,
918 const struct nlattr
*actions
,
921 struct dp_netdev_flow
*netdev_flow
;
925 netdev_flow
= xzalloc(sizeof *netdev_flow
);
926 netdev_flow
->flow
= *flow
;
928 match_init(&match
, flow
, wc
);
929 cls_rule_init(&netdev_flow
->cr
, &match
, NETDEV_RULE_PRIORITY
);
930 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
931 classifier_insert(&dp
->cls
, &netdev_flow
->cr
);
932 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
934 error
= set_flow_actions(netdev_flow
, actions
, actions_len
);
936 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
937 classifier_remove(&dp
->cls
, &netdev_flow
->cr
);
938 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
939 cls_rule_destroy(&netdev_flow
->cr
);
945 hmap_insert(&dp
->flow_table
, &netdev_flow
->node
, flow_hash(flow
, 0));
950 clear_stats(struct dp_netdev_flow
*netdev_flow
)
952 netdev_flow
->used
= 0;
953 netdev_flow
->packet_count
= 0;
954 netdev_flow
->byte_count
= 0;
955 netdev_flow
->tcp_flags
= 0;
959 dpif_netdev_flow_put(struct dpif
*dpif
, const struct dpif_flow_put
*put
)
961 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
962 struct dp_netdev_flow
*netdev_flow
;
964 struct flow_wildcards wc
;
967 error
= dpif_netdev_flow_from_nlattrs(put
->key
, put
->key_len
, &flow
);
971 error
= dpif_netdev_mask_from_nlattrs(put
->key
, put
->key_len
,
972 put
->mask
, put
->mask_len
,
978 ovs_mutex_lock(&dp_netdev_mutex
);
979 netdev_flow
= dp_netdev_lookup_flow(dp
, &flow
);
981 if (put
->flags
& DPIF_FP_CREATE
) {
982 if (hmap_count(&dp
->flow_table
) < MAX_FLOWS
) {
984 memset(put
->stats
, 0, sizeof *put
->stats
);
986 error
= dp_netdev_flow_add(dp
, &flow
, &wc
, put
->actions
,
995 if (put
->flags
& DPIF_FP_MODIFY
996 && flow_equal(&flow
, &netdev_flow
->flow
)) {
997 error
= set_flow_actions(netdev_flow
, put
->actions
,
1001 get_dpif_flow_stats(netdev_flow
, put
->stats
);
1003 if (put
->flags
& DPIF_FP_ZERO_STATS
) {
1004 clear_stats(netdev_flow
);
1007 } else if (put
->flags
& DPIF_FP_CREATE
) {
1010 /* Overlapping flow. */
1014 ovs_mutex_unlock(&dp_netdev_mutex
);
1020 dpif_netdev_flow_del(struct dpif
*dpif
, const struct dpif_flow_del
*del
)
1022 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1023 struct dp_netdev_flow
*netdev_flow
;
1027 error
= dpif_netdev_flow_from_nlattrs(del
->key
, del
->key_len
, &key
);
1032 ovs_mutex_lock(&dp_netdev_mutex
);
1033 netdev_flow
= dp_netdev_find_flow(dp
, &key
);
1036 get_dpif_flow_stats(netdev_flow
, del
->stats
);
1038 dp_netdev_free_flow(dp
, netdev_flow
);
1042 ovs_mutex_unlock(&dp_netdev_mutex
);
1047 struct dp_netdev_flow_state
{
1050 struct nlattr
*actions
;
1051 struct odputil_keybuf keybuf
;
1052 struct odputil_keybuf maskbuf
;
1053 struct dpif_flow_stats stats
;
1057 dpif_netdev_flow_dump_start(const struct dpif
*dpif OVS_UNUSED
, void **statep
)
1059 struct dp_netdev_flow_state
*state
;
1061 *statep
= state
= xmalloc(sizeof *state
);
1064 state
->actions
= NULL
;
1069 dpif_netdev_flow_dump_next(const struct dpif
*dpif
, void *state_
,
1070 const struct nlattr
**key
, size_t *key_len
,
1071 const struct nlattr
**mask
, size_t *mask_len
,
1072 const struct nlattr
**actions
, size_t *actions_len
,
1073 const struct dpif_flow_stats
**stats
)
1075 struct dp_netdev_flow_state
*state
= state_
;
1076 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1077 struct dp_netdev_flow
*netdev_flow
;
1078 struct hmap_node
*node
;
1080 ovs_mutex_lock(&dp_netdev_mutex
);
1081 node
= hmap_at_position(&dp
->flow_table
, &state
->bucket
, &state
->offset
);
1083 ovs_mutex_unlock(&dp_netdev_mutex
);
1087 netdev_flow
= CONTAINER_OF(node
, struct dp_netdev_flow
, node
);
1092 ofpbuf_use_stack(&buf
, &state
->keybuf
, sizeof state
->keybuf
);
1093 odp_flow_key_from_flow(&buf
, &netdev_flow
->flow
,
1094 netdev_flow
->flow
.in_port
.odp_port
);
1097 *key_len
= buf
.size
;
1102 struct flow_wildcards wc
;
1104 ofpbuf_use_stack(&buf
, &state
->maskbuf
, sizeof state
->maskbuf
);
1105 minimask_expand(&netdev_flow
->cr
.match
.mask
, &wc
);
1106 odp_flow_key_from_mask(&buf
, &wc
.masks
, &netdev_flow
->flow
,
1107 odp_to_u32(wc
.masks
.in_port
.odp_port
));
1110 *mask_len
= buf
.size
;
1114 free(state
->actions
);
1115 state
->actions
= xmemdup(netdev_flow
->actions
,
1116 netdev_flow
->actions_len
);
1118 *actions
= state
->actions
;
1119 *actions_len
= netdev_flow
->actions_len
;
1123 get_dpif_flow_stats(netdev_flow
, &state
->stats
);
1124 *stats
= &state
->stats
;
1127 ovs_mutex_unlock(&dp_netdev_mutex
);
1132 dpif_netdev_flow_dump_done(const struct dpif
*dpif OVS_UNUSED
, void *state_
)
1134 struct dp_netdev_flow_state
*state
= state_
;
1136 free(state
->actions
);
1142 dpif_netdev_execute(struct dpif
*dpif
, struct dpif_execute
*execute
)
1144 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1145 struct pkt_metadata
*md
= &execute
->md
;
1148 if (execute
->packet
->size
< ETH_HEADER_LEN
||
1149 execute
->packet
->size
> UINT16_MAX
) {
1153 /* Extract flow key. */
1154 flow_extract(execute
->packet
, md
->skb_priority
, md
->pkt_mark
, &md
->tunnel
,
1155 (union flow_in_port
*)&md
->in_port
, &key
);
1156 ovs_mutex_lock(&dp_netdev_mutex
);
1157 dp_netdev_execute_actions(dp
, &key
, execute
->packet
, md
, execute
->actions
,
1158 execute
->actions_len
);
1159 ovs_mutex_unlock(&dp_netdev_mutex
);
1164 dpif_netdev_recv_set(struct dpif
*dpif OVS_UNUSED
, bool enable OVS_UNUSED
)
1170 dpif_netdev_queue_to_priority(const struct dpif
*dpif OVS_UNUSED
,
1171 uint32_t queue_id
, uint32_t *priority
)
1173 *priority
= queue_id
;
1177 static struct dp_netdev_queue
*
1178 find_nonempty_queue(struct dpif
*dpif
)
1180 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1183 for (i
= 0; i
< N_QUEUES
; i
++) {
1184 struct dp_netdev_queue
*q
= &dp
->queues
[i
];
1185 if (q
->head
!= q
->tail
) {
1193 dpif_netdev_recv(struct dpif
*dpif
, struct dpif_upcall
*upcall
,
1196 struct dp_netdev_queue
*q
;
1199 ovs_mutex_lock(&dp_netdev_mutex
);
1200 q
= find_nonempty_queue(dpif
);
1202 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->tail
++ & QUEUE_MASK
];
1204 *upcall
= u
->upcall
;
1213 ovs_mutex_unlock(&dp_netdev_mutex
);
1219 dpif_netdev_recv_wait(struct dpif
*dpif
)
1221 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1224 ovs_mutex_lock(&dp_netdev_mutex
);
1225 seq
= seq_read(dp
->queue_seq
);
1226 if (find_nonempty_queue(dpif
)) {
1227 poll_immediate_wake();
1229 seq_wait(dp
->queue_seq
, seq
);
1231 ovs_mutex_unlock(&dp_netdev_mutex
);
1235 dpif_netdev_recv_purge(struct dpif
*dpif
)
1237 struct dpif_netdev
*dpif_netdev
= dpif_netdev_cast(dpif
);
1238 ovs_mutex_lock(&dp_netdev_mutex
);
1239 dp_netdev_purge_queues(dpif_netdev
->dp
);
1240 ovs_mutex_unlock(&dp_netdev_mutex
);
1244 dp_netdev_flow_used(struct dp_netdev_flow
*netdev_flow
,
1245 const struct ofpbuf
*packet
)
1247 netdev_flow
->used
= time_msec();
1248 netdev_flow
->packet_count
++;
1249 netdev_flow
->byte_count
+= packet
->size
;
1250 netdev_flow
->tcp_flags
|= packet_get_tcp_flags(packet
, &netdev_flow
->flow
);
1254 dp_netdev_port_input(struct dp_netdev
*dp
, struct ofpbuf
*packet
,
1255 struct pkt_metadata
*md
)
1257 struct dp_netdev_flow
*netdev_flow
;
1260 if (packet
->size
< ETH_HEADER_LEN
) {
1263 flow_extract(packet
, md
->skb_priority
, md
->pkt_mark
, &md
->tunnel
,
1264 (union flow_in_port
*)&md
->in_port
, &key
);
1265 netdev_flow
= dp_netdev_lookup_flow(dp
, &key
);
1267 dp_netdev_flow_used(netdev_flow
, packet
);
1268 dp_netdev_execute_actions(dp
, &key
, packet
, md
,
1269 netdev_flow
->actions
,
1270 netdev_flow
->actions_len
);
1274 dp_netdev_output_userspace(dp
, packet
, DPIF_UC_MISS
, &key
, NULL
);
1279 dpif_netdev_run(struct dpif
*dpif
)
1281 struct dp_netdev_port
*port
;
1282 struct dp_netdev
*dp
;
1283 struct ofpbuf packet
;
1286 ovs_mutex_lock(&dp_netdev_mutex
);
1287 dp
= get_dp_netdev(dpif
);
1288 ofpbuf_init(&packet
, 0);
1290 buf_size
= DP_NETDEV_HEADROOM
+ VLAN_ETH_HEADER_LEN
+ dp
->max_mtu
;
1292 LIST_FOR_EACH (port
, node
, &dp
->port_list
) {
1295 /* Reset packet contents. Packet data may have been stolen. */
1296 ofpbuf_clear(&packet
);
1297 ofpbuf_reserve_with_tailroom(&packet
, DP_NETDEV_HEADROOM
, buf_size
);
1299 error
= port
->rx
? netdev_rx_recv(port
->rx
, &packet
) : EOPNOTSUPP
;
1301 struct pkt_metadata md
= PKT_METADATA_INITIALIZER(port
->port_no
);
1302 dp_netdev_port_input(dp
, &packet
, &md
);
1303 } else if (error
!= EAGAIN
&& error
!= EOPNOTSUPP
) {
1304 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
1306 VLOG_ERR_RL(&rl
, "error receiving data from %s: %s",
1307 netdev_get_name(port
->netdev
), ovs_strerror(error
));
1310 ofpbuf_uninit(&packet
);
1311 ovs_mutex_unlock(&dp_netdev_mutex
);
1315 dpif_netdev_wait(struct dpif
*dpif
)
1317 struct dp_netdev_port
*port
;
1319 /* There is a race here, if thread A calls dpif_netdev_wait(dpif) and
1320 * thread B calls dpif_port_add(dpif) or dpif_port_remove(dpif) before
1321 * A makes it to poll_block().
1323 * But I think it doesn't matter:
1325 * - In the dpif_port_add() case, A will not wake up when a packet
1326 * arrives on the new port, but this would also happen if the
1327 * ordering were reversed.
1329 * - In the dpif_port_remove() case, A might wake up spuriously, but
1330 * that is harmless. */
1332 ovs_mutex_lock(&dp_netdev_mutex
);
1333 LIST_FOR_EACH (port
, node
, &get_dp_netdev(dpif
)->port_list
) {
1335 netdev_rx_wait(port
->rx
);
1338 ovs_mutex_unlock(&dp_netdev_mutex
);
1342 dp_netdev_output_port(struct dp_netdev
*dp
, struct ofpbuf
*packet
,
1343 odp_port_t out_port
)
1345 struct dp_netdev_port
*p
= dp
->ports
[odp_to_u32(out_port
)];
1347 netdev_send(p
->netdev
, packet
);
1352 dp_netdev_output_userspace(struct dp_netdev
*dp
, struct ofpbuf
*packet
,
1353 int queue_no
, const struct flow
*flow
,
1354 const struct nlattr
*userdata
)
1356 struct dp_netdev_queue
*q
= &dp
->queues
[queue_no
];
1357 if (q
->head
- q
->tail
< MAX_QUEUE_LEN
) {
1358 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->head
++ & QUEUE_MASK
];
1359 struct dpif_upcall
*upcall
= &u
->upcall
;
1360 struct ofpbuf
*buf
= &u
->buf
;
1363 upcall
->type
= queue_no
;
1365 /* Allocate buffer big enough for everything. */
1366 buf_size
= ODPUTIL_FLOW_KEY_BYTES
;
1368 buf_size
+= NLA_ALIGN(userdata
->nla_len
);
1370 ofpbuf_init(buf
, buf_size
);
1373 odp_flow_key_from_flow(buf
, flow
, flow
->in_port
.odp_port
);
1374 upcall
->key
= buf
->data
;
1375 upcall
->key_len
= buf
->size
;
1379 upcall
->userdata
= ofpbuf_put(buf
, userdata
,
1380 NLA_ALIGN(userdata
->nla_len
));
1383 /* Steal packet data. */
1384 ovs_assert(packet
->source
== OFPBUF_MALLOC
);
1385 upcall
->packet
= *packet
;
1386 ofpbuf_use(packet
, NULL
, 0);
1388 seq_change(dp
->queue_seq
);
1397 struct dp_netdev_execute_aux
{
1398 struct dp_netdev
*dp
;
1399 const struct flow
*key
;
1403 dp_execute_cb(void *aux_
, struct ofpbuf
*packet
,
1404 const struct pkt_metadata
*md OVS_UNUSED
,
1405 const struct nlattr
*a
, bool may_steal
)
1407 struct dp_netdev_execute_aux
*aux
= aux_
;
1408 int type
= nl_attr_type(a
);
1410 switch ((enum ovs_action_attr
)type
) {
1411 case OVS_ACTION_ATTR_OUTPUT
:
1412 dp_netdev_output_port(aux
->dp
, packet
, u32_to_odp(nl_attr_get_u32(a
)));
1415 case OVS_ACTION_ATTR_USERSPACE
: {
1416 const struct nlattr
*userdata
;
1418 userdata
= nl_attr_find_nested(a
, OVS_USERSPACE_ATTR_USERDATA
);
1420 /* Make a copy if we are not allowed to steal the packet's data. */
1422 packet
= ofpbuf_clone_with_headroom(packet
, DP_NETDEV_HEADROOM
);
1424 dp_netdev_output_userspace(aux
->dp
, packet
, DPIF_UC_ACTION
, aux
->key
,
1427 ofpbuf_uninit(packet
);
1431 case OVS_ACTION_ATTR_PUSH_VLAN
:
1432 case OVS_ACTION_ATTR_POP_VLAN
:
1433 case OVS_ACTION_ATTR_PUSH_MPLS
:
1434 case OVS_ACTION_ATTR_POP_MPLS
:
1435 case OVS_ACTION_ATTR_SET
:
1436 case OVS_ACTION_ATTR_SAMPLE
:
1437 case OVS_ACTION_ATTR_UNSPEC
:
1438 case __OVS_ACTION_ATTR_MAX
:
1444 dp_netdev_execute_actions(struct dp_netdev
*dp
, const struct flow
*key
,
1445 struct ofpbuf
*packet
, struct pkt_metadata
*md
,
1446 const struct nlattr
*actions
, size_t actions_len
)
1448 struct dp_netdev_execute_aux aux
= {dp
, key
};
1450 odp_execute_actions(&aux
, packet
, md
, actions
, actions_len
, dp_execute_cb
);
1453 const struct dpif_class dpif_netdev_class
= {
1455 dpif_netdev_enumerate
,
1456 dpif_netdev_port_open_type
,
1459 dpif_netdev_destroy
,
1462 dpif_netdev_get_stats
,
1463 dpif_netdev_port_add
,
1464 dpif_netdev_port_del
,
1465 dpif_netdev_port_query_by_number
,
1466 dpif_netdev_port_query_by_name
,
1467 dpif_netdev_get_max_ports
,
1468 NULL
, /* port_get_pid */
1469 dpif_netdev_port_dump_start
,
1470 dpif_netdev_port_dump_next
,
1471 dpif_netdev_port_dump_done
,
1472 dpif_netdev_port_poll
,
1473 dpif_netdev_port_poll_wait
,
1474 dpif_netdev_flow_get
,
1475 dpif_netdev_flow_put
,
1476 dpif_netdev_flow_del
,
1477 dpif_netdev_flow_flush
,
1478 dpif_netdev_flow_dump_start
,
1479 dpif_netdev_flow_dump_next
,
1480 dpif_netdev_flow_dump_done
,
1481 dpif_netdev_execute
,
1483 dpif_netdev_recv_set
,
1484 dpif_netdev_queue_to_priority
,
1486 dpif_netdev_recv_wait
,
1487 dpif_netdev_recv_purge
,
1491 dpif_dummy_change_port_number(struct unixctl_conn
*conn
, int argc OVS_UNUSED
,
1492 const char *argv
[], void *aux OVS_UNUSED
)
1494 struct dp_netdev_port
*port
;
1495 struct dp_netdev
*dp
;
1498 dp
= shash_find_data(&dp_netdevs
, argv
[1]);
1499 if (!dp
|| !dpif_netdev_class_is_dummy(dp
->class)) {
1500 unixctl_command_reply_error(conn
, "unknown datapath or not a dummy");
1504 if (get_port_by_name(dp
, argv
[2], &port
)) {
1505 unixctl_command_reply_error(conn
, "unknown port");
1509 port_no
= atoi(argv
[3]);
1510 if (port_no
<= 0 || port_no
>= MAX_PORTS
) {
1511 unixctl_command_reply_error(conn
, "bad port number");
1514 if (dp
->ports
[port_no
]) {
1515 unixctl_command_reply_error(conn
, "port number already in use");
1518 dp
->ports
[odp_to_u32(port
->port_no
)] = NULL
;
1519 dp
->ports
[port_no
] = port
;
1520 port
->port_no
= u32_to_odp(port_no
);
1521 seq_change(dp
->port_seq
);
1522 unixctl_command_reply(conn
, NULL
);
1526 dpif_dummy_register__(const char *type
)
1528 struct dpif_class
*class;
1530 class = xmalloc(sizeof *class);
1531 *class = dpif_netdev_class
;
1532 class->type
= xstrdup(type
);
1533 dp_register_provider(class);
1537 dpif_dummy_register(bool override
)
1544 dp_enumerate_types(&types
);
1545 SSET_FOR_EACH (type
, &types
) {
1546 if (!dp_unregister_provider(type
)) {
1547 dpif_dummy_register__(type
);
1550 sset_destroy(&types
);
1553 dpif_dummy_register__("dummy");
1555 unixctl_command_register("dpif-dummy/change-port-number",
1556 "DP PORT NEW-NUMBER",
1557 3, 3, dpif_dummy_change_port_number
, NULL
);