2 * Copyright (c) 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
24 #include <netinet/in.h>
25 #include <sys/socket.h>
30 #include <sys/ioctl.h>
34 #include "classifier.h"
37 #include "dpif-provider.h"
39 #include "dynamic-string.h"
43 #include "meta-flow.h"
45 #include "netdev-vport.h"
47 #include "odp-execute.h"
49 #include "ofp-print.h"
52 #include "poll-loop.h"
62 VLOG_DEFINE_THIS_MODULE(dpif_netdev
);
64 /* By default, choose a priority in the middle. */
65 #define NETDEV_RULE_PRIORITY 0x8000
67 /* Configuration parameters. */
68 enum { MAX_PORTS
= 256 }; /* Maximum number of ports. */
69 enum { MAX_FLOWS
= 65536 }; /* Maximum number of flows in flow table. */
71 /* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
72 * headers to be aligned on a 4-byte boundary. */
73 enum { DP_NETDEV_HEADROOM
= 2 + VLAN_HEADER_LEN
};
76 enum { N_QUEUES
= 2 }; /* Number of queues for dpif_recv(). */
77 enum { MAX_QUEUE_LEN
= 128 }; /* Maximum number of packets per queue. */
78 enum { QUEUE_MASK
= MAX_QUEUE_LEN
- 1 };
79 BUILD_ASSERT_DECL(IS_POW2(MAX_QUEUE_LEN
));
81 struct dp_netdev_upcall
{
82 struct dpif_upcall upcall
; /* Queued upcall information. */
83 struct ofpbuf buf
; /* ofpbuf instance for upcall.packet. */
86 struct dp_netdev_queue
{
87 struct dp_netdev_upcall upcalls
[MAX_QUEUE_LEN
];
88 unsigned int head
, tail
;
91 /* Datapath based on the network device interface from netdev.h. */
93 const struct dpif_class
*class;
97 int max_mtu
; /* Maximum MTU of any port added so far. */
99 struct dp_netdev_queue queues
[N_QUEUES
];
100 struct classifier cls
; /* Classifier. */
101 struct hmap flow_table
; /* Flow table. */
102 struct seq
*queue_seq
; /* Incremented whenever a packet is queued. */
105 long long int n_hit
; /* Number of flow table matches. */
106 long long int n_missed
; /* Number of flow table misses. */
107 long long int n_lost
; /* Number of misses not passed to client. */
110 struct dp_netdev_port
*ports
[MAX_PORTS
];
111 struct list port_list
;
112 struct seq
*port_seq
; /* Incremented whenever a port changes. */
115 /* A port in a netdev-based datapath. */
116 struct dp_netdev_port
{
117 odp_port_t port_no
; /* Index into dp_netdev's 'ports'. */
118 struct list node
; /* Element in dp_netdev's 'port_list'. */
119 struct netdev
*netdev
;
120 struct netdev_saved_flags
*sf
;
121 struct netdev_rx
*rx
;
122 char *type
; /* Port type as requested by user. */
125 /* A flow in dp_netdev's 'flow_table'. */
126 struct dp_netdev_flow
{
127 /* Packet classification. */
128 struct cls_rule cr
; /* In owning dp_netdev's 'cls'. */
130 /* Hash table index by unmasked flow.*/
131 struct hmap_node node
; /* In owning dp_netdev's 'flow_table'. */
132 struct flow flow
; /* The flow that created this entry. */
135 long long int used
; /* Last used time, in monotonic msecs. */
136 long long int packet_count
; /* Number of packets matched. */
137 long long int byte_count
; /* Number of bytes matched. */
138 uint16_t tcp_flags
; /* Bitwise-OR of seen tcp_flags values. */
141 struct nlattr
*actions
;
145 /* Interface to netdev-based datapath. */
148 struct dp_netdev
*dp
;
149 uint64_t last_port_seq
;
152 /* All netdev-based datapaths. */
153 static struct shash dp_netdevs
= SHASH_INITIALIZER(&dp_netdevs
);
155 /* Global lock for all data. */
156 static struct ovs_mutex dp_netdev_mutex
= OVS_MUTEX_INITIALIZER
;
158 static int get_port_by_number(struct dp_netdev
*, odp_port_t port_no
,
159 struct dp_netdev_port
**portp
);
160 static int get_port_by_name(struct dp_netdev
*, const char *devname
,
161 struct dp_netdev_port
**portp
);
162 static void dp_netdev_free(struct dp_netdev
*);
163 static void dp_netdev_flow_flush(struct dp_netdev
*);
164 static int do_add_port(struct dp_netdev
*, const char *devname
,
165 const char *type
, odp_port_t port_no
);
166 static int do_del_port(struct dp_netdev
*, odp_port_t port_no
);
167 static int dpif_netdev_open(const struct dpif_class
*, const char *name
,
168 bool create
, struct dpif
**);
169 static int dp_netdev_output_userspace(struct dp_netdev
*, const struct ofpbuf
*,
170 int queue_no
, const struct flow
*,
171 const struct nlattr
*userdata
);
172 static void dp_netdev_execute_actions(struct dp_netdev
*, const struct flow
*,
174 const struct nlattr
*actions
,
176 static void dp_netdev_port_input(struct dp_netdev
*dp
,
177 struct dp_netdev_port
*port
,
178 struct ofpbuf
*packet
, uint32_t skb_priority
,
179 uint32_t pkt_mark
, const struct flow_tnl
*tnl
);
181 static struct dpif_netdev
*
182 dpif_netdev_cast(const struct dpif
*dpif
)
184 ovs_assert(dpif
->dpif_class
->open
== dpif_netdev_open
);
185 return CONTAINER_OF(dpif
, struct dpif_netdev
, dpif
);
188 static struct dp_netdev
*
189 get_dp_netdev(const struct dpif
*dpif
)
191 return dpif_netdev_cast(dpif
)->dp
;
195 dpif_netdev_enumerate(struct sset
*all_dps
)
197 struct shash_node
*node
;
199 ovs_mutex_lock(&dp_netdev_mutex
);
200 SHASH_FOR_EACH(node
, &dp_netdevs
) {
201 sset_add(all_dps
, node
->name
);
203 ovs_mutex_unlock(&dp_netdev_mutex
);
209 dpif_netdev_class_is_dummy(const struct dpif_class
*class)
211 return class != &dpif_netdev_class
;
215 dpif_netdev_port_open_type(const struct dpif_class
*class, const char *type
)
217 return strcmp(type
, "internal") ? type
218 : dpif_netdev_class_is_dummy(class) ? "dummy"
223 create_dpif_netdev(struct dp_netdev
*dp
)
225 uint16_t netflow_id
= hash_string(dp
->name
, 0);
226 struct dpif_netdev
*dpif
;
230 dpif
= xmalloc(sizeof *dpif
);
231 dpif_init(&dpif
->dpif
, dp
->class, dp
->name
, netflow_id
>> 8, netflow_id
);
233 dpif
->last_port_seq
= seq_read(dp
->port_seq
);
238 /* Choose an unused, non-zero port number and return it on success.
239 * Return ODPP_NONE on failure. */
241 choose_port(struct dp_netdev
*dp
, const char *name
)
245 if (dp
->class != &dpif_netdev_class
) {
249 /* If the port name begins with "br", start the number search at
250 * 100 to make writing tests easier. */
251 if (!strncmp(name
, "br", 2)) {
255 /* If the port name contains a number, try to assign that port number.
256 * This can make writing unit tests easier because port numbers are
258 for (p
= name
; *p
!= '\0'; p
++) {
259 if (isdigit((unsigned char) *p
)) {
260 port_no
= start_no
+ strtol(p
, NULL
, 10);
261 if (port_no
> 0 && port_no
< MAX_PORTS
262 && !dp
->ports
[port_no
]) {
263 return u32_to_odp(port_no
);
270 for (port_no
= 1; port_no
< MAX_PORTS
; port_no
++) {
271 if (!dp
->ports
[port_no
]) {
272 return u32_to_odp(port_no
);
280 create_dp_netdev(const char *name
, const struct dpif_class
*class,
281 struct dp_netdev
**dpp
)
283 struct dp_netdev
*dp
;
287 dp
= xzalloc(sizeof *dp
);
289 dp
->name
= xstrdup(name
);
291 dp
->max_mtu
= ETH_PAYLOAD_MAX
;
292 for (i
= 0; i
< N_QUEUES
; i
++) {
293 dp
->queues
[i
].head
= dp
->queues
[i
].tail
= 0;
295 dp
->queue_seq
= seq_create();
296 classifier_init(&dp
->cls
, NULL
);
297 hmap_init(&dp
->flow_table
);
298 list_init(&dp
->port_list
);
299 dp
->port_seq
= seq_create();
301 error
= do_add_port(dp
, name
, "internal", ODPP_LOCAL
);
307 shash_add(&dp_netdevs
, name
, dp
);
314 dpif_netdev_open(const struct dpif_class
*class, const char *name
,
315 bool create
, struct dpif
**dpifp
)
317 struct dp_netdev
*dp
;
320 ovs_mutex_lock(&dp_netdev_mutex
);
321 dp
= shash_find_data(&dp_netdevs
, name
);
323 error
= create
? create_dp_netdev(name
, class, &dp
) : ENODEV
;
325 error
= (dp
->class != class ? EINVAL
330 *dpifp
= create_dpif_netdev(dp
);
332 ovs_mutex_unlock(&dp_netdev_mutex
);
338 dp_netdev_purge_queues(struct dp_netdev
*dp
)
342 for (i
= 0; i
< N_QUEUES
; i
++) {
343 struct dp_netdev_queue
*q
= &dp
->queues
[i
];
345 while (q
->tail
!= q
->head
) {
346 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->tail
++ & QUEUE_MASK
];
347 ofpbuf_uninit(&u
->buf
);
353 dp_netdev_free(struct dp_netdev
*dp
)
355 struct dp_netdev_port
*port
, *next
;
357 dp_netdev_flow_flush(dp
);
358 LIST_FOR_EACH_SAFE (port
, next
, node
, &dp
->port_list
) {
359 do_del_port(dp
, port
->port_no
);
361 dp_netdev_purge_queues(dp
);
362 seq_destroy(dp
->queue_seq
);
363 classifier_destroy(&dp
->cls
);
364 hmap_destroy(&dp
->flow_table
);
365 seq_destroy(dp
->port_seq
);
371 dpif_netdev_close(struct dpif
*dpif
)
373 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
375 ovs_mutex_lock(&dp_netdev_mutex
);
377 ovs_assert(dp
->open_cnt
> 0);
378 if (--dp
->open_cnt
== 0 && dp
->destroyed
) {
379 shash_find_and_delete(&dp_netdevs
, dp
->name
);
384 ovs_mutex_unlock(&dp_netdev_mutex
);
388 dpif_netdev_destroy(struct dpif
*dpif
)
390 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
392 ovs_mutex_lock(&dp_netdev_mutex
);
393 dp
->destroyed
= true;
394 ovs_mutex_unlock(&dp_netdev_mutex
);
400 dpif_netdev_get_stats(const struct dpif
*dpif
, struct dpif_dp_stats
*stats
)
402 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
404 ovs_mutex_lock(&dp_netdev_mutex
);
405 stats
->n_flows
= hmap_count(&dp
->flow_table
);
406 stats
->n_hit
= dp
->n_hit
;
407 stats
->n_missed
= dp
->n_missed
;
408 stats
->n_lost
= dp
->n_lost
;
409 stats
->n_masks
= UINT64_MAX
;
410 stats
->n_mask_hit
= UINT64_MAX
;
411 ovs_mutex_unlock(&dp_netdev_mutex
);
417 do_add_port(struct dp_netdev
*dp
, const char *devname
, const char *type
,
420 struct netdev_saved_flags
*sf
;
421 struct dp_netdev_port
*port
;
422 struct netdev
*netdev
;
423 struct netdev_rx
*rx
;
424 enum netdev_flags flags
;
425 const char *open_type
;
429 /* XXX reject devices already in some dp_netdev. */
431 /* Open and validate network device. */
432 open_type
= dpif_netdev_port_open_type(dp
->class, type
);
433 error
= netdev_open(devname
, open_type
, &netdev
);
437 /* XXX reject non-Ethernet devices */
439 netdev_get_flags(netdev
, &flags
);
440 if (flags
& NETDEV_LOOPBACK
) {
441 VLOG_ERR("%s: cannot add a loopback device", devname
);
442 netdev_close(netdev
);
446 error
= netdev_rx_open(netdev
, &rx
);
448 && !(error
== EOPNOTSUPP
&& dpif_netdev_class_is_dummy(dp
->class))) {
449 VLOG_ERR("%s: cannot receive packets on this network device (%s)",
450 devname
, ovs_strerror(errno
));
451 netdev_close(netdev
);
455 error
= netdev_turn_flags_on(netdev
, NETDEV_PROMISC
, &sf
);
458 netdev_close(netdev
);
462 port
= xmalloc(sizeof *port
);
463 port
->port_no
= port_no
;
464 port
->netdev
= netdev
;
467 port
->type
= xstrdup(type
);
469 error
= netdev_get_mtu(netdev
, &mtu
);
470 if (!error
&& mtu
> dp
->max_mtu
) {
474 list_push_back(&dp
->port_list
, &port
->node
);
475 dp
->ports
[odp_to_u32(port_no
)] = port
;
476 seq_change(dp
->port_seq
);
482 dpif_netdev_port_add(struct dpif
*dpif
, struct netdev
*netdev
,
483 odp_port_t
*port_nop
)
485 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
486 char namebuf
[NETDEV_VPORT_NAME_BUFSIZE
];
487 const char *dpif_port
;
491 ovs_mutex_lock(&dp_netdev_mutex
);
492 dpif_port
= netdev_vport_get_dpif_port(netdev
, namebuf
, sizeof namebuf
);
493 if (*port_nop
!= ODPP_NONE
) {
494 uint32_t port_idx
= odp_to_u32(*port_nop
);
495 if (port_idx
>= MAX_PORTS
) {
497 } else if (dp
->ports
[port_idx
]) {
504 port_no
= choose_port(dp
, dpif_port
);
505 error
= port_no
== ODPP_NONE
? EFBIG
: 0;
509 error
= do_add_port(dp
, dpif_port
, netdev_get_type(netdev
), port_no
);
511 ovs_mutex_unlock(&dp_netdev_mutex
);
517 dpif_netdev_port_del(struct dpif
*dpif
, odp_port_t port_no
)
519 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
522 ovs_mutex_lock(&dp_netdev_mutex
);
523 error
= port_no
== ODPP_LOCAL
? EINVAL
: do_del_port(dp
, port_no
);
524 ovs_mutex_unlock(&dp_netdev_mutex
);
530 is_valid_port_number(odp_port_t port_no
)
532 return odp_to_u32(port_no
) < MAX_PORTS
;
536 get_port_by_number(struct dp_netdev
*dp
,
537 odp_port_t port_no
, struct dp_netdev_port
**portp
)
539 if (!is_valid_port_number(port_no
)) {
543 *portp
= dp
->ports
[odp_to_u32(port_no
)];
544 return *portp
? 0 : ENOENT
;
549 get_port_by_name(struct dp_netdev
*dp
,
550 const char *devname
, struct dp_netdev_port
**portp
)
552 struct dp_netdev_port
*port
;
554 LIST_FOR_EACH (port
, node
, &dp
->port_list
) {
555 if (!strcmp(netdev_get_name(port
->netdev
), devname
)) {
564 do_del_port(struct dp_netdev
*dp
, odp_port_t port_no
)
566 struct dp_netdev_port
*port
;
569 error
= get_port_by_number(dp
, port_no
, &port
);
574 list_remove(&port
->node
);
575 dp
->ports
[odp_to_u32(port_no
)] = NULL
;
576 seq_change(dp
->port_seq
);
578 netdev_close(port
->netdev
);
579 netdev_restore_flags(port
->sf
);
580 netdev_rx_close(port
->rx
);
588 answer_port_query(const struct dp_netdev_port
*port
,
589 struct dpif_port
*dpif_port
)
591 dpif_port
->name
= xstrdup(netdev_get_name(port
->netdev
));
592 dpif_port
->type
= xstrdup(port
->type
);
593 dpif_port
->port_no
= port
->port_no
;
597 dpif_netdev_port_query_by_number(const struct dpif
*dpif
, odp_port_t port_no
,
598 struct dpif_port
*dpif_port
)
600 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
601 struct dp_netdev_port
*port
;
604 ovs_mutex_lock(&dp_netdev_mutex
);
605 error
= get_port_by_number(dp
, port_no
, &port
);
606 if (!error
&& dpif_port
) {
607 answer_port_query(port
, dpif_port
);
609 ovs_mutex_unlock(&dp_netdev_mutex
);
615 dpif_netdev_port_query_by_name(const struct dpif
*dpif
, const char *devname
,
616 struct dpif_port
*dpif_port
)
618 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
619 struct dp_netdev_port
*port
;
622 ovs_mutex_lock(&dp_netdev_mutex
);
623 error
= get_port_by_name(dp
, devname
, &port
);
624 if (!error
&& dpif_port
) {
625 answer_port_query(port
, dpif_port
);
627 ovs_mutex_unlock(&dp_netdev_mutex
);
633 dpif_netdev_get_max_ports(const struct dpif
*dpif OVS_UNUSED
)
639 dp_netdev_free_flow(struct dp_netdev
*dp
, struct dp_netdev_flow
*netdev_flow
)
641 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
642 classifier_remove(&dp
->cls
, &netdev_flow
->cr
);
643 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
644 cls_rule_destroy(&netdev_flow
->cr
);
646 hmap_remove(&dp
->flow_table
, &netdev_flow
->node
);
647 free(netdev_flow
->actions
);
652 dp_netdev_flow_flush(struct dp_netdev
*dp
)
654 struct dp_netdev_flow
*netdev_flow
, *next
;
656 HMAP_FOR_EACH_SAFE (netdev_flow
, next
, node
, &dp
->flow_table
) {
657 dp_netdev_free_flow(dp
, netdev_flow
);
662 dpif_netdev_flow_flush(struct dpif
*dpif
)
664 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
666 ovs_mutex_lock(&dp_netdev_mutex
);
667 dp_netdev_flow_flush(dp
);
668 ovs_mutex_unlock(&dp_netdev_mutex
);
673 struct dp_netdev_port_state
{
679 dpif_netdev_port_dump_start(const struct dpif
*dpif OVS_UNUSED
, void **statep
)
681 *statep
= xzalloc(sizeof(struct dp_netdev_port_state
));
686 dpif_netdev_port_dump_next(const struct dpif
*dpif
, void *state_
,
687 struct dpif_port
*dpif_port
)
689 struct dp_netdev_port_state
*state
= state_
;
690 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
693 ovs_mutex_lock(&dp_netdev_mutex
);
694 for (port_idx
= odp_to_u32(state
->port_no
);
695 port_idx
< MAX_PORTS
; port_idx
++) {
696 struct dp_netdev_port
*port
= dp
->ports
[port_idx
];
699 state
->name
= xstrdup(netdev_get_name(port
->netdev
));
700 dpif_port
->name
= state
->name
;
701 dpif_port
->type
= port
->type
;
702 dpif_port
->port_no
= port
->port_no
;
703 state
->port_no
= u32_to_odp(port_idx
+ 1);
704 ovs_mutex_unlock(&dp_netdev_mutex
);
709 ovs_mutex_unlock(&dp_netdev_mutex
);
715 dpif_netdev_port_dump_done(const struct dpif
*dpif OVS_UNUSED
, void *state_
)
717 struct dp_netdev_port_state
*state
= state_
;
724 dpif_netdev_port_poll(const struct dpif
*dpif_
, char **devnamep OVS_UNUSED
)
726 struct dpif_netdev
*dpif
= dpif_netdev_cast(dpif_
);
727 uint64_t new_port_seq
;
730 ovs_mutex_lock(&dp_netdev_mutex
);
731 new_port_seq
= seq_read(dpif
->dp
->port_seq
);
732 if (dpif
->last_port_seq
!= new_port_seq
) {
733 dpif
->last_port_seq
= new_port_seq
;
738 ovs_mutex_unlock(&dp_netdev_mutex
);
744 dpif_netdev_port_poll_wait(const struct dpif
*dpif_
)
746 struct dpif_netdev
*dpif
= dpif_netdev_cast(dpif_
);
748 ovs_mutex_lock(&dp_netdev_mutex
);
749 seq_wait(dpif
->dp
->port_seq
, dpif
->last_port_seq
);
750 ovs_mutex_unlock(&dp_netdev_mutex
);
753 static struct dp_netdev_flow
*
754 dp_netdev_lookup_flow(const struct dp_netdev
*dp
, const struct flow
*flow
)
758 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
759 cr
= classifier_lookup(&dp
->cls
, flow
, NULL
);
760 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
763 ? CONTAINER_OF(cr
, struct dp_netdev_flow
, cr
)
767 static struct dp_netdev_flow
*
768 dp_netdev_find_flow(const struct dp_netdev
*dp
, const struct flow
*flow
)
770 struct dp_netdev_flow
*netdev_flow
;
772 HMAP_FOR_EACH_WITH_HASH (netdev_flow
, node
, flow_hash(flow
, 0),
774 if (flow_equal(&netdev_flow
->flow
, flow
)) {
782 get_dpif_flow_stats(struct dp_netdev_flow
*netdev_flow
,
783 struct dpif_flow_stats
*stats
)
785 stats
->n_packets
= netdev_flow
->packet_count
;
786 stats
->n_bytes
= netdev_flow
->byte_count
;
787 stats
->used
= netdev_flow
->used
;
788 stats
->tcp_flags
= netdev_flow
->tcp_flags
;
792 dpif_netdev_mask_from_nlattrs(const struct nlattr
*key
, uint32_t key_len
,
793 const struct nlattr
*mask_key
,
794 uint32_t mask_key_len
, const struct flow
*flow
,
798 if (odp_flow_key_to_mask(mask_key
, mask_key_len
, mask
, flow
)) {
799 /* This should not happen: it indicates that
800 * odp_flow_key_from_mask() and odp_flow_key_to_mask()
801 * disagree on the acceptable form of a mask. Log the problem
802 * as an error, with enough details to enable debugging. */
803 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
805 if (!VLOG_DROP_ERR(&rl
)) {
809 odp_flow_format(key
, key_len
, mask_key
, mask_key_len
, NULL
, &s
,
811 VLOG_ERR("internal error parsing flow mask %s", ds_cstr(&s
));
817 /* Force unwildcard the in_port. */
818 mask
->in_port
.odp_port
= u32_to_odp(UINT32_MAX
);
821 /* No mask key, unwildcard everything except fields whose
822 * prerequisities are not met. */
823 memset(mask
, 0x0, sizeof *mask
);
825 for (id
= 0; id
< MFF_N_IDS
; ++id
) {
826 /* Skip registers and metadata. */
827 if (!(id
>= MFF_REG0
&& id
< MFF_REG0
+ FLOW_N_REGS
)
828 && id
!= MFF_METADATA
) {
829 const struct mf_field
*mf
= mf_from_id(id
);
830 if (mf_are_prereqs_ok(mf
, flow
)) {
831 mf_mask_field(mf
, mask
);
841 dpif_netdev_flow_from_nlattrs(const struct nlattr
*key
, uint32_t key_len
,
846 if (odp_flow_key_to_flow(key
, key_len
, flow
)) {
847 /* This should not happen: it indicates that odp_flow_key_from_flow()
848 * and odp_flow_key_to_flow() disagree on the acceptable form of a
849 * flow. Log the problem as an error, with enough details to enable
851 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
853 if (!VLOG_DROP_ERR(&rl
)) {
857 odp_flow_format(key
, key_len
, NULL
, 0, NULL
, &s
, true);
858 VLOG_ERR("internal error parsing flow key %s", ds_cstr(&s
));
865 in_port
= flow
->in_port
.odp_port
;
866 if (!is_valid_port_number(in_port
) && in_port
!= ODPP_NONE
) {
874 dpif_netdev_flow_get(const struct dpif
*dpif
,
875 const struct nlattr
*nl_key
, size_t nl_key_len
,
876 struct ofpbuf
**actionsp
, struct dpif_flow_stats
*stats
)
878 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
879 struct dp_netdev_flow
*netdev_flow
;
883 error
= dpif_netdev_flow_from_nlattrs(nl_key
, nl_key_len
, &key
);
888 ovs_mutex_lock(&dp_netdev_mutex
);
889 netdev_flow
= dp_netdev_find_flow(dp
, &key
);
892 get_dpif_flow_stats(netdev_flow
, stats
);
895 *actionsp
= ofpbuf_clone_data(netdev_flow
->actions
,
896 netdev_flow
->actions_len
);
901 ovs_mutex_unlock(&dp_netdev_mutex
);
907 set_flow_actions(struct dp_netdev_flow
*netdev_flow
,
908 const struct nlattr
*actions
, size_t actions_len
)
910 netdev_flow
->actions
= xrealloc(netdev_flow
->actions
, actions_len
);
911 netdev_flow
->actions_len
= actions_len
;
912 memcpy(netdev_flow
->actions
, actions
, actions_len
);
917 dp_netdev_flow_add(struct dp_netdev
*dp
, const struct flow
*flow
,
918 const struct flow_wildcards
*wc
,
919 const struct nlattr
*actions
,
922 struct dp_netdev_flow
*netdev_flow
;
926 netdev_flow
= xzalloc(sizeof *netdev_flow
);
927 netdev_flow
->flow
= *flow
;
929 match_init(&match
, flow
, wc
);
930 cls_rule_init(&netdev_flow
->cr
, &match
, NETDEV_RULE_PRIORITY
);
931 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
932 classifier_insert(&dp
->cls
, &netdev_flow
->cr
);
933 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
935 error
= set_flow_actions(netdev_flow
, actions
, actions_len
);
937 ovs_rwlock_wrlock(&dp
->cls
.rwlock
);
938 classifier_remove(&dp
->cls
, &netdev_flow
->cr
);
939 ovs_rwlock_unlock(&dp
->cls
.rwlock
);
940 cls_rule_destroy(&netdev_flow
->cr
);
946 hmap_insert(&dp
->flow_table
, &netdev_flow
->node
, flow_hash(flow
, 0));
951 clear_stats(struct dp_netdev_flow
*netdev_flow
)
953 netdev_flow
->used
= 0;
954 netdev_flow
->packet_count
= 0;
955 netdev_flow
->byte_count
= 0;
956 netdev_flow
->tcp_flags
= 0;
960 dpif_netdev_flow_put(struct dpif
*dpif
, const struct dpif_flow_put
*put
)
962 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
963 struct dp_netdev_flow
*netdev_flow
;
965 struct flow_wildcards wc
;
968 error
= dpif_netdev_flow_from_nlattrs(put
->key
, put
->key_len
, &flow
);
972 error
= dpif_netdev_mask_from_nlattrs(put
->key
, put
->key_len
,
973 put
->mask
, put
->mask_len
,
979 ovs_mutex_lock(&dp_netdev_mutex
);
980 netdev_flow
= dp_netdev_lookup_flow(dp
, &flow
);
982 if (put
->flags
& DPIF_FP_CREATE
) {
983 if (hmap_count(&dp
->flow_table
) < MAX_FLOWS
) {
985 memset(put
->stats
, 0, sizeof *put
->stats
);
987 error
= dp_netdev_flow_add(dp
, &flow
, &wc
, put
->actions
,
996 if (put
->flags
& DPIF_FP_MODIFY
997 && flow_equal(&flow
, &netdev_flow
->flow
)) {
998 error
= set_flow_actions(netdev_flow
, put
->actions
,
1002 get_dpif_flow_stats(netdev_flow
, put
->stats
);
1004 if (put
->flags
& DPIF_FP_ZERO_STATS
) {
1005 clear_stats(netdev_flow
);
1008 } else if (put
->flags
& DPIF_FP_CREATE
) {
1011 /* Overlapping flow. */
1015 ovs_mutex_unlock(&dp_netdev_mutex
);
1021 dpif_netdev_flow_del(struct dpif
*dpif
, const struct dpif_flow_del
*del
)
1023 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1024 struct dp_netdev_flow
*netdev_flow
;
1028 error
= dpif_netdev_flow_from_nlattrs(del
->key
, del
->key_len
, &key
);
1033 ovs_mutex_lock(&dp_netdev_mutex
);
1034 netdev_flow
= dp_netdev_find_flow(dp
, &key
);
1037 get_dpif_flow_stats(netdev_flow
, del
->stats
);
1039 dp_netdev_free_flow(dp
, netdev_flow
);
1043 ovs_mutex_unlock(&dp_netdev_mutex
);
1048 struct dp_netdev_flow_state
{
1051 struct nlattr
*actions
;
1052 struct odputil_keybuf keybuf
;
1053 struct odputil_keybuf maskbuf
;
1054 struct dpif_flow_stats stats
;
1058 dpif_netdev_flow_dump_start(const struct dpif
*dpif OVS_UNUSED
, void **statep
)
1060 struct dp_netdev_flow_state
*state
;
1062 *statep
= state
= xmalloc(sizeof *state
);
1065 state
->actions
= NULL
;
1070 dpif_netdev_flow_dump_next(const struct dpif
*dpif
, void *state_
,
1071 const struct nlattr
**key
, size_t *key_len
,
1072 const struct nlattr
**mask
, size_t *mask_len
,
1073 const struct nlattr
**actions
, size_t *actions_len
,
1074 const struct dpif_flow_stats
**stats
)
1076 struct dp_netdev_flow_state
*state
= state_
;
1077 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1078 struct dp_netdev_flow
*netdev_flow
;
1079 struct hmap_node
*node
;
1081 ovs_mutex_lock(&dp_netdev_mutex
);
1082 node
= hmap_at_position(&dp
->flow_table
, &state
->bucket
, &state
->offset
);
1084 ovs_mutex_unlock(&dp_netdev_mutex
);
1088 netdev_flow
= CONTAINER_OF(node
, struct dp_netdev_flow
, node
);
1093 ofpbuf_use_stack(&buf
, &state
->keybuf
, sizeof state
->keybuf
);
1094 odp_flow_key_from_flow(&buf
, &netdev_flow
->flow
,
1095 netdev_flow
->flow
.in_port
.odp_port
);
1098 *key_len
= buf
.size
;
1103 struct flow_wildcards wc
;
1105 ofpbuf_use_stack(&buf
, &state
->maskbuf
, sizeof state
->maskbuf
);
1106 minimask_expand(&netdev_flow
->cr
.match
.mask
, &wc
);
1107 odp_flow_key_from_mask(&buf
, &wc
.masks
, &netdev_flow
->flow
,
1108 odp_to_u32(wc
.masks
.in_port
.odp_port
));
1111 *mask_len
= buf
.size
;
1115 free(state
->actions
);
1116 state
->actions
= xmemdup(netdev_flow
->actions
,
1117 netdev_flow
->actions_len
);
1119 *actions
= state
->actions
;
1120 *actions_len
= netdev_flow
->actions_len
;
1124 get_dpif_flow_stats(netdev_flow
, &state
->stats
);
1125 *stats
= &state
->stats
;
1128 ovs_mutex_unlock(&dp_netdev_mutex
);
1133 dpif_netdev_flow_dump_done(const struct dpif
*dpif OVS_UNUSED
, void *state_
)
1135 struct dp_netdev_flow_state
*state
= state_
;
1137 free(state
->actions
);
1143 dpif_netdev_execute(struct dpif
*dpif
, const struct dpif_execute
*execute
)
1145 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1149 if (execute
->packet
->size
< ETH_HEADER_LEN
||
1150 execute
->packet
->size
> UINT16_MAX
) {
1154 /* Get packet metadata. */
1155 error
= dpif_netdev_flow_from_nlattrs(execute
->key
, execute
->key_len
, &md
);
1157 struct ofpbuf
*copy
;
1160 /* Make a deep copy of 'packet', because we might modify its data. */
1161 copy
= ofpbuf_clone_with_headroom(execute
->packet
, DP_NETDEV_HEADROOM
);
1163 /* Extract flow key. */
1164 flow_extract(copy
, md
.skb_priority
, md
.pkt_mark
, &md
.tunnel
,
1166 ovs_mutex_lock(&dp_netdev_mutex
);
1167 dp_netdev_execute_actions(dp
, &key
, copy
,
1168 execute
->actions
, execute
->actions_len
);
1169 ovs_mutex_unlock(&dp_netdev_mutex
);
1170 ofpbuf_delete(copy
);
1176 dpif_netdev_recv_set(struct dpif
*dpif OVS_UNUSED
, bool enable OVS_UNUSED
)
1182 dpif_netdev_queue_to_priority(const struct dpif
*dpif OVS_UNUSED
,
1183 uint32_t queue_id
, uint32_t *priority
)
1185 *priority
= queue_id
;
1189 static struct dp_netdev_queue
*
1190 find_nonempty_queue(struct dpif
*dpif
)
1192 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1195 for (i
= 0; i
< N_QUEUES
; i
++) {
1196 struct dp_netdev_queue
*q
= &dp
->queues
[i
];
1197 if (q
->head
!= q
->tail
) {
1205 dpif_netdev_recv(struct dpif
*dpif
, struct dpif_upcall
*upcall
,
1208 struct dp_netdev_queue
*q
;
1211 ovs_mutex_lock(&dp_netdev_mutex
);
1212 q
= find_nonempty_queue(dpif
);
1214 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->tail
++ & QUEUE_MASK
];
1216 *upcall
= u
->upcall
;
1217 upcall
->packet
= buf
;
1226 ovs_mutex_unlock(&dp_netdev_mutex
);
1232 dpif_netdev_recv_wait(struct dpif
*dpif
)
1234 struct dp_netdev
*dp
= get_dp_netdev(dpif
);
1237 ovs_mutex_lock(&dp_netdev_mutex
);
1238 seq
= seq_read(dp
->queue_seq
);
1239 if (find_nonempty_queue(dpif
)) {
1240 poll_immediate_wake();
1242 seq_wait(dp
->queue_seq
, seq
);
1244 ovs_mutex_unlock(&dp_netdev_mutex
);
1248 dpif_netdev_recv_purge(struct dpif
*dpif
)
1250 struct dpif_netdev
*dpif_netdev
= dpif_netdev_cast(dpif
);
1251 ovs_mutex_lock(&dp_netdev_mutex
);
1252 dp_netdev_purge_queues(dpif_netdev
->dp
);
1253 ovs_mutex_unlock(&dp_netdev_mutex
);
1257 dp_netdev_flow_used(struct dp_netdev_flow
*netdev_flow
,
1258 const struct ofpbuf
*packet
)
1260 netdev_flow
->used
= time_msec();
1261 netdev_flow
->packet_count
++;
1262 netdev_flow
->byte_count
+= packet
->size
;
1263 netdev_flow
->tcp_flags
|= packet_get_tcp_flags(packet
, &netdev_flow
->flow
);
1267 dp_netdev_port_input(struct dp_netdev
*dp
, struct dp_netdev_port
*port
,
1268 struct ofpbuf
*packet
, uint32_t skb_priority
,
1269 uint32_t pkt_mark
, const struct flow_tnl
*tnl
)
1271 struct dp_netdev_flow
*netdev_flow
;
1273 union flow_in_port in_port_
;
1275 if (packet
->size
< ETH_HEADER_LEN
) {
1278 in_port_
.odp_port
= port
->port_no
;
1279 flow_extract(packet
, skb_priority
, pkt_mark
, tnl
, &in_port_
, &key
);
1280 netdev_flow
= dp_netdev_lookup_flow(dp
, &key
);
1282 dp_netdev_flow_used(netdev_flow
, packet
);
1283 dp_netdev_execute_actions(dp
, &key
, packet
,
1284 netdev_flow
->actions
,
1285 netdev_flow
->actions_len
);
1289 dp_netdev_output_userspace(dp
, packet
, DPIF_UC_MISS
, &key
, NULL
);
1294 dpif_netdev_run(struct dpif
*dpif
)
1296 struct dp_netdev_port
*port
;
1297 struct dp_netdev
*dp
;
1298 struct ofpbuf packet
;
1300 ovs_mutex_lock(&dp_netdev_mutex
);
1301 dp
= get_dp_netdev(dpif
);
1302 ofpbuf_init(&packet
,
1303 DP_NETDEV_HEADROOM
+ VLAN_ETH_HEADER_LEN
+ dp
->max_mtu
);
1305 LIST_FOR_EACH (port
, node
, &dp
->port_list
) {
1308 /* Reset packet contents. */
1309 ofpbuf_clear(&packet
);
1310 ofpbuf_reserve(&packet
, DP_NETDEV_HEADROOM
);
1312 error
= port
->rx
? netdev_rx_recv(port
->rx
, &packet
) : EOPNOTSUPP
;
1314 dp_netdev_port_input(dp
, port
, &packet
, 0, 0, NULL
);
1315 } else if (error
!= EAGAIN
&& error
!= EOPNOTSUPP
) {
1316 static struct vlog_rate_limit rl
= VLOG_RATE_LIMIT_INIT(1, 5);
1318 VLOG_ERR_RL(&rl
, "error receiving data from %s: %s",
1319 netdev_get_name(port
->netdev
), ovs_strerror(error
));
1322 ofpbuf_uninit(&packet
);
1323 ovs_mutex_unlock(&dp_netdev_mutex
);
1327 dpif_netdev_wait(struct dpif
*dpif
)
1329 struct dp_netdev_port
*port
;
1331 /* There is a race here, if thread A calls dpif_netdev_wait(dpif) and
1332 * thread B calls dpif_port_add(dpif) or dpif_port_remove(dpif) before
1333 * A makes it to poll_block().
1335 * But I think it doesn't matter:
1337 * - In the dpif_port_add() case, A will not wake up when a packet
1338 * arrives on the new port, but this would also happen if the
1339 * ordering were reversed.
1341 * - In the dpif_port_remove() case, A might wake up spuriously, but
1342 * that is harmless. */
1344 ovs_mutex_lock(&dp_netdev_mutex
);
1345 LIST_FOR_EACH (port
, node
, &get_dp_netdev(dpif
)->port_list
) {
1347 netdev_rx_wait(port
->rx
);
1350 ovs_mutex_unlock(&dp_netdev_mutex
);
1354 dp_netdev_output_userspace(struct dp_netdev
*dp
, const struct ofpbuf
*packet
,
1355 int queue_no
, const struct flow
*flow
,
1356 const struct nlattr
*userdata
)
1358 struct dp_netdev_queue
*q
= &dp
->queues
[queue_no
];
1359 if (q
->head
- q
->tail
< MAX_QUEUE_LEN
) {
1360 struct dp_netdev_upcall
*u
= &q
->upcalls
[q
->head
++ & QUEUE_MASK
];
1361 struct dpif_upcall
*upcall
= &u
->upcall
;
1362 struct ofpbuf
*buf
= &u
->buf
;
1365 upcall
->type
= queue_no
;
1367 /* Allocate buffer big enough for everything. */
1368 buf_size
= ODPUTIL_FLOW_KEY_BYTES
+ 2 + packet
->size
;
1370 buf_size
+= NLA_ALIGN(userdata
->nla_len
);
1372 ofpbuf_init(buf
, buf_size
);
1375 odp_flow_key_from_flow(buf
, flow
, flow
->in_port
.odp_port
);
1376 upcall
->key
= buf
->data
;
1377 upcall
->key_len
= buf
->size
;
1381 upcall
->userdata
= ofpbuf_put(buf
, userdata
,
1382 NLA_ALIGN(userdata
->nla_len
));
1387 * We adjust 'data' and 'size' in 'buf' so that only the packet itself
1388 * is visible in 'upcall->packet'. The ODP flow and (if present)
1389 * userdata become part of the headroom. */
1390 ofpbuf_put_zeros(buf
, 2);
1391 buf
->data
= ofpbuf_put(buf
, packet
->data
, packet
->size
);
1392 buf
->size
= packet
->size
;
1393 upcall
->packet
= buf
;
1395 seq_change(dp
->queue_seq
);
1404 struct dp_netdev_execute_aux
{
1405 struct dp_netdev
*dp
;
1406 const struct flow
*key
;
1410 dp_netdev_action_output(void *aux_
, struct ofpbuf
*packet
,
1411 const struct flow
*flow OVS_UNUSED
,
1412 odp_port_t out_port
)
1414 struct dp_netdev_execute_aux
*aux
= aux_
;
1415 struct dp_netdev_port
*p
= aux
->dp
->ports
[odp_to_u32(out_port
)];
1417 netdev_send(p
->netdev
, packet
);
1422 dp_netdev_action_userspace(void *aux_
, struct ofpbuf
*packet
,
1423 const struct flow
*flow OVS_UNUSED
,
1424 const struct nlattr
*a
)
1426 struct dp_netdev_execute_aux
*aux
= aux_
;
1427 const struct nlattr
*userdata
;
1429 userdata
= nl_attr_find_nested(a
, OVS_USERSPACE_ATTR_USERDATA
);
1430 dp_netdev_output_userspace(aux
->dp
, packet
, DPIF_UC_ACTION
, aux
->key
,
1435 dp_netdev_execute_actions(struct dp_netdev
*dp
, const struct flow
*key
,
1436 struct ofpbuf
*packet
,
1437 const struct nlattr
*actions
, size_t actions_len
)
1439 struct dp_netdev_execute_aux aux
= {dp
, key
};
1440 struct flow md
= *key
; /* Packet metadata, may be modified by actions. */
1442 odp_execute_actions(&aux
, packet
, &md
, actions
, actions_len
,
1443 dp_netdev_action_output
, dp_netdev_action_userspace
);
1446 const struct dpif_class dpif_netdev_class
= {
1448 dpif_netdev_enumerate
,
1449 dpif_netdev_port_open_type
,
1452 dpif_netdev_destroy
,
1455 dpif_netdev_get_stats
,
1456 dpif_netdev_port_add
,
1457 dpif_netdev_port_del
,
1458 dpif_netdev_port_query_by_number
,
1459 dpif_netdev_port_query_by_name
,
1460 dpif_netdev_get_max_ports
,
1461 NULL
, /* port_get_pid */
1462 dpif_netdev_port_dump_start
,
1463 dpif_netdev_port_dump_next
,
1464 dpif_netdev_port_dump_done
,
1465 dpif_netdev_port_poll
,
1466 dpif_netdev_port_poll_wait
,
1467 dpif_netdev_flow_get
,
1468 dpif_netdev_flow_put
,
1469 dpif_netdev_flow_del
,
1470 dpif_netdev_flow_flush
,
1471 dpif_netdev_flow_dump_start
,
1472 dpif_netdev_flow_dump_next
,
1473 dpif_netdev_flow_dump_done
,
1474 dpif_netdev_execute
,
1476 dpif_netdev_recv_set
,
1477 dpif_netdev_queue_to_priority
,
1479 dpif_netdev_recv_wait
,
1480 dpif_netdev_recv_purge
,
1484 dpif_dummy_change_port_number(struct unixctl_conn
*conn
, int argc OVS_UNUSED
,
1485 const char *argv
[], void *aux OVS_UNUSED
)
1487 struct dp_netdev_port
*port
;
1488 struct dp_netdev
*dp
;
1491 dp
= shash_find_data(&dp_netdevs
, argv
[1]);
1492 if (!dp
|| !dpif_netdev_class_is_dummy(dp
->class)) {
1493 unixctl_command_reply_error(conn
, "unknown datapath or not a dummy");
1497 if (get_port_by_name(dp
, argv
[2], &port
)) {
1498 unixctl_command_reply_error(conn
, "unknown port");
1502 port_no
= atoi(argv
[3]);
1503 if (port_no
<= 0 || port_no
>= MAX_PORTS
) {
1504 unixctl_command_reply_error(conn
, "bad port number");
1507 if (dp
->ports
[port_no
]) {
1508 unixctl_command_reply_error(conn
, "port number already in use");
1511 dp
->ports
[odp_to_u32(port
->port_no
)] = NULL
;
1512 dp
->ports
[port_no
] = port
;
1513 port
->port_no
= u32_to_odp(port_no
);
1514 seq_change(dp
->port_seq
);
1515 unixctl_command_reply(conn
, NULL
);
1519 dpif_dummy_register__(const char *type
)
1521 struct dpif_class
*class;
1523 class = xmalloc(sizeof *class);
1524 *class = dpif_netdev_class
;
1525 class->type
= xstrdup(type
);
1526 dp_register_provider(class);
1530 dpif_dummy_register(bool override
)
1537 dp_enumerate_types(&types
);
1538 SSET_FOR_EACH (type
, &types
) {
1539 if (!dp_unregister_provider(type
)) {
1540 dpif_dummy_register__(type
);
1543 sset_destroy(&types
);
1546 dpif_dummy_register__("dummy");
1548 unixctl_command_register("dpif-dummy/change-port-number",
1549 "DP PORT NEW-NUMBER",
1550 3, 3, dpif_dummy_change_port_number
, NULL
);