]>
git.proxmox.com Git - mirror_ovs.git/blob - datapath/actions.c
2 * Distributed under the terms of the GNU GPL version 2.
3 * Copyright (c) 2007, 2008, 2009, 2010 Nicira Networks.
5 * Significant portions of this file may be copied from parts of the Linux
6 * kernel, by Linus Torvalds and others.
9 /* Functions for executing flow actions. */
11 #include <linux/skbuff.h>
14 #include <linux/tcp.h>
15 #include <linux/udp.h>
16 #include <linux/in6.h>
17 #include <linux/if_vlan.h>
18 #include <net/inet_ecn.h>
20 #include <net/checksum.h>
24 #include "openvswitch/datapath-protocol.h"
26 static struct sk_buff
*
27 make_writable(struct sk_buff
*skb
, unsigned min_headroom
, gfp_t gfp
)
29 if (skb_shared(skb
) || skb_cloned(skb
)) {
31 unsigned headroom
= max(min_headroom
, skb_headroom(skb
));
33 nskb
= skb_copy_expand(skb
, headroom
, skb_tailroom(skb
), gfp
);
35 #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
36 /* Before 2.6.24 these fields were not copied when
37 * doing an skb_copy_expand. */
38 nskb
->ip_summed
= skb
->ip_summed
;
39 nskb
->csum
= skb
->csum
;
41 #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
42 /* These fields are copied in skb_clone but not in
43 * skb_copy or related functions. We need to manually
44 * copy them over here. */
45 nskb
->proto_data_valid
= skb
->proto_data_valid
;
46 nskb
->proto_csum_blank
= skb
->proto_csum_blank
;
52 unsigned int hdr_len
= (skb_transport_offset(skb
)
53 + sizeof(struct tcphdr
));
54 if (pskb_may_pull(skb
, min(hdr_len
, skb
->len
)))
62 static struct sk_buff
*
63 vlan_pull_tag(struct sk_buff
*skb
)
65 struct vlan_ethhdr
*vh
= vlan_eth_hdr(skb
);
69 /* Verify we were given a vlan packet */
70 if (vh
->h_vlan_proto
!= htons(ETH_P_8021Q
))
73 memmove(skb
->data
+ VLAN_HLEN
, skb
->data
, 2 * VLAN_ETH_ALEN
);
75 eh
= (struct ethhdr
*)skb_pull(skb
, VLAN_HLEN
);
77 skb
->protocol
= eh
->h_proto
;
78 skb
->mac_header
+= VLAN_HLEN
;
84 static struct sk_buff
*
85 modify_vlan_tci(struct datapath
*dp
, struct sk_buff
*skb
,
86 struct odp_flow_key
*key
, const union odp_action
*a
,
87 int n_actions
, gfp_t gfp
)
91 if (a
->type
== ODPAT_SET_VLAN_VID
) {
92 tci
= ntohs(a
->vlan_vid
.vlan_vid
);
94 key
->dl_vlan
= htons(tci
& mask
);
96 tci
= a
->vlan_pcp
.vlan_pcp
<< VLAN_PCP_SHIFT
;
100 skb
= make_writable(skb
, VLAN_HLEN
, gfp
);
102 return ERR_PTR(-ENOMEM
);
104 if (skb
->protocol
== htons(ETH_P_8021Q
)) {
105 /* Modify vlan id, but maintain other TCI values */
106 struct vlan_ethhdr
*vh
= vlan_eth_hdr(skb
);
107 vh
->h_vlan_TCI
= htons((ntohs(vh
->h_vlan_TCI
) & ~mask
) | tci
);
109 /* Add vlan header */
111 /* Set up checksumming pointers for checksum-deferred packets
112 * on Xen. Otherwise, dev_queue_xmit() will try to do this
113 * when we send the packet out on the wire, and it will fail at
114 * that point because skb_checksum_setup() will not look inside
115 * an 802.1Q header. */
116 vswitch_skb_checksum_setup(skb
);
118 /* GSO is not implemented for packets with an 802.1Q header, so
119 * we have to do segmentation before we add that header.
121 * GSO does work with hardware-accelerated VLAN tagging, but we
122 * can't use hardware-accelerated VLAN tagging since it
123 * requires the device to have a VLAN group configured (with
124 * e.g. vconfig(8)) and we don't do that.
126 * Having to do this here may be a performance loss, since we
127 * can't take advantage of TSO hardware support, although it
128 * does not make a measurable network performance difference
129 * for 1G Ethernet. Fixing that would require patching the
130 * kernel (either to add GSO support to the VLAN protocol or to
131 * support hardware-accelerated VLAN tagging without VLAN
132 * groups configured). */
133 if (skb_is_gso(skb
)) {
134 struct sk_buff
*segs
;
136 segs
= skb_gso_segment(skb
, 0);
138 if (unlikely(IS_ERR(segs
)))
139 return ERR_CAST(segs
);
142 struct sk_buff
*nskb
= segs
->next
;
147 segs
= __vlan_put_tag(segs
, tci
);
150 struct odp_flow_key segkey
= *key
;
151 err
= execute_actions(dp
, segs
,
158 while ((segs
= nskb
)) {
167 } while (segs
->next
);
172 /* The hardware-accelerated version of vlan_put_tag() works
173 * only for a device that has a VLAN group configured (with
174 * e.g. vconfig(8)), so call the software-only version
175 * __vlan_put_tag() directly instead.
177 skb
= __vlan_put_tag(skb
, tci
);
179 return ERR_PTR(-ENOMEM
);
185 static struct sk_buff
*strip_vlan(struct sk_buff
*skb
,
186 struct odp_flow_key
*key
, gfp_t gfp
)
188 skb
= make_writable(skb
, 0, gfp
);
191 key
->dl_vlan
= htons(ODP_VLAN_NONE
);
196 static struct sk_buff
*set_dl_addr(struct sk_buff
*skb
,
197 const struct odp_action_dl_addr
*a
,
200 skb
= make_writable(skb
, 0, gfp
);
202 struct ethhdr
*eh
= eth_hdr(skb
);
203 memcpy(a
->type
== ODPAT_SET_DL_SRC
? eh
->h_source
: eh
->h_dest
,
204 a
->dl_addr
, ETH_ALEN
);
209 /* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field
210 * covered by the sum has been changed from 'from' to 'to'. If set,
211 * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header.
212 * Based on nf_proto_csum_replace4. */
213 static void update_csum(__sum16
*sum
, struct sk_buff
*skb
,
214 __be32 from
, __be32 to
, int pseudohdr
)
216 __be32 diff
[] = { ~from
, to
};
218 /* On older kernels, CHECKSUM_PARTIAL and CHECKSUM_COMPLETE are both defined
219 * as CHECKSUM_HW. However, we can make some inferences so that we can update
220 * the checksums appropriately. */
222 CSUM_PARTIAL
, /* Partial checksum, skb->csum undefined. */
223 CSUM_PACKET
, /* In-packet checksum, skb->csum undefined. */
224 CSUM_COMPLETE
, /* In-packet checksum, skb->csum valid. */
227 csum_type
= CSUM_PACKET
;
229 /* Newer kernel, just map between kernel types and ours. */
230 if (skb
->ip_summed
== CHECKSUM_PARTIAL
)
231 csum_type
= CSUM_PARTIAL
;
232 else if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
233 csum_type
= CSUM_COMPLETE
;
235 /* In theory this could be either CHECKSUM_PARTIAL or CHECKSUM_COMPLETE.
236 * However, we should only get CHECKSUM_PARTIAL packets from Xen, which
237 * uses some special fields to represent this (see below). Since we
238 * can only make one type work, pick the one that actually happens in
240 if (skb
->ip_summed
== CHECKSUM_HW
)
241 csum_type
= CSUM_COMPLETE
;
243 #if defined(CONFIG_XEN) && defined(HAVE_PROTO_DATA_VALID)
244 /* Xen has a special way of representing CHECKSUM_PARTIAL on older
246 if (skb
->proto_csum_blank
)
247 csum_type
= CSUM_PARTIAL
;
250 if (csum_type
!= CSUM_PARTIAL
) {
251 *sum
= csum_fold(csum_partial((char *)diff
, sizeof(diff
),
252 ~csum_unfold(*sum
)));
253 if (csum_type
== CSUM_COMPLETE
&& pseudohdr
)
254 skb
->csum
= ~csum_partial((char *)diff
, sizeof(diff
),
256 } else if (pseudohdr
)
257 *sum
= ~csum_fold(csum_partial((char *)diff
, sizeof(diff
),
261 static struct sk_buff
*set_nw_addr(struct sk_buff
*skb
,
262 struct odp_flow_key
*key
,
263 const struct odp_action_nw_addr
*a
,
266 if (key
->dl_type
!= htons(ETH_P_IP
))
269 skb
= make_writable(skb
, 0, gfp
);
271 struct iphdr
*nh
= ip_hdr(skb
);
272 u32
*f
= a
->type
== ODPAT_SET_NW_SRC
? &nh
->saddr
: &nh
->daddr
;
274 u32
new = a
->nw_addr
;
276 if (key
->nw_proto
== IPPROTO_TCP
) {
277 struct tcphdr
*th
= tcp_hdr(skb
);
278 update_csum(&th
->check
, skb
, old
, new, 1);
279 } else if (key
->nw_proto
== IPPROTO_UDP
) {
280 struct udphdr
*th
= udp_hdr(skb
);
281 update_csum(&th
->check
, skb
, old
, new, 1);
283 update_csum(&nh
->check
, skb
, old
, new, 0);
289 static struct sk_buff
*set_nw_tos(struct sk_buff
*skb
,
290 struct odp_flow_key
*key
,
291 const struct odp_action_nw_tos
*a
,
294 if (key
->dl_type
!= htons(ETH_P_IP
))
297 skb
= make_writable(skb
, 0, gfp
);
299 struct iphdr
*nh
= ip_hdr(skb
);
304 /* Set the DSCP bits and preserve the ECN bits. */
305 new = (a
->nw_tos
& ~INET_ECN_MASK
) | (nh
->tos
& INET_ECN_MASK
);
306 update_csum(&nh
->check
, skb
, htons((uint16_t)old
),
307 htons((uint16_t)new), 0);
313 static struct sk_buff
*
314 set_tp_port(struct sk_buff
*skb
, struct odp_flow_key
*key
,
315 const struct odp_action_tp_port
*a
,
320 if (key
->dl_type
!= htons(ETH_P_IP
))
323 if (key
->nw_proto
== IPPROTO_TCP
)
324 check_ofs
= offsetof(struct tcphdr
, check
);
325 else if (key
->nw_proto
== IPPROTO_UDP
)
326 check_ofs
= offsetof(struct udphdr
, check
);
330 skb
= make_writable(skb
, 0, gfp
);
332 struct udphdr
*th
= udp_hdr(skb
);
333 u16
*f
= a
->type
== ODPAT_SET_TP_SRC
? &th
->source
: &th
->dest
;
335 u16
new = a
->tp_port
;
336 update_csum((u16
*)(skb_transport_header(skb
) + check_ofs
),
343 static inline unsigned packet_length(const struct sk_buff
*skb
)
345 unsigned length
= skb
->len
- ETH_HLEN
;
346 if (skb
->protocol
== htons(ETH_P_8021Q
))
351 int dp_xmit_skb(struct sk_buff
*skb
)
353 struct datapath
*dp
= skb
->dev
->br_port
->dp
;
356 if (packet_length(skb
) > skb
->dev
->mtu
&& !skb_is_gso(skb
)) {
357 printk(KERN_WARNING
"%s: dropped over-mtu packet: %d > %d\n",
358 dp_name(dp
), packet_length(skb
), skb
->dev
->mtu
);
363 forward_ip_summed(skb
);
370 do_output(struct datapath
*dp
, struct sk_buff
*skb
, int out_port
)
372 struct net_bridge_port
*p
;
373 struct net_device
*dev
;
378 p
= dp
->ports
[out_port
];
382 dev
= skb
->dev
= p
->dev
;
384 dp_dev_recv(dev
, skb
);
393 /* Never consumes 'skb'. Returns a port that 'skb' should be sent to, -1 if
395 static int output_group(struct datapath
*dp
, __u16 group
,
396 struct sk_buff
*skb
, gfp_t gfp
)
398 struct dp_port_group
*g
= rcu_dereference(dp
->groups
[group
]);
404 for (i
= 0; i
< g
->n_ports
; i
++) {
405 struct net_bridge_port
*p
= dp
->ports
[g
->ports
[i
]];
406 if (!p
|| skb
->dev
== p
->dev
)
408 if (prev_port
!= -1) {
409 struct sk_buff
*clone
= skb_clone(skb
, gfp
);
412 do_output(dp
, clone
, prev_port
);
414 prev_port
= p
->port_no
;
420 output_control(struct datapath
*dp
, struct sk_buff
*skb
, u32 arg
, gfp_t gfp
)
422 skb
= skb_clone(skb
, gfp
);
425 return dp_output_control(dp
, skb
, _ODPL_ACTION_NR
, arg
);
428 /* Send a copy of this packet up to the sFlow agent, along with extra
429 * information about what happened to it. */
430 static void sflow_sample(struct datapath
*dp
, struct sk_buff
*skb
,
431 const union odp_action
*a
, int n_actions
,
432 gfp_t gfp
, struct net_bridge_port
*nbp
)
434 struct odp_sflow_sample_header
*hdr
;
435 unsigned int actlen
= n_actions
* sizeof(union odp_action
);
436 unsigned int hdrlen
= sizeof(struct odp_sflow_sample_header
);
437 struct sk_buff
*nskb
;
439 nskb
= skb_copy_expand(skb
, actlen
+ hdrlen
, 0, gfp
);
443 memcpy(__skb_push(nskb
, actlen
), a
, actlen
);
444 hdr
= (struct odp_sflow_sample_header
*)__skb_push(nskb
, hdrlen
);
445 hdr
->n_actions
= n_actions
;
446 hdr
->sample_pool
= atomic_read(&nbp
->sflow_pool
);
447 dp_output_control(dp
, nskb
, _ODPL_SFLOW_NR
, 0);
450 /* Execute a list of actions against 'skb'. */
451 int execute_actions(struct datapath
*dp
, struct sk_buff
*skb
,
452 struct odp_flow_key
*key
,
453 const union odp_action
*a
, int n_actions
,
456 /* Every output action needs a separate clone of 'skb', but the common
457 * case is just a single output action, so that doing a clone and
458 * then freeing the original skbuff is wasteful. So the following code
459 * is slightly obscure just to avoid that. */
463 if (dp
->sflow_probability
) {
464 struct net_bridge_port
*p
= skb
->dev
->br_port
;
466 atomic_inc(&p
->sflow_pool
);
467 if (dp
->sflow_probability
== UINT_MAX
||
468 net_random() < dp
->sflow_probability
)
469 sflow_sample(dp
, skb
, a
, n_actions
, gfp
, p
);
473 for (; n_actions
> 0; a
++, n_actions
--) {
474 WARN_ON_ONCE(skb_shared(skb
));
475 if (prev_port
!= -1) {
476 do_output(dp
, skb_clone(skb
, gfp
), prev_port
);
482 prev_port
= a
->output
.port
;
485 case ODPAT_OUTPUT_GROUP
:
486 prev_port
= output_group(dp
, a
->output_group
.group
,
490 case ODPAT_CONTROLLER
:
491 err
= output_control(dp
, skb
, a
->controller
.arg
, gfp
);
498 case ODPAT_SET_VLAN_VID
:
499 case ODPAT_SET_VLAN_PCP
:
500 skb
= modify_vlan_tci(dp
, skb
, key
, a
, n_actions
, gfp
);
505 case ODPAT_STRIP_VLAN
:
506 skb
= strip_vlan(skb
, key
, gfp
);
509 case ODPAT_SET_DL_SRC
:
510 case ODPAT_SET_DL_DST
:
511 skb
= set_dl_addr(skb
, &a
->dl_addr
, gfp
);
514 case ODPAT_SET_NW_SRC
:
515 case ODPAT_SET_NW_DST
:
516 skb
= set_nw_addr(skb
, key
, &a
->nw_addr
, gfp
);
519 case ODPAT_SET_NW_TOS
:
520 skb
= set_nw_tos(skb
, key
, &a
->nw_tos
, gfp
);
523 case ODPAT_SET_TP_SRC
:
524 case ODPAT_SET_TP_DST
:
525 skb
= set_tp_port(skb
, key
, &a
->tp_port
, gfp
);
532 do_output(dp
, skb
, prev_port
);