]>
Commit | Line | Data |
---|---|---|
064af421 BP |
1 | /* |
2 | * Distributed under the terms of the GNU GPL version 2. | |
834377ea | 3 | * Copyright (c) 2007, 2008, 2009, 2010 Nicira Networks. |
a14bc59f BP |
4 | * |
5 | * Significant portions of this file may be copied from parts of the Linux | |
6 | * kernel, by Linus Torvalds and others. | |
064af421 BP |
7 | */ |
8 | ||
9 | #include "flow.h" | |
f5e86186 | 10 | #include "datapath.h" |
064af421 BP |
11 | #include <linux/netdevice.h> |
12 | #include <linux/etherdevice.h> | |
13 | #include <linux/if_ether.h> | |
14 | #include <linux/if_vlan.h> | |
15 | #include <net/llc_pdu.h> | |
16 | #include <linux/kernel.h> | |
8d5ebd83 | 17 | #include <linux/jhash.h> |
064af421 BP |
18 | #include <linux/jiffies.h> |
19 | #include <linux/llc.h> | |
20 | #include <linux/module.h> | |
21 | #include <linux/in.h> | |
22 | #include <linux/rcupdate.h> | |
a26ef517 | 23 | #include <linux/if_arp.h> |
064af421 BP |
24 | #include <linux/if_ether.h> |
25 | #include <linux/ip.h> | |
26 | #include <linux/tcp.h> | |
27 | #include <linux/udp.h> | |
28 | #include <linux/icmp.h> | |
3c5f6de3 | 29 | #include <net/inet_ecn.h> |
064af421 BP |
30 | #include <net/ip.h> |
31 | ||
32 | #include "compat.h" | |
33 | ||
34 | struct kmem_cache *flow_cache; | |
8d5ebd83 | 35 | static unsigned int hash_seed; |
064af421 | 36 | |
e819fb47 | 37 | static inline bool arphdr_ok(struct sk_buff *skb) |
a26ef517 | 38 | { |
7d0ab001 | 39 | return skb->len >= skb_network_offset(skb) + sizeof(struct arp_eth_header); |
a26ef517 JP |
40 | } |
41 | ||
4c1ad233 | 42 | static inline int check_iphdr(struct sk_buff *skb) |
064af421 | 43 | { |
4c1ad233 BP |
44 | unsigned int nh_ofs = skb_network_offset(skb); |
45 | unsigned int ip_len; | |
46 | ||
47 | if (skb->len < nh_ofs + sizeof(struct iphdr)) | |
48 | return -EINVAL; | |
49 | ||
50 | ip_len = ip_hdrlen(skb); | |
51 | if (ip_len < sizeof(struct iphdr) || skb->len < nh_ofs + ip_len) | |
52 | return -EINVAL; | |
53 | ||
54 | /* | |
55 | * Pull enough header bytes to account for the IP header plus the | |
56 | * longest transport header that we parse, currently 20 bytes for TCP. | |
57 | */ | |
58 | if (!pskb_may_pull(skb, min(nh_ofs + ip_len + 20, skb->len))) | |
59 | return -ENOMEM; | |
60 | ||
61 | skb_set_transport_header(skb, nh_ofs + ip_len); | |
62 | return 0; | |
064af421 BP |
63 | } |
64 | ||
e819fb47 | 65 | static inline bool tcphdr_ok(struct sk_buff *skb) |
064af421 BP |
66 | { |
67 | int th_ofs = skb_transport_offset(skb); | |
7d0ab001 | 68 | if (skb->len >= th_ofs + sizeof(struct tcphdr)) { |
064af421 BP |
69 | int tcp_len = tcp_hdrlen(skb); |
70 | return (tcp_len >= sizeof(struct tcphdr) | |
71 | && skb->len >= th_ofs + tcp_len); | |
72 | } | |
e819fb47 | 73 | return false; |
064af421 BP |
74 | } |
75 | ||
e819fb47 | 76 | static inline bool udphdr_ok(struct sk_buff *skb) |
064af421 | 77 | { |
7d0ab001 | 78 | return skb->len >= skb_transport_offset(skb) + sizeof(struct udphdr); |
064af421 BP |
79 | } |
80 | ||
e819fb47 | 81 | static inline bool icmphdr_ok(struct sk_buff *skb) |
064af421 | 82 | { |
7d0ab001 | 83 | return skb->len >= skb_transport_offset(skb) + sizeof(struct icmphdr); |
064af421 BP |
84 | } |
85 | ||
86 | #define TCP_FLAGS_OFFSET 13 | |
87 | #define TCP_FLAG_MASK 0x3f | |
88 | ||
064af421 BP |
89 | void flow_used(struct sw_flow *flow, struct sk_buff *skb) |
90 | { | |
064af421 BP |
91 | u8 tcp_flags = 0; |
92 | ||
abfec865 BP |
93 | if (flow->key.dl_type == htons(ETH_P_IP) && |
94 | flow->key.nw_proto == IPPROTO_TCP) { | |
95 | u8 *tcp = (u8 *)tcp_hdr(skb); | |
96 | tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; | |
064af421 BP |
97 | } |
98 | ||
f2459fe7 | 99 | spin_lock_bh(&flow->lock); |
6bfafa55 | 100 | flow->used = jiffies; |
064af421 BP |
101 | flow->packet_count++; |
102 | flow->byte_count += skb->len; | |
103 | flow->tcp_flags |= tcp_flags; | |
f2459fe7 | 104 | spin_unlock_bh(&flow->lock); |
064af421 BP |
105 | } |
106 | ||
107 | struct sw_flow_actions *flow_actions_alloc(size_t n_actions) | |
108 | { | |
109 | struct sw_flow_actions *sfa; | |
110 | ||
722d19c5 BP |
111 | /* At least DP_MAX_PORTS actions are required to be able to flood a |
112 | * packet to every port. Factor of 2 allows for setting VLAN tags, | |
113 | * etc. */ | |
114 | if (n_actions > 2 * DP_MAX_PORTS) | |
064af421 BP |
115 | return ERR_PTR(-EINVAL); |
116 | ||
117 | sfa = kmalloc(sizeof *sfa + n_actions * sizeof(union odp_action), | |
118 | GFP_KERNEL); | |
119 | if (!sfa) | |
120 | return ERR_PTR(-ENOMEM); | |
121 | ||
122 | sfa->n_actions = n_actions; | |
123 | return sfa; | |
124 | } | |
125 | ||
560e8022 JG |
126 | struct sw_flow *flow_alloc(void) |
127 | { | |
128 | struct sw_flow *flow; | |
129 | ||
130 | flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); | |
131 | if (!flow) | |
132 | return ERR_PTR(-ENOMEM); | |
133 | ||
134 | spin_lock_init(&flow->lock); | |
fb8c9347 JG |
135 | atomic_set(&flow->refcnt, 1); |
136 | flow->dead = false; | |
064af421 | 137 | |
560e8022 JG |
138 | return flow; |
139 | } | |
140 | ||
8d5ebd83 JG |
141 | void flow_free_tbl(struct tbl_node *node) |
142 | { | |
143 | struct sw_flow *flow = flow_cast(node); | |
fb8c9347 JG |
144 | |
145 | flow->dead = true; | |
146 | flow_put(flow); | |
8d5ebd83 JG |
147 | } |
148 | ||
064af421 BP |
149 | /* RCU callback used by flow_deferred_free. */ |
150 | static void rcu_free_flow_callback(struct rcu_head *rcu) | |
151 | { | |
152 | struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); | |
fb8c9347 JG |
153 | |
154 | flow->dead = true; | |
155 | flow_put(flow); | |
064af421 BP |
156 | } |
157 | ||
158 | /* Schedules 'flow' to be freed after the next RCU grace period. | |
159 | * The caller must hold rcu_read_lock for this to be sensible. */ | |
160 | void flow_deferred_free(struct sw_flow *flow) | |
161 | { | |
162 | call_rcu(&flow->rcu, rcu_free_flow_callback); | |
163 | } | |
164 | ||
fb8c9347 JG |
165 | void flow_hold(struct sw_flow *flow) |
166 | { | |
167 | atomic_inc(&flow->refcnt); | |
168 | } | |
169 | ||
170 | void flow_put(struct sw_flow *flow) | |
171 | { | |
172 | if (unlikely(!flow)) | |
173 | return; | |
174 | ||
175 | if (atomic_dec_and_test(&flow->refcnt)) { | |
176 | kfree(flow->sf_acts); | |
177 | kmem_cache_free(flow_cache, flow); | |
178 | } | |
179 | } | |
180 | ||
064af421 BP |
181 | /* RCU callback used by flow_deferred_free_acts. */ |
182 | static void rcu_free_acts_callback(struct rcu_head *rcu) | |
183 | { | |
d295e8e9 | 184 | struct sw_flow_actions *sf_acts = container_of(rcu, |
064af421 BP |
185 | struct sw_flow_actions, rcu); |
186 | kfree(sf_acts); | |
187 | } | |
188 | ||
189 | /* Schedules 'sf_acts' to be freed after the next RCU grace period. | |
190 | * The caller must hold rcu_read_lock for this to be sensible. */ | |
191 | void flow_deferred_free_acts(struct sw_flow_actions *sf_acts) | |
192 | { | |
193 | call_rcu(&sf_acts->rcu, rcu_free_acts_callback); | |
194 | } | |
195 | ||
50f06e16 | 196 | static void parse_vlan(struct sk_buff *skb, struct odp_flow_key *key) |
064af421 | 197 | { |
50f06e16 BP |
198 | struct qtag_prefix { |
199 | __be16 eth_type; /* ETH_P_8021Q */ | |
200 | __be16 tci; | |
201 | }; | |
202 | struct qtag_prefix *qp; | |
203 | ||
204 | if (skb->len < sizeof(struct qtag_prefix) + sizeof(__be16)) | |
205 | return; | |
206 | ||
207 | qp = (struct qtag_prefix *) skb->data; | |
208 | key->dl_vlan = qp->tci & htons(VLAN_VID_MASK); | |
209 | key->dl_vlan_pcp = (ntohs(qp->tci) & VLAN_PCP_MASK) >> VLAN_PCP_SHIFT; | |
210 | __skb_pull(skb, sizeof(struct qtag_prefix)); | |
211 | } | |
212 | ||
213 | static __be16 parse_ethertype(struct sk_buff *skb) | |
064af421 | 214 | { |
50f06e16 BP |
215 | struct llc_snap_hdr { |
216 | u8 dsap; /* Always 0xAA */ | |
217 | u8 ssap; /* Always 0xAA */ | |
218 | u8 ctrl; | |
219 | u8 oui[3]; | |
220 | u16 ethertype; | |
221 | }; | |
222 | struct llc_snap_hdr *llc; | |
223 | __be16 proto; | |
224 | ||
225 | proto = *(__be16 *) skb->data; | |
226 | __skb_pull(skb, sizeof(__be16)); | |
227 | ||
228 | if (ntohs(proto) >= ODP_DL_TYPE_ETH2_CUTOFF) | |
229 | return proto; | |
230 | ||
231 | if (unlikely(skb->len < sizeof(struct llc_snap_hdr))) | |
232 | return htons(ODP_DL_TYPE_NOT_ETH_TYPE); | |
233 | ||
234 | llc = (struct llc_snap_hdr *) skb->data; | |
235 | if (llc->dsap != LLC_SAP_SNAP || | |
236 | llc->ssap != LLC_SAP_SNAP || | |
237 | (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) | |
238 | return htons(ODP_DL_TYPE_NOT_ETH_TYPE); | |
239 | ||
240 | __skb_pull(skb, sizeof(struct llc_snap_hdr)); | |
241 | return llc->ethertype; | |
064af421 BP |
242 | } |
243 | ||
a31e0e31 BP |
244 | /** |
245 | * flow_extract - extracts a flow key from an Ethernet frame. | |
246 | * @skb: sk_buff that contains the frame, with skb->data pointing to the | |
247 | * Ethernet header | |
248 | * @in_port: port number on which @skb was received. | |
249 | * @key: output flow key | |
250 | * | |
251 | * The caller must ensure that skb->len >= ETH_HLEN. | |
252 | * | |
4c1ad233 BP |
253 | * Returns 0 if successful, otherwise a negative errno value. |
254 | * | |
59a18f80 BP |
255 | * Initializes @skb header pointers as follows: |
256 | * | |
257 | * - skb->mac_header: the Ethernet header. | |
258 | * | |
259 | * - skb->network_header: just past the Ethernet header, or just past the | |
260 | * VLAN header, to the first byte of the Ethernet payload. | |
261 | * | |
262 | * - skb->transport_header: If key->dl_type is ETH_P_IP on output, then just | |
263 | * past the IPv4 header, if one is present and of a correct length, | |
264 | * otherwise the same as skb->network_header. For other key->dl_type | |
265 | * values it is left untouched. | |
266 | * | |
769f8ccd BP |
267 | * Sets OVS_CB(skb)->is_frag to %true if @skb is an IPv4 fragment, otherwise to |
268 | * %false. | |
a31e0e31 | 269 | */ |
b7a31ec1 JG |
270 | int flow_extract(struct sk_buff *skb, u16 in_port, struct odp_flow_key *key, |
271 | bool *is_frag) | |
064af421 BP |
272 | { |
273 | struct ethhdr *eth; | |
064af421 BP |
274 | |
275 | memset(key, 0, sizeof *key); | |
659586ef | 276 | key->tun_id = OVS_CB(skb)->tun_id; |
064af421 | 277 | key->in_port = in_port; |
659586ef | 278 | key->dl_vlan = htons(ODP_VLAN_NONE); |
b7a31ec1 | 279 | *is_frag = false; |
064af421 | 280 | |
4c1ad233 BP |
281 | /* |
282 | * We would really like to pull as many bytes as we could possibly | |
283 | * want to parse into the linear data area. Currently that is: | |
284 | * | |
285 | * 14 Ethernet header | |
286 | * 4 VLAN header | |
287 | * 60 max IP header with options | |
288 | * 20 max TCP/UDP/ICMP header (don't care about options) | |
289 | * -- | |
290 | * 98 | |
291 | * | |
292 | * But Xen only allocates 64 or 72 bytes for the linear data area in | |
293 | * netback, which means that we would reallocate and copy the skb's | |
294 | * linear data on every packet if we did that. So instead just pull 64 | |
295 | * bytes, which is always sufficient without IP options, and then check | |
296 | * whether we need to pull more later when we look at the IP header. | |
297 | */ | |
d9fce1ca | 298 | if (!pskb_may_pull(skb, min(skb->len, 64u))) |
4c1ad233 | 299 | return -ENOMEM; |
064af421 BP |
300 | |
301 | skb_reset_mac_header(skb); | |
064af421 | 302 | |
50f06e16 BP |
303 | /* Link layer. */ |
304 | eth = eth_hdr(skb); | |
064af421 BP |
305 | memcpy(key->dl_src, eth->h_source, ETH_ALEN); |
306 | memcpy(key->dl_dst, eth->h_dest, ETH_ALEN); | |
50f06e16 BP |
307 | |
308 | /* dl_type, dl_vlan, dl_vlan_pcp. */ | |
309 | __skb_pull(skb, 2 * ETH_ALEN); | |
310 | if (eth->h_proto == htons(ETH_P_8021Q)) | |
311 | parse_vlan(skb, key); | |
312 | key->dl_type = parse_ethertype(skb); | |
313 | skb_reset_network_header(skb); | |
314 | __skb_push(skb, skb->data - (unsigned char *)eth); | |
064af421 BP |
315 | |
316 | /* Network layer. */ | |
4c1ad233 BP |
317 | if (key->dl_type == htons(ETH_P_IP)) { |
318 | struct iphdr *nh; | |
319 | int error; | |
320 | ||
321 | error = check_iphdr(skb); | |
322 | if (unlikely(error)) { | |
323 | if (error == -EINVAL) { | |
324 | skb->transport_header = skb->network_header; | |
325 | return 0; | |
326 | } | |
327 | return error; | |
328 | } | |
329 | ||
330 | nh = ip_hdr(skb); | |
064af421 BP |
331 | key->nw_src = nh->saddr; |
332 | key->nw_dst = nh->daddr; | |
f5e86186 | 333 | key->nw_tos = nh->tos & ~INET_ECN_MASK; |
064af421 | 334 | key->nw_proto = nh->protocol; |
064af421 BP |
335 | |
336 | /* Transport layer. */ | |
337 | if (!(nh->frag_off & htons(IP_MF | IP_OFFSET))) { | |
338 | if (key->nw_proto == IPPROTO_TCP) { | |
339 | if (tcphdr_ok(skb)) { | |
340 | struct tcphdr *tcp = tcp_hdr(skb); | |
341 | key->tp_src = tcp->source; | |
342 | key->tp_dst = tcp->dest; | |
064af421 BP |
343 | } |
344 | } else if (key->nw_proto == IPPROTO_UDP) { | |
345 | if (udphdr_ok(skb)) { | |
346 | struct udphdr *udp = udp_hdr(skb); | |
347 | key->tp_src = udp->source; | |
348 | key->tp_dst = udp->dest; | |
064af421 BP |
349 | } |
350 | } else if (key->nw_proto == IPPROTO_ICMP) { | |
351 | if (icmphdr_ok(skb)) { | |
352 | struct icmphdr *icmp = icmp_hdr(skb); | |
353 | /* The ICMP type and code fields use the 16-bit | |
354 | * transport port fields, so we need to store them | |
355 | * in 16-bit network byte order. */ | |
356 | key->tp_src = htons(icmp->type); | |
357 | key->tp_dst = htons(icmp->code); | |
064af421 BP |
358 | } |
359 | } | |
b7a31ec1 JG |
360 | } else |
361 | *is_frag = true; | |
362 | ||
a26ef517 JP |
363 | } else if (key->dl_type == htons(ETH_P_ARP) && arphdr_ok(skb)) { |
364 | struct arp_eth_header *arp; | |
365 | ||
366 | arp = (struct arp_eth_header *)skb_network_header(skb); | |
367 | ||
f5e86186 | 368 | if (arp->ar_hrd == htons(ARPHRD_ETHER) |
de3f65ea JP |
369 | && arp->ar_pro == htons(ETH_P_IP) |
370 | && arp->ar_hln == ETH_ALEN | |
371 | && arp->ar_pln == 4) { | |
372 | ||
373 | /* We only match on the lower 8 bits of the opcode. */ | |
b7a31ec1 | 374 | if (ntohs(arp->ar_op) <= 0xff) |
de3f65ea | 375 | key->nw_proto = ntohs(arp->ar_op); |
de3f65ea | 376 | |
d295e8e9 | 377 | if (key->nw_proto == ARPOP_REQUEST |
de3f65ea JP |
378 | || key->nw_proto == ARPOP_REPLY) { |
379 | memcpy(&key->nw_src, arp->ar_sip, sizeof(key->nw_src)); | |
380 | memcpy(&key->nw_dst, arp->ar_tip, sizeof(key->nw_dst)); | |
381 | } | |
382 | } | |
064af421 | 383 | } |
769f8ccd | 384 | return 0; |
064af421 BP |
385 | } |
386 | ||
8d5ebd83 JG |
387 | u32 flow_hash(const struct odp_flow_key *key) |
388 | { | |
389 | return jhash2((u32*)key, sizeof *key / sizeof(u32), hash_seed); | |
390 | } | |
391 | ||
392 | int flow_cmp(const struct tbl_node *node, void *key2_) | |
393 | { | |
394 | const struct odp_flow_key *key1 = &flow_cast(node)->key; | |
395 | const struct odp_flow_key *key2 = key2_; | |
396 | ||
397 | return !memcmp(key1, key2, sizeof(struct odp_flow_key)); | |
398 | } | |
399 | ||
064af421 BP |
400 | /* Initializes the flow module. |
401 | * Returns zero if successful or a negative error code. */ | |
402 | int flow_init(void) | |
403 | { | |
404 | flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, | |
405 | 0, NULL); | |
406 | if (flow_cache == NULL) | |
407 | return -ENOMEM; | |
408 | ||
8d5ebd83 JG |
409 | get_random_bytes(&hash_seed, sizeof hash_seed); |
410 | ||
064af421 BP |
411 | return 0; |
412 | } | |
413 | ||
414 | /* Uninitializes the flow module. */ | |
415 | void flow_exit(void) | |
416 | { | |
417 | kmem_cache_destroy(flow_cache); | |
418 | } |