]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
20 | * | |
21 | */ | |
22 | ||
23 | #include <seastar/net/ip.hh> | |
24 | #include <seastar/core/print.hh> | |
11fdf7f2 TL |
25 | #include <seastar/core/shared_ptr.hh> |
26 | #include <seastar/net/toeplitz.hh> | |
27 | #include <seastar/core/metrics.hh> | |
28 | ||
29 | namespace seastar { | |
30 | ||
31 | namespace net { | |
32 | ||
9f95a23c TL |
33 | ipv4_address::ipv4_address(const std::string& addr) { |
34 | boost::system::error_code ec; | |
35 | auto ipv4 = boost::asio::ip::address_v4::from_string(addr, ec); | |
36 | if (ec) { | |
37 | throw std::runtime_error( | |
38 | format("Wrong format for IPv4 address {}. Please ensure it's in dotted-decimal format", addr)); | |
39 | } | |
40 | ip = static_cast<uint32_t>(std::move(ipv4).to_ulong()); | |
11fdf7f2 TL |
41 | } |
42 | ||
43 | constexpr std::chrono::seconds ipv4::_frag_timeout; | |
44 | constexpr uint32_t ipv4::_frag_low_thresh; | |
45 | constexpr uint32_t ipv4::_frag_high_thresh; | |
46 | ||
47 | ipv4::ipv4(interface* netif) | |
48 | : _netif(netif) | |
49 | , _global_arp(netif) | |
50 | , _arp(_global_arp) | |
51 | , _host_address(0) | |
52 | , _gw_address(0) | |
53 | , _netmask(0) | |
54 | , _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); }) | |
11fdf7f2 TL |
55 | , _tcp(*this) |
56 | , _icmp(*this) | |
57 | , _udp(*this) | |
58 | , _l4({ { uint8_t(ip_protocol_num::tcp), &_tcp }, { uint8_t(ip_protocol_num::icmp), &_icmp }, { uint8_t(ip_protocol_num::udp), &_udp }}) | |
59 | { | |
60 | namespace sm = seastar::metrics; | |
9f95a23c TL |
61 | // FIXME: ignored future |
62 | (void)_l3.receive( | |
63 | [this](packet p, ethernet_address ea) { | |
64 | return handle_received_packet(std::move(p), ea); | |
65 | }, | |
66 | [this](forward_hash& out_hash_data, packet& p, size_t off) { | |
67 | return forward(out_hash_data, p, off); | |
68 | }); | |
11fdf7f2 TL |
69 | |
70 | _metrics.add_group("ipv4", { | |
71 | // | |
72 | // Linearized events: DERIVE:0:u | |
73 | // | |
1e59de90 | 74 | sm::make_counter("linearizations", [] { return ipv4_packet_merger::linearizations(); }, |
11fdf7f2 TL |
75 | sm::description("Counts a number of times a buffer linearization was invoked during buffers merge process. " |
76 | "Divide it by a total IPv4 receive packet rate to get an average number of lineraizations per packet.")) | |
77 | }); | |
78 | _frag_timer.set_callback([this] { frag_timeout(); }); | |
79 | } | |
80 | ||
81 | bool ipv4::forward(forward_hash& out_hash_data, packet& p, size_t off) | |
82 | { | |
83 | auto iph = p.get_header<ip_hdr>(off); | |
84 | ||
85 | out_hash_data.push_back(iph->src_ip.ip); | |
86 | out_hash_data.push_back(iph->dst_ip.ip); | |
87 | ||
88 | auto h = ntoh(*iph); | |
89 | auto l4 = _l4[h.ip_proto]; | |
90 | if (l4) { | |
91 | if (h.mf() == false && h.offset() == 0) { | |
92 | // This IP datagram is atomic, forward according to tcp or udp connection hash | |
93 | l4->forward(out_hash_data, p, off + sizeof(ip_hdr)); | |
94 | } | |
95 | // else forward according to ip fields only | |
96 | } | |
97 | return true; | |
98 | } | |
99 | ||
100 | bool ipv4::in_my_netmask(ipv4_address a) const { | |
101 | return !((a.ip ^ _host_address.ip) & _netmask.ip); | |
102 | } | |
103 | ||
104 | bool ipv4::needs_frag(packet& p, ip_protocol_num prot_num, net::hw_features hw_features) { | |
105 | if (p.len() + ipv4_hdr_len_min <= hw_features.mtu) { | |
106 | return false; | |
107 | } | |
108 | ||
109 | if ((prot_num == ip_protocol_num::tcp && hw_features.tx_tso) || | |
110 | (prot_num == ip_protocol_num::udp && hw_features.tx_ufo)) { | |
111 | return false; | |
112 | } | |
113 | ||
114 | return true; | |
115 | } | |
116 | ||
117 | future<> | |
118 | ipv4::handle_received_packet(packet p, ethernet_address from) { | |
119 | auto iph = p.get_header<ip_hdr>(0); | |
120 | if (!iph) { | |
121 | return make_ready_future<>(); | |
122 | } | |
123 | ||
124 | // Skip checking csum of reassembled IP datagram | |
125 | if (!hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) { | |
126 | checksummer csum; | |
127 | csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); | |
128 | if (csum.get() != 0) { | |
129 | return make_ready_future<>(); | |
130 | } | |
131 | } | |
132 | ||
133 | auto h = ntoh(*iph); | |
134 | unsigned ip_len = h.len; | |
135 | unsigned ip_hdr_len = h.ihl * 4; | |
136 | unsigned pkt_len = p.len(); | |
137 | auto offset = h.offset(); | |
138 | if (pkt_len > ip_len) { | |
139 | // Trim extra data in the packet beyond IP total length | |
140 | p.trim_back(pkt_len - ip_len); | |
141 | } else if (pkt_len < ip_len) { | |
142 | // Drop if it contains less than IP total length | |
143 | return make_ready_future<>(); | |
144 | } | |
145 | // Drop if the reassembled datagram will be larger than maximum IP size | |
146 | if (offset + p.len() > net::ip_packet_len_max) { | |
147 | return make_ready_future<>(); | |
148 | } | |
149 | ||
150 | // FIXME: process options | |
151 | if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) { | |
152 | _arp.learn(from, h.src_ip); | |
153 | } | |
154 | ||
155 | if (_packet_filter) { | |
156 | bool handled = false; | |
157 | auto r = _packet_filter->handle(p, &h, from, handled); | |
158 | if (handled) { | |
159 | return r; | |
160 | } | |
161 | } | |
162 | ||
163 | if (h.dst_ip != _host_address) { | |
164 | // FIXME: forward | |
165 | return make_ready_future<>(); | |
166 | } | |
167 | ||
168 | // Does this IP datagram need reassembly | |
169 | auto mf = h.mf(); | |
170 | if (mf == true || offset != 0) { | |
171 | frag_limit_mem(); | |
172 | auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto}; | |
173 | auto& frag = _frags[frag_id]; | |
174 | if (mf == false) { | |
175 | frag.last_frag_received = true; | |
176 | } | |
177 | // This is a newly created frag_id | |
178 | if (frag.mem_size == 0) { | |
179 | _frags_age.push_back(frag_id); | |
180 | frag.rx_time = clock_type::now(); | |
181 | } | |
182 | auto added_size = frag.merge(h, offset, std::move(p)); | |
183 | _frag_mem += added_size; | |
184 | if (frag.is_complete()) { | |
185 | // All the fragments are received | |
186 | auto dropped_size = frag.mem_size; | |
187 | auto& ip_data = frag.data.map.begin()->second; | |
188 | // Choose a cpu to forward this packet | |
f67539c2 | 189 | auto cpu_id = this_shard_id(); |
11fdf7f2 TL |
190 | auto l4 = _l4[h.ip_proto]; |
191 | if (l4) { | |
192 | size_t l4_offset = 0; | |
193 | forward_hash hash_data; | |
194 | hash_data.push_back(hton(h.src_ip.ip)); | |
195 | hash_data.push_back(hton(h.dst_ip.ip)); | |
196 | auto forwarded = l4->forward(hash_data, ip_data, l4_offset); | |
197 | if (forwarded) { | |
198 | cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data)); | |
199 | // No need to forward if the dst cpu is the current cpu | |
f67539c2 | 200 | if (cpu_id == this_shard_id()) { |
11fdf7f2 TL |
201 | l4->received(std::move(ip_data), h.src_ip, h.dst_ip); |
202 | } else { | |
203 | auto to = _netif->hw_address(); | |
204 | auto pkt = frag.get_assembled_packet(from, to); | |
205 | _netif->forward(cpu_id, std::move(pkt)); | |
206 | } | |
207 | } | |
208 | } | |
209 | ||
210 | // Delete this frag from _frags and _frags_age | |
211 | frag_drop(frag_id, dropped_size); | |
212 | _frags_age.remove(frag_id); | |
213 | } else { | |
214 | // Some of the fragments are missing | |
215 | if (!_frag_timer.armed()) { | |
216 | frag_arm(); | |
217 | } | |
218 | } | |
219 | return make_ready_future<>(); | |
220 | } | |
221 | ||
222 | auto l4 = _l4[h.ip_proto]; | |
223 | if (l4) { | |
224 | // Trim IP header and pass to upper layer | |
225 | p.trim_front(ip_hdr_len); | |
226 | l4->received(std::move(p), h.src_ip, h.dst_ip); | |
227 | } | |
228 | return make_ready_future<>(); | |
229 | } | |
230 | ||
231 | future<ethernet_address> ipv4::get_l2_dst_address(ipv4_address to) { | |
232 | // Figure out where to send the packet to. If it is a directly connected | |
233 | // host, send to it directly, otherwise send to the default gateway. | |
234 | ipv4_address dst; | |
235 | if (in_my_netmask(to)) { | |
236 | dst = to; | |
237 | } else { | |
238 | dst = _gw_address; | |
239 | } | |
240 | ||
241 | return _arp.lookup(dst); | |
242 | } | |
243 | ||
244 | void ipv4::send(ipv4_address to, ip_protocol_num proto_num, packet p, ethernet_address e_dst) { | |
245 | auto needs_frag = this->needs_frag(p, proto_num, hw_features()); | |
246 | ||
247 | auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (packet& pkt, uint16_t remaining, uint16_t offset) mutable { | |
248 | auto iph = pkt.prepend_header<ip_hdr>(); | |
249 | iph->ihl = sizeof(*iph) / 4; | |
250 | iph->ver = 4; | |
251 | iph->dscp = 0; | |
252 | iph->ecn = 0; | |
253 | iph->len = pkt.len(); | |
254 | // FIXME: a proper id | |
255 | iph->id = 0; | |
256 | if (needs_frag) { | |
257 | uint16_t mf = remaining > 0; | |
258 | // The fragment offset is measured in units of 8 octets (64 bits) | |
259 | auto off = offset / 8; | |
260 | iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off; | |
261 | } else { | |
262 | iph->frag = 0; | |
263 | } | |
264 | iph->ttl = 64; | |
265 | iph->ip_proto = (uint8_t)proto_num; | |
266 | iph->csum = 0; | |
267 | iph->src_ip = _host_address; | |
268 | iph->dst_ip = to; | |
269 | *iph = hton(*iph); | |
270 | ||
271 | if (hw_features().tx_csum_ip_offload) { | |
272 | iph->csum = 0; | |
273 | pkt.offload_info_ref().needs_ip_csum = true; | |
274 | } else { | |
275 | checksummer csum; | |
276 | csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph)); | |
277 | iph->csum = csum.get(); | |
278 | } | |
279 | ||
280 | _packetq.push_back(l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)}); | |
281 | }; | |
282 | ||
283 | if (needs_frag) { | |
284 | uint16_t offset = 0; | |
285 | uint16_t remaining = p.len(); | |
286 | auto mtu = hw_features().mtu; | |
287 | ||
288 | while (remaining) { | |
289 | auto can_send = std::min(uint16_t(mtu - net::ipv4_hdr_len_min), remaining); | |
290 | remaining -= can_send; | |
291 | auto pkt = p.share(offset, can_send); | |
292 | send_pkt(pkt, remaining, offset); | |
293 | offset += can_send; | |
294 | } | |
295 | } else { | |
296 | // The whole packet can be send in one shot | |
297 | send_pkt(p, 0, 0); | |
298 | } | |
299 | } | |
300 | ||
f67539c2 | 301 | std::optional<l3_protocol::l3packet> ipv4::get_packet() { |
11fdf7f2 TL |
302 | // _packetq will be mostly empty here unless it hold remnants of previously |
303 | // fragmented packet | |
304 | if (_packetq.empty()) { | |
305 | for (size_t i = 0; i < _pkt_providers.size(); i++) { | |
306 | auto l4p = _pkt_providers[_pkt_provider_idx++](); | |
307 | if (_pkt_provider_idx == _pkt_providers.size()) { | |
308 | _pkt_provider_idx = 0; | |
309 | } | |
310 | if (l4p) { | |
311 | auto l4pv = std::move(l4p.value()); | |
312 | send(l4pv.to, l4pv.proto_num, std::move(l4pv.p), l4pv.e_dst); | |
313 | break; | |
314 | } | |
315 | } | |
316 | } | |
317 | ||
f67539c2 | 318 | std::optional<l3_protocol::l3packet> p; |
11fdf7f2 TL |
319 | if (!_packetq.empty()) { |
320 | p = std::move(_packetq.front()); | |
321 | _packetq.pop_front(); | |
322 | } | |
323 | return p; | |
324 | } | |
325 | ||
326 | void ipv4::set_host_address(ipv4_address ip) { | |
327 | _host_address = ip; | |
328 | _arp.set_self_addr(ip); | |
329 | } | |
330 | ||
9f95a23c | 331 | ipv4_address ipv4::host_address() const { |
11fdf7f2 TL |
332 | return _host_address; |
333 | } | |
334 | ||
335 | void ipv4::set_gw_address(ipv4_address ip) { | |
336 | _gw_address = ip; | |
337 | } | |
338 | ||
339 | ipv4_address ipv4::gw_address() const { | |
340 | return _gw_address; | |
341 | } | |
342 | ||
343 | void ipv4::set_netmask_address(ipv4_address ip) { | |
344 | _netmask = ip; | |
345 | } | |
346 | ||
347 | ipv4_address ipv4::netmask_address() const { | |
348 | return _netmask; | |
349 | } | |
350 | ||
351 | void ipv4::set_packet_filter(ip_packet_filter * f) { | |
352 | _packet_filter = f; | |
353 | } | |
354 | ||
355 | ip_packet_filter * ipv4::packet_filter() const { | |
356 | return _packet_filter; | |
357 | } | |
358 | ||
359 | void ipv4::frag_limit_mem() { | |
360 | if (_frag_mem <= _frag_high_thresh) { | |
361 | return; | |
362 | } | |
363 | auto drop = _frag_mem - _frag_low_thresh; | |
364 | while (drop) { | |
365 | if (_frags_age.empty()) { | |
366 | return; | |
367 | } | |
368 | // Drop the oldest frag (first element) from _frags_age | |
369 | auto frag_id = _frags_age.front(); | |
370 | _frags_age.pop_front(); | |
371 | ||
372 | // Drop from _frags as well | |
373 | auto& frag = _frags[frag_id]; | |
374 | auto dropped_size = frag.mem_size; | |
375 | frag_drop(frag_id, dropped_size); | |
376 | ||
377 | drop -= std::min(drop, dropped_size); | |
378 | } | |
379 | } | |
380 | ||
381 | void ipv4::frag_timeout() { | |
382 | if (_frags.empty()) { | |
383 | return; | |
384 | } | |
385 | auto now = clock_type::now(); | |
386 | for (auto it = _frags_age.begin(); it != _frags_age.end();) { | |
387 | auto frag_id = *it; | |
388 | auto& frag = _frags[frag_id]; | |
389 | if (now > frag.rx_time + _frag_timeout) { | |
390 | auto dropped_size = frag.mem_size; | |
391 | // Drop from _frags | |
392 | frag_drop(frag_id, dropped_size); | |
393 | // Drop from _frags_age | |
394 | it = _frags_age.erase(it); | |
395 | } else { | |
396 | // The further items can only be younger | |
397 | break; | |
398 | } | |
399 | } | |
400 | if (_frags.size() != 0) { | |
401 | frag_arm(now); | |
402 | } else { | |
403 | _frag_mem = 0; | |
404 | } | |
405 | } | |
406 | ||
407 | void ipv4::frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) { | |
408 | _frags.erase(frag_id); | |
409 | _frag_mem -= dropped_size; | |
410 | } | |
411 | ||
412 | int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, packet p) { | |
413 | uint32_t old = mem_size; | |
414 | unsigned ip_hdr_len = h.ihl * 4; | |
415 | // Store IP header | |
416 | if (offset == 0) { | |
417 | header = p.share(0, ip_hdr_len); | |
418 | } | |
419 | // Sotre IP payload | |
420 | p.trim_front(ip_hdr_len); | |
421 | data.merge(offset, std::move(p)); | |
422 | // Update mem size | |
423 | mem_size = header.memory(); | |
424 | for (const auto& x : data.map) { | |
425 | mem_size += x.second.memory(); | |
426 | } | |
427 | auto added_size = mem_size - old; | |
428 | return added_size; | |
429 | } | |
430 | ||
431 | bool ipv4::frag::is_complete() { | |
432 | // If all the fragments are received, ipv4::frag::merge() should merge all | |
433 | // the fragments into a single packet | |
434 | auto offset = data.map.begin()->first; | |
435 | auto nr_packet = data.map.size(); | |
436 | return last_frag_received && nr_packet == 1 && offset == 0; | |
437 | } | |
438 | ||
439 | packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) { | |
440 | auto& ip_header = header; | |
441 | auto& ip_data = data.map.begin()->second; | |
442 | // Append a ethernet header, needed for forwarding | |
443 | auto eh = ip_header.prepend_header<eth_hdr>(); | |
444 | eh->src_mac = from; | |
445 | eh->dst_mac = to; | |
446 | eh->eth_proto = uint16_t(eth_protocol_num::ipv4); | |
447 | *eh = hton(*eh); | |
448 | // Prepare a packet contains both ethernet header, ip header and ip data | |
449 | ip_header.append(std::move(ip_data)); | |
450 | auto pkt = std::move(ip_header); | |
451 | auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr)); | |
452 | // len is the sum of each fragment | |
453 | iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr))); | |
454 | // No fragmentation for the assembled datagram | |
455 | iph->frag = 0; | |
456 | // Since each fragment's csum is checked, no need to csum | |
457 | // again for the assembled datagram | |
458 | offload_info oi; | |
459 | oi.reassembled = true; | |
460 | pkt.set_offload_info(oi); | |
461 | return pkt; | |
462 | } | |
463 | ||
464 | void icmp::received(packet p, ipaddr from, ipaddr to) { | |
465 | auto hdr = p.get_header<icmp_hdr>(0); | |
466 | if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) { | |
467 | return; | |
468 | } | |
469 | hdr->type = icmp_hdr::msg_type::echo_reply; | |
470 | hdr->code = 0; | |
471 | hdr->csum = 0; | |
472 | checksummer csum; | |
473 | csum.sum(reinterpret_cast<char*>(hdr), p.len()); | |
474 | hdr->csum = csum.get(); | |
475 | ||
476 | if (_queue_space.try_wait(p.len())) { // drop packets that do not fit the queue | |
9f95a23c TL |
477 | // FIXME: future is discarded |
478 | (void)_inet.get_l2_dst_address(from).then([this, from, p = std::move(p)] (ethernet_address e_dst) mutable { | |
11fdf7f2 TL |
479 | _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp}); |
480 | }); | |
481 | } | |
482 | } | |
483 | ||
484 | } | |
485 | ||
486 | } |