]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/src/net/ip.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / seastar / src / net / ip.cc
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
20 *
21 */
22
23#include <seastar/net/ip.hh>
24#include <seastar/core/print.hh>
11fdf7f2
TL
25#include <seastar/core/shared_ptr.hh>
26#include <seastar/net/toeplitz.hh>
27#include <seastar/core/metrics.hh>
28
29namespace seastar {
30
31namespace net {
32
9f95a23c
TL
33ipv4_address::ipv4_address(const std::string& addr) {
34 boost::system::error_code ec;
35 auto ipv4 = boost::asio::ip::address_v4::from_string(addr, ec);
36 if (ec) {
37 throw std::runtime_error(
38 format("Wrong format for IPv4 address {}. Please ensure it's in dotted-decimal format", addr));
39 }
40 ip = static_cast<uint32_t>(std::move(ipv4).to_ulong());
11fdf7f2
TL
41}
42
43constexpr std::chrono::seconds ipv4::_frag_timeout;
44constexpr uint32_t ipv4::_frag_low_thresh;
45constexpr uint32_t ipv4::_frag_high_thresh;
46
47ipv4::ipv4(interface* netif)
48 : _netif(netif)
49 , _global_arp(netif)
50 , _arp(_global_arp)
51 , _host_address(0)
52 , _gw_address(0)
53 , _netmask(0)
54 , _l3(netif, eth_protocol_num::ipv4, [this] { return get_packet(); })
11fdf7f2
TL
55 , _tcp(*this)
56 , _icmp(*this)
57 , _udp(*this)
58 , _l4({ { uint8_t(ip_protocol_num::tcp), &_tcp }, { uint8_t(ip_protocol_num::icmp), &_icmp }, { uint8_t(ip_protocol_num::udp), &_udp }})
59{
60 namespace sm = seastar::metrics;
9f95a23c
TL
61 // FIXME: ignored future
62 (void)_l3.receive(
63 [this](packet p, ethernet_address ea) {
64 return handle_received_packet(std::move(p), ea);
65 },
66 [this](forward_hash& out_hash_data, packet& p, size_t off) {
67 return forward(out_hash_data, p, off);
68 });
11fdf7f2
TL
69
70 _metrics.add_group("ipv4", {
71 //
72 // Linearized events: DERIVE:0:u
73 //
1e59de90 74 sm::make_counter("linearizations", [] { return ipv4_packet_merger::linearizations(); },
11fdf7f2
TL
75 sm::description("Counts a number of times a buffer linearization was invoked during buffers merge process. "
76 "Divide it by a total IPv4 receive packet rate to get an average number of lineraizations per packet."))
77 });
78 _frag_timer.set_callback([this] { frag_timeout(); });
79}
80
81bool ipv4::forward(forward_hash& out_hash_data, packet& p, size_t off)
82{
83 auto iph = p.get_header<ip_hdr>(off);
84
85 out_hash_data.push_back(iph->src_ip.ip);
86 out_hash_data.push_back(iph->dst_ip.ip);
87
88 auto h = ntoh(*iph);
89 auto l4 = _l4[h.ip_proto];
90 if (l4) {
91 if (h.mf() == false && h.offset() == 0) {
92 // This IP datagram is atomic, forward according to tcp or udp connection hash
93 l4->forward(out_hash_data, p, off + sizeof(ip_hdr));
94 }
95 // else forward according to ip fields only
96 }
97 return true;
98}
99
100bool ipv4::in_my_netmask(ipv4_address a) const {
101 return !((a.ip ^ _host_address.ip) & _netmask.ip);
102}
103
104bool ipv4::needs_frag(packet& p, ip_protocol_num prot_num, net::hw_features hw_features) {
105 if (p.len() + ipv4_hdr_len_min <= hw_features.mtu) {
106 return false;
107 }
108
109 if ((prot_num == ip_protocol_num::tcp && hw_features.tx_tso) ||
110 (prot_num == ip_protocol_num::udp && hw_features.tx_ufo)) {
111 return false;
112 }
113
114 return true;
115}
116
117future<>
118ipv4::handle_received_packet(packet p, ethernet_address from) {
119 auto iph = p.get_header<ip_hdr>(0);
120 if (!iph) {
121 return make_ready_future<>();
122 }
123
124 // Skip checking csum of reassembled IP datagram
125 if (!hw_features().rx_csum_offload && !p.offload_info_ref().reassembled) {
126 checksummer csum;
127 csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
128 if (csum.get() != 0) {
129 return make_ready_future<>();
130 }
131 }
132
133 auto h = ntoh(*iph);
134 unsigned ip_len = h.len;
135 unsigned ip_hdr_len = h.ihl * 4;
136 unsigned pkt_len = p.len();
137 auto offset = h.offset();
138 if (pkt_len > ip_len) {
139 // Trim extra data in the packet beyond IP total length
140 p.trim_back(pkt_len - ip_len);
141 } else if (pkt_len < ip_len) {
142 // Drop if it contains less than IP total length
143 return make_ready_future<>();
144 }
145 // Drop if the reassembled datagram will be larger than maximum IP size
146 if (offset + p.len() > net::ip_packet_len_max) {
147 return make_ready_future<>();
148 }
149
150 // FIXME: process options
151 if (in_my_netmask(h.src_ip) && h.src_ip != _host_address) {
152 _arp.learn(from, h.src_ip);
153 }
154
155 if (_packet_filter) {
156 bool handled = false;
157 auto r = _packet_filter->handle(p, &h, from, handled);
158 if (handled) {
159 return r;
160 }
161 }
162
163 if (h.dst_ip != _host_address) {
164 // FIXME: forward
165 return make_ready_future<>();
166 }
167
168 // Does this IP datagram need reassembly
169 auto mf = h.mf();
170 if (mf == true || offset != 0) {
171 frag_limit_mem();
172 auto frag_id = ipv4_frag_id{h.src_ip, h.dst_ip, h.id, h.ip_proto};
173 auto& frag = _frags[frag_id];
174 if (mf == false) {
175 frag.last_frag_received = true;
176 }
177 // This is a newly created frag_id
178 if (frag.mem_size == 0) {
179 _frags_age.push_back(frag_id);
180 frag.rx_time = clock_type::now();
181 }
182 auto added_size = frag.merge(h, offset, std::move(p));
183 _frag_mem += added_size;
184 if (frag.is_complete()) {
185 // All the fragments are received
186 auto dropped_size = frag.mem_size;
187 auto& ip_data = frag.data.map.begin()->second;
188 // Choose a cpu to forward this packet
f67539c2 189 auto cpu_id = this_shard_id();
11fdf7f2
TL
190 auto l4 = _l4[h.ip_proto];
191 if (l4) {
192 size_t l4_offset = 0;
193 forward_hash hash_data;
194 hash_data.push_back(hton(h.src_ip.ip));
195 hash_data.push_back(hton(h.dst_ip.ip));
196 auto forwarded = l4->forward(hash_data, ip_data, l4_offset);
197 if (forwarded) {
198 cpu_id = _netif->hash2cpu(toeplitz_hash(_netif->rss_key(), hash_data));
199 // No need to forward if the dst cpu is the current cpu
f67539c2 200 if (cpu_id == this_shard_id()) {
11fdf7f2
TL
201 l4->received(std::move(ip_data), h.src_ip, h.dst_ip);
202 } else {
203 auto to = _netif->hw_address();
204 auto pkt = frag.get_assembled_packet(from, to);
205 _netif->forward(cpu_id, std::move(pkt));
206 }
207 }
208 }
209
210 // Delete this frag from _frags and _frags_age
211 frag_drop(frag_id, dropped_size);
212 _frags_age.remove(frag_id);
213 } else {
214 // Some of the fragments are missing
215 if (!_frag_timer.armed()) {
216 frag_arm();
217 }
218 }
219 return make_ready_future<>();
220 }
221
222 auto l4 = _l4[h.ip_proto];
223 if (l4) {
224 // Trim IP header and pass to upper layer
225 p.trim_front(ip_hdr_len);
226 l4->received(std::move(p), h.src_ip, h.dst_ip);
227 }
228 return make_ready_future<>();
229}
230
231future<ethernet_address> ipv4::get_l2_dst_address(ipv4_address to) {
232 // Figure out where to send the packet to. If it is a directly connected
233 // host, send to it directly, otherwise send to the default gateway.
234 ipv4_address dst;
235 if (in_my_netmask(to)) {
236 dst = to;
237 } else {
238 dst = _gw_address;
239 }
240
241 return _arp.lookup(dst);
242}
243
244void ipv4::send(ipv4_address to, ip_protocol_num proto_num, packet p, ethernet_address e_dst) {
245 auto needs_frag = this->needs_frag(p, proto_num, hw_features());
246
247 auto send_pkt = [this, to, proto_num, needs_frag, e_dst] (packet& pkt, uint16_t remaining, uint16_t offset) mutable {
248 auto iph = pkt.prepend_header<ip_hdr>();
249 iph->ihl = sizeof(*iph) / 4;
250 iph->ver = 4;
251 iph->dscp = 0;
252 iph->ecn = 0;
253 iph->len = pkt.len();
254 // FIXME: a proper id
255 iph->id = 0;
256 if (needs_frag) {
257 uint16_t mf = remaining > 0;
258 // The fragment offset is measured in units of 8 octets (64 bits)
259 auto off = offset / 8;
260 iph->frag = (mf << uint8_t(ip_hdr::frag_bits::mf)) | off;
261 } else {
262 iph->frag = 0;
263 }
264 iph->ttl = 64;
265 iph->ip_proto = (uint8_t)proto_num;
266 iph->csum = 0;
267 iph->src_ip = _host_address;
268 iph->dst_ip = to;
269 *iph = hton(*iph);
270
271 if (hw_features().tx_csum_ip_offload) {
272 iph->csum = 0;
273 pkt.offload_info_ref().needs_ip_csum = true;
274 } else {
275 checksummer csum;
276 csum.sum(reinterpret_cast<char*>(iph), sizeof(*iph));
277 iph->csum = csum.get();
278 }
279
280 _packetq.push_back(l3_protocol::l3packet{eth_protocol_num::ipv4, e_dst, std::move(pkt)});
281 };
282
283 if (needs_frag) {
284 uint16_t offset = 0;
285 uint16_t remaining = p.len();
286 auto mtu = hw_features().mtu;
287
288 while (remaining) {
289 auto can_send = std::min(uint16_t(mtu - net::ipv4_hdr_len_min), remaining);
290 remaining -= can_send;
291 auto pkt = p.share(offset, can_send);
292 send_pkt(pkt, remaining, offset);
293 offset += can_send;
294 }
295 } else {
296 // The whole packet can be send in one shot
297 send_pkt(p, 0, 0);
298 }
299}
300
f67539c2 301std::optional<l3_protocol::l3packet> ipv4::get_packet() {
11fdf7f2
TL
302 // _packetq will be mostly empty here unless it hold remnants of previously
303 // fragmented packet
304 if (_packetq.empty()) {
305 for (size_t i = 0; i < _pkt_providers.size(); i++) {
306 auto l4p = _pkt_providers[_pkt_provider_idx++]();
307 if (_pkt_provider_idx == _pkt_providers.size()) {
308 _pkt_provider_idx = 0;
309 }
310 if (l4p) {
311 auto l4pv = std::move(l4p.value());
312 send(l4pv.to, l4pv.proto_num, std::move(l4pv.p), l4pv.e_dst);
313 break;
314 }
315 }
316 }
317
f67539c2 318 std::optional<l3_protocol::l3packet> p;
11fdf7f2
TL
319 if (!_packetq.empty()) {
320 p = std::move(_packetq.front());
321 _packetq.pop_front();
322 }
323 return p;
324}
325
326void ipv4::set_host_address(ipv4_address ip) {
327 _host_address = ip;
328 _arp.set_self_addr(ip);
329}
330
9f95a23c 331ipv4_address ipv4::host_address() const {
11fdf7f2
TL
332 return _host_address;
333}
334
335void ipv4::set_gw_address(ipv4_address ip) {
336 _gw_address = ip;
337}
338
339ipv4_address ipv4::gw_address() const {
340 return _gw_address;
341}
342
343void ipv4::set_netmask_address(ipv4_address ip) {
344 _netmask = ip;
345}
346
347ipv4_address ipv4::netmask_address() const {
348 return _netmask;
349}
350
351void ipv4::set_packet_filter(ip_packet_filter * f) {
352 _packet_filter = f;
353}
354
355ip_packet_filter * ipv4::packet_filter() const {
356 return _packet_filter;
357}
358
359void ipv4::frag_limit_mem() {
360 if (_frag_mem <= _frag_high_thresh) {
361 return;
362 }
363 auto drop = _frag_mem - _frag_low_thresh;
364 while (drop) {
365 if (_frags_age.empty()) {
366 return;
367 }
368 // Drop the oldest frag (first element) from _frags_age
369 auto frag_id = _frags_age.front();
370 _frags_age.pop_front();
371
372 // Drop from _frags as well
373 auto& frag = _frags[frag_id];
374 auto dropped_size = frag.mem_size;
375 frag_drop(frag_id, dropped_size);
376
377 drop -= std::min(drop, dropped_size);
378 }
379}
380
381void ipv4::frag_timeout() {
382 if (_frags.empty()) {
383 return;
384 }
385 auto now = clock_type::now();
386 for (auto it = _frags_age.begin(); it != _frags_age.end();) {
387 auto frag_id = *it;
388 auto& frag = _frags[frag_id];
389 if (now > frag.rx_time + _frag_timeout) {
390 auto dropped_size = frag.mem_size;
391 // Drop from _frags
392 frag_drop(frag_id, dropped_size);
393 // Drop from _frags_age
394 it = _frags_age.erase(it);
395 } else {
396 // The further items can only be younger
397 break;
398 }
399 }
400 if (_frags.size() != 0) {
401 frag_arm(now);
402 } else {
403 _frag_mem = 0;
404 }
405}
406
407void ipv4::frag_drop(ipv4_frag_id frag_id, uint32_t dropped_size) {
408 _frags.erase(frag_id);
409 _frag_mem -= dropped_size;
410}
411
412int32_t ipv4::frag::merge(ip_hdr &h, uint16_t offset, packet p) {
413 uint32_t old = mem_size;
414 unsigned ip_hdr_len = h.ihl * 4;
415 // Store IP header
416 if (offset == 0) {
417 header = p.share(0, ip_hdr_len);
418 }
419 // Sotre IP payload
420 p.trim_front(ip_hdr_len);
421 data.merge(offset, std::move(p));
422 // Update mem size
423 mem_size = header.memory();
424 for (const auto& x : data.map) {
425 mem_size += x.second.memory();
426 }
427 auto added_size = mem_size - old;
428 return added_size;
429}
430
431bool ipv4::frag::is_complete() {
432 // If all the fragments are received, ipv4::frag::merge() should merge all
433 // the fragments into a single packet
434 auto offset = data.map.begin()->first;
435 auto nr_packet = data.map.size();
436 return last_frag_received && nr_packet == 1 && offset == 0;
437}
438
439packet ipv4::frag::get_assembled_packet(ethernet_address from, ethernet_address to) {
440 auto& ip_header = header;
441 auto& ip_data = data.map.begin()->second;
442 // Append a ethernet header, needed for forwarding
443 auto eh = ip_header.prepend_header<eth_hdr>();
444 eh->src_mac = from;
445 eh->dst_mac = to;
446 eh->eth_proto = uint16_t(eth_protocol_num::ipv4);
447 *eh = hton(*eh);
448 // Prepare a packet contains both ethernet header, ip header and ip data
449 ip_header.append(std::move(ip_data));
450 auto pkt = std::move(ip_header);
451 auto iph = pkt.get_header<ip_hdr>(sizeof(eth_hdr));
452 // len is the sum of each fragment
453 iph->len = hton(uint16_t(pkt.len() - sizeof(eth_hdr)));
454 // No fragmentation for the assembled datagram
455 iph->frag = 0;
456 // Since each fragment's csum is checked, no need to csum
457 // again for the assembled datagram
458 offload_info oi;
459 oi.reassembled = true;
460 pkt.set_offload_info(oi);
461 return pkt;
462}
463
464void icmp::received(packet p, ipaddr from, ipaddr to) {
465 auto hdr = p.get_header<icmp_hdr>(0);
466 if (!hdr || hdr->type != icmp_hdr::msg_type::echo_request) {
467 return;
468 }
469 hdr->type = icmp_hdr::msg_type::echo_reply;
470 hdr->code = 0;
471 hdr->csum = 0;
472 checksummer csum;
473 csum.sum(reinterpret_cast<char*>(hdr), p.len());
474 hdr->csum = csum.get();
475
476 if (_queue_space.try_wait(p.len())) { // drop packets that do not fit the queue
9f95a23c
TL
477 // FIXME: future is discarded
478 (void)_inet.get_l2_dst_address(from).then([this, from, p = std::move(p)] (ethernet_address e_dst) mutable {
11fdf7f2
TL
479 _packetq.emplace_back(ipv4_traits::l4packet{from, std::move(p), e_dst, ip_protocol_num::icmp});
480 });
481 }
482}
483
484}
485
486}