2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
23 #include <seastar/net/ip.hh>
24 #include <seastar/core/print.hh>
25 #include <seastar/core/future-util.hh>
26 #include <seastar/core/shared_ptr.hh>
27 #include <seastar/net/toeplitz.hh>
28 #include <seastar/core/metrics.hh>
34 ipv4_address::ipv4_address(const std::string
& addr
) {
35 boost::system::error_code ec
;
36 auto ipv4
= boost::asio::ip::address_v4::from_string(addr
, ec
);
38 throw std::runtime_error(
39 format("Wrong format for IPv4 address {}. Please ensure it's in dotted-decimal format", addr
));
41 ip
= static_cast<uint32_t>(std::move(ipv4
).to_ulong());
44 constexpr std::chrono::seconds
ipv4::_frag_timeout
;
45 constexpr uint32_t ipv4::_frag_low_thresh
;
46 constexpr uint32_t ipv4::_frag_high_thresh
;
48 ipv4::ipv4(interface
* netif
)
55 , _l3(netif
, eth_protocol_num::ipv4
, [this] { return get_packet(); })
59 , _l4({ { uint8_t(ip_protocol_num::tcp
), &_tcp
}, { uint8_t(ip_protocol_num::icmp
), &_icmp
}, { uint8_t(ip_protocol_num::udp
), &_udp
}})
61 namespace sm
= seastar::metrics
;
62 // FIXME: ignored future
64 [this](packet p
, ethernet_address ea
) {
65 return handle_received_packet(std::move(p
), ea
);
67 [this](forward_hash
& out_hash_data
, packet
& p
, size_t off
) {
68 return forward(out_hash_data
, p
, off
);
71 _metrics
.add_group("ipv4", {
73 // Linearized events: DERIVE:0:u
75 sm::make_derive("linearizations", [] { return ipv4_packet_merger::linearizations(); },
76 sm::description("Counts a number of times a buffer linearization was invoked during buffers merge process. "
77 "Divide it by a total IPv4 receive packet rate to get an average number of lineraizations per packet."))
79 _frag_timer
.set_callback([this] { frag_timeout(); });
82 bool ipv4::forward(forward_hash
& out_hash_data
, packet
& p
, size_t off
)
84 auto iph
= p
.get_header
<ip_hdr
>(off
);
86 out_hash_data
.push_back(iph
->src_ip
.ip
);
87 out_hash_data
.push_back(iph
->dst_ip
.ip
);
90 auto l4
= _l4
[h
.ip_proto
];
92 if (h
.mf() == false && h
.offset() == 0) {
93 // This IP datagram is atomic, forward according to tcp or udp connection hash
94 l4
->forward(out_hash_data
, p
, off
+ sizeof(ip_hdr
));
96 // else forward according to ip fields only
101 bool ipv4::in_my_netmask(ipv4_address a
) const {
102 return !((a
.ip
^ _host_address
.ip
) & _netmask
.ip
);
105 bool ipv4::needs_frag(packet
& p
, ip_protocol_num prot_num
, net::hw_features hw_features
) {
106 if (p
.len() + ipv4_hdr_len_min
<= hw_features
.mtu
) {
110 if ((prot_num
== ip_protocol_num::tcp
&& hw_features
.tx_tso
) ||
111 (prot_num
== ip_protocol_num::udp
&& hw_features
.tx_ufo
)) {
119 ipv4::handle_received_packet(packet p
, ethernet_address from
) {
120 auto iph
= p
.get_header
<ip_hdr
>(0);
122 return make_ready_future
<>();
125 // Skip checking csum of reassembled IP datagram
126 if (!hw_features().rx_csum_offload
&& !p
.offload_info_ref().reassembled
) {
128 csum
.sum(reinterpret_cast<char*>(iph
), sizeof(*iph
));
129 if (csum
.get() != 0) {
130 return make_ready_future
<>();
135 unsigned ip_len
= h
.len
;
136 unsigned ip_hdr_len
= h
.ihl
* 4;
137 unsigned pkt_len
= p
.len();
138 auto offset
= h
.offset();
139 if (pkt_len
> ip_len
) {
140 // Trim extra data in the packet beyond IP total length
141 p
.trim_back(pkt_len
- ip_len
);
142 } else if (pkt_len
< ip_len
) {
143 // Drop if it contains less than IP total length
144 return make_ready_future
<>();
146 // Drop if the reassembled datagram will be larger than maximum IP size
147 if (offset
+ p
.len() > net::ip_packet_len_max
) {
148 return make_ready_future
<>();
151 // FIXME: process options
152 if (in_my_netmask(h
.src_ip
) && h
.src_ip
!= _host_address
) {
153 _arp
.learn(from
, h
.src_ip
);
156 if (_packet_filter
) {
157 bool handled
= false;
158 auto r
= _packet_filter
->handle(p
, &h
, from
, handled
);
164 if (h
.dst_ip
!= _host_address
) {
166 return make_ready_future
<>();
169 // Does this IP datagram need reassembly
171 if (mf
== true || offset
!= 0) {
173 auto frag_id
= ipv4_frag_id
{h
.src_ip
, h
.dst_ip
, h
.id
, h
.ip_proto
};
174 auto& frag
= _frags
[frag_id
];
176 frag
.last_frag_received
= true;
178 // This is a newly created frag_id
179 if (frag
.mem_size
== 0) {
180 _frags_age
.push_back(frag_id
);
181 frag
.rx_time
= clock_type::now();
183 auto added_size
= frag
.merge(h
, offset
, std::move(p
));
184 _frag_mem
+= added_size
;
185 if (frag
.is_complete()) {
186 // All the fragments are received
187 auto dropped_size
= frag
.mem_size
;
188 auto& ip_data
= frag
.data
.map
.begin()->second
;
189 // Choose a cpu to forward this packet
190 auto cpu_id
= engine().cpu_id();
191 auto l4
= _l4
[h
.ip_proto
];
193 size_t l4_offset
= 0;
194 forward_hash hash_data
;
195 hash_data
.push_back(hton(h
.src_ip
.ip
));
196 hash_data
.push_back(hton(h
.dst_ip
.ip
));
197 auto forwarded
= l4
->forward(hash_data
, ip_data
, l4_offset
);
199 cpu_id
= _netif
->hash2cpu(toeplitz_hash(_netif
->rss_key(), hash_data
));
200 // No need to forward if the dst cpu is the current cpu
201 if (cpu_id
== engine().cpu_id()) {
202 l4
->received(std::move(ip_data
), h
.src_ip
, h
.dst_ip
);
204 auto to
= _netif
->hw_address();
205 auto pkt
= frag
.get_assembled_packet(from
, to
);
206 _netif
->forward(cpu_id
, std::move(pkt
));
211 // Delete this frag from _frags and _frags_age
212 frag_drop(frag_id
, dropped_size
);
213 _frags_age
.remove(frag_id
);
215 // Some of the fragments are missing
216 if (!_frag_timer
.armed()) {
220 return make_ready_future
<>();
223 auto l4
= _l4
[h
.ip_proto
];
225 // Trim IP header and pass to upper layer
226 p
.trim_front(ip_hdr_len
);
227 l4
->received(std::move(p
), h
.src_ip
, h
.dst_ip
);
229 return make_ready_future
<>();
232 future
<ethernet_address
> ipv4::get_l2_dst_address(ipv4_address to
) {
233 // Figure out where to send the packet to. If it is a directly connected
234 // host, send to it directly, otherwise send to the default gateway.
236 if (in_my_netmask(to
)) {
242 return _arp
.lookup(dst
);
245 void ipv4::send(ipv4_address to
, ip_protocol_num proto_num
, packet p
, ethernet_address e_dst
) {
246 auto needs_frag
= this->needs_frag(p
, proto_num
, hw_features());
248 auto send_pkt
= [this, to
, proto_num
, needs_frag
, e_dst
] (packet
& pkt
, uint16_t remaining
, uint16_t offset
) mutable {
249 auto iph
= pkt
.prepend_header
<ip_hdr
>();
250 iph
->ihl
= sizeof(*iph
) / 4;
254 iph
->len
= pkt
.len();
255 // FIXME: a proper id
258 uint16_t mf
= remaining
> 0;
259 // The fragment offset is measured in units of 8 octets (64 bits)
260 auto off
= offset
/ 8;
261 iph
->frag
= (mf
<< uint8_t(ip_hdr::frag_bits::mf
)) | off
;
266 iph
->ip_proto
= (uint8_t)proto_num
;
268 iph
->src_ip
= _host_address
;
272 if (hw_features().tx_csum_ip_offload
) {
274 pkt
.offload_info_ref().needs_ip_csum
= true;
277 csum
.sum(reinterpret_cast<char*>(iph
), sizeof(*iph
));
278 iph
->csum
= csum
.get();
281 _packetq
.push_back(l3_protocol::l3packet
{eth_protocol_num::ipv4
, e_dst
, std::move(pkt
)});
286 uint16_t remaining
= p
.len();
287 auto mtu
= hw_features().mtu
;
290 auto can_send
= std::min(uint16_t(mtu
- net::ipv4_hdr_len_min
), remaining
);
291 remaining
-= can_send
;
292 auto pkt
= p
.share(offset
, can_send
);
293 send_pkt(pkt
, remaining
, offset
);
297 // The whole packet can be send in one shot
302 compat::optional
<l3_protocol::l3packet
> ipv4::get_packet() {
303 // _packetq will be mostly empty here unless it hold remnants of previously
305 if (_packetq
.empty()) {
306 for (size_t i
= 0; i
< _pkt_providers
.size(); i
++) {
307 auto l4p
= _pkt_providers
[_pkt_provider_idx
++]();
308 if (_pkt_provider_idx
== _pkt_providers
.size()) {
309 _pkt_provider_idx
= 0;
312 auto l4pv
= std::move(l4p
.value());
313 send(l4pv
.to
, l4pv
.proto_num
, std::move(l4pv
.p
), l4pv
.e_dst
);
319 compat::optional
<l3_protocol::l3packet
> p
;
320 if (!_packetq
.empty()) {
321 p
= std::move(_packetq
.front());
322 _packetq
.pop_front();
327 void ipv4::set_host_address(ipv4_address ip
) {
329 _arp
.set_self_addr(ip
);
332 ipv4_address
ipv4::host_address() const {
333 return _host_address
;
336 void ipv4::set_gw_address(ipv4_address ip
) {
340 ipv4_address
ipv4::gw_address() const {
344 void ipv4::set_netmask_address(ipv4_address ip
) {
348 ipv4_address
ipv4::netmask_address() const {
352 void ipv4::set_packet_filter(ip_packet_filter
* f
) {
356 ip_packet_filter
* ipv4::packet_filter() const {
357 return _packet_filter
;
360 void ipv4::frag_limit_mem() {
361 if (_frag_mem
<= _frag_high_thresh
) {
364 auto drop
= _frag_mem
- _frag_low_thresh
;
366 if (_frags_age
.empty()) {
369 // Drop the oldest frag (first element) from _frags_age
370 auto frag_id
= _frags_age
.front();
371 _frags_age
.pop_front();
373 // Drop from _frags as well
374 auto& frag
= _frags
[frag_id
];
375 auto dropped_size
= frag
.mem_size
;
376 frag_drop(frag_id
, dropped_size
);
378 drop
-= std::min(drop
, dropped_size
);
382 void ipv4::frag_timeout() {
383 if (_frags
.empty()) {
386 auto now
= clock_type::now();
387 for (auto it
= _frags_age
.begin(); it
!= _frags_age
.end();) {
389 auto& frag
= _frags
[frag_id
];
390 if (now
> frag
.rx_time
+ _frag_timeout
) {
391 auto dropped_size
= frag
.mem_size
;
393 frag_drop(frag_id
, dropped_size
);
394 // Drop from _frags_age
395 it
= _frags_age
.erase(it
);
397 // The further items can only be younger
401 if (_frags
.size() != 0) {
408 void ipv4::frag_drop(ipv4_frag_id frag_id
, uint32_t dropped_size
) {
409 _frags
.erase(frag_id
);
410 _frag_mem
-= dropped_size
;
413 int32_t ipv4::frag::merge(ip_hdr
&h
, uint16_t offset
, packet p
) {
414 uint32_t old
= mem_size
;
415 unsigned ip_hdr_len
= h
.ihl
* 4;
418 header
= p
.share(0, ip_hdr_len
);
421 p
.trim_front(ip_hdr_len
);
422 data
.merge(offset
, std::move(p
));
424 mem_size
= header
.memory();
425 for (const auto& x
: data
.map
) {
426 mem_size
+= x
.second
.memory();
428 auto added_size
= mem_size
- old
;
432 bool ipv4::frag::is_complete() {
433 // If all the fragments are received, ipv4::frag::merge() should merge all
434 // the fragments into a single packet
435 auto offset
= data
.map
.begin()->first
;
436 auto nr_packet
= data
.map
.size();
437 return last_frag_received
&& nr_packet
== 1 && offset
== 0;
440 packet
ipv4::frag::get_assembled_packet(ethernet_address from
, ethernet_address to
) {
441 auto& ip_header
= header
;
442 auto& ip_data
= data
.map
.begin()->second
;
443 // Append a ethernet header, needed for forwarding
444 auto eh
= ip_header
.prepend_header
<eth_hdr
>();
447 eh
->eth_proto
= uint16_t(eth_protocol_num::ipv4
);
449 // Prepare a packet contains both ethernet header, ip header and ip data
450 ip_header
.append(std::move(ip_data
));
451 auto pkt
= std::move(ip_header
);
452 auto iph
= pkt
.get_header
<ip_hdr
>(sizeof(eth_hdr
));
453 // len is the sum of each fragment
454 iph
->len
= hton(uint16_t(pkt
.len() - sizeof(eth_hdr
)));
455 // No fragmentation for the assembled datagram
457 // Since each fragment's csum is checked, no need to csum
458 // again for the assembled datagram
460 oi
.reassembled
= true;
461 pkt
.set_offload_info(oi
);
465 void icmp::received(packet p
, ipaddr from
, ipaddr to
) {
466 auto hdr
= p
.get_header
<icmp_hdr
>(0);
467 if (!hdr
|| hdr
->type
!= icmp_hdr::msg_type::echo_request
) {
470 hdr
->type
= icmp_hdr::msg_type::echo_reply
;
474 csum
.sum(reinterpret_cast<char*>(hdr
), p
.len());
475 hdr
->csum
= csum
.get();
477 if (_queue_space
.try_wait(p
.len())) { // drop packets that do not fit the queue
478 // FIXME: future is discarded
479 (void)_inet
.get_l2_dst_address(from
).then([this, from
, p
= std::move(p
)] (ethernet_address e_dst
) mutable {
480 _packetq
.emplace_back(ipv4_traits::l4packet
{from
, std::move(p
), e_dst
, ip_protocol_num::icmp
});