]> git.proxmox.com Git - ceph.git/blame - ceph/src/seastar/src/net/virtio.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / seastar / src / net / virtio.cc
CommitLineData
11fdf7f2
TL
1/*
2 * This file is open source software, licensed to you under the terms
3 * of the Apache License, Version 2.0 (the "License"). See the NOTICE file
4 * distributed with this work for additional information regarding copyright
5 * ownership. You may not use this file except in compliance with the License.
6 *
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing,
12 * software distributed under the License is distributed on an
13 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 * KIND, either express or implied. See the License for the
15 * specific language governing permissions and limitations
16 * under the License.
17 */
18/*
19 * Copyright (C) 2014 Cloudius Systems, Ltd.
20 */
21
22#include <seastar/net/virtio.hh>
23#include <seastar/core/posix.hh>
f67539c2 24#include <seastar/core/internal/pollable_fd.hh>
11fdf7f2
TL
25#include "core/vla.hh"
26#include <seastar/net/virtio-interface.hh>
27#include <seastar/core/reactor.hh>
28#include <seastar/core/stream.hh>
29#include <seastar/core/circular_buffer.hh>
30#include <seastar/core/align.hh>
31#include <seastar/core/metrics.hh>
32#include <seastar/util/function_input_iterator.hh>
33#include <seastar/util/transform_iterator.hh>
34#include <atomic>
35#include <vector>
36#include <queue>
37#include <fcntl.h>
38#include <linux/vhost.h>
39#include <linux/if_tun.h>
40#include <seastar/net/ip.hh>
41#include <seastar/net/const.hh>
42#include <seastar/net/native-stack.hh>
43
44#ifdef HAVE_OSV
45#include <osv/virtio-assign.hh>
46#endif
47
48namespace seastar {
49
50using namespace net;
51
52namespace virtio {
53
54using phys = uint64_t;
55
56#ifndef HAVE_OSV
57
58phys virt_to_phys(void* p) {
59 return reinterpret_cast<uintptr_t>(p);
60}
61
62#else
63
64phys virt_to_phys(void* p) {
65 return osv::assigned_virtio::virt_to_phys(p);
66}
67
68#endif
69
70class device : public net::device {
71private:
72 boost::program_options::variables_map _opts;
73 net::hw_features _hw_features;
74 uint64_t _features;
75
76private:
77 uint64_t setup_features() {
78 int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC | VIRTIO_NET_F_MRG_RXBUF;
79
80 if (!(_opts.count("event-index") && _opts["event-index"].as<std::string>() == "off")) {
81 seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX;
82 }
83 if (!(_opts.count("csum-offload") && _opts["csum-offload"].as<std::string>() == "off")) {
84 seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
85 _hw_features.tx_csum_l4_offload = true;
86 _hw_features.rx_csum_offload = true;
87 } else {
88 _hw_features.tx_csum_l4_offload = false;
89 _hw_features.rx_csum_offload = false;
90 }
91 if (!(_opts.count("tso") && _opts["tso"].as<std::string>() == "off")) {
92 seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4;
93 _hw_features.tx_tso = true;
94 } else {
95 _hw_features.tx_tso = false;
96 }
97
98 if (!(_opts.count("lro") && _opts["lro"].as<std::string>() == "off")) {
99 seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4;
100 _hw_features.rx_lro = true;
101 } else {
102 _hw_features.rx_lro = false;
103 }
104
105 if (!(_opts.count("ufo") && _opts["ufo"].as<std::string>() == "off")) {
106 seastar_supported_features |= VIRTIO_NET_F_HOST_UFO;
107 seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO;
108 _hw_features.tx_ufo = true;
109 } else {
110 _hw_features.tx_ufo = false;
111 }
112
113 seastar_supported_features |= VIRTIO_NET_F_MAC;
114 return seastar_supported_features;
115 }
116
117public:
118 device(boost::program_options::variables_map opts)
119 : _opts(opts), _features(setup_features())
120 {}
121 ethernet_address hw_address() override {
122 return { 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 };
123 }
124
125 net::hw_features hw_features() override {
126 return _hw_features;
127 }
128
129 uint64_t features() {
130 return _features;
131 }
132
133 virtual std::unique_ptr<net::qp> init_local_queue(boost::program_options::variables_map opts, uint16_t qid) override;
134};
135
136/* The virtio_notifier class determines how to do host-to-guest and guest-to-
137 * host notifications. We have two different implementations - one for vhost
138 * (where both notifications occur through eventfds) and one for an assigned
139 * virtio device from OSv.
140 */
141class notifier {
142public:
143 // Notify the host
144 virtual void notify() = 0;
145 // Do whatever it takes to wake wait(). A notifier does not need to
146 // implement this function if wait() waits for an external even which is
147 // generated by an external process (e.g., virtio_notifier_host doesn't
148 // need to implement this).
149 virtual void wake_wait() {
150 abort();
151 }
152 virtual ~notifier() {
153 }
154};
155
156class notifier_vhost : public notifier {
157private:
158 writeable_eventfd _kick;
159public:
160 virtual void notify() override {
161 _kick.signal(1);
162 }
163 notifier_vhost(writeable_eventfd &&kick)
164 : _kick(std::move(kick)) {}
165};
166
167#ifdef HAVE_OSV
168class notifier_osv : public notifier {
169private:
170 uint16_t _q_index;
171 osv::assigned_virtio &_virtio;
172public:
173 virtual void notify() override {
174 _virtio.kick(_q_index);
175 }
176 notifier_osv(osv::assigned_virtio &virtio, uint16_t q_index)
177 : _q_index(q_index)
178 , _virtio(virtio)
179 {
180 }
181};
182#endif
183
184struct ring_config {
185 char* descs;
186 char* avail;
187 char* used;
188 unsigned size;
189 bool event_index;
190 bool indirect;
191 bool mergable_buffers;
192};
193
194struct buffer {
195 phys addr;
196 uint32_t len;
197 bool writeable;
198};
199
200// The 'buffer_chain' concept, used in vring, is a container of buffers, as in:
201//
202// using buffer_chain = std::vector<buffer>;
203//
204// The 'Completion' concept is a functor with the signature:
205//
206// void (buffer_chain&, size_t len);
207//
208template <typename BufferChain, typename Completion>
209class vring {
210private:
211 class desc {
212 public:
213 struct flags {
214 // This marks a buffer as continuing via the next field.
215 uint16_t has_next : 1;
216 // This marks a buffer as write-only (otherwise read-only).
217 uint16_t writeable : 1;
218 // This means the buffer contains a list of buffer descriptors.
219 uint16_t indirect : 1;
220 };
221
222 phys get_paddr();
223 uint32_t get_len() { return _len; }
224 uint16_t next_idx() { return _next; }
225
226 phys _paddr;
227 uint32_t _len;
228 flags _flags;
229 uint16_t _next;
230 };
231
232 // Guest to host
233 struct avail_layout {
234 struct flags {
235 // Mark that we do not need an interrupt for consuming a descriptor
236 // from the ring. Unreliable so it's simply an optimization
237 uint16_t no_interrupts : 1;
238 };
239
240 std::atomic<uint16_t> _flags;
241
242 // Where we put the next descriptor
243 std::atomic<uint16_t> _idx;
244 // There may be no more entries than the queue size read from device
245 uint16_t _ring[];
246 // used event index is an optimization in order to get an interrupt from the host
247 // only when the value reaches this number
248 // The location of this field is places after the variable length ring array,
249 // that's why we cannot fully define it within the struct and use a function accessor
250 //std::atomic<uint16_t> used_event;
251 };
252
253 struct used_elem {
254 // Index of start of used _desc chain. (uint32_t for padding reasons)
255 uint32_t _id;
256 // Total length of the descriptor chain which was used (written to)
257 uint32_t _len;
258 };
259
260 // Host to guest
261 struct used_layout {
262 enum {
263 // The Host advise the Guest: don't kick me when
264 // you add a buffer. It's unreliable, so it's simply an
265 // optimization. Guest will still kick if it's out of buffers.
266 no_notify = 1
267 };
268
269 // Using std::atomic since it being changed by the host
270 std::atomic<uint16_t> _flags;
271 // Using std::atomic in order to have memory barriers for it
272 std::atomic<uint16_t> _idx;
273 used_elem _used_elements[];
274 // avail event index is an optimization kick the host only when the value reaches this number
275 // The location of this field is places after the variable length ring array,
276 // that's why we cannot fully define it within the struct and use a function accessor
277 //std::atomic<uint16_t> avail_event;
278 };
279
280 struct avail {
281 explicit avail(ring_config conf);
282 avail_layout* _shared;
283 uint16_t _head = 0;
284 uint16_t _avail_added_since_kick = 0;
285 };
286 struct used {
287 explicit used(ring_config conf);
288 used_layout* _shared;
289 uint16_t _tail = 0;
290 };
291private:
292 ring_config _config;
293 Completion _complete;
294 std::unique_ptr<notifier> _notifier;
295 std::unique_ptr<BufferChain[]> _buffer_chains;
296 desc* _descs;
297 avail _avail;
298 used _used;
299 std::atomic<uint16_t>* _avail_event;
300 std::atomic<uint16_t>* _used_event;
301 semaphore _available_descriptors = { 0 };
302 int _free_head = -1;
303 int _free_last = -1;
304 reactor::poller _poller;
305public:
306
307 explicit vring(ring_config conf, Completion complete);
308 void set_notifier(std::unique_ptr<notifier> notifier) {
309 _notifier = std::move(notifier);
310 }
311 const ring_config& getconfig() {
312 return _config;
313 }
314 void wake_notifier_wait() {
315 _notifier->wake_wait();
316 }
317
318 // start the queue
319 void run();
320
321 // wait for the used ring to have at least @nr buffers
322 future<> on_used(size_t nr);
323
324 // Total number of descriptors in ring
325 int size() { return _config.size; }
326
327 template <typename Iterator>
328 void post(Iterator begin, Iterator end);
329
330 semaphore& available_descriptors() { return _available_descriptors; }
331private:
332 bool notifications_disabled() {
333 return (_used._shared->_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0;
334 }
335
336 void kick() {
337 bool need_kick = true;
338 // Make sure we see the fresh _idx value writen before kick.
339 std::atomic_thread_fence(std::memory_order_seq_cst);
340 if (_config.event_index) {
341 uint16_t avail_idx = _avail._shared->_idx.load(std::memory_order_relaxed);
342 uint16_t avail_event = _avail_event->load(std::memory_order_relaxed);
343 need_kick = (uint16_t)(avail_idx - avail_event - 1) < _avail._avail_added_since_kick;
344 } else {
345 if (notifications_disabled())
346 return;
347 }
348 if (need_kick || (_avail._avail_added_since_kick >= (uint16_t)(~0) / 2)) {
349 _notifier->notify();
350 _avail._avail_added_since_kick = 0;
351 }
352 }
353
354 bool do_complete();
355 size_t mask() { return size() - 1; }
356 size_t masked(size_t idx) { return idx & mask(); }
357 size_t available();
358 unsigned allocate_desc();
359 void setup();
360};
361
362template <typename BufferChain, typename Completion>
363vring<BufferChain, Completion>::avail::avail(ring_config conf)
364 : _shared(reinterpret_cast<avail_layout*>(conf.avail)) {
365}
366
367template <typename BufferChain, typename Completion>
368vring<BufferChain, Completion>::used::used(ring_config conf)
369 : _shared(reinterpret_cast<used_layout*>(conf.used)) {
370}
371
372template <typename BufferChain, typename Completion>
373inline
374unsigned
375vring<BufferChain, Completion>::allocate_desc() {
376 assert(_free_head != -1);
377 auto desc = _free_head;
378 if (desc == _free_last) {
379 _free_last = _free_head = -1;
380 } else {
381 _free_head = _descs[desc]._next;
382 }
383 return desc;
384}
385
386template <typename BufferChain, typename Completion>
387vring<BufferChain, Completion>::vring(ring_config conf, Completion complete)
388 : _config(conf)
389 , _complete(complete)
390 , _buffer_chains(new BufferChain[_config.size])
391 , _descs(reinterpret_cast<desc*>(conf.descs))
392 , _avail(conf)
393 , _used(conf)
394 , _avail_event(reinterpret_cast<std::atomic<uint16_t>*>(&_used._shared->_used_elements[conf.size]))
395 , _used_event(reinterpret_cast<std::atomic<uint16_t>*>(&_avail._shared->_ring[conf.size]))
396 , _poller(reactor::poller::simple([this] {
397 return do_complete();
398 }))
399{
400 setup();
401}
402
403template <typename BufferChain, typename Completion>
404void vring<BufferChain, Completion>::setup() {
405 for (unsigned i = 0; i < _config.size; ++i) {
406 _descs[i]._next = i + 1;
407 }
408 _free_head = 0;
409 _free_last = _config.size - 1;
410 _available_descriptors.signal(_config.size);
411}
412
413// Iterator: points at a buffer_chain
414template <typename BufferChain, typename Completion>
415template <typename Iterator>
416void vring<BufferChain, Completion>::post(Iterator begin, Iterator end) {
417 for (auto bci = begin; bci!= end; ++bci) {
418 auto&& bc = *bci;
419 desc pseudo_head = {};
420 desc* prev = &pseudo_head;
421 for (auto i = bc.begin(); i != bc.end(); ++i) {
422 unsigned desc_idx = allocate_desc();
423 prev->_flags.has_next = true;
424 prev->_next = desc_idx;
425 desc &d = _descs[desc_idx];
426 d._flags = {};
427 auto&& b = *i;
428 d._flags.writeable = b.writeable;
429 d._paddr = b.addr;
430 d._len = b.len;
431 prev = &d;
432 }
433 auto desc_head = pseudo_head._next;
434 _buffer_chains[desc_head] = std::move(bc);
435 _avail._shared->_ring[masked(_avail._head++)] = desc_head;
436 _avail._avail_added_since_kick++;
437 }
438 _avail._shared->_idx.store(_avail._head, std::memory_order_release);
439 kick();
440}
441
442template <typename BufferChain, typename Completion>
443bool vring<BufferChain, Completion>::do_complete() {
444 auto used_head = _used._shared->_idx.load(std::memory_order_acquire);
445 auto count = _used._tail - used_head;
446 _complete.bunch(count);
447 while (used_head != _used._tail) {
448 auto ue = _used._shared->_used_elements[masked(_used._tail++)];
449 _complete(std::move(_buffer_chains[ue._id]), ue._len);
450 auto id = ue._id;
451 if (_free_last != -1) {
452 _descs[_free_last]._next = id;
453 } else {
454 _free_head = id;
455 }
456 while (true) {
457 auto& d = _descs[id];
458 if (!d._flags.has_next) {
459 break;
460 }
461 id = d._next;
462 }
463 _free_last = id;
464 }
465 return count;
466}
467
468class qp : public net::qp {
469protected:
470 struct net_hdr {
471 uint8_t needs_csum : 1;
472 uint8_t flags_reserved : 7;
473 enum { gso_none = 0, gso_tcpv4 = 1, gso_udp = 3, gso_tcpv6 = 4, gso_ecn = 0x80 };
474 uint8_t gso_type;
475 uint16_t hdr_len;
476 uint16_t gso_size;
477 uint16_t csum_start;
478 uint16_t csum_offset;
479 };
480 struct net_hdr_mrg : net_hdr {
481 uint16_t num_buffers;
482 };
483 class txq {
484 static buffer fragment_to_buffer(fragment f) {
485 buffer b;
486 b.addr = virt_to_phys(f.base);
487 b.len = f.size;
488 b.writeable = false;
489 return b;
490 };
491 struct packet_as_buffer_chain {
492 packet p;
493 auto begin() {
494 return make_transform_iterator(p.fragments().begin(), fragment_to_buffer);
495 }
496 auto end() {
497 return make_transform_iterator(p.fragments().end(), fragment_to_buffer);
498 }
499 };
500 struct complete {
501 txq& q;
502 void operator()(packet_as_buffer_chain&& bc, size_t len) {
503 // move the packet here, to be destroyed on scope exit
504 auto p = std::move(bc.p);
505 q._ring.available_descriptors().signal(p.nr_frags());
506 }
507 void bunch(uint64_t c) {}
508 };
509 qp& _dev;
510 vring<packet_as_buffer_chain, complete> _ring;
511 std::vector<packet_as_buffer_chain> _packets;
512 public:
513 txq(qp& dev, ring_config config);
514 void set_notifier(std::unique_ptr<notifier> notifier) {
515 _ring.set_notifier(std::move(notifier));
516 }
517 const ring_config& getconfig() {
518 return _ring.getconfig();
519 }
520 void wake_notifier_wait() {
521 _ring.wake_notifier_wait();
522 }
523 uint32_t post(circular_buffer<packet>& p);
524 };
525 class rxq {
526 struct buffer_and_virt : buffer {
527 std::unique_ptr<char[], free_deleter> buf;
528 };
529 using single_buffer = std::array<buffer_and_virt, 1>;
530 struct complete {
531 rxq& q;
532 void operator()(single_buffer&& bc, size_t len) {
533 q.complete_buffer(std::move(bc), len);
534 }
535 void bunch(uint64_t c) {
536 q.update_rx_count(c);
537 }
538 };
539 qp& _dev;
540 vring<single_buffer, complete> _ring;
541 unsigned _remaining_buffers = 0;
542 std::vector<fragment> _fragments;
543 std::vector<std::unique_ptr<char[], free_deleter>> _buffers;
544 public:
545 rxq(qp& _if, ring_config config);
546 void set_notifier(std::unique_ptr<notifier> notifier) {
547 _ring.set_notifier(std::move(notifier));
548 }
549 const ring_config& getconfig() {
550 return _ring.getconfig();
551 }
552 void run() {
9f95a23c
TL
553 // FIXME: future is discarded
554 // At least catch errors and warn about them.
555 (void)keep_doing([this] { return prepare_buffers(); });
11fdf7f2
TL
556 }
557 void wake_notifier_wait() {
558 _ring.wake_notifier_wait();
559 }
560 void update_rx_count(uint64_t c) {
561 _dev._stats.rx.good.update_pkts_bunch(c);
562 }
563 private:
564 future<> prepare_buffers();
565 void complete_buffer(single_buffer&& b, size_t len);
566 void debug_mode_adjust_fragments();
567 };
568protected:
569 device* _dev;
570 size_t _header_len;
571 std::unique_ptr<char[], free_deleter> _txq_storage;
572 std::unique_ptr<char[], free_deleter> _rxq_storage;
573 txq _txq;
574 rxq _rxq;
575protected:
576 ring_config txq_config(size_t txq_ring_size);
577 ring_config rxq_config(size_t rxq_ring_size);
578 void common_config(ring_config& r);
579 size_t vring_storage_size(size_t ring_size);
580public:
581 explicit qp(device* dev, size_t rx_ring_size, size_t tx_ring_size);
582 virtual future<> send(packet p) override {
583 abort();
584 }
585 virtual uint32_t send(circular_buffer<packet>& p) override;
586 virtual void rx_start() override;
587 friend class rxq;
588};
589
590qp::txq::txq(qp& dev, ring_config config)
591 : _dev(dev), _ring(config, complete{*this}) {
592}
593
594uint32_t
595qp::txq::post(circular_buffer<packet>& pb) {
596 uint64_t bytes = 0, nr_frags = 0;
597
598 _packets.clear();
599
600 while (!pb.empty() && pb.front().nr_frags() + 1 <= _ring.available_descriptors().current()) {
601 net_hdr_mrg vhdr = {};
602 auto p = std::move(pb.front());
603
604 bytes += p.len();
605 nr_frags += p.nr_frags();
606
607 pb.pop_front();
608 // Handle TCP checksum offload
609 auto oi = p.offload_info();
610 if (_dev._dev->hw_features().tx_csum_l4_offload) {
611 auto eth_hdr_len = sizeof(eth_hdr);
612 auto ip_hdr_len = oi.ip_hdr_len;
613 auto mtu = _dev._dev->hw_features().mtu;
614 if (oi.protocol == ip_protocol_num::tcp) {
615 auto tcp_hdr_len = oi.tcp_hdr_len;
616 if (oi.needs_csum) {
617 vhdr.needs_csum = 1;
618 vhdr.csum_start = eth_hdr_len + ip_hdr_len;
619 // TCP checksum filed's offset within the TCP header is 16 bytes
620 vhdr.csum_offset = 16;
621 }
622 if (oi.tso_seg_size) {
623 // IPv4 TCP TSO
624 vhdr.gso_type = net_hdr::gso_tcpv4;
625 // Sum of Ethernet, IP and TCP header size
626 vhdr.hdr_len = eth_hdr_len + ip_hdr_len + tcp_hdr_len;
627 // Maximum segment size of packet after the offload
628 vhdr.gso_size = oi.tso_seg_size;
629 }
630 } else if (oi.protocol == ip_protocol_num::udp) {
631 auto udp_hdr_len = oi.udp_hdr_len;
632 if (oi.needs_csum) {
633 vhdr.needs_csum = 1;
634 vhdr.csum_start = eth_hdr_len + ip_hdr_len;
635 // UDP checksum filed's offset within the UDP header is 6 bytes
636 vhdr.csum_offset = 6;
637 }
638 if (_dev._dev->hw_features().tx_ufo && p.len() > mtu + eth_hdr_len) {
639 vhdr.gso_type = net_hdr::gso_udp;
640 vhdr.hdr_len = eth_hdr_len + ip_hdr_len + udp_hdr_len;
641 vhdr.gso_size = mtu - ip_hdr_len - udp_hdr_len;
642 }
643 }
644 }
645 // prepend virtio-net header
646 packet q = packet(fragment{reinterpret_cast<char*>(&vhdr), _dev._header_len},
647 std::move(p));
648 auto fut = _ring.available_descriptors().wait(q.nr_frags());
649 assert(fut.available()); // how it cannot?
650 _packets.emplace_back(packet_as_buffer_chain{ std::move(q) });
651 }
652 _ring.post(_packets.begin(), _packets.end());
653
654 _dev._stats.tx.good.update_frags_stats(nr_frags, bytes);
655
656 return _packets.size();
657}
658
659qp::rxq::rxq(qp& dev, ring_config config)
660 : _dev(dev), _ring(config, complete{*this}) {
661}
662
663future<>
664qp::rxq::prepare_buffers() {
665 auto& available = _ring.available_descriptors();
666 return available.wait(1).then([this, &available] {
667 unsigned count = 1;
668 auto opportunistic = available.current();
669 if (available.try_wait(opportunistic)) {
670 count += opportunistic;
671 }
672 auto make_buffer_chain = [] {
673 single_buffer bc;
674 std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(4096)));
675 buffer_and_virt& b = bc[0];
676 b.addr = virt_to_phys(buf.get());
677 b.len = 4096;
678 b.writeable = true;
679 b.buf = std::move(buf);
680 return bc;
681 };
682 auto start = make_function_input_iterator(make_buffer_chain, 0U);
683 auto finish = make_function_input_iterator(make_buffer_chain, count);
684 _ring.post(start, finish);
685 });
686}
687
688void
689qp::rxq::debug_mode_adjust_fragments() {
690#ifdef SEASTAR_DEBUG
691 // For debug mode, reallocate last fragment to detect buffer overruns
692 auto last = _fragments.back();
693 auto sz = last.size;
694 std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(sz)));
695 if (!buf) {
696 throw std::bad_alloc();
697 }
698 std::copy_n(last.base, sz, buf.get());
699 _fragments.back() = { buf.get(), sz };
700 _buffers.back() = std::move(buf);
701#endif
702}
703
704void
705qp::rxq::complete_buffer(single_buffer&& bc, size_t len) {
706 auto&& sb = bc[0];
707 auto&& buf = sb.buf;
708 auto frag_buf = buf.get();
709 auto frag_len = len;
710 // First buffer
711 if (_remaining_buffers == 0) {
712 auto hdr = reinterpret_cast<net_hdr_mrg*>(frag_buf);
713 assert(hdr->num_buffers >= 1);
714 _remaining_buffers = hdr->num_buffers;
715 frag_buf += _dev._header_len;
716 frag_len -= _dev._header_len;
717 _fragments.clear();
718 _buffers.clear();
719 };
720
721 // Append current buffer
722 _fragments.emplace_back(fragment{frag_buf, frag_len});
723 _buffers.push_back(std::move(buf));
724 _remaining_buffers--;
725
726 // Last buffer
727 if (_remaining_buffers == 0) {
728 debug_mode_adjust_fragments();
729 deleter del;
730 if (_buffers.size() == 1) {
731 del = make_free_deleter(_buffers[0].release());
732 _buffers.clear();
733 } else {
734 del = make_object_deleter(std::move(_buffers));
735 }
736 packet p(_fragments.begin(), _fragments.end(), std::move(del));
737
738 _dev._stats.rx.good.update_frags_stats(p.nr_frags(), p.len());
739
740 _dev._dev->l2receive(std::move(p));
741
742
743 _ring.available_descriptors().signal(_fragments.size());
744 }
745}
746
747// Allocate and zero-initialize a buffer which is page-aligned and can be
748// used for virt_to_phys (i.e., physically contiguous).
749static std::unique_ptr<char[], free_deleter> virtio_buffer(size_t size) {
750 void* ret;
751 auto r = posix_memalign(&ret, 4096, size);
752 assert(r == 0);
753 bzero(ret, size);
754 return std::unique_ptr<char[], free_deleter>(reinterpret_cast<char*>(ret));
755}
756
757qp::qp(device* dev, size_t rx_ring_size, size_t tx_ring_size)
758 : _dev(dev)
759 , _txq_storage(virtio_buffer(vring_storage_size(tx_ring_size)))
760 , _rxq_storage(virtio_buffer(vring_storage_size(rx_ring_size)))
761 , _txq(*this, txq_config(tx_ring_size))
762 , _rxq(*this, rxq_config(rx_ring_size)) {
763}
764
765size_t qp::vring_storage_size(size_t ring_size) {
766 // overestimate, but not by much.
767 return 3 * 4096 + ring_size * (16 + 2 + 8);
768}
769
770void qp::common_config(ring_config& r) {
771 r.avail = r.descs + 16 * r.size;
772 r.used = align_up(r.avail + 2 * r.size + 6, 4096);
773 r.event_index = (_dev->features() & VIRTIO_RING_F_EVENT_IDX) != 0;
774 r.indirect = false;
775}
776
777ring_config qp::txq_config(size_t tx_ring_size) {
778 ring_config r;
779 r.size = tx_ring_size;
780 r.descs = _txq_storage.get();
781 r.mergable_buffers = false;
782 common_config(r);
783 return r;
784}
785
786ring_config qp::rxq_config(size_t rx_ring_size) {
787 ring_config r;
788 r.size = rx_ring_size;
789 r.descs = _rxq_storage.get();
790 r.mergable_buffers = true;
791 common_config(r);
792 return r;
793}
794
795void
796qp::rx_start() {
797 _rxq.run();
798}
799
800uint32_t
801qp::send(circular_buffer<packet>& p) {
802 return _txq.post(p);
803}
804
805class qp_vhost : public qp {
806private:
807 // The vhost file descriptor needs to remain open throughout the life of
808 // this driver, as as soon as we close it, vhost stops servicing us.
809 file_desc _vhost_fd;
810public:
811 qp_vhost(device* dev, boost::program_options::variables_map opts);
812};
813
814static size_t config_ring_size(boost::program_options::variables_map &opts) {
815 if (opts.count("event-index")) {
816 return opts["virtio-ring-size"].as<unsigned>();
817 } else {
818 return 256;
819 }
820}
821
822qp_vhost::qp_vhost(device *dev, boost::program_options::variables_map opts)
823 : qp(dev, config_ring_size(opts), config_ring_size(opts))
824 , _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR))
825{
826 auto tap_device = opts["tap-device"].as<std::string>();
827 int64_t vhost_supported_features;
828 _vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features);
829 vhost_supported_features &= _dev->features();
830 _vhost_fd.ioctl(VHOST_SET_FEATURES, vhost_supported_features);
831 if (vhost_supported_features & VIRTIO_NET_F_MRG_RXBUF) {
832 _header_len = sizeof(net_hdr_mrg);
833 } else {
834 _header_len = sizeof(net_hdr);
835 }
836
837 // Open and set up the tap device, which we'll tell vhost to use.
838 // Note that the tap_fd we open here will be closed at the end of
839 // this function. It appears that this is fine - i.e., after we pass
840 // this fd to VHOST_NET_SET_BACKEND, the Linux kernel keeps the reference
841 // to it and it's fine to close the file descriptor.
842 file_desc tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK));
843 assert(tap_device.size() + 1 <= IFNAMSIZ);
844 ifreq ifr = {};
845 ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR;
846 strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str());
847 tap_fd.ioctl(TUNSETIFF, ifr);
848 unsigned int offload = 0;
849 auto hw_features = _dev->hw_features();
850 if (hw_features.tx_csum_l4_offload && hw_features.rx_csum_offload) {
851 offload = TUN_F_CSUM;
852 if (hw_features.tx_tso) {
853 offload |= TUN_F_TSO4;
854 }
855 if (hw_features.tx_ufo) {
856 offload |= TUN_F_UFO;
857 }
858 }
859 tap_fd.ioctl(TUNSETOFFLOAD, offload);
860 tap_fd.ioctl(TUNSETVNETHDRSZ, _header_len);
861
862 // Additional vhost setup:
863 _vhost_fd.ioctl(VHOST_SET_OWNER);
864 auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1);
865 mem_table->nregions = 1;
866 auto& region = mem_table->regions[0];
867 region.guest_phys_addr = 0;
868 region.memory_size = (size_t(1) << 47) - 4096;
869 region.userspace_addr = 0;
870 region.flags_padding = 0;
871 _vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table);
872 vhost_vring_state vvs0 = { 0, _rxq.getconfig().size };
873 _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0);
874 vhost_vring_state vvs1 = { 1, _txq.getconfig().size };
875 _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1);
876 auto tov = [](char* x) { return reinterpret_cast<uintptr_t>(x); };
877
878 _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
879 0, 0, tov(_rxq.getconfig().descs), tov(_rxq.getconfig().used),
880 tov(_rxq.getconfig().avail), 0
881 });
882 _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{
883 1, 0, tov(_txq.getconfig().descs), tov(_txq.getconfig().used),
884 tov(_txq.getconfig().avail), 0
885 });
886
887 readable_eventfd _txq_notify;
888 writeable_eventfd _txq_kick;
889 readable_eventfd _rxq_notify;
890 writeable_eventfd _rxq_kick;
891 _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, _rxq_kick.get_read_fd()});
892 _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, _rxq_notify.get_write_fd()});
893 _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, _txq_kick.get_read_fd()});
894 _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, _txq_notify.get_write_fd()});
895 _rxq.set_notifier(std::make_unique<notifier_vhost>(std::move(_rxq_kick)));
896 _txq.set_notifier(std::make_unique<notifier_vhost>(std::move(_txq_kick)));
897
898 _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, tap_fd.get()});
899 _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, tap_fd.get()});
900}
901
902#ifdef HAVE_OSV
903class qp_osv : public qp {
904private:
905 ethernet_address _mac;
906 osv::assigned_virtio &_virtio;
907public:
908 qp_osv(device *dev, osv::assigned_virtio &virtio,
909 boost::program_options::variables_map opts);
910};
911
912qp_osv::qp_osv(device *dev, osv::assigned_virtio &virtio,
913 boost::program_options::variables_map opts)
914 : qp(dev, virtio.queue_size(0), virtio.queue_size(1))
915 , _virtio(virtio)
916{
917 // Read the host's virtio supported feature bitmask, AND it with the
918 // features we want to use, and tell the host of the result:
919 uint32_t subset = _virtio.init_features(_dev->features());
920 if (subset & VIRTIO_NET_F_MRG_RXBUF) {
921 _header_len = sizeof(net_hdr_mrg);
922 } else {
923 _header_len = sizeof(net_hdr);
924 }
925
926 // TODO: save bits from "subset" in _hw_features?
927// bool _mergeable_bufs = subset & VIRTIO_NET_F_MRG_RXBUF;
928// bool _status = subset & VIRTIO_NET_F_STATUS;
929// bool _tso_ecn = subset & VIRTIO_NET_F_GUEST_ECN;
930// bool _host_tso_ecn = subset & VIRTIO_NET_F_HOST_ECN;
931// bool _csum = subset & VIRTIO_NET_F_CSUM;
932// bool _guest_csum = subset & VIRTIO_NET_F_GUEST_CSUM;
933// bool _guest_tso4 = subset & VIRTIO_NET_F_GUEST_TSO4;
934// bool _host_tso4 = subset & VIRTIO_NET_F_HOST_TSO4;
935// bool _guest_ufo = subset & VIRTIO_NET_F_GUEST_UFO;
936
937 // Get the MAC address set by the host
938 assert(subset & VIRTIO_NET_F_MAC);
939 struct net_config {
940 /* The ring_config defining mac address (if VIRTIO_NET_F_MAC) */
941 uint8_t mac[6];
942 /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* */
943 uint16_t status;
944 /* Maximum number of each of transmit and receive queues;
945 * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ.
946 * Legal values are between 1 and 0x8000
947 */
948 uint16_t max_virtqueue_pairs;
949 } __attribute__((packed)) host_config;
950 _virtio.conf_read(&host_config, sizeof(host_config));
951 _mac = { host_config.mac[0], host_config.mac[1], host_config.mac[2],
952 host_config.mac[3], host_config.mac[4], host_config.mac[5] };
953
954 // Setup notifiers
955 _rxq.set_notifier(std::make_unique<notifier_osv>(_virtio, 0));
956 _txq.set_notifier(std::make_unique<notifier_osv>(_virtio, 1));
957
958
959 // Tell the host where we put the rings (we already allocated them earlier)
960 _virtio.set_queue_pfn(
961 0, virt_to_phys(_rxq.getconfig().descs));
962 _virtio.set_queue_pfn(
963 1, virt_to_phys(_txq.getconfig().descs));
964
965 // Set up interrupts
966 // FIXME: in OSv, the first thing we do in the handler is to call
967 // _rqx.disable_interrupts(). Here in seastar, we only do it much later
968 // in the main engine(). Probably needs to do it like in osv - in the beginning of the handler.
969 _virtio.enable_interrupt(
970 0, [&] { _rxq.wake_notifier_wait(); } );
971 _virtio.enable_interrupt(
972 1, [&] { _txq.wake_notifier_wait(); } );
973
974 _virtio.set_driver_ok();
975}
976#endif
977
978std::unique_ptr<net::qp> device::init_local_queue(boost::program_options::variables_map opts, uint16_t qid) {
979 static bool called = false;
980 assert(!qid);
981 assert(!called);
982 called = true;
983
984#ifdef HAVE_OSV
985 if (osv::assigned_virtio::get && osv::assigned_virtio::get()) {
986 std::cout << "In OSv and assigned host's virtio device\n";
987 return std::make_unique<qp_osv>(this, *osv::assigned_virtio::get(), opts);
988 }
989#endif
990 return std::make_unique<qp_vhost>(this, opts);
991}
992
993}
994
995boost::program_options::options_description
996get_virtio_net_options_description()
997{
998 boost::program_options::options_description opts(
999 "Virtio net options");
1000 opts.add_options()
1001 ("event-index",
1002 boost::program_options::value<std::string>()->default_value("on"),
1003 "Enable event-index feature (on / off)")
1004 ("csum-offload",
1005 boost::program_options::value<std::string>()->default_value("on"),
1006 "Enable checksum offload feature (on / off)")
1007 ("tso",
1008 boost::program_options::value<std::string>()->default_value("on"),
1009 "Enable TCP segment offload feature (on / off)")
1010 ("ufo",
1011 boost::program_options::value<std::string>()->default_value("on"),
1012 "Enable UDP fragmentation offload feature (on / off)")
1013 ("virtio-ring-size",
1014 boost::program_options::value<unsigned>()->default_value(256),
1015 "Virtio ring size (must be power-of-two)")
1016 ;
1017 return opts;
1018}
1019
1020std::unique_ptr<net::device> create_virtio_net_device(boost::program_options::variables_map opts) {
1021 return std::make_unique<virtio::device>(opts);
1022}
1023
1024}
1025
1026// Locks the shared object in memory and forces on-load function resolution.
1027// Needed if the function passed to enable_interrupt() is run at interrupt
1028// time.
1029// TODO: Instead of doing this, _virtio.enable_interrupt() could take a
1030// pollable to wake instead of a function, then this won't be needed.
1031asm(".pushsection .note.osv-mlock, \"a\"; .long 0, 0, 0; .popsection");