]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | /* |
2 | * This file is open source software, licensed to you under the terms | |
3 | * of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
4 | * distributed with this work for additional information regarding copyright | |
5 | * ownership. You may not use this file except in compliance with the License. | |
6 | * | |
7 | * You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, | |
12 | * software distributed under the License is distributed on an | |
13 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
14 | * KIND, either express or implied. See the License for the | |
15 | * specific language governing permissions and limitations | |
16 | * under the License. | |
17 | */ | |
18 | /* | |
19 | * Copyright (C) 2014 Cloudius Systems, Ltd. | |
20 | */ | |
21 | ||
22 | #include <seastar/net/virtio.hh> | |
23 | #include <seastar/core/posix.hh> | |
f67539c2 | 24 | #include <seastar/core/internal/pollable_fd.hh> |
11fdf7f2 TL |
25 | #include "core/vla.hh" |
26 | #include <seastar/net/virtio-interface.hh> | |
27 | #include <seastar/core/reactor.hh> | |
28 | #include <seastar/core/stream.hh> | |
29 | #include <seastar/core/circular_buffer.hh> | |
30 | #include <seastar/core/align.hh> | |
31 | #include <seastar/core/metrics.hh> | |
32 | #include <seastar/util/function_input_iterator.hh> | |
33 | #include <seastar/util/transform_iterator.hh> | |
34 | #include <atomic> | |
35 | #include <vector> | |
36 | #include <queue> | |
37 | #include <fcntl.h> | |
38 | #include <linux/vhost.h> | |
39 | #include <linux/if_tun.h> | |
40 | #include <seastar/net/ip.hh> | |
41 | #include <seastar/net/const.hh> | |
42 | #include <seastar/net/native-stack.hh> | |
43 | ||
44 | #ifdef HAVE_OSV | |
45 | #include <osv/virtio-assign.hh> | |
46 | #endif | |
47 | ||
48 | namespace seastar { | |
49 | ||
50 | using namespace net; | |
51 | ||
52 | namespace virtio { | |
53 | ||
54 | using phys = uint64_t; | |
55 | ||
56 | #ifndef HAVE_OSV | |
57 | ||
58 | phys virt_to_phys(void* p) { | |
59 | return reinterpret_cast<uintptr_t>(p); | |
60 | } | |
61 | ||
62 | #else | |
63 | ||
64 | phys virt_to_phys(void* p) { | |
65 | return osv::assigned_virtio::virt_to_phys(p); | |
66 | } | |
67 | ||
68 | #endif | |
69 | ||
70 | class device : public net::device { | |
71 | private: | |
72 | boost::program_options::variables_map _opts; | |
73 | net::hw_features _hw_features; | |
74 | uint64_t _features; | |
75 | ||
76 | private: | |
77 | uint64_t setup_features() { | |
78 | int64_t seastar_supported_features = VIRTIO_RING_F_INDIRECT_DESC | VIRTIO_NET_F_MRG_RXBUF; | |
79 | ||
80 | if (!(_opts.count("event-index") && _opts["event-index"].as<std::string>() == "off")) { | |
81 | seastar_supported_features |= VIRTIO_RING_F_EVENT_IDX; | |
82 | } | |
83 | if (!(_opts.count("csum-offload") && _opts["csum-offload"].as<std::string>() == "off")) { | |
84 | seastar_supported_features |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; | |
85 | _hw_features.tx_csum_l4_offload = true; | |
86 | _hw_features.rx_csum_offload = true; | |
87 | } else { | |
88 | _hw_features.tx_csum_l4_offload = false; | |
89 | _hw_features.rx_csum_offload = false; | |
90 | } | |
91 | if (!(_opts.count("tso") && _opts["tso"].as<std::string>() == "off")) { | |
92 | seastar_supported_features |= VIRTIO_NET_F_HOST_TSO4; | |
93 | _hw_features.tx_tso = true; | |
94 | } else { | |
95 | _hw_features.tx_tso = false; | |
96 | } | |
97 | ||
98 | if (!(_opts.count("lro") && _opts["lro"].as<std::string>() == "off")) { | |
99 | seastar_supported_features |= VIRTIO_NET_F_GUEST_TSO4; | |
100 | _hw_features.rx_lro = true; | |
101 | } else { | |
102 | _hw_features.rx_lro = false; | |
103 | } | |
104 | ||
105 | if (!(_opts.count("ufo") && _opts["ufo"].as<std::string>() == "off")) { | |
106 | seastar_supported_features |= VIRTIO_NET_F_HOST_UFO; | |
107 | seastar_supported_features |= VIRTIO_NET_F_GUEST_UFO; | |
108 | _hw_features.tx_ufo = true; | |
109 | } else { | |
110 | _hw_features.tx_ufo = false; | |
111 | } | |
112 | ||
113 | seastar_supported_features |= VIRTIO_NET_F_MAC; | |
114 | return seastar_supported_features; | |
115 | } | |
116 | ||
117 | public: | |
118 | device(boost::program_options::variables_map opts) | |
119 | : _opts(opts), _features(setup_features()) | |
120 | {} | |
121 | ethernet_address hw_address() override { | |
122 | return { 0x12, 0x23, 0x34, 0x56, 0x67, 0x78 }; | |
123 | } | |
124 | ||
125 | net::hw_features hw_features() override { | |
126 | return _hw_features; | |
127 | } | |
128 | ||
129 | uint64_t features() { | |
130 | return _features; | |
131 | } | |
132 | ||
133 | virtual std::unique_ptr<net::qp> init_local_queue(boost::program_options::variables_map opts, uint16_t qid) override; | |
134 | }; | |
135 | ||
136 | /* The virtio_notifier class determines how to do host-to-guest and guest-to- | |
137 | * host notifications. We have two different implementations - one for vhost | |
138 | * (where both notifications occur through eventfds) and one for an assigned | |
139 | * virtio device from OSv. | |
140 | */ | |
141 | class notifier { | |
142 | public: | |
143 | // Notify the host | |
144 | virtual void notify() = 0; | |
145 | // Do whatever it takes to wake wait(). A notifier does not need to | |
146 | // implement this function if wait() waits for an external even which is | |
147 | // generated by an external process (e.g., virtio_notifier_host doesn't | |
148 | // need to implement this). | |
149 | virtual void wake_wait() { | |
150 | abort(); | |
151 | } | |
152 | virtual ~notifier() { | |
153 | } | |
154 | }; | |
155 | ||
156 | class notifier_vhost : public notifier { | |
157 | private: | |
158 | writeable_eventfd _kick; | |
159 | public: | |
160 | virtual void notify() override { | |
161 | _kick.signal(1); | |
162 | } | |
163 | notifier_vhost(writeable_eventfd &&kick) | |
164 | : _kick(std::move(kick)) {} | |
165 | }; | |
166 | ||
167 | #ifdef HAVE_OSV | |
168 | class notifier_osv : public notifier { | |
169 | private: | |
170 | uint16_t _q_index; | |
171 | osv::assigned_virtio &_virtio; | |
172 | public: | |
173 | virtual void notify() override { | |
174 | _virtio.kick(_q_index); | |
175 | } | |
176 | notifier_osv(osv::assigned_virtio &virtio, uint16_t q_index) | |
177 | : _q_index(q_index) | |
178 | , _virtio(virtio) | |
179 | { | |
180 | } | |
181 | }; | |
182 | #endif | |
183 | ||
184 | struct ring_config { | |
185 | char* descs; | |
186 | char* avail; | |
187 | char* used; | |
188 | unsigned size; | |
189 | bool event_index; | |
190 | bool indirect; | |
191 | bool mergable_buffers; | |
192 | }; | |
193 | ||
194 | struct buffer { | |
195 | phys addr; | |
196 | uint32_t len; | |
197 | bool writeable; | |
198 | }; | |
199 | ||
200 | // The 'buffer_chain' concept, used in vring, is a container of buffers, as in: | |
201 | // | |
202 | // using buffer_chain = std::vector<buffer>; | |
203 | // | |
204 | // The 'Completion' concept is a functor with the signature: | |
205 | // | |
206 | // void (buffer_chain&, size_t len); | |
207 | // | |
208 | template <typename BufferChain, typename Completion> | |
209 | class vring { | |
210 | private: | |
211 | class desc { | |
212 | public: | |
213 | struct flags { | |
214 | // This marks a buffer as continuing via the next field. | |
215 | uint16_t has_next : 1; | |
216 | // This marks a buffer as write-only (otherwise read-only). | |
217 | uint16_t writeable : 1; | |
218 | // This means the buffer contains a list of buffer descriptors. | |
219 | uint16_t indirect : 1; | |
220 | }; | |
221 | ||
222 | phys get_paddr(); | |
223 | uint32_t get_len() { return _len; } | |
224 | uint16_t next_idx() { return _next; } | |
225 | ||
226 | phys _paddr; | |
227 | uint32_t _len; | |
228 | flags _flags; | |
229 | uint16_t _next; | |
230 | }; | |
231 | ||
232 | // Guest to host | |
233 | struct avail_layout { | |
234 | struct flags { | |
235 | // Mark that we do not need an interrupt for consuming a descriptor | |
236 | // from the ring. Unreliable so it's simply an optimization | |
237 | uint16_t no_interrupts : 1; | |
238 | }; | |
239 | ||
240 | std::atomic<uint16_t> _flags; | |
241 | ||
242 | // Where we put the next descriptor | |
243 | std::atomic<uint16_t> _idx; | |
244 | // There may be no more entries than the queue size read from device | |
245 | uint16_t _ring[]; | |
246 | // used event index is an optimization in order to get an interrupt from the host | |
247 | // only when the value reaches this number | |
248 | // The location of this field is places after the variable length ring array, | |
249 | // that's why we cannot fully define it within the struct and use a function accessor | |
250 | //std::atomic<uint16_t> used_event; | |
251 | }; | |
252 | ||
253 | struct used_elem { | |
254 | // Index of start of used _desc chain. (uint32_t for padding reasons) | |
255 | uint32_t _id; | |
256 | // Total length of the descriptor chain which was used (written to) | |
257 | uint32_t _len; | |
258 | }; | |
259 | ||
260 | // Host to guest | |
261 | struct used_layout { | |
262 | enum { | |
263 | // The Host advise the Guest: don't kick me when | |
264 | // you add a buffer. It's unreliable, so it's simply an | |
265 | // optimization. Guest will still kick if it's out of buffers. | |
266 | no_notify = 1 | |
267 | }; | |
268 | ||
269 | // Using std::atomic since it being changed by the host | |
270 | std::atomic<uint16_t> _flags; | |
271 | // Using std::atomic in order to have memory barriers for it | |
272 | std::atomic<uint16_t> _idx; | |
273 | used_elem _used_elements[]; | |
274 | // avail event index is an optimization kick the host only when the value reaches this number | |
275 | // The location of this field is places after the variable length ring array, | |
276 | // that's why we cannot fully define it within the struct and use a function accessor | |
277 | //std::atomic<uint16_t> avail_event; | |
278 | }; | |
279 | ||
280 | struct avail { | |
281 | explicit avail(ring_config conf); | |
282 | avail_layout* _shared; | |
283 | uint16_t _head = 0; | |
284 | uint16_t _avail_added_since_kick = 0; | |
285 | }; | |
286 | struct used { | |
287 | explicit used(ring_config conf); | |
288 | used_layout* _shared; | |
289 | uint16_t _tail = 0; | |
290 | }; | |
291 | private: | |
292 | ring_config _config; | |
293 | Completion _complete; | |
294 | std::unique_ptr<notifier> _notifier; | |
295 | std::unique_ptr<BufferChain[]> _buffer_chains; | |
296 | desc* _descs; | |
297 | avail _avail; | |
298 | used _used; | |
299 | std::atomic<uint16_t>* _avail_event; | |
300 | std::atomic<uint16_t>* _used_event; | |
301 | semaphore _available_descriptors = { 0 }; | |
302 | int _free_head = -1; | |
303 | int _free_last = -1; | |
304 | reactor::poller _poller; | |
305 | public: | |
306 | ||
307 | explicit vring(ring_config conf, Completion complete); | |
308 | void set_notifier(std::unique_ptr<notifier> notifier) { | |
309 | _notifier = std::move(notifier); | |
310 | } | |
311 | const ring_config& getconfig() { | |
312 | return _config; | |
313 | } | |
314 | void wake_notifier_wait() { | |
315 | _notifier->wake_wait(); | |
316 | } | |
317 | ||
318 | // start the queue | |
319 | void run(); | |
320 | ||
321 | // wait for the used ring to have at least @nr buffers | |
322 | future<> on_used(size_t nr); | |
323 | ||
324 | // Total number of descriptors in ring | |
325 | int size() { return _config.size; } | |
326 | ||
327 | template <typename Iterator> | |
328 | void post(Iterator begin, Iterator end); | |
329 | ||
330 | semaphore& available_descriptors() { return _available_descriptors; } | |
331 | private: | |
332 | bool notifications_disabled() { | |
333 | return (_used._shared->_flags.load(std::memory_order_relaxed) & VRING_USED_F_NO_NOTIFY) != 0; | |
334 | } | |
335 | ||
336 | void kick() { | |
337 | bool need_kick = true; | |
338 | // Make sure we see the fresh _idx value writen before kick. | |
339 | std::atomic_thread_fence(std::memory_order_seq_cst); | |
340 | if (_config.event_index) { | |
341 | uint16_t avail_idx = _avail._shared->_idx.load(std::memory_order_relaxed); | |
342 | uint16_t avail_event = _avail_event->load(std::memory_order_relaxed); | |
343 | need_kick = (uint16_t)(avail_idx - avail_event - 1) < _avail._avail_added_since_kick; | |
344 | } else { | |
345 | if (notifications_disabled()) | |
346 | return; | |
347 | } | |
348 | if (need_kick || (_avail._avail_added_since_kick >= (uint16_t)(~0) / 2)) { | |
349 | _notifier->notify(); | |
350 | _avail._avail_added_since_kick = 0; | |
351 | } | |
352 | } | |
353 | ||
354 | bool do_complete(); | |
355 | size_t mask() { return size() - 1; } | |
356 | size_t masked(size_t idx) { return idx & mask(); } | |
357 | size_t available(); | |
358 | unsigned allocate_desc(); | |
359 | void setup(); | |
360 | }; | |
361 | ||
362 | template <typename BufferChain, typename Completion> | |
363 | vring<BufferChain, Completion>::avail::avail(ring_config conf) | |
364 | : _shared(reinterpret_cast<avail_layout*>(conf.avail)) { | |
365 | } | |
366 | ||
367 | template <typename BufferChain, typename Completion> | |
368 | vring<BufferChain, Completion>::used::used(ring_config conf) | |
369 | : _shared(reinterpret_cast<used_layout*>(conf.used)) { | |
370 | } | |
371 | ||
372 | template <typename BufferChain, typename Completion> | |
373 | inline | |
374 | unsigned | |
375 | vring<BufferChain, Completion>::allocate_desc() { | |
376 | assert(_free_head != -1); | |
377 | auto desc = _free_head; | |
378 | if (desc == _free_last) { | |
379 | _free_last = _free_head = -1; | |
380 | } else { | |
381 | _free_head = _descs[desc]._next; | |
382 | } | |
383 | return desc; | |
384 | } | |
385 | ||
386 | template <typename BufferChain, typename Completion> | |
387 | vring<BufferChain, Completion>::vring(ring_config conf, Completion complete) | |
388 | : _config(conf) | |
389 | , _complete(complete) | |
390 | , _buffer_chains(new BufferChain[_config.size]) | |
391 | , _descs(reinterpret_cast<desc*>(conf.descs)) | |
392 | , _avail(conf) | |
393 | , _used(conf) | |
394 | , _avail_event(reinterpret_cast<std::atomic<uint16_t>*>(&_used._shared->_used_elements[conf.size])) | |
395 | , _used_event(reinterpret_cast<std::atomic<uint16_t>*>(&_avail._shared->_ring[conf.size])) | |
396 | , _poller(reactor::poller::simple([this] { | |
397 | return do_complete(); | |
398 | })) | |
399 | { | |
400 | setup(); | |
401 | } | |
402 | ||
403 | template <typename BufferChain, typename Completion> | |
404 | void vring<BufferChain, Completion>::setup() { | |
405 | for (unsigned i = 0; i < _config.size; ++i) { | |
406 | _descs[i]._next = i + 1; | |
407 | } | |
408 | _free_head = 0; | |
409 | _free_last = _config.size - 1; | |
410 | _available_descriptors.signal(_config.size); | |
411 | } | |
412 | ||
413 | // Iterator: points at a buffer_chain | |
414 | template <typename BufferChain, typename Completion> | |
415 | template <typename Iterator> | |
416 | void vring<BufferChain, Completion>::post(Iterator begin, Iterator end) { | |
417 | for (auto bci = begin; bci!= end; ++bci) { | |
418 | auto&& bc = *bci; | |
419 | desc pseudo_head = {}; | |
420 | desc* prev = &pseudo_head; | |
421 | for (auto i = bc.begin(); i != bc.end(); ++i) { | |
422 | unsigned desc_idx = allocate_desc(); | |
423 | prev->_flags.has_next = true; | |
424 | prev->_next = desc_idx; | |
425 | desc &d = _descs[desc_idx]; | |
426 | d._flags = {}; | |
427 | auto&& b = *i; | |
428 | d._flags.writeable = b.writeable; | |
429 | d._paddr = b.addr; | |
430 | d._len = b.len; | |
431 | prev = &d; | |
432 | } | |
433 | auto desc_head = pseudo_head._next; | |
434 | _buffer_chains[desc_head] = std::move(bc); | |
435 | _avail._shared->_ring[masked(_avail._head++)] = desc_head; | |
436 | _avail._avail_added_since_kick++; | |
437 | } | |
438 | _avail._shared->_idx.store(_avail._head, std::memory_order_release); | |
439 | kick(); | |
440 | } | |
441 | ||
442 | template <typename BufferChain, typename Completion> | |
443 | bool vring<BufferChain, Completion>::do_complete() { | |
444 | auto used_head = _used._shared->_idx.load(std::memory_order_acquire); | |
445 | auto count = _used._tail - used_head; | |
446 | _complete.bunch(count); | |
447 | while (used_head != _used._tail) { | |
448 | auto ue = _used._shared->_used_elements[masked(_used._tail++)]; | |
449 | _complete(std::move(_buffer_chains[ue._id]), ue._len); | |
450 | auto id = ue._id; | |
451 | if (_free_last != -1) { | |
452 | _descs[_free_last]._next = id; | |
453 | } else { | |
454 | _free_head = id; | |
455 | } | |
456 | while (true) { | |
457 | auto& d = _descs[id]; | |
458 | if (!d._flags.has_next) { | |
459 | break; | |
460 | } | |
461 | id = d._next; | |
462 | } | |
463 | _free_last = id; | |
464 | } | |
465 | return count; | |
466 | } | |
467 | ||
468 | class qp : public net::qp { | |
469 | protected: | |
470 | struct net_hdr { | |
471 | uint8_t needs_csum : 1; | |
472 | uint8_t flags_reserved : 7; | |
473 | enum { gso_none = 0, gso_tcpv4 = 1, gso_udp = 3, gso_tcpv6 = 4, gso_ecn = 0x80 }; | |
474 | uint8_t gso_type; | |
475 | uint16_t hdr_len; | |
476 | uint16_t gso_size; | |
477 | uint16_t csum_start; | |
478 | uint16_t csum_offset; | |
479 | }; | |
480 | struct net_hdr_mrg : net_hdr { | |
481 | uint16_t num_buffers; | |
482 | }; | |
483 | class txq { | |
484 | static buffer fragment_to_buffer(fragment f) { | |
485 | buffer b; | |
486 | b.addr = virt_to_phys(f.base); | |
487 | b.len = f.size; | |
488 | b.writeable = false; | |
489 | return b; | |
490 | }; | |
491 | struct packet_as_buffer_chain { | |
492 | packet p; | |
493 | auto begin() { | |
494 | return make_transform_iterator(p.fragments().begin(), fragment_to_buffer); | |
495 | } | |
496 | auto end() { | |
497 | return make_transform_iterator(p.fragments().end(), fragment_to_buffer); | |
498 | } | |
499 | }; | |
500 | struct complete { | |
501 | txq& q; | |
502 | void operator()(packet_as_buffer_chain&& bc, size_t len) { | |
503 | // move the packet here, to be destroyed on scope exit | |
504 | auto p = std::move(bc.p); | |
505 | q._ring.available_descriptors().signal(p.nr_frags()); | |
506 | } | |
507 | void bunch(uint64_t c) {} | |
508 | }; | |
509 | qp& _dev; | |
510 | vring<packet_as_buffer_chain, complete> _ring; | |
511 | std::vector<packet_as_buffer_chain> _packets; | |
512 | public: | |
513 | txq(qp& dev, ring_config config); | |
514 | void set_notifier(std::unique_ptr<notifier> notifier) { | |
515 | _ring.set_notifier(std::move(notifier)); | |
516 | } | |
517 | const ring_config& getconfig() { | |
518 | return _ring.getconfig(); | |
519 | } | |
520 | void wake_notifier_wait() { | |
521 | _ring.wake_notifier_wait(); | |
522 | } | |
523 | uint32_t post(circular_buffer<packet>& p); | |
524 | }; | |
525 | class rxq { | |
526 | struct buffer_and_virt : buffer { | |
527 | std::unique_ptr<char[], free_deleter> buf; | |
528 | }; | |
529 | using single_buffer = std::array<buffer_and_virt, 1>; | |
530 | struct complete { | |
531 | rxq& q; | |
532 | void operator()(single_buffer&& bc, size_t len) { | |
533 | q.complete_buffer(std::move(bc), len); | |
534 | } | |
535 | void bunch(uint64_t c) { | |
536 | q.update_rx_count(c); | |
537 | } | |
538 | }; | |
539 | qp& _dev; | |
540 | vring<single_buffer, complete> _ring; | |
541 | unsigned _remaining_buffers = 0; | |
542 | std::vector<fragment> _fragments; | |
543 | std::vector<std::unique_ptr<char[], free_deleter>> _buffers; | |
544 | public: | |
545 | rxq(qp& _if, ring_config config); | |
546 | void set_notifier(std::unique_ptr<notifier> notifier) { | |
547 | _ring.set_notifier(std::move(notifier)); | |
548 | } | |
549 | const ring_config& getconfig() { | |
550 | return _ring.getconfig(); | |
551 | } | |
552 | void run() { | |
9f95a23c TL |
553 | // FIXME: future is discarded |
554 | // At least catch errors and warn about them. | |
555 | (void)keep_doing([this] { return prepare_buffers(); }); | |
11fdf7f2 TL |
556 | } |
557 | void wake_notifier_wait() { | |
558 | _ring.wake_notifier_wait(); | |
559 | } | |
560 | void update_rx_count(uint64_t c) { | |
561 | _dev._stats.rx.good.update_pkts_bunch(c); | |
562 | } | |
563 | private: | |
564 | future<> prepare_buffers(); | |
565 | void complete_buffer(single_buffer&& b, size_t len); | |
566 | void debug_mode_adjust_fragments(); | |
567 | }; | |
568 | protected: | |
569 | device* _dev; | |
570 | size_t _header_len; | |
571 | std::unique_ptr<char[], free_deleter> _txq_storage; | |
572 | std::unique_ptr<char[], free_deleter> _rxq_storage; | |
573 | txq _txq; | |
574 | rxq _rxq; | |
575 | protected: | |
576 | ring_config txq_config(size_t txq_ring_size); | |
577 | ring_config rxq_config(size_t rxq_ring_size); | |
578 | void common_config(ring_config& r); | |
579 | size_t vring_storage_size(size_t ring_size); | |
580 | public: | |
581 | explicit qp(device* dev, size_t rx_ring_size, size_t tx_ring_size); | |
582 | virtual future<> send(packet p) override { | |
583 | abort(); | |
584 | } | |
585 | virtual uint32_t send(circular_buffer<packet>& p) override; | |
586 | virtual void rx_start() override; | |
587 | friend class rxq; | |
588 | }; | |
589 | ||
590 | qp::txq::txq(qp& dev, ring_config config) | |
591 | : _dev(dev), _ring(config, complete{*this}) { | |
592 | } | |
593 | ||
594 | uint32_t | |
595 | qp::txq::post(circular_buffer<packet>& pb) { | |
596 | uint64_t bytes = 0, nr_frags = 0; | |
597 | ||
598 | _packets.clear(); | |
599 | ||
600 | while (!pb.empty() && pb.front().nr_frags() + 1 <= _ring.available_descriptors().current()) { | |
601 | net_hdr_mrg vhdr = {}; | |
602 | auto p = std::move(pb.front()); | |
603 | ||
604 | bytes += p.len(); | |
605 | nr_frags += p.nr_frags(); | |
606 | ||
607 | pb.pop_front(); | |
608 | // Handle TCP checksum offload | |
609 | auto oi = p.offload_info(); | |
610 | if (_dev._dev->hw_features().tx_csum_l4_offload) { | |
611 | auto eth_hdr_len = sizeof(eth_hdr); | |
612 | auto ip_hdr_len = oi.ip_hdr_len; | |
613 | auto mtu = _dev._dev->hw_features().mtu; | |
614 | if (oi.protocol == ip_protocol_num::tcp) { | |
615 | auto tcp_hdr_len = oi.tcp_hdr_len; | |
616 | if (oi.needs_csum) { | |
617 | vhdr.needs_csum = 1; | |
618 | vhdr.csum_start = eth_hdr_len + ip_hdr_len; | |
619 | // TCP checksum filed's offset within the TCP header is 16 bytes | |
620 | vhdr.csum_offset = 16; | |
621 | } | |
622 | if (oi.tso_seg_size) { | |
623 | // IPv4 TCP TSO | |
624 | vhdr.gso_type = net_hdr::gso_tcpv4; | |
625 | // Sum of Ethernet, IP and TCP header size | |
626 | vhdr.hdr_len = eth_hdr_len + ip_hdr_len + tcp_hdr_len; | |
627 | // Maximum segment size of packet after the offload | |
628 | vhdr.gso_size = oi.tso_seg_size; | |
629 | } | |
630 | } else if (oi.protocol == ip_protocol_num::udp) { | |
631 | auto udp_hdr_len = oi.udp_hdr_len; | |
632 | if (oi.needs_csum) { | |
633 | vhdr.needs_csum = 1; | |
634 | vhdr.csum_start = eth_hdr_len + ip_hdr_len; | |
635 | // UDP checksum filed's offset within the UDP header is 6 bytes | |
636 | vhdr.csum_offset = 6; | |
637 | } | |
638 | if (_dev._dev->hw_features().tx_ufo && p.len() > mtu + eth_hdr_len) { | |
639 | vhdr.gso_type = net_hdr::gso_udp; | |
640 | vhdr.hdr_len = eth_hdr_len + ip_hdr_len + udp_hdr_len; | |
641 | vhdr.gso_size = mtu - ip_hdr_len - udp_hdr_len; | |
642 | } | |
643 | } | |
644 | } | |
645 | // prepend virtio-net header | |
646 | packet q = packet(fragment{reinterpret_cast<char*>(&vhdr), _dev._header_len}, | |
647 | std::move(p)); | |
648 | auto fut = _ring.available_descriptors().wait(q.nr_frags()); | |
649 | assert(fut.available()); // how it cannot? | |
650 | _packets.emplace_back(packet_as_buffer_chain{ std::move(q) }); | |
651 | } | |
652 | _ring.post(_packets.begin(), _packets.end()); | |
653 | ||
654 | _dev._stats.tx.good.update_frags_stats(nr_frags, bytes); | |
655 | ||
656 | return _packets.size(); | |
657 | } | |
658 | ||
659 | qp::rxq::rxq(qp& dev, ring_config config) | |
660 | : _dev(dev), _ring(config, complete{*this}) { | |
661 | } | |
662 | ||
663 | future<> | |
664 | qp::rxq::prepare_buffers() { | |
665 | auto& available = _ring.available_descriptors(); | |
666 | return available.wait(1).then([this, &available] { | |
667 | unsigned count = 1; | |
668 | auto opportunistic = available.current(); | |
669 | if (available.try_wait(opportunistic)) { | |
670 | count += opportunistic; | |
671 | } | |
672 | auto make_buffer_chain = [] { | |
673 | single_buffer bc; | |
674 | std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(4096))); | |
675 | buffer_and_virt& b = bc[0]; | |
676 | b.addr = virt_to_phys(buf.get()); | |
677 | b.len = 4096; | |
678 | b.writeable = true; | |
679 | b.buf = std::move(buf); | |
680 | return bc; | |
681 | }; | |
682 | auto start = make_function_input_iterator(make_buffer_chain, 0U); | |
683 | auto finish = make_function_input_iterator(make_buffer_chain, count); | |
684 | _ring.post(start, finish); | |
685 | }); | |
686 | } | |
687 | ||
688 | void | |
689 | qp::rxq::debug_mode_adjust_fragments() { | |
690 | #ifdef SEASTAR_DEBUG | |
691 | // For debug mode, reallocate last fragment to detect buffer overruns | |
692 | auto last = _fragments.back(); | |
693 | auto sz = last.size; | |
694 | std::unique_ptr<char[], free_deleter> buf(reinterpret_cast<char*>(malloc(sz))); | |
695 | if (!buf) { | |
696 | throw std::bad_alloc(); | |
697 | } | |
698 | std::copy_n(last.base, sz, buf.get()); | |
699 | _fragments.back() = { buf.get(), sz }; | |
700 | _buffers.back() = std::move(buf); | |
701 | #endif | |
702 | } | |
703 | ||
704 | void | |
705 | qp::rxq::complete_buffer(single_buffer&& bc, size_t len) { | |
706 | auto&& sb = bc[0]; | |
707 | auto&& buf = sb.buf; | |
708 | auto frag_buf = buf.get(); | |
709 | auto frag_len = len; | |
710 | // First buffer | |
711 | if (_remaining_buffers == 0) { | |
712 | auto hdr = reinterpret_cast<net_hdr_mrg*>(frag_buf); | |
713 | assert(hdr->num_buffers >= 1); | |
714 | _remaining_buffers = hdr->num_buffers; | |
715 | frag_buf += _dev._header_len; | |
716 | frag_len -= _dev._header_len; | |
717 | _fragments.clear(); | |
718 | _buffers.clear(); | |
719 | }; | |
720 | ||
721 | // Append current buffer | |
722 | _fragments.emplace_back(fragment{frag_buf, frag_len}); | |
723 | _buffers.push_back(std::move(buf)); | |
724 | _remaining_buffers--; | |
725 | ||
726 | // Last buffer | |
727 | if (_remaining_buffers == 0) { | |
728 | debug_mode_adjust_fragments(); | |
729 | deleter del; | |
730 | if (_buffers.size() == 1) { | |
731 | del = make_free_deleter(_buffers[0].release()); | |
732 | _buffers.clear(); | |
733 | } else { | |
734 | del = make_object_deleter(std::move(_buffers)); | |
735 | } | |
736 | packet p(_fragments.begin(), _fragments.end(), std::move(del)); | |
737 | ||
738 | _dev._stats.rx.good.update_frags_stats(p.nr_frags(), p.len()); | |
739 | ||
740 | _dev._dev->l2receive(std::move(p)); | |
741 | ||
742 | ||
743 | _ring.available_descriptors().signal(_fragments.size()); | |
744 | } | |
745 | } | |
746 | ||
747 | // Allocate and zero-initialize a buffer which is page-aligned and can be | |
748 | // used for virt_to_phys (i.e., physically contiguous). | |
749 | static std::unique_ptr<char[], free_deleter> virtio_buffer(size_t size) { | |
750 | void* ret; | |
751 | auto r = posix_memalign(&ret, 4096, size); | |
752 | assert(r == 0); | |
753 | bzero(ret, size); | |
754 | return std::unique_ptr<char[], free_deleter>(reinterpret_cast<char*>(ret)); | |
755 | } | |
756 | ||
757 | qp::qp(device* dev, size_t rx_ring_size, size_t tx_ring_size) | |
758 | : _dev(dev) | |
759 | , _txq_storage(virtio_buffer(vring_storage_size(tx_ring_size))) | |
760 | , _rxq_storage(virtio_buffer(vring_storage_size(rx_ring_size))) | |
761 | , _txq(*this, txq_config(tx_ring_size)) | |
762 | , _rxq(*this, rxq_config(rx_ring_size)) { | |
763 | } | |
764 | ||
765 | size_t qp::vring_storage_size(size_t ring_size) { | |
766 | // overestimate, but not by much. | |
767 | return 3 * 4096 + ring_size * (16 + 2 + 8); | |
768 | } | |
769 | ||
770 | void qp::common_config(ring_config& r) { | |
771 | r.avail = r.descs + 16 * r.size; | |
772 | r.used = align_up(r.avail + 2 * r.size + 6, 4096); | |
773 | r.event_index = (_dev->features() & VIRTIO_RING_F_EVENT_IDX) != 0; | |
774 | r.indirect = false; | |
775 | } | |
776 | ||
777 | ring_config qp::txq_config(size_t tx_ring_size) { | |
778 | ring_config r; | |
779 | r.size = tx_ring_size; | |
780 | r.descs = _txq_storage.get(); | |
781 | r.mergable_buffers = false; | |
782 | common_config(r); | |
783 | return r; | |
784 | } | |
785 | ||
786 | ring_config qp::rxq_config(size_t rx_ring_size) { | |
787 | ring_config r; | |
788 | r.size = rx_ring_size; | |
789 | r.descs = _rxq_storage.get(); | |
790 | r.mergable_buffers = true; | |
791 | common_config(r); | |
792 | return r; | |
793 | } | |
794 | ||
795 | void | |
796 | qp::rx_start() { | |
797 | _rxq.run(); | |
798 | } | |
799 | ||
800 | uint32_t | |
801 | qp::send(circular_buffer<packet>& p) { | |
802 | return _txq.post(p); | |
803 | } | |
804 | ||
805 | class qp_vhost : public qp { | |
806 | private: | |
807 | // The vhost file descriptor needs to remain open throughout the life of | |
808 | // this driver, as as soon as we close it, vhost stops servicing us. | |
809 | file_desc _vhost_fd; | |
810 | public: | |
811 | qp_vhost(device* dev, boost::program_options::variables_map opts); | |
812 | }; | |
813 | ||
814 | static size_t config_ring_size(boost::program_options::variables_map &opts) { | |
815 | if (opts.count("event-index")) { | |
816 | return opts["virtio-ring-size"].as<unsigned>(); | |
817 | } else { | |
818 | return 256; | |
819 | } | |
820 | } | |
821 | ||
822 | qp_vhost::qp_vhost(device *dev, boost::program_options::variables_map opts) | |
823 | : qp(dev, config_ring_size(opts), config_ring_size(opts)) | |
824 | , _vhost_fd(file_desc::open("/dev/vhost-net", O_RDWR)) | |
825 | { | |
826 | auto tap_device = opts["tap-device"].as<std::string>(); | |
827 | int64_t vhost_supported_features; | |
828 | _vhost_fd.ioctl(VHOST_GET_FEATURES, vhost_supported_features); | |
829 | vhost_supported_features &= _dev->features(); | |
830 | _vhost_fd.ioctl(VHOST_SET_FEATURES, vhost_supported_features); | |
831 | if (vhost_supported_features & VIRTIO_NET_F_MRG_RXBUF) { | |
832 | _header_len = sizeof(net_hdr_mrg); | |
833 | } else { | |
834 | _header_len = sizeof(net_hdr); | |
835 | } | |
836 | ||
837 | // Open and set up the tap device, which we'll tell vhost to use. | |
838 | // Note that the tap_fd we open here will be closed at the end of | |
839 | // this function. It appears that this is fine - i.e., after we pass | |
840 | // this fd to VHOST_NET_SET_BACKEND, the Linux kernel keeps the reference | |
841 | // to it and it's fine to close the file descriptor. | |
842 | file_desc tap_fd(file_desc::open("/dev/net/tun", O_RDWR | O_NONBLOCK)); | |
843 | assert(tap_device.size() + 1 <= IFNAMSIZ); | |
844 | ifreq ifr = {}; | |
845 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR; | |
846 | strcpy(ifr.ifr_ifrn.ifrn_name, tap_device.c_str()); | |
847 | tap_fd.ioctl(TUNSETIFF, ifr); | |
848 | unsigned int offload = 0; | |
849 | auto hw_features = _dev->hw_features(); | |
850 | if (hw_features.tx_csum_l4_offload && hw_features.rx_csum_offload) { | |
851 | offload = TUN_F_CSUM; | |
852 | if (hw_features.tx_tso) { | |
853 | offload |= TUN_F_TSO4; | |
854 | } | |
855 | if (hw_features.tx_ufo) { | |
856 | offload |= TUN_F_UFO; | |
857 | } | |
858 | } | |
859 | tap_fd.ioctl(TUNSETOFFLOAD, offload); | |
860 | tap_fd.ioctl(TUNSETVNETHDRSZ, _header_len); | |
861 | ||
862 | // Additional vhost setup: | |
863 | _vhost_fd.ioctl(VHOST_SET_OWNER); | |
864 | auto mem_table = make_struct_with_vla(&vhost_memory::regions, 1); | |
865 | mem_table->nregions = 1; | |
866 | auto& region = mem_table->regions[0]; | |
867 | region.guest_phys_addr = 0; | |
868 | region.memory_size = (size_t(1) << 47) - 4096; | |
869 | region.userspace_addr = 0; | |
870 | region.flags_padding = 0; | |
871 | _vhost_fd.ioctl(VHOST_SET_MEM_TABLE, *mem_table); | |
872 | vhost_vring_state vvs0 = { 0, _rxq.getconfig().size }; | |
873 | _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs0); | |
874 | vhost_vring_state vvs1 = { 1, _txq.getconfig().size }; | |
875 | _vhost_fd.ioctl(VHOST_SET_VRING_NUM, vvs1); | |
876 | auto tov = [](char* x) { return reinterpret_cast<uintptr_t>(x); }; | |
877 | ||
878 | _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{ | |
879 | 0, 0, tov(_rxq.getconfig().descs), tov(_rxq.getconfig().used), | |
880 | tov(_rxq.getconfig().avail), 0 | |
881 | }); | |
882 | _vhost_fd.ioctl(VHOST_SET_VRING_ADDR, vhost_vring_addr{ | |
883 | 1, 0, tov(_txq.getconfig().descs), tov(_txq.getconfig().used), | |
884 | tov(_txq.getconfig().avail), 0 | |
885 | }); | |
886 | ||
887 | readable_eventfd _txq_notify; | |
888 | writeable_eventfd _txq_kick; | |
889 | readable_eventfd _rxq_notify; | |
890 | writeable_eventfd _rxq_kick; | |
891 | _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{0, _rxq_kick.get_read_fd()}); | |
892 | _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{0, _rxq_notify.get_write_fd()}); | |
893 | _vhost_fd.ioctl(VHOST_SET_VRING_KICK, vhost_vring_file{1, _txq_kick.get_read_fd()}); | |
894 | _vhost_fd.ioctl(VHOST_SET_VRING_CALL, vhost_vring_file{1, _txq_notify.get_write_fd()}); | |
895 | _rxq.set_notifier(std::make_unique<notifier_vhost>(std::move(_rxq_kick))); | |
896 | _txq.set_notifier(std::make_unique<notifier_vhost>(std::move(_txq_kick))); | |
897 | ||
898 | _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{0, tap_fd.get()}); | |
899 | _vhost_fd.ioctl(VHOST_NET_SET_BACKEND, vhost_vring_file{1, tap_fd.get()}); | |
900 | } | |
901 | ||
902 | #ifdef HAVE_OSV | |
903 | class qp_osv : public qp { | |
904 | private: | |
905 | ethernet_address _mac; | |
906 | osv::assigned_virtio &_virtio; | |
907 | public: | |
908 | qp_osv(device *dev, osv::assigned_virtio &virtio, | |
909 | boost::program_options::variables_map opts); | |
910 | }; | |
911 | ||
912 | qp_osv::qp_osv(device *dev, osv::assigned_virtio &virtio, | |
913 | boost::program_options::variables_map opts) | |
914 | : qp(dev, virtio.queue_size(0), virtio.queue_size(1)) | |
915 | , _virtio(virtio) | |
916 | { | |
917 | // Read the host's virtio supported feature bitmask, AND it with the | |
918 | // features we want to use, and tell the host of the result: | |
919 | uint32_t subset = _virtio.init_features(_dev->features()); | |
920 | if (subset & VIRTIO_NET_F_MRG_RXBUF) { | |
921 | _header_len = sizeof(net_hdr_mrg); | |
922 | } else { | |
923 | _header_len = sizeof(net_hdr); | |
924 | } | |
925 | ||
926 | // TODO: save bits from "subset" in _hw_features? | |
927 | // bool _mergeable_bufs = subset & VIRTIO_NET_F_MRG_RXBUF; | |
928 | // bool _status = subset & VIRTIO_NET_F_STATUS; | |
929 | // bool _tso_ecn = subset & VIRTIO_NET_F_GUEST_ECN; | |
930 | // bool _host_tso_ecn = subset & VIRTIO_NET_F_HOST_ECN; | |
931 | // bool _csum = subset & VIRTIO_NET_F_CSUM; | |
932 | // bool _guest_csum = subset & VIRTIO_NET_F_GUEST_CSUM; | |
933 | // bool _guest_tso4 = subset & VIRTIO_NET_F_GUEST_TSO4; | |
934 | // bool _host_tso4 = subset & VIRTIO_NET_F_HOST_TSO4; | |
935 | // bool _guest_ufo = subset & VIRTIO_NET_F_GUEST_UFO; | |
936 | ||
937 | // Get the MAC address set by the host | |
938 | assert(subset & VIRTIO_NET_F_MAC); | |
939 | struct net_config { | |
940 | /* The ring_config defining mac address (if VIRTIO_NET_F_MAC) */ | |
941 | uint8_t mac[6]; | |
942 | /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* */ | |
943 | uint16_t status; | |
944 | /* Maximum number of each of transmit and receive queues; | |
945 | * see VIRTIO_NET_F_MQ and VIRTIO_NET_CTRL_MQ. | |
946 | * Legal values are between 1 and 0x8000 | |
947 | */ | |
948 | uint16_t max_virtqueue_pairs; | |
949 | } __attribute__((packed)) host_config; | |
950 | _virtio.conf_read(&host_config, sizeof(host_config)); | |
951 | _mac = { host_config.mac[0], host_config.mac[1], host_config.mac[2], | |
952 | host_config.mac[3], host_config.mac[4], host_config.mac[5] }; | |
953 | ||
954 | // Setup notifiers | |
955 | _rxq.set_notifier(std::make_unique<notifier_osv>(_virtio, 0)); | |
956 | _txq.set_notifier(std::make_unique<notifier_osv>(_virtio, 1)); | |
957 | ||
958 | ||
959 | // Tell the host where we put the rings (we already allocated them earlier) | |
960 | _virtio.set_queue_pfn( | |
961 | 0, virt_to_phys(_rxq.getconfig().descs)); | |
962 | _virtio.set_queue_pfn( | |
963 | 1, virt_to_phys(_txq.getconfig().descs)); | |
964 | ||
965 | // Set up interrupts | |
966 | // FIXME: in OSv, the first thing we do in the handler is to call | |
967 | // _rqx.disable_interrupts(). Here in seastar, we only do it much later | |
968 | // in the main engine(). Probably needs to do it like in osv - in the beginning of the handler. | |
969 | _virtio.enable_interrupt( | |
970 | 0, [&] { _rxq.wake_notifier_wait(); } ); | |
971 | _virtio.enable_interrupt( | |
972 | 1, [&] { _txq.wake_notifier_wait(); } ); | |
973 | ||
974 | _virtio.set_driver_ok(); | |
975 | } | |
976 | #endif | |
977 | ||
978 | std::unique_ptr<net::qp> device::init_local_queue(boost::program_options::variables_map opts, uint16_t qid) { | |
979 | static bool called = false; | |
980 | assert(!qid); | |
981 | assert(!called); | |
982 | called = true; | |
983 | ||
984 | #ifdef HAVE_OSV | |
985 | if (osv::assigned_virtio::get && osv::assigned_virtio::get()) { | |
986 | std::cout << "In OSv and assigned host's virtio device\n"; | |
987 | return std::make_unique<qp_osv>(this, *osv::assigned_virtio::get(), opts); | |
988 | } | |
989 | #endif | |
990 | return std::make_unique<qp_vhost>(this, opts); | |
991 | } | |
992 | ||
993 | } | |
994 | ||
995 | boost::program_options::options_description | |
996 | get_virtio_net_options_description() | |
997 | { | |
998 | boost::program_options::options_description opts( | |
999 | "Virtio net options"); | |
1000 | opts.add_options() | |
1001 | ("event-index", | |
1002 | boost::program_options::value<std::string>()->default_value("on"), | |
1003 | "Enable event-index feature (on / off)") | |
1004 | ("csum-offload", | |
1005 | boost::program_options::value<std::string>()->default_value("on"), | |
1006 | "Enable checksum offload feature (on / off)") | |
1007 | ("tso", | |
1008 | boost::program_options::value<std::string>()->default_value("on"), | |
1009 | "Enable TCP segment offload feature (on / off)") | |
1010 | ("ufo", | |
1011 | boost::program_options::value<std::string>()->default_value("on"), | |
1012 | "Enable UDP fragmentation offload feature (on / off)") | |
1013 | ("virtio-ring-size", | |
1014 | boost::program_options::value<unsigned>()->default_value(256), | |
1015 | "Virtio ring size (must be power-of-two)") | |
1016 | ; | |
1017 | return opts; | |
1018 | } | |
1019 | ||
1020 | std::unique_ptr<net::device> create_virtio_net_device(boost::program_options::variables_map opts) { | |
1021 | return std::make_unique<virtio::device>(opts); | |
1022 | } | |
1023 | ||
1024 | } | |
1025 | ||
1026 | // Locks the shared object in memory and forces on-load function resolution. | |
1027 | // Needed if the function passed to enable_interrupt() is run at interrupt | |
1028 | // time. | |
1029 | // TODO: Instead of doing this, _virtio.enable_interrupt() could take a | |
1030 | // pollable to wake instead of a function, then this won't be needed. | |
1031 | asm(".pushsection .note.osv-mlock, \"a\"; .long 0, 0, 0; .popsection"); |