]> git.proxmox.com Git - mirror_frr.git/blame_incremental - bgpd/bgp_io.c
Merge pull request #5737 from mjstapp/zebra_disable_kern_nhs
[mirror_frr.git] / bgpd / bgp_io.c
... / ...
CommitLineData
1/* BGP I/O.
2 * Implements packet I/O in a pthread.
3 * Copyright (C) 2017 Cumulus Networks
4 * Quentin Young
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19 * MA 02110-1301 USA
20 */
21
22/* clang-format off */
23#include <zebra.h>
24#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
25#include <sys/uio.h> // for writev
26
27#include "frr_pthread.h"
28#include "linklist.h" // for list_delete, list_delete_all_node, lis...
29#include "log.h" // for zlog_debug, safe_strerror, zlog_err
30#include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
31#include "network.h" // for ERRNO_IO_RETRY
32#include "stream.h" // for stream_get_endp, stream_getw_from, str...
33#include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
34#include "thread.h" // for THREAD_OFF, THREAD_ARG, thread, thread...
35#include "zassert.h" // for assert
36
37#include "bgpd/bgp_io.h"
38#include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
39#include "bgpd/bgp_errors.h" // for expanded error reference information
40#include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
41#include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
42#include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
43/* clang-format on */
44
45/* forward declarations */
46static uint16_t bgp_write(struct peer *);
47static uint16_t bgp_read(struct peer *);
48static int bgp_process_writes(struct thread *);
49static int bgp_process_reads(struct thread *);
50static bool validate_header(struct peer *);
51
52/* generic i/o status codes */
53#define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
54#define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
55
56/* Thread external API ----------------------------------------------------- */
57
58void bgp_writes_on(struct peer *peer)
59{
60 struct frr_pthread *fpt = bgp_pth_io;
61 assert(fpt->running);
62
63 assert(peer->status != Deleted);
64 assert(peer->obuf);
65 assert(peer->ibuf);
66 assert(peer->ibuf_work);
67 assert(!peer->t_connect_check_r);
68 assert(!peer->t_connect_check_w);
69 assert(peer->fd);
70
71 thread_add_write(fpt->master, bgp_process_writes, peer, peer->fd,
72 &peer->t_write);
73 SET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
74}
75
76void bgp_writes_off(struct peer *peer)
77{
78 struct frr_pthread *fpt = bgp_pth_io;
79 assert(fpt->running);
80
81 thread_cancel_async(fpt->master, &peer->t_write, NULL);
82 THREAD_OFF(peer->t_generate_updgrp_packets);
83
84 UNSET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
85}
86
87void bgp_reads_on(struct peer *peer)
88{
89 struct frr_pthread *fpt = bgp_pth_io;
90 assert(fpt->running);
91
92 assert(peer->status != Deleted);
93 assert(peer->ibuf);
94 assert(peer->fd);
95 assert(peer->ibuf_work);
96 assert(peer->obuf);
97 assert(!peer->t_connect_check_r);
98 assert(!peer->t_connect_check_w);
99 assert(peer->fd);
100
101 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
102 &peer->t_read);
103
104 SET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
105}
106
107void bgp_reads_off(struct peer *peer)
108{
109 struct frr_pthread *fpt = bgp_pth_io;
110 assert(fpt->running);
111
112 thread_cancel_async(fpt->master, &peer->t_read, NULL);
113 THREAD_OFF(peer->t_process_packet);
114
115 UNSET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
116}
117
118/* Thread internal functions ----------------------------------------------- */
119
120/*
121 * Called from I/O pthread when a file descriptor has become ready for writing.
122 */
123static int bgp_process_writes(struct thread *thread)
124{
125 static struct peer *peer;
126 peer = THREAD_ARG(thread);
127 uint16_t status;
128 bool reschedule;
129 bool fatal = false;
130
131 if (peer->fd < 0)
132 return -1;
133
134 struct frr_pthread *fpt = bgp_pth_io;
135
136 frr_with_mutex(&peer->io_mtx) {
137 status = bgp_write(peer);
138 reschedule = (stream_fifo_head(peer->obuf) != NULL);
139 }
140
141 /* no problem */
142 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
143 }
144
145 /* problem */
146 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
147 reschedule = false;
148 fatal = true;
149 }
150
151 if (reschedule) {
152 thread_add_write(fpt->master, bgp_process_writes, peer,
153 peer->fd, &peer->t_write);
154 } else if (!fatal) {
155 BGP_TIMER_ON(peer->t_generate_updgrp_packets,
156 bgp_generate_updgrp_packets, 0);
157 }
158
159 return 0;
160}
161
162/*
163 * Called from I/O pthread when a file descriptor has become ready for reading,
164 * or has hung up.
165 *
166 * We read as much data as possible, process as many packets as we can and
167 * place them on peer->ibuf for secondary processing by the main thread.
168 */
169static int bgp_process_reads(struct thread *thread)
170{
171 /* clang-format off */
172 static struct peer *peer; // peer to read from
173 uint16_t status; // bgp_read status code
174 bool more = true; // whether we got more data
175 bool fatal = false; // whether fatal error occurred
176 bool added_pkt = false; // whether we pushed onto ->ibuf
177 /* clang-format on */
178
179 peer = THREAD_ARG(thread);
180
181 if (peer->fd < 0 || bm->terminating)
182 return -1;
183
184 struct frr_pthread *fpt = bgp_pth_io;
185
186 frr_with_mutex(&peer->io_mtx) {
187 status = bgp_read(peer);
188 }
189
190 /* error checking phase */
191 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
192 /* no problem; just don't process packets */
193 more = false;
194 }
195
196 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
197 /* problem; tear down session */
198 more = false;
199 fatal = true;
200 }
201
202 while (more) {
203 /* static buffer for transferring packets */
204 static unsigned char pktbuf[BGP_MAX_PACKET_SIZE];
205 /* shorter alias to peer's input buffer */
206 struct ringbuf *ibw = peer->ibuf_work;
207 /* packet size as given by header */
208 uint16_t pktsize = 0;
209
210 /* check that we have enough data for a header */
211 if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
212 break;
213
214 /* check that header is valid */
215 if (!validate_header(peer)) {
216 fatal = true;
217 break;
218 }
219
220 /* header is valid; retrieve packet size */
221 ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
222
223 pktsize = ntohs(pktsize);
224
225 /* if this fails we are seriously screwed */
226 assert(pktsize <= BGP_MAX_PACKET_SIZE);
227
228 /*
229 * If we have that much data, chuck it into its own
230 * stream and append to input queue for processing.
231 */
232 if (ringbuf_remain(ibw) >= pktsize) {
233 struct stream *pkt = stream_new(pktsize);
234 assert(ringbuf_get(ibw, pktbuf, pktsize) == pktsize);
235 stream_put(pkt, pktbuf, pktsize);
236
237 frr_with_mutex(&peer->io_mtx) {
238 stream_fifo_push(peer->ibuf, pkt);
239 }
240
241 added_pkt = true;
242 } else
243 break;
244 }
245
246 /* handle invalid header */
247 if (fatal) {
248 /* wipe buffer just in case someone screwed up */
249 ringbuf_wipe(peer->ibuf_work);
250 } else {
251 assert(ringbuf_space(peer->ibuf_work) >= BGP_MAX_PACKET_SIZE);
252
253 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
254 &peer->t_read);
255 if (added_pkt)
256 thread_add_timer_msec(bm->master, bgp_process_packet,
257 peer, 0, &peer->t_process_packet);
258 }
259
260 return 0;
261}
262
263/*
264 * Flush peer output buffer.
265 *
266 * This function pops packets off of peer->obuf and writes them to peer->fd.
267 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
268 * and the number of packets on the output buffer, unless an error occurs.
269 *
270 * If write() returns an error, the appropriate FSM event is generated.
271 *
272 * The return value is equal to the number of packets written
273 * (which may be zero).
274 */
275static uint16_t bgp_write(struct peer *peer)
276{
277 uint8_t type;
278 struct stream *s;
279 int update_last_write = 0;
280 unsigned int count;
281 uint32_t uo = 0;
282 uint16_t status = 0;
283 uint32_t wpkt_quanta_old;
284
285 int writenum = 0;
286 int num;
287 unsigned int iovsz;
288 unsigned int strmsz;
289 unsigned int total_written;
290
291 wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
292 memory_order_relaxed);
293 struct stream *ostreams[wpkt_quanta_old];
294 struct stream **streams = ostreams;
295 struct iovec iov[wpkt_quanta_old];
296
297 s = stream_fifo_head(peer->obuf);
298
299 if (!s)
300 goto done;
301
302 count = iovsz = 0;
303 while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
304 ostreams[iovsz] = s;
305 iov[iovsz].iov_base = stream_pnt(s);
306 iov[iovsz].iov_len = STREAM_READABLE(s);
307 writenum += STREAM_READABLE(s);
308 s = s->next;
309 ++iovsz;
310 ++count;
311 }
312
313 strmsz = iovsz;
314 total_written = 0;
315
316 do {
317 num = writev(peer->fd, iov, iovsz);
318
319 if (num < 0) {
320 if (!ERRNO_IO_RETRY(errno)) {
321 BGP_EVENT_ADD(peer, TCP_fatal_error);
322 SET_FLAG(status, BGP_IO_FATAL_ERR);
323 } else {
324 SET_FLAG(status, BGP_IO_TRANS_ERR);
325 }
326
327 break;
328 } else if (num != writenum) {
329 unsigned int msg_written = 0;
330 unsigned int ic = iovsz;
331
332 for (unsigned int i = 0; i < ic; i++) {
333 size_t ss = iov[i].iov_len;
334
335 if (ss > (unsigned int) num)
336 break;
337
338 msg_written++;
339 iovsz--;
340 writenum -= ss;
341 num -= ss;
342 }
343
344 total_written += msg_written;
345
346 assert(total_written < count);
347
348 memmove(&iov, &iov[msg_written],
349 sizeof(iov[0]) * iovsz);
350 streams = &streams[msg_written];
351 stream_forward_getp(streams[0], num);
352 iov[0].iov_base = stream_pnt(streams[0]);
353 iov[0].iov_len = STREAM_READABLE(streams[0]);
354
355 writenum -= num;
356 num = 0;
357 assert(writenum > 0);
358 } else {
359 total_written = strmsz;
360 }
361
362 } while (num != writenum);
363
364 /* Handle statistics */
365 for (unsigned int i = 0; i < total_written; i++) {
366 s = stream_fifo_pop(peer->obuf);
367
368 assert(s == ostreams[i]);
369
370 /* Retrieve BGP packet type. */
371 stream_set_getp(s, BGP_MARKER_SIZE + 2);
372 type = stream_getc(s);
373
374 switch (type) {
375 case BGP_MSG_OPEN:
376 atomic_fetch_add_explicit(&peer->open_out, 1,
377 memory_order_relaxed);
378 break;
379 case BGP_MSG_UPDATE:
380 atomic_fetch_add_explicit(&peer->update_out, 1,
381 memory_order_relaxed);
382 uo++;
383 break;
384 case BGP_MSG_NOTIFY:
385 atomic_fetch_add_explicit(&peer->notify_out, 1,
386 memory_order_relaxed);
387 /* Double start timer. */
388 peer->v_start *= 2;
389
390 /* Overflow check. */
391 if (peer->v_start >= (60 * 2))
392 peer->v_start = (60 * 2);
393
394 /*
395 * Handle Graceful Restart case where the state changes
396 * to Connect instead of Idle.
397 */
398 BGP_EVENT_ADD(peer, BGP_Stop);
399 goto done;
400
401 case BGP_MSG_KEEPALIVE:
402 atomic_fetch_add_explicit(&peer->keepalive_out, 1,
403 memory_order_relaxed);
404 break;
405 case BGP_MSG_ROUTE_REFRESH_NEW:
406 case BGP_MSG_ROUTE_REFRESH_OLD:
407 atomic_fetch_add_explicit(&peer->refresh_out, 1,
408 memory_order_relaxed);
409 break;
410 case BGP_MSG_CAPABILITY:
411 atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
412 memory_order_relaxed);
413 break;
414 }
415
416 stream_free(s);
417 ostreams[i] = NULL;
418 update_last_write = 1;
419 }
420
421done : {
422 /*
423 * Update last_update if UPDATEs were written.
424 * Note: that these are only updated at end,
425 * not per message (i.e., per loop)
426 */
427 if (uo)
428 atomic_store_explicit(&peer->last_update, bgp_clock(),
429 memory_order_relaxed);
430
431 /* If we TXed any flavor of packet */
432 if (update_last_write)
433 atomic_store_explicit(&peer->last_write, bgp_clock(),
434 memory_order_relaxed);
435}
436
437 return status;
438}
439
440/*
441 * Reads a chunk of data from peer->fd into peer->ibuf_work.
442 *
443 * @return status flag (see top-of-file)
444 */
445static uint16_t bgp_read(struct peer *peer)
446{
447 size_t readsize; // how many bytes we want to read
448 ssize_t nbytes; // how many bytes we actually read
449 uint16_t status = 0;
450 static uint8_t ibw[BGP_MAX_PACKET_SIZE * BGP_READ_PACKET_MAX];
451
452 readsize = MIN(ringbuf_space(peer->ibuf_work), sizeof(ibw));
453 nbytes = read(peer->fd, ibw, readsize);
454
455 /* EAGAIN or EWOULDBLOCK; come back later */
456 if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
457 SET_FLAG(status, BGP_IO_TRANS_ERR);
458 /* Fatal error; tear down session */
459 } else if (nbytes < 0) {
460 flog_err(EC_BGP_UPDATE_RCV,
461 "%s [Error] bgp_read_packet error: %s", peer->host,
462 safe_strerror(errno));
463
464 if (peer->status == Established) {
465 if ((CHECK_FLAG(peer->flags,
466 PEER_FLAG_GRACEFUL_RESTART) ||
467 CHECK_FLAG(peer->flags,
468 PEER_FLAG_GRACEFUL_RESTART_HELPER)) &&
469 CHECK_FLAG(peer->sflags,
470 PEER_STATUS_NSF_MODE)) {
471 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
472 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
473 } else
474 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
475 }
476
477 BGP_EVENT_ADD(peer, TCP_fatal_error);
478 SET_FLAG(status, BGP_IO_FATAL_ERR);
479 /* Received EOF / TCP session closed */
480 } else if (nbytes == 0) {
481 if (bgp_debug_neighbor_events(peer))
482 zlog_debug("%s [Event] BGP connection closed fd %d",
483 peer->host, peer->fd);
484
485 if (peer->status == Established) {
486 if ((CHECK_FLAG(peer->flags,
487 PEER_FLAG_GRACEFUL_RESTART) ||
488 CHECK_FLAG(peer->flags,
489 PEER_FLAG_GRACEFUL_RESTART_HELPER)) &&
490 CHECK_FLAG(peer->sflags,
491 PEER_STATUS_NSF_MODE)) {
492 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
493 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
494 } else
495 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
496 }
497
498 BGP_EVENT_ADD(peer, TCP_connection_closed);
499 SET_FLAG(status, BGP_IO_FATAL_ERR);
500 } else {
501 assert(ringbuf_put(peer->ibuf_work, ibw, nbytes)
502 == (size_t)nbytes);
503 }
504
505 return status;
506}
507
508/*
509 * Called after we have read a BGP packet header. Validates marker, message
510 * type and packet length. If any of these aren't correct, sends a notify.
511 *
512 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
513 * buffer.
514 */
515static bool validate_header(struct peer *peer)
516{
517 uint16_t size;
518 uint8_t type;
519 struct ringbuf *pkt = peer->ibuf_work;
520
521 static const uint8_t m_correct[BGP_MARKER_SIZE] = {
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
523 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
524 uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
525
526 if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
527 return false;
528
529 if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
530 bgp_notify_send(peer, BGP_NOTIFY_HEADER_ERR,
531 BGP_NOTIFY_HEADER_NOT_SYNC);
532 return false;
533 }
534
535 /* Get size and type in network byte order. */
536 ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
537 ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
538
539 size = ntohs(size);
540
541 /* BGP type check. */
542 if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
543 && type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
544 && type != BGP_MSG_ROUTE_REFRESH_NEW
545 && type != BGP_MSG_ROUTE_REFRESH_OLD
546 && type != BGP_MSG_CAPABILITY) {
547 if (bgp_debug_neighbor_events(peer))
548 zlog_debug("%s unknown message type 0x%02x", peer->host,
549 type);
550
551 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
552 BGP_NOTIFY_HEADER_BAD_MESTYPE, &type,
553 1);
554 return false;
555 }
556
557 /* Minimum packet length check. */
558 if ((size < BGP_HEADER_SIZE) || (size > BGP_MAX_PACKET_SIZE)
559 || (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
560 || (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
561 || (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
562 || (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
563 || (type == BGP_MSG_ROUTE_REFRESH_NEW
564 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
565 || (type == BGP_MSG_ROUTE_REFRESH_OLD
566 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
567 || (type == BGP_MSG_CAPABILITY
568 && size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
569 if (bgp_debug_neighbor_events(peer)) {
570 zlog_debug("%s bad message length - %d for %s",
571 peer->host, size,
572 type == 128 ? "ROUTE-REFRESH"
573 : bgp_type_str[(int)type]);
574 }
575
576 uint16_t nsize = htons(size);
577
578 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
579 BGP_NOTIFY_HEADER_BAD_MESLEN,
580 (unsigned char *)&nsize, 2);
581 return false;
582 }
583
584 return true;
585}