]> git.proxmox.com Git - mirror_frr.git/blob - bgpd/bgp_io.c
Merge pull request #1618 from donaldsharp/zebra_startup_ordering
[mirror_frr.git] / bgpd / bgp_io.c
1 /* BGP I/O.
2 * Implements packet I/O in a pthread.
3 * Copyright (C) 2017 Cumulus Networks
4 * Quentin Young
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19 * MA 02110-1301 USA
20 */
21
22 /* clang-format off */
23 #include <zebra.h>
24 #include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
25
26 #include "frr_pthread.h" // for frr_pthread_get, frr_pthread
27 #include "linklist.h" // for list_delete, list_delete_all_node, lis...
28 #include "log.h" // for zlog_debug, safe_strerror, zlog_err
29 #include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
30 #include "network.h" // for ERRNO_IO_RETRY
31 #include "stream.h" // for stream_get_endp, stream_getw_from, str...
32 #include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
33 #include "thread.h" // for THREAD_OFF, THREAD_ARG, thread, thread...
34 #include "zassert.h" // for assert
35
36 #include "bgpd/bgp_io.h"
37 #include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
38 #include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
39 #include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
40 #include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
41 /* clang-format on */
42
43 /* forward declarations */
44 static uint16_t bgp_write(struct peer *);
45 static uint16_t bgp_read(struct peer *);
46 static int bgp_process_writes(struct thread *);
47 static int bgp_process_reads(struct thread *);
48 static bool validate_header(struct peer *);
49
50 /* generic i/o status codes */
51 #define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
52 #define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
53
54 /* Plumbing & control variables for thread lifecycle
55 * ------------------------------------------------------------------------ */
56 bool bgp_io_thread_run;
57 pthread_mutex_t *running_cond_mtx;
58 pthread_cond_t *running_cond;
59
60 /* Unused callback for thread_add_read() */
61 static int bgp_io_dummy(struct thread *thread) { return 0; }
62
63 /* Poison pill task */
64 static int bgp_io_finish(struct thread *thread)
65 {
66 bgp_io_thread_run = false;
67 return 0;
68 }
69
70 /* Extern lifecycle control functions. init -> start -> stop
71 * ------------------------------------------------------------------------ */
72 void bgp_io_init()
73 {
74 bgp_io_thread_run = false;
75
76 running_cond_mtx = XCALLOC(MTYPE_PTHREAD_PRIM, sizeof(pthread_mutex_t));
77 running_cond = XCALLOC(MTYPE_PTHREAD_PRIM, sizeof(pthread_cond_t));
78
79 pthread_mutex_init(running_cond_mtx, NULL);
80 pthread_cond_init(running_cond, NULL);
81
82 /* unlocked in bgp_io_wait_running() */
83 pthread_mutex_lock(running_cond_mtx);
84 }
85
86 void *bgp_io_start(void *arg)
87 {
88 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
89 fpt->master->owner = pthread_self();
90
91 // fd so we can sleep in poll()
92 int sleeper[2];
93 pipe(sleeper);
94 thread_add_read(fpt->master, &bgp_io_dummy, NULL, sleeper[0], NULL);
95
96 // we definitely don't want to handle signals
97 fpt->master->handle_signals = false;
98
99 struct thread task;
100
101 pthread_mutex_lock(running_cond_mtx);
102 {
103 bgp_io_thread_run = true;
104 pthread_cond_signal(running_cond);
105 }
106 pthread_mutex_unlock(running_cond_mtx);
107
108 while (bgp_io_thread_run) {
109 if (thread_fetch(fpt->master, &task)) {
110 thread_call(&task);
111 }
112 }
113
114 close(sleeper[1]);
115 close(sleeper[0]);
116
117 return NULL;
118 }
119
120 void bgp_io_wait_running()
121 {
122 while (!bgp_io_thread_run)
123 pthread_cond_wait(running_cond, running_cond_mtx);
124
125 /* locked in bgp_io_init() */
126 pthread_mutex_unlock(running_cond_mtx);
127 }
128
129 int bgp_io_stop(void **result, struct frr_pthread *fpt)
130 {
131 thread_add_event(fpt->master, &bgp_io_finish, NULL, 0, NULL);
132 pthread_join(fpt->thread, result);
133
134 pthread_mutex_destroy(running_cond_mtx);
135 pthread_cond_destroy(running_cond);
136
137 XFREE(MTYPE_PTHREAD_PRIM, running_cond_mtx);
138 XFREE(MTYPE_PTHREAD_PRIM, running_cond);
139
140 return 0;
141 }
142
143 /* Extern API -------------------------------------------------------------- */
144
145 void bgp_writes_on(struct peer *peer)
146 {
147 assert(bgp_io_thread_run);
148
149 assert(peer->status != Deleted);
150 assert(peer->obuf);
151 assert(peer->ibuf);
152 assert(peer->ibuf_work);
153 assert(!peer->t_connect_check_r);
154 assert(!peer->t_connect_check_w);
155 assert(peer->fd);
156
157 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
158
159 thread_add_write(fpt->master, bgp_process_writes, peer, peer->fd,
160 &peer->t_write);
161 SET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
162 }
163
164 void bgp_writes_off(struct peer *peer)
165 {
166 assert(bgp_io_thread_run);
167
168 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
169
170 thread_cancel_async(fpt->master, &peer->t_write, NULL);
171 THREAD_OFF(peer->t_generate_updgrp_packets);
172
173 UNSET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
174 }
175
176 void bgp_reads_on(struct peer *peer)
177 {
178 assert(bgp_io_thread_run);
179
180 assert(peer->status != Deleted);
181 assert(peer->ibuf);
182 assert(peer->fd);
183 assert(peer->ibuf_work);
184 assert(peer->obuf);
185 assert(!peer->t_connect_check_r);
186 assert(!peer->t_connect_check_w);
187 assert(peer->fd);
188
189 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
190
191 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
192 &peer->t_read);
193
194 SET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
195 }
196
197 void bgp_reads_off(struct peer *peer)
198 {
199 assert(bgp_io_thread_run);
200
201 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
202
203 thread_cancel_async(fpt->master, &peer->t_read, NULL);
204 THREAD_OFF(peer->t_process_packet);
205
206 UNSET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
207 }
208
209 /* Internal functions ------------------------------------------------------- */
210
211 /**
212 * Called from I/O pthread when a file descriptor has become ready for writing.
213 */
214 static int bgp_process_writes(struct thread *thread)
215 {
216 static struct peer *peer;
217 peer = THREAD_ARG(thread);
218 uint16_t status;
219 bool reschedule;
220 bool fatal = false;
221
222 if (peer->fd < 0)
223 return -1;
224
225 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
226
227 pthread_mutex_lock(&peer->io_mtx);
228 {
229 status = bgp_write(peer);
230 reschedule = (stream_fifo_head(peer->obuf) != NULL);
231 }
232 pthread_mutex_unlock(&peer->io_mtx);
233
234 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) { /* no problem */
235 }
236
237 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
238 reschedule = false; /* problem */
239 fatal = true;
240 }
241
242 if (reschedule) {
243 thread_add_write(fpt->master, bgp_process_writes, peer,
244 peer->fd, &peer->t_write);
245 } else if (!fatal) {
246 BGP_TIMER_ON(peer->t_generate_updgrp_packets,
247 bgp_generate_updgrp_packets, 0);
248 }
249
250 return 0;
251 }
252
253 /**
254 * Called from I/O pthread when a file descriptor has become ready for reading,
255 * or has hung up.
256 *
257 * We read as much data as possible, process as many packets as we can and
258 * place them on peer->ibuf for secondary processing by the main thread.
259 */
260 static int bgp_process_reads(struct thread *thread)
261 {
262 /* clang-format off */
263 static struct peer *peer; // peer to read from
264 uint16_t status; // bgp_read status code
265 bool more = true; // whether we got more data
266 bool fatal = false; // whether fatal error occurred
267 bool added_pkt = false; // whether we pushed onto ->ibuf
268 bool header_valid = true; // whether header is valid
269 /* clang-format on */
270
271 peer = THREAD_ARG(thread);
272
273 if (peer->fd < 0)
274 return -1;
275
276 struct frr_pthread *fpt = frr_pthread_get(PTHREAD_IO);
277
278 pthread_mutex_lock(&peer->io_mtx);
279 {
280 status = bgp_read(peer);
281 }
282 pthread_mutex_unlock(&peer->io_mtx);
283
284 /* error checking phase */
285 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
286 /* no problem; just don't process packets */
287 more = false;
288 }
289
290 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
291 /* problem; tear down session */
292 more = false;
293 fatal = true;
294 }
295
296 while (more) {
297 /* static buffer for transferring packets */
298 static unsigned char pktbuf[BGP_MAX_PACKET_SIZE];
299 /* shorter alias to peer's input buffer */
300 struct ringbuf *ibw = peer->ibuf_work;
301 /* packet size as given by header */
302 uint16_t pktsize = 0;
303
304 /* check that we have enough data for a header */
305 if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
306 break;
307
308 /* validate header */
309 header_valid = validate_header(peer);
310
311 if (!header_valid) {
312 fatal = true;
313 break;
314 }
315
316 /* header is valid; retrieve packet size */
317 ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
318
319 pktsize = ntohs(pktsize);
320
321 /* if this fails we are seriously screwed */
322 assert(pktsize <= BGP_MAX_PACKET_SIZE);
323
324 /* If we have that much data, chuck it into its own
325 * stream and append to input queue for processing. */
326 if (ringbuf_remain(ibw) >= pktsize) {
327 struct stream *pkt = stream_new(pktsize);
328 assert(ringbuf_get(ibw, pktbuf, pktsize) == pktsize);
329 stream_put(pkt, pktbuf, pktsize);
330
331 pthread_mutex_lock(&peer->io_mtx);
332 {
333 stream_fifo_push(peer->ibuf, pkt);
334 }
335 pthread_mutex_unlock(&peer->io_mtx);
336
337 added_pkt = true;
338 } else
339 break;
340 }
341
342 assert(ringbuf_space(peer->ibuf_work) >= BGP_MAX_PACKET_SIZE);
343
344 /* handle invalid header */
345 if (fatal) {
346 /* wipe buffer just in case someone screwed up */
347 ringbuf_wipe(peer->ibuf_work);
348 } else {
349 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
350 &peer->t_read);
351 if (added_pkt)
352 thread_add_timer_msec(bm->master, bgp_process_packet,
353 peer, 0, &peer->t_process_packet);
354 }
355
356 return 0;
357 }
358
359 /**
360 * Flush peer output buffer.
361 *
362 * This function pops packets off of peer->obuf and writes them to peer->fd.
363 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
364 * and the number of packets on the output buffer, unless an error occurs.
365 *
366 * If write() returns an error, the appropriate FSM event is generated.
367 *
368 * The return value is equal to the number of packets written
369 * (which may be zero).
370 */
371 static uint16_t bgp_write(struct peer *peer)
372 {
373 u_char type;
374 struct stream *s;
375 int num;
376 int update_last_write = 0;
377 unsigned int count = 0;
378 uint32_t uo = 0;
379 uint16_t status = 0;
380 uint32_t wpkt_quanta_old;
381
382 // cache current write quanta
383 wpkt_quanta_old =
384 atomic_load_explicit(&peer->bgp->wpkt_quanta, memory_order_relaxed);
385
386 while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) {
387 int writenum;
388 do {
389 writenum = stream_get_endp(s) - stream_get_getp(s);
390 num = write(peer->fd, STREAM_PNT(s), writenum);
391
392 if (num < 0) {
393 if (!ERRNO_IO_RETRY(errno)) {
394 BGP_EVENT_ADD(peer, TCP_fatal_error);
395 SET_FLAG(status, BGP_IO_FATAL_ERR);
396 } else {
397 SET_FLAG(status, BGP_IO_TRANS_ERR);
398 }
399
400 goto done;
401 } else if (num != writenum) // incomplete write
402 stream_forward_getp(s, num);
403
404 } while (num != writenum);
405
406 /* Retrieve BGP packet type. */
407 stream_set_getp(s, BGP_MARKER_SIZE + 2);
408 type = stream_getc(s);
409
410 switch (type) {
411 case BGP_MSG_OPEN:
412 atomic_fetch_add_explicit(&peer->open_out, 1,
413 memory_order_relaxed);
414 break;
415 case BGP_MSG_UPDATE:
416 atomic_fetch_add_explicit(&peer->update_out, 1,
417 memory_order_relaxed);
418 uo++;
419 break;
420 case BGP_MSG_NOTIFY:
421 atomic_fetch_add_explicit(&peer->notify_out, 1,
422 memory_order_relaxed);
423 /* Double start timer. */
424 peer->v_start *= 2;
425
426 /* Overflow check. */
427 if (peer->v_start >= (60 * 2))
428 peer->v_start = (60 * 2);
429
430 /* Handle Graceful Restart case where the state changes
431 * to Connect instead of Idle */
432 BGP_EVENT_ADD(peer, BGP_Stop);
433 goto done;
434
435 case BGP_MSG_KEEPALIVE:
436 atomic_fetch_add_explicit(&peer->keepalive_out, 1,
437 memory_order_relaxed);
438 break;
439 case BGP_MSG_ROUTE_REFRESH_NEW:
440 case BGP_MSG_ROUTE_REFRESH_OLD:
441 atomic_fetch_add_explicit(&peer->refresh_out, 1,
442 memory_order_relaxed);
443 break;
444 case BGP_MSG_CAPABILITY:
445 atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
446 memory_order_relaxed);
447 break;
448 }
449
450 count++;
451
452 stream_free(stream_fifo_pop(peer->obuf));
453 update_last_write = 1;
454 }
455
456 done : {
457 /*
458 * Update last_update if UPDATEs were written.
459 * Note: that these are only updated at end,
460 * not per message (i.e., per loop)
461 */
462 if (uo)
463 atomic_store_explicit(&peer->last_update, bgp_clock(),
464 memory_order_relaxed);
465
466 /* If we TXed any flavor of packet */
467 if (update_last_write)
468 atomic_store_explicit(&peer->last_write, bgp_clock(),
469 memory_order_relaxed);
470 }
471
472 return status;
473 }
474
475 /**
476 * Reads a chunk of data from peer->fd into peer->ibuf_work.
477 *
478 * @return status flag (see top-of-file)
479 */
480 static uint16_t bgp_read(struct peer *peer)
481 {
482 size_t readsize; // how many bytes we want to read
483 ssize_t nbytes; // how many bytes we actually read
484 uint16_t status = 0;
485 static uint8_t ibw[BGP_MAX_PACKET_SIZE * BGP_READ_PACKET_MAX];
486
487 readsize = MIN(ringbuf_space(peer->ibuf_work), sizeof(ibw));
488 nbytes = read(peer->fd, ibw, readsize);
489
490 /* EAGAIN or EWOULDBLOCK; come back later */
491 if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
492 SET_FLAG(status, BGP_IO_TRANS_ERR);
493 /* Fatal error; tear down session */
494 } else if (nbytes < 0) {
495 zlog_err("%s [Error] bgp_read_packet error: %s", peer->host,
496 safe_strerror(errno));
497
498 if (peer->status == Established) {
499 if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
500 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
501 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
502 } else
503 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
504 }
505
506 BGP_EVENT_ADD(peer, TCP_fatal_error);
507 SET_FLAG(status, BGP_IO_FATAL_ERR);
508 /* Received EOF / TCP session closed */
509 } else if (nbytes == 0) {
510 if (bgp_debug_neighbor_events(peer))
511 zlog_debug("%s [Event] BGP connection closed fd %d",
512 peer->host, peer->fd);
513
514 if (peer->status == Established) {
515 if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
516 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
517 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
518 } else
519 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
520 }
521
522 BGP_EVENT_ADD(peer, TCP_connection_closed);
523 SET_FLAG(status, BGP_IO_FATAL_ERR);
524 } else {
525 assert(ringbuf_put(peer->ibuf_work, ibw, nbytes)
526 == (size_t)nbytes);
527 }
528
529 return status;
530 }
531
532 /*
533 * Called after we have read a BGP packet header. Validates marker, message
534 * type and packet length. If any of these aren't correct, sends a notify.
535 *
536 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
537 * buffer.
538 */
539 static bool validate_header(struct peer *peer)
540 {
541 uint16_t size;
542 uint8_t type;
543 struct ringbuf *pkt = peer->ibuf_work;
544
545 static uint8_t m_correct[BGP_MARKER_SIZE] = {
546 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
547 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
548 uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
549
550 if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
551 return false;
552
553 if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
554 bgp_notify_send(peer, BGP_NOTIFY_HEADER_ERR,
555 BGP_NOTIFY_HEADER_NOT_SYNC);
556 return false;
557 }
558
559 /* Get size and type in network byte order. */
560 ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
561 ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
562
563 size = ntohs(size);
564
565 /* BGP type check. */
566 if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
567 && type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
568 && type != BGP_MSG_ROUTE_REFRESH_NEW
569 && type != BGP_MSG_ROUTE_REFRESH_OLD
570 && type != BGP_MSG_CAPABILITY) {
571 if (bgp_debug_neighbor_events(peer))
572 zlog_debug("%s unknown message type 0x%02x", peer->host,
573 type);
574
575 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
576 BGP_NOTIFY_HEADER_BAD_MESTYPE,
577 &type, 1);
578 return false;
579 }
580
581 /* Minimum packet length check. */
582 if ((size < BGP_HEADER_SIZE) || (size > BGP_MAX_PACKET_SIZE)
583 || (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
584 || (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
585 || (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
586 || (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
587 || (type == BGP_MSG_ROUTE_REFRESH_NEW
588 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
589 || (type == BGP_MSG_ROUTE_REFRESH_OLD
590 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
591 || (type == BGP_MSG_CAPABILITY
592 && size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
593 if (bgp_debug_neighbor_events(peer)) {
594 zlog_debug("%s bad message length - %d for %s",
595 peer->host, size,
596 type == 128 ? "ROUTE-REFRESH"
597 : bgp_type_str[(int) type]);
598 }
599
600 uint16_t nsize = htons(size);
601
602 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
603 BGP_NOTIFY_HEADER_BAD_MESLEN,
604 (unsigned char *) &nsize, 2);
605 return false;
606 }
607
608 return true;
609 }