2 * Implements packet I/O in a pthread.
3 * Copyright (C) 2017 Cumulus Networks
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
22 /* clang-format off */
24 #include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
26 #include "frr_pthread.h" // for frr_pthread_get, frr_pthread
27 #include "linklist.h" // for list_delete, list_delete_all_node, lis...
28 #include "log.h" // for zlog_debug, safe_strerror, zlog_err
29 #include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
30 #include "network.h" // for ERRNO_IO_RETRY
31 #include "stream.h" // for stream_get_endp, stream_getw_from, str...
32 #include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
33 #include "thread.h" // for THREAD_OFF, THREAD_ARG, thread, thread...
34 #include "zassert.h" // for assert
36 #include "bgpd/bgp_io.h"
37 #include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
38 #include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
39 #include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
40 #include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
43 /* forward declarations */
44 static uint16_t bgp_write(struct peer
*);
45 static uint16_t bgp_read(struct peer
*);
46 static int bgp_process_writes(struct thread
*);
47 static int bgp_process_reads(struct thread
*);
48 static bool validate_header(struct peer
*);
50 /* generic i/o status codes */
51 #define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
52 #define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
54 /* Plumbing & control variables for thread lifecycle
55 * ------------------------------------------------------------------------ */
56 bool bgp_io_thread_run
;
57 pthread_mutex_t
*running_cond_mtx
;
58 pthread_cond_t
*running_cond
;
60 /* Unused callback for thread_add_read() */
61 static int bgp_io_dummy(struct thread
*thread
) { return 0; }
63 /* Poison pill task */
64 static int bgp_io_finish(struct thread
*thread
)
66 bgp_io_thread_run
= false;
70 /* Extern lifecycle control functions. init -> start -> stop
71 * ------------------------------------------------------------------------ */
74 bgp_io_thread_run
= false;
76 running_cond_mtx
= XCALLOC(MTYPE_PTHREAD_PRIM
, sizeof(pthread_mutex_t
));
77 running_cond
= XCALLOC(MTYPE_PTHREAD_PRIM
, sizeof(pthread_cond_t
));
79 pthread_mutex_init(running_cond_mtx
, NULL
);
80 pthread_cond_init(running_cond
, NULL
);
82 /* unlocked in bgp_io_wait_running() */
83 pthread_mutex_lock(running_cond_mtx
);
86 void *bgp_io_start(void *arg
)
88 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
89 fpt
->master
->owner
= pthread_self();
91 // fd so we can sleep in poll()
94 thread_add_read(fpt
->master
, &bgp_io_dummy
, NULL
, sleeper
[0], NULL
);
96 // we definitely don't want to handle signals
97 fpt
->master
->handle_signals
= false;
101 pthread_mutex_lock(running_cond_mtx
);
103 bgp_io_thread_run
= true;
104 pthread_cond_signal(running_cond
);
106 pthread_mutex_unlock(running_cond_mtx
);
108 while (bgp_io_thread_run
) {
109 if (thread_fetch(fpt
->master
, &task
)) {
120 void bgp_io_wait_running()
122 while (!bgp_io_thread_run
)
123 pthread_cond_wait(running_cond
, running_cond_mtx
);
125 /* locked in bgp_io_init() */
126 pthread_mutex_unlock(running_cond_mtx
);
129 int bgp_io_stop(void **result
, struct frr_pthread
*fpt
)
131 thread_add_event(fpt
->master
, &bgp_io_finish
, NULL
, 0, NULL
);
132 pthread_join(fpt
->thread
, result
);
134 pthread_mutex_destroy(running_cond_mtx
);
135 pthread_cond_destroy(running_cond
);
137 XFREE(MTYPE_PTHREAD_PRIM
, running_cond_mtx
);
138 XFREE(MTYPE_PTHREAD_PRIM
, running_cond
);
143 /* Extern API -------------------------------------------------------------- */
145 void bgp_writes_on(struct peer
*peer
)
147 assert(bgp_io_thread_run
);
149 assert(peer
->status
!= Deleted
);
152 assert(peer
->ibuf_work
);
153 assert(!peer
->t_connect_check_r
);
154 assert(!peer
->t_connect_check_w
);
157 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
159 thread_add_write(fpt
->master
, bgp_process_writes
, peer
, peer
->fd
,
161 SET_FLAG(peer
->thread_flags
, PEER_THREAD_WRITES_ON
);
164 void bgp_writes_off(struct peer
*peer
)
166 assert(bgp_io_thread_run
);
168 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
170 thread_cancel_async(fpt
->master
, &peer
->t_write
, NULL
);
171 THREAD_OFF(peer
->t_generate_updgrp_packets
);
173 UNSET_FLAG(peer
->thread_flags
, PEER_THREAD_WRITES_ON
);
176 void bgp_reads_on(struct peer
*peer
)
178 assert(bgp_io_thread_run
);
180 assert(peer
->status
!= Deleted
);
183 assert(peer
->ibuf_work
);
185 assert(!peer
->t_connect_check_r
);
186 assert(!peer
->t_connect_check_w
);
189 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
191 thread_add_read(fpt
->master
, bgp_process_reads
, peer
, peer
->fd
,
194 SET_FLAG(peer
->thread_flags
, PEER_THREAD_READS_ON
);
197 void bgp_reads_off(struct peer
*peer
)
199 assert(bgp_io_thread_run
);
201 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
203 thread_cancel_async(fpt
->master
, &peer
->t_read
, NULL
);
204 THREAD_OFF(peer
->t_process_packet
);
206 UNSET_FLAG(peer
->thread_flags
, PEER_THREAD_READS_ON
);
209 /* Internal functions ------------------------------------------------------- */
212 * Called from I/O pthread when a file descriptor has become ready for writing.
214 static int bgp_process_writes(struct thread
*thread
)
216 static struct peer
*peer
;
217 peer
= THREAD_ARG(thread
);
225 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
227 pthread_mutex_lock(&peer
->io_mtx
);
229 status
= bgp_write(peer
);
230 reschedule
= (stream_fifo_head(peer
->obuf
) != NULL
);
232 pthread_mutex_unlock(&peer
->io_mtx
);
234 if (CHECK_FLAG(status
, BGP_IO_TRANS_ERR
)) { /* no problem */
237 if (CHECK_FLAG(status
, BGP_IO_FATAL_ERR
)) {
238 reschedule
= false; /* problem */
243 thread_add_write(fpt
->master
, bgp_process_writes
, peer
,
244 peer
->fd
, &peer
->t_write
);
246 BGP_TIMER_ON(peer
->t_generate_updgrp_packets
,
247 bgp_generate_updgrp_packets
, 0);
254 * Called from I/O pthread when a file descriptor has become ready for reading,
257 * We read as much data as possible, process as many packets as we can and
258 * place them on peer->ibuf for secondary processing by the main thread.
260 static int bgp_process_reads(struct thread
*thread
)
262 /* clang-format off */
263 static struct peer
*peer
; // peer to read from
264 uint16_t status
; // bgp_read status code
265 bool more
= true; // whether we got more data
266 bool fatal
= false; // whether fatal error occurred
267 bool added_pkt
= false; // whether we pushed onto ->ibuf
268 bool header_valid
= true; // whether header is valid
269 /* clang-format on */
271 peer
= THREAD_ARG(thread
);
276 struct frr_pthread
*fpt
= frr_pthread_get(PTHREAD_IO
);
278 pthread_mutex_lock(&peer
->io_mtx
);
280 status
= bgp_read(peer
);
282 pthread_mutex_unlock(&peer
->io_mtx
);
284 /* error checking phase */
285 if (CHECK_FLAG(status
, BGP_IO_TRANS_ERR
)) {
286 /* no problem; just don't process packets */
290 if (CHECK_FLAG(status
, BGP_IO_FATAL_ERR
)) {
291 /* problem; tear down session */
297 /* static buffer for transferring packets */
298 static unsigned char pktbuf
[BGP_MAX_PACKET_SIZE
];
299 /* shorter alias to peer's input buffer */
300 struct ringbuf
*ibw
= peer
->ibuf_work
;
301 /* packet size as given by header */
302 uint16_t pktsize
= 0;
304 /* check that we have enough data for a header */
305 if (ringbuf_remain(ibw
) < BGP_HEADER_SIZE
)
308 /* validate header */
309 header_valid
= validate_header(peer
);
316 /* header is valid; retrieve packet size */
317 ringbuf_peek(ibw
, BGP_MARKER_SIZE
, &pktsize
, sizeof(pktsize
));
319 pktsize
= ntohs(pktsize
);
321 /* if this fails we are seriously screwed */
322 assert(pktsize
<= BGP_MAX_PACKET_SIZE
);
324 /* If we have that much data, chuck it into its own
325 * stream and append to input queue for processing. */
326 if (ringbuf_remain(ibw
) >= pktsize
) {
327 struct stream
*pkt
= stream_new(pktsize
);
328 assert(ringbuf_get(ibw
, pktbuf
, pktsize
) == pktsize
);
329 stream_put(pkt
, pktbuf
, pktsize
);
331 pthread_mutex_lock(&peer
->io_mtx
);
333 stream_fifo_push(peer
->ibuf
, pkt
);
335 pthread_mutex_unlock(&peer
->io_mtx
);
342 assert(ringbuf_space(peer
->ibuf_work
) >= BGP_MAX_PACKET_SIZE
);
344 /* handle invalid header */
346 /* wipe buffer just in case someone screwed up */
347 ringbuf_wipe(peer
->ibuf_work
);
349 thread_add_read(fpt
->master
, bgp_process_reads
, peer
, peer
->fd
,
352 thread_add_timer_msec(bm
->master
, bgp_process_packet
,
353 peer
, 0, &peer
->t_process_packet
);
360 * Flush peer output buffer.
362 * This function pops packets off of peer->obuf and writes them to peer->fd.
363 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
364 * and the number of packets on the output buffer, unless an error occurs.
366 * If write() returns an error, the appropriate FSM event is generated.
368 * The return value is equal to the number of packets written
369 * (which may be zero).
371 static uint16_t bgp_write(struct peer
*peer
)
376 int update_last_write
= 0;
377 unsigned int count
= 0;
380 uint32_t wpkt_quanta_old
;
382 // cache current write quanta
384 atomic_load_explicit(&peer
->bgp
->wpkt_quanta
, memory_order_relaxed
);
386 while (count
< wpkt_quanta_old
&& (s
= stream_fifo_head(peer
->obuf
))) {
389 writenum
= stream_get_endp(s
) - stream_get_getp(s
);
390 num
= write(peer
->fd
, STREAM_PNT(s
), writenum
);
393 if (!ERRNO_IO_RETRY(errno
)) {
394 BGP_EVENT_ADD(peer
, TCP_fatal_error
);
395 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
397 SET_FLAG(status
, BGP_IO_TRANS_ERR
);
401 } else if (num
!= writenum
) // incomplete write
402 stream_forward_getp(s
, num
);
404 } while (num
!= writenum
);
406 /* Retrieve BGP packet type. */
407 stream_set_getp(s
, BGP_MARKER_SIZE
+ 2);
408 type
= stream_getc(s
);
412 atomic_fetch_add_explicit(&peer
->open_out
, 1,
413 memory_order_relaxed
);
416 atomic_fetch_add_explicit(&peer
->update_out
, 1,
417 memory_order_relaxed
);
421 atomic_fetch_add_explicit(&peer
->notify_out
, 1,
422 memory_order_relaxed
);
423 /* Double start timer. */
426 /* Overflow check. */
427 if (peer
->v_start
>= (60 * 2))
428 peer
->v_start
= (60 * 2);
430 /* Handle Graceful Restart case where the state changes
431 * to Connect instead of Idle */
432 BGP_EVENT_ADD(peer
, BGP_Stop
);
435 case BGP_MSG_KEEPALIVE
:
436 atomic_fetch_add_explicit(&peer
->keepalive_out
, 1,
437 memory_order_relaxed
);
439 case BGP_MSG_ROUTE_REFRESH_NEW
:
440 case BGP_MSG_ROUTE_REFRESH_OLD
:
441 atomic_fetch_add_explicit(&peer
->refresh_out
, 1,
442 memory_order_relaxed
);
444 case BGP_MSG_CAPABILITY
:
445 atomic_fetch_add_explicit(&peer
->dynamic_cap_out
, 1,
446 memory_order_relaxed
);
452 stream_free(stream_fifo_pop(peer
->obuf
));
453 update_last_write
= 1;
458 * Update last_update if UPDATEs were written.
459 * Note: that these are only updated at end,
460 * not per message (i.e., per loop)
463 atomic_store_explicit(&peer
->last_update
, bgp_clock(),
464 memory_order_relaxed
);
466 /* If we TXed any flavor of packet */
467 if (update_last_write
)
468 atomic_store_explicit(&peer
->last_write
, bgp_clock(),
469 memory_order_relaxed
);
476 * Reads a chunk of data from peer->fd into peer->ibuf_work.
478 * @return status flag (see top-of-file)
480 static uint16_t bgp_read(struct peer
*peer
)
482 size_t readsize
; // how many bytes we want to read
483 ssize_t nbytes
; // how many bytes we actually read
485 static uint8_t ibw
[BGP_MAX_PACKET_SIZE
* BGP_READ_PACKET_MAX
];
487 readsize
= MIN(ringbuf_space(peer
->ibuf_work
), sizeof(ibw
));
488 nbytes
= read(peer
->fd
, ibw
, readsize
);
490 /* EAGAIN or EWOULDBLOCK; come back later */
491 if (nbytes
< 0 && ERRNO_IO_RETRY(errno
)) {
492 SET_FLAG(status
, BGP_IO_TRANS_ERR
);
493 /* Fatal error; tear down session */
494 } else if (nbytes
< 0) {
495 zlog_err("%s [Error] bgp_read_packet error: %s", peer
->host
,
496 safe_strerror(errno
));
498 if (peer
->status
== Established
) {
499 if (CHECK_FLAG(peer
->sflags
, PEER_STATUS_NSF_MODE
)) {
500 peer
->last_reset
= PEER_DOWN_NSF_CLOSE_SESSION
;
501 SET_FLAG(peer
->sflags
, PEER_STATUS_NSF_WAIT
);
503 peer
->last_reset
= PEER_DOWN_CLOSE_SESSION
;
506 BGP_EVENT_ADD(peer
, TCP_fatal_error
);
507 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
508 /* Received EOF / TCP session closed */
509 } else if (nbytes
== 0) {
510 if (bgp_debug_neighbor_events(peer
))
511 zlog_debug("%s [Event] BGP connection closed fd %d",
512 peer
->host
, peer
->fd
);
514 if (peer
->status
== Established
) {
515 if (CHECK_FLAG(peer
->sflags
, PEER_STATUS_NSF_MODE
)) {
516 peer
->last_reset
= PEER_DOWN_NSF_CLOSE_SESSION
;
517 SET_FLAG(peer
->sflags
, PEER_STATUS_NSF_WAIT
);
519 peer
->last_reset
= PEER_DOWN_CLOSE_SESSION
;
522 BGP_EVENT_ADD(peer
, TCP_connection_closed
);
523 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
525 assert(ringbuf_put(peer
->ibuf_work
, ibw
, nbytes
)
533 * Called after we have read a BGP packet header. Validates marker, message
534 * type and packet length. If any of these aren't correct, sends a notify.
536 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
539 static bool validate_header(struct peer
*peer
)
543 struct ringbuf
*pkt
= peer
->ibuf_work
;
545 static uint8_t m_correct
[BGP_MARKER_SIZE
] = {
546 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
547 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
548 uint8_t m_rx
[BGP_MARKER_SIZE
] = {0x00};
550 if (ringbuf_peek(pkt
, 0, m_rx
, BGP_MARKER_SIZE
) != BGP_MARKER_SIZE
)
553 if (memcmp(m_correct
, m_rx
, BGP_MARKER_SIZE
) != 0) {
554 bgp_notify_send(peer
, BGP_NOTIFY_HEADER_ERR
,
555 BGP_NOTIFY_HEADER_NOT_SYNC
);
559 /* Get size and type in network byte order. */
560 ringbuf_peek(pkt
, BGP_MARKER_SIZE
, &size
, sizeof(size
));
561 ringbuf_peek(pkt
, BGP_MARKER_SIZE
+ 2, &type
, sizeof(type
));
565 /* BGP type check. */
566 if (type
!= BGP_MSG_OPEN
&& type
!= BGP_MSG_UPDATE
567 && type
!= BGP_MSG_NOTIFY
&& type
!= BGP_MSG_KEEPALIVE
568 && type
!= BGP_MSG_ROUTE_REFRESH_NEW
569 && type
!= BGP_MSG_ROUTE_REFRESH_OLD
570 && type
!= BGP_MSG_CAPABILITY
) {
571 if (bgp_debug_neighbor_events(peer
))
572 zlog_debug("%s unknown message type 0x%02x", peer
->host
,
575 bgp_notify_send_with_data(peer
, BGP_NOTIFY_HEADER_ERR
,
576 BGP_NOTIFY_HEADER_BAD_MESTYPE
,
581 /* Minimum packet length check. */
582 if ((size
< BGP_HEADER_SIZE
) || (size
> BGP_MAX_PACKET_SIZE
)
583 || (type
== BGP_MSG_OPEN
&& size
< BGP_MSG_OPEN_MIN_SIZE
)
584 || (type
== BGP_MSG_UPDATE
&& size
< BGP_MSG_UPDATE_MIN_SIZE
)
585 || (type
== BGP_MSG_NOTIFY
&& size
< BGP_MSG_NOTIFY_MIN_SIZE
)
586 || (type
== BGP_MSG_KEEPALIVE
&& size
!= BGP_MSG_KEEPALIVE_MIN_SIZE
)
587 || (type
== BGP_MSG_ROUTE_REFRESH_NEW
588 && size
< BGP_MSG_ROUTE_REFRESH_MIN_SIZE
)
589 || (type
== BGP_MSG_ROUTE_REFRESH_OLD
590 && size
< BGP_MSG_ROUTE_REFRESH_MIN_SIZE
)
591 || (type
== BGP_MSG_CAPABILITY
592 && size
< BGP_MSG_CAPABILITY_MIN_SIZE
)) {
593 if (bgp_debug_neighbor_events(peer
)) {
594 zlog_debug("%s bad message length - %d for %s",
596 type
== 128 ? "ROUTE-REFRESH"
597 : bgp_type_str
[(int) type
]);
600 uint16_t nsize
= htons(size
);
602 bgp_notify_send_with_data(peer
, BGP_NOTIFY_HEADER_ERR
,
603 BGP_NOTIFY_HEADER_BAD_MESLEN
,
604 (unsigned char *) &nsize
, 2);