2 * Implements packet I/O in a pthread.
3 * Copyright (C) 2017 Cumulus Networks
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
22 /* clang-format off */
24 #include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
25 #include <sys/uio.h> // for writev
27 #include "frr_pthread.h"
28 #include "linklist.h" // for list_delete, list_delete_all_node, lis...
29 #include "log.h" // for zlog_debug, safe_strerror, zlog_err
30 #include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
31 #include "network.h" // for ERRNO_IO_RETRY
32 #include "stream.h" // for stream_get_endp, stream_getw_from, str...
33 #include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
34 #include "thread.h" // for THREAD_OFF, THREAD_ARG, thread...
36 #include "bgpd/bgp_io.h"
37 #include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
38 #include "bgpd/bgp_errors.h" // for expanded error reference information
39 #include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
40 #include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
41 #include "bgpd/bgp_trace.h" // for frrtraces
42 #include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
45 /* forward declarations */
46 static uint16_t bgp_write(struct peer
*);
47 static uint16_t bgp_read(struct peer
*peer
, int *code_p
);
48 static void bgp_process_writes(struct thread
*);
49 static void bgp_process_reads(struct thread
*);
50 static bool validate_header(struct peer
*);
52 /* generic i/o status codes */
53 #define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
54 #define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
56 /* Thread external API ----------------------------------------------------- */
58 void bgp_writes_on(struct peer
*peer
)
60 struct frr_pthread
*fpt
= bgp_pth_io
;
63 assert(peer
->status
!= Deleted
);
66 assert(peer
->ibuf_work
);
67 assert(!peer
->t_connect_check_r
);
68 assert(!peer
->t_connect_check_w
);
71 thread_add_write(fpt
->master
, bgp_process_writes
, peer
, peer
->fd
,
73 SET_FLAG(peer
->thread_flags
, PEER_THREAD_WRITES_ON
);
76 void bgp_writes_off(struct peer
*peer
)
78 struct frr_pthread
*fpt
= bgp_pth_io
;
81 thread_cancel_async(fpt
->master
, &peer
->t_write
, NULL
);
82 THREAD_OFF(peer
->t_generate_updgrp_packets
);
84 UNSET_FLAG(peer
->thread_flags
, PEER_THREAD_WRITES_ON
);
87 void bgp_reads_on(struct peer
*peer
)
89 struct frr_pthread
*fpt
= bgp_pth_io
;
92 assert(peer
->status
!= Deleted
);
95 assert(peer
->ibuf_work
);
97 assert(!peer
->t_connect_check_r
);
98 assert(!peer
->t_connect_check_w
);
101 thread_add_read(fpt
->master
, bgp_process_reads
, peer
, peer
->fd
,
104 SET_FLAG(peer
->thread_flags
, PEER_THREAD_READS_ON
);
107 void bgp_reads_off(struct peer
*peer
)
109 struct frr_pthread
*fpt
= bgp_pth_io
;
110 assert(fpt
->running
);
112 thread_cancel_async(fpt
->master
, &peer
->t_read
, NULL
);
113 THREAD_OFF(peer
->t_process_packet
);
114 THREAD_OFF(peer
->t_process_packet_error
);
116 UNSET_FLAG(peer
->thread_flags
, PEER_THREAD_READS_ON
);
119 /* Thread internal functions ----------------------------------------------- */
122 * Called from I/O pthread when a file descriptor has become ready for writing.
124 static void bgp_process_writes(struct thread
*thread
)
126 static struct peer
*peer
;
127 peer
= THREAD_ARG(thread
);
135 struct frr_pthread
*fpt
= bgp_pth_io
;
137 frr_with_mutex(&peer
->io_mtx
) {
138 status
= bgp_write(peer
);
139 reschedule
= (stream_fifo_head(peer
->obuf
) != NULL
);
143 if (CHECK_FLAG(status
, BGP_IO_TRANS_ERR
)) {
147 if (CHECK_FLAG(status
, BGP_IO_FATAL_ERR
)) {
152 /* If suppress fib pending is enabled, route is advertised to peers when
153 * the status is received from the FIB. The delay is added
154 * to update group packet generate which will allow more routes to be
155 * sent in the update message
158 thread_add_write(fpt
->master
, bgp_process_writes
, peer
,
159 peer
->fd
, &peer
->t_write
);
161 BGP_UPDATE_GROUP_TIMER_ON(&peer
->t_generate_updgrp_packets
,
162 bgp_generate_updgrp_packets
);
167 * Called from I/O pthread when a file descriptor has become ready for reading,
170 * We read as much data as possible, process as many packets as we can and
171 * place them on peer->ibuf for secondary processing by the main thread.
173 static void bgp_process_reads(struct thread
*thread
)
175 /* clang-format off */
176 static struct peer
*peer
; // peer to read from
177 uint16_t status
; // bgp_read status code
178 bool more
= true; // whether we got more data
179 bool fatal
= false; // whether fatal error occurred
180 bool added_pkt
= false; // whether we pushed onto ->ibuf
181 int code
= 0; // FSM code if error occurred
182 /* clang-format on */
184 peer
= THREAD_ARG(thread
);
186 if (peer
->fd
< 0 || bm
->terminating
)
189 struct frr_pthread
*fpt
= bgp_pth_io
;
191 frr_with_mutex(&peer
->io_mtx
) {
192 status
= bgp_read(peer
, &code
);
195 /* error checking phase */
196 if (CHECK_FLAG(status
, BGP_IO_TRANS_ERR
)) {
197 /* no problem; just don't process packets */
201 if (CHECK_FLAG(status
, BGP_IO_FATAL_ERR
)) {
202 /* problem; tear down session */
206 /* Handle the error in the main pthread, include the
207 * specific state change from 'bgp_read'.
209 thread_add_event(bm
->master
, bgp_packet_process_error
,
210 peer
, code
, &peer
->t_process_packet_error
);
214 /* static buffer for transferring packets */
215 /* shorter alias to peer's input buffer */
216 struct ringbuf
*ibw
= peer
->ibuf_work
;
217 /* packet size as given by header */
218 uint16_t pktsize
= 0;
220 /* check that we have enough data for a header */
221 if (ringbuf_remain(ibw
) < BGP_HEADER_SIZE
)
224 /* check that header is valid */
225 if (!validate_header(peer
)) {
230 /* header is valid; retrieve packet size */
231 ringbuf_peek(ibw
, BGP_MARKER_SIZE
, &pktsize
, sizeof(pktsize
));
233 pktsize
= ntohs(pktsize
);
235 /* if this fails we are seriously screwed */
236 assert(pktsize
<= peer
->max_packet_size
);
239 * If we have that much data, chuck it into its own
240 * stream and append to input queue for processing.
242 if (ringbuf_remain(ibw
) >= pktsize
) {
243 struct stream
*pkt
= stream_new(pktsize
);
245 assert(STREAM_WRITEABLE(pkt
) == pktsize
);
246 assert(ringbuf_get(ibw
, pkt
->data
, pktsize
) == pktsize
);
247 stream_set_endp(pkt
, pktsize
);
249 frrtrace(2, frr_bgp
, packet_read
, peer
, pkt
);
250 frr_with_mutex(&peer
->io_mtx
) {
251 stream_fifo_push(peer
->ibuf
, pkt
);
259 /* handle invalid header */
261 /* wipe buffer just in case someone screwed up */
262 ringbuf_wipe(peer
->ibuf_work
);
264 assert(ringbuf_space(peer
->ibuf_work
) >= peer
->max_packet_size
);
266 thread_add_read(fpt
->master
, bgp_process_reads
, peer
, peer
->fd
,
269 thread_add_event(bm
->master
, bgp_process_packet
,
270 peer
, 0, &peer
->t_process_packet
);
275 * Flush peer output buffer.
277 * This function pops packets off of peer->obuf and writes them to peer->fd.
278 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
279 * and the number of packets on the output buffer, unless an error occurs.
281 * If write() returns an error, the appropriate FSM event is generated.
283 * The return value is equal to the number of packets written
284 * (which may be zero).
286 static uint16_t bgp_write(struct peer
*peer
)
290 int update_last_write
= 0;
294 uint32_t wpkt_quanta_old
;
300 unsigned int total_written
;
303 wpkt_quanta_old
= atomic_load_explicit(&peer
->bgp
->wpkt_quanta
,
304 memory_order_relaxed
);
305 struct stream
*ostreams
[wpkt_quanta_old
];
306 struct stream
**streams
= ostreams
;
307 struct iovec iov
[wpkt_quanta_old
];
309 s
= stream_fifo_head(peer
->obuf
);
315 while (count
< wpkt_quanta_old
&& iovsz
< array_size(iov
) && s
) {
317 iov
[iovsz
].iov_base
= stream_pnt(s
);
318 iov
[iovsz
].iov_len
= STREAM_READABLE(s
);
319 writenum
+= STREAM_READABLE(s
);
329 num
= writev(peer
->fd
, iov
, iovsz
);
332 if (!ERRNO_IO_RETRY(errno
)) {
333 BGP_EVENT_ADD(peer
, TCP_fatal_error
);
334 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
336 SET_FLAG(status
, BGP_IO_TRANS_ERR
);
340 } else if (num
!= writenum
) {
341 unsigned int msg_written
= 0;
342 unsigned int ic
= iovsz
;
344 for (unsigned int i
= 0; i
< ic
; i
++) {
345 size_t ss
= iov
[i
].iov_len
;
347 if (ss
> (unsigned int) num
)
356 total_written
+= msg_written
;
358 assert(total_written
< count
);
360 memmove(&iov
, &iov
[msg_written
],
361 sizeof(iov
[0]) * iovsz
);
362 streams
= &streams
[msg_written
];
363 stream_forward_getp(streams
[0], num
);
364 iov
[0].iov_base
= stream_pnt(streams
[0]);
365 iov
[0].iov_len
= STREAM_READABLE(streams
[0]);
369 assert(writenum
> 0);
371 total_written
= strmsz
;
374 } while (num
!= writenum
);
376 /* Handle statistics */
377 for (unsigned int i
= 0; i
< total_written
; i
++) {
378 s
= stream_fifo_pop(peer
->obuf
);
380 assert(s
== ostreams
[i
]);
382 /* Retrieve BGP packet type. */
383 stream_set_getp(s
, BGP_MARKER_SIZE
+ 2);
384 type
= stream_getc(s
);
388 atomic_fetch_add_explicit(&peer
->open_out
, 1,
389 memory_order_relaxed
);
392 atomic_fetch_add_explicit(&peer
->update_out
, 1,
393 memory_order_relaxed
);
397 atomic_fetch_add_explicit(&peer
->notify_out
, 1,
398 memory_order_relaxed
);
399 /* Double start timer. */
402 /* Overflow check. */
403 if (peer
->v_start
>= (60 * 2))
404 peer
->v_start
= (60 * 2);
407 * Handle Graceful Restart case where the state changes
408 * to Connect instead of Idle.
410 BGP_EVENT_ADD(peer
, BGP_Stop
);
413 case BGP_MSG_KEEPALIVE
:
414 atomic_fetch_add_explicit(&peer
->keepalive_out
, 1,
415 memory_order_relaxed
);
417 case BGP_MSG_ROUTE_REFRESH_NEW
:
418 case BGP_MSG_ROUTE_REFRESH_OLD
:
419 atomic_fetch_add_explicit(&peer
->refresh_out
, 1,
420 memory_order_relaxed
);
422 case BGP_MSG_CAPABILITY
:
423 atomic_fetch_add_explicit(&peer
->dynamic_cap_out
, 1,
424 memory_order_relaxed
);
430 update_last_write
= 1;
436 * Update last_update if UPDATEs were written.
437 * Note: that these are only updated at end,
438 * not per message (i.e., per loop)
441 atomic_store_explicit(&peer
->last_update
, now
,
442 memory_order_relaxed
);
444 /* If we TXed any flavor of packet */
445 if (update_last_write
) {
446 atomic_store_explicit(&peer
->last_write
, now
,
447 memory_order_relaxed
);
448 peer
->last_sendq_ok
= now
;
456 * Reads a chunk of data from peer->fd into peer->ibuf_work.
459 * Pointer to location to store FSM event code in case of fatal error.
461 * @return status flag (see top-of-file)
463 static uint16_t bgp_read(struct peer
*peer
, int *code_p
)
465 size_t readsize
; // how many bytes we want to read
466 ssize_t nbytes
; // how many bytes we actually read
470 MIN(ringbuf_space(peer
->ibuf_work
), sizeof(peer
->ibuf_scratch
));
471 nbytes
= read(peer
->fd
, peer
->ibuf_scratch
, readsize
);
473 /* EAGAIN or EWOULDBLOCK; come back later */
474 if (nbytes
< 0 && ERRNO_IO_RETRY(errno
)) {
475 SET_FLAG(status
, BGP_IO_TRANS_ERR
);
476 } else if (nbytes
< 0) {
477 /* Fatal error; tear down session */
478 flog_err(EC_BGP_UPDATE_RCV
,
479 "%s [Error] bgp_read_packet error: %s", peer
->host
,
480 safe_strerror(errno
));
482 /* Handle the error in the main pthread. */
484 *code_p
= TCP_fatal_error
;
486 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
488 } else if (nbytes
== 0) {
489 /* Received EOF / TCP session closed */
490 if (bgp_debug_neighbor_events(peer
))
491 zlog_debug("%s [Event] BGP connection closed fd %d",
492 peer
->host
, peer
->fd
);
494 /* Handle the error in the main pthread. */
496 *code_p
= TCP_connection_closed
;
498 SET_FLAG(status
, BGP_IO_FATAL_ERR
);
500 assert(ringbuf_put(peer
->ibuf_work
, peer
->ibuf_scratch
, nbytes
)
508 * Called after we have read a BGP packet header. Validates marker, message
509 * type and packet length. If any of these aren't correct, sends a notify.
511 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
514 static bool validate_header(struct peer
*peer
)
518 struct ringbuf
*pkt
= peer
->ibuf_work
;
520 static const uint8_t m_correct
[BGP_MARKER_SIZE
] = {
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
523 uint8_t m_rx
[BGP_MARKER_SIZE
] = {0x00};
525 if (ringbuf_peek(pkt
, 0, m_rx
, BGP_MARKER_SIZE
) != BGP_MARKER_SIZE
)
528 if (memcmp(m_correct
, m_rx
, BGP_MARKER_SIZE
) != 0) {
529 bgp_notify_send(peer
, BGP_NOTIFY_HEADER_ERR
,
530 BGP_NOTIFY_HEADER_NOT_SYNC
);
534 /* Get size and type in network byte order. */
535 ringbuf_peek(pkt
, BGP_MARKER_SIZE
, &size
, sizeof(size
));
536 ringbuf_peek(pkt
, BGP_MARKER_SIZE
+ 2, &type
, sizeof(type
));
540 /* BGP type check. */
541 if (type
!= BGP_MSG_OPEN
&& type
!= BGP_MSG_UPDATE
542 && type
!= BGP_MSG_NOTIFY
&& type
!= BGP_MSG_KEEPALIVE
543 && type
!= BGP_MSG_ROUTE_REFRESH_NEW
544 && type
!= BGP_MSG_ROUTE_REFRESH_OLD
545 && type
!= BGP_MSG_CAPABILITY
) {
546 if (bgp_debug_neighbor_events(peer
))
547 zlog_debug("%s unknown message type 0x%02x", peer
->host
,
550 bgp_notify_send_with_data(peer
, BGP_NOTIFY_HEADER_ERR
,
551 BGP_NOTIFY_HEADER_BAD_MESTYPE
, &type
,
556 /* Minimum packet length check. */
557 if ((size
< BGP_HEADER_SIZE
) || (size
> peer
->max_packet_size
)
558 || (type
== BGP_MSG_OPEN
&& size
< BGP_MSG_OPEN_MIN_SIZE
)
559 || (type
== BGP_MSG_UPDATE
&& size
< BGP_MSG_UPDATE_MIN_SIZE
)
560 || (type
== BGP_MSG_NOTIFY
&& size
< BGP_MSG_NOTIFY_MIN_SIZE
)
561 || (type
== BGP_MSG_KEEPALIVE
&& size
!= BGP_MSG_KEEPALIVE_MIN_SIZE
)
562 || (type
== BGP_MSG_ROUTE_REFRESH_NEW
563 && size
< BGP_MSG_ROUTE_REFRESH_MIN_SIZE
)
564 || (type
== BGP_MSG_ROUTE_REFRESH_OLD
565 && size
< BGP_MSG_ROUTE_REFRESH_MIN_SIZE
)
566 || (type
== BGP_MSG_CAPABILITY
567 && size
< BGP_MSG_CAPABILITY_MIN_SIZE
)) {
568 if (bgp_debug_neighbor_events(peer
)) {
569 zlog_debug("%s bad message length - %d for %s",
571 type
== 128 ? "ROUTE-REFRESH"
572 : bgp_type_str
[(int)type
]);
575 uint16_t nsize
= htons(size
);
577 bgp_notify_send_with_data(peer
, BGP_NOTIFY_HEADER_ERR
,
578 BGP_NOTIFY_HEADER_BAD_MESLEN
,
579 (unsigned char *)&nsize
, 2);