]> git.proxmox.com Git - mirror_frr.git/blame - bgpd/bgp_io.c
Merge pull request #3394 from karamalla0406/frr3360
[mirror_frr.git] / bgpd / bgp_io.c
CommitLineData
958b450c 1/* BGP I/O.
51abb4b4 2 * Implements packet I/O in a pthread.
958b450c 3 * Copyright (C) 2017 Cumulus Networks
51abb4b4 4 * Quentin Young
958b450c
QY
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19 * MA 02110-1301 USA
56257a44
QY
20 */
21
95158b0c 22/* clang-format off */
42cf651e 23#include <zebra.h>
95158b0c 24#include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
56257a44 25
1ac267a2 26#include "frr_pthread.h"
95158b0c
QY
27#include "linklist.h" // for list_delete, list_delete_all_node, lis...
28#include "log.h" // for zlog_debug, safe_strerror, zlog_err
29#include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
30#include "network.h" // for ERRNO_IO_RETRY
31#include "stream.h" // for stream_get_endp, stream_getw_from, str...
74ffbfe6 32#include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
95158b0c
QY
33#include "thread.h" // for THREAD_OFF, THREAD_ARG, thread, thread...
34#include "zassert.h" // for assert
56257a44 35
42cf651e 36#include "bgpd/bgp_io.h"
95158b0c 37#include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
14454c9f 38#include "bgpd/bgp_errors.h" // for expanded error reference information
95158b0c
QY
39#include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
40#include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
41#include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
42/* clang-format on */
56257a44 43
424ab01d
QY
44/* forward declarations */
45static uint16_t bgp_write(struct peer *);
46static uint16_t bgp_read(struct peer *);
47static int bgp_process_writes(struct thread *);
48static int bgp_process_reads(struct thread *);
49static bool validate_header(struct peer *);
56257a44 50
424ab01d 51/* generic i/o status codes */
95158b0c
QY
52#define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
53#define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
56257a44 54
a715eab3 55/* Thread external API ----------------------------------------------------- */
56257a44 56
424ab01d 57void bgp_writes_on(struct peer *peer)
56257a44 58{
1ac267a2 59 struct frr_pthread *fpt = bgp_pth_io;
a715eab3 60 assert(fpt->running);
f09a656d 61
424ab01d
QY
62 assert(peer->status != Deleted);
63 assert(peer->obuf);
64 assert(peer->ibuf);
65 assert(peer->ibuf_work);
387f984e
QY
66 assert(!peer->t_connect_check_r);
67 assert(!peer->t_connect_check_w);
424ab01d 68 assert(peer->fd);
56257a44 69
b750b0ba
QY
70 thread_add_write(fpt->master, bgp_process_writes, peer, peer->fd,
71 &peer->t_write);
72 SET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
424ab01d 73}
56257a44 74
424ab01d
QY
75void bgp_writes_off(struct peer *peer)
76{
1ac267a2 77 struct frr_pthread *fpt = bgp_pth_io;
a715eab3 78 assert(fpt->running);
151044ce 79
b750b0ba
QY
80 thread_cancel_async(fpt->master, &peer->t_write, NULL);
81 THREAD_OFF(peer->t_generate_updgrp_packets);
56257a44 82
b750b0ba 83 UNSET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
56257a44
QY
84}
85
424ab01d 86void bgp_reads_on(struct peer *peer)
56257a44 87{
1ac267a2 88 struct frr_pthread *fpt = bgp_pth_io;
a715eab3 89 assert(fpt->running);
f09a656d 90
424ab01d
QY
91 assert(peer->status != Deleted);
92 assert(peer->ibuf);
93 assert(peer->fd);
94 assert(peer->ibuf_work);
424ab01d 95 assert(peer->obuf);
387f984e
QY
96 assert(!peer->t_connect_check_r);
97 assert(!peer->t_connect_check_w);
424ab01d
QY
98 assert(peer->fd);
99
b750b0ba
QY
100 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
101 &peer->t_read);
102
103 SET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
56257a44
QY
104}
105
424ab01d 106void bgp_reads_off(struct peer *peer)
56257a44 107{
1ac267a2 108 struct frr_pthread *fpt = bgp_pth_io;
a715eab3 109 assert(fpt->running);
151044ce 110
b750b0ba
QY
111 thread_cancel_async(fpt->master, &peer->t_read, NULL);
112 THREAD_OFF(peer->t_process_packet);
56257a44 113
b750b0ba 114 UNSET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
56257a44
QY
115}
116
a715eab3 117/* Thread internal functions ----------------------------------------------- */
51abb4b4 118
a715eab3 119/*
51abb4b4 120 * Called from I/O pthread when a file descriptor has become ready for writing.
424ab01d
QY
121 */
122static int bgp_process_writes(struct thread *thread)
56257a44 123{
424ab01d
QY
124 static struct peer *peer;
125 peer = THREAD_ARG(thread);
126 uint16_t status;
b750b0ba 127 bool reschedule;
bbac44ac 128 bool fatal = false;
424ab01d
QY
129
130 if (peer->fd < 0)
131 return -1;
132
1ac267a2 133 struct frr_pthread *fpt = bgp_pth_io;
424ab01d 134
424ab01d 135 pthread_mutex_lock(&peer->io_mtx);
56257a44 136 {
424ab01d
QY
137 status = bgp_write(peer);
138 reschedule = (stream_fifo_head(peer->obuf) != NULL);
139 }
140 pthread_mutex_unlock(&peer->io_mtx);
56257a44 141
a715eab3
QY
142 /* no problem */
143 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
56257a44 144 }
56257a44 145
a715eab3 146 /* problem */
bbac44ac 147 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
a715eab3 148 reschedule = false;
bbac44ac
QY
149 fatal = true;
150 }
424ab01d
QY
151
152 if (reschedule) {
153 thread_add_write(fpt->master, bgp_process_writes, peer,
154 peer->fd, &peer->t_write);
b785b7ad
QY
155 } else if (!fatal) {
156 BGP_TIMER_ON(peer->t_generate_updgrp_packets,
157 bgp_generate_updgrp_packets, 0);
424ab01d
QY
158 }
159
160 return 0;
56257a44
QY
161}
162
a715eab3 163/*
51abb4b4
QY
164 * Called from I/O pthread when a file descriptor has become ready for reading,
165 * or has hung up.
9eb217ff
QY
166 *
167 * We read as much data as possible, process as many packets as we can and
168 * place them on peer->ibuf for secondary processing by the main thread.
56257a44 169 */
424ab01d 170static int bgp_process_reads(struct thread *thread)
56257a44 171{
e11eeb8c
QY
172 /* clang-format off */
173 static struct peer *peer; // peer to read from
174 uint16_t status; // bgp_read status code
175 bool more = true; // whether we got more data
176 bool fatal = false; // whether fatal error occurred
177 bool added_pkt = false; // whether we pushed onto ->ibuf
e11eeb8c 178 /* clang-format on */
9eb217ff 179
424ab01d 180 peer = THREAD_ARG(thread);
424ab01d 181
97b4a0ec 182 if (peer->fd < 0 || bm->terminating)
424ab01d
QY
183 return -1;
184
1ac267a2 185 struct frr_pthread *fpt = bgp_pth_io;
424ab01d 186
424ab01d 187 pthread_mutex_lock(&peer->io_mtx);
56257a44 188 {
424ab01d
QY
189 status = bgp_read(peer);
190 }
191 pthread_mutex_unlock(&peer->io_mtx);
192
9eb217ff
QY
193 /* error checking phase */
194 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
195 /* no problem; just don't process packets */
196 more = false;
197 }
424ab01d 198
9eb217ff
QY
199 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
200 /* problem; tear down session */
201 more = false;
202 fatal = true;
56257a44 203 }
56257a44 204
9eb217ff
QY
205 while (more) {
206 /* static buffer for transferring packets */
207 static unsigned char pktbuf[BGP_MAX_PACKET_SIZE];
208 /* shorter alias to peer's input buffer */
74ffbfe6 209 struct ringbuf *ibw = peer->ibuf_work;
9eb217ff 210 /* packet size as given by header */
74ffbfe6 211 uint16_t pktsize = 0;
9eb217ff
QY
212
213 /* check that we have enough data for a header */
74ffbfe6 214 if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
9eb217ff 215 break;
424ab01d 216
a2b6e694 217 /* check that header is valid */
218 if (!validate_header(peer)) {
9eb217ff
QY
219 fatal = true;
220 break;
424ab01d 221 }
424ab01d 222
9eb217ff 223 /* header is valid; retrieve packet size */
74ffbfe6
QY
224 ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
225
226 pktsize = ntohs(pktsize);
424ab01d 227
9eb217ff
QY
228 /* if this fails we are seriously screwed */
229 assert(pktsize <= BGP_MAX_PACKET_SIZE);
230
a715eab3
QY
231 /*
232 * If we have that much data, chuck it into its own
233 * stream and append to input queue for processing.
234 */
74ffbfe6 235 if (ringbuf_remain(ibw) >= pktsize) {
9eb217ff 236 struct stream *pkt = stream_new(pktsize);
74ffbfe6 237 assert(ringbuf_get(ibw, pktbuf, pktsize) == pktsize);
9eb217ff
QY
238 stream_put(pkt, pktbuf, pktsize);
239
240 pthread_mutex_lock(&peer->io_mtx);
241 {
242 stream_fifo_push(peer->ibuf, pkt);
243 }
244 pthread_mutex_unlock(&peer->io_mtx);
245
246 added_pkt = true;
247 } else
248 break;
249 }
250
74ffbfe6 251 assert(ringbuf_space(peer->ibuf_work) >= BGP_MAX_PACKET_SIZE);
9eb217ff
QY
252
253 /* handle invalid header */
254 if (fatal) {
9eb217ff 255 /* wipe buffer just in case someone screwed up */
74ffbfe6 256 ringbuf_wipe(peer->ibuf_work);
9eb217ff 257 } else {
424ab01d
QY
258 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
259 &peer->t_read);
9eb217ff 260 if (added_pkt)
7a86aa5a
QY
261 thread_add_timer_msec(bm->master, bgp_process_packet,
262 peer, 0, &peer->t_process_packet);
9eb217ff 263 }
424ab01d
QY
264
265 return 0;
56257a44
QY
266}
267
a715eab3 268/*
56257a44
QY
269 * Flush peer output buffer.
270 *
271 * This function pops packets off of peer->obuf and writes them to peer->fd.
272 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
424ab01d 273 * and the number of packets on the output buffer, unless an error occurs.
56257a44
QY
274 *
275 * If write() returns an error, the appropriate FSM event is generated.
276 *
277 * The return value is equal to the number of packets written
278 * (which may be zero).
279 */
424ab01d 280static uint16_t bgp_write(struct peer *peer)
56257a44 281{
d7c0a89a 282 uint8_t type;
56257a44
QY
283 struct stream *s;
284 int num;
285 int update_last_write = 0;
286 unsigned int count = 0;
eb2277cf 287 uint32_t uo = 0;
424ab01d 288 uint16_t status = 0;
555e09d4 289 uint32_t wpkt_quanta_old;
56257a44 290
996c9314
LB
291 wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
292 memory_order_relaxed);
555e09d4
QY
293
294 while (count < wpkt_quanta_old && (s = stream_fifo_head(peer->obuf))) {
56257a44
QY
295 int writenum;
296 do {
297 writenum = stream_get_endp(s) - stream_get_getp(s);
26c08e95 298 num = write(peer->fd, stream_pnt(s), writenum);
56257a44
QY
299
300 if (num < 0) {
424ab01d 301 if (!ERRNO_IO_RETRY(errno)) {
56257a44 302 BGP_EVENT_ADD(peer, TCP_fatal_error);
424ab01d
QY
303 SET_FLAG(status, BGP_IO_FATAL_ERR);
304 } else {
305 SET_FLAG(status, BGP_IO_TRANS_ERR);
306 }
56257a44
QY
307
308 goto done;
a715eab3 309 } else if (num != writenum)
56257a44
QY
310 stream_forward_getp(s, num);
311
312 } while (num != writenum);
313
314 /* Retrieve BGP packet type. */
315 stream_set_getp(s, BGP_MARKER_SIZE + 2);
316 type = stream_getc(s);
317
318 switch (type) {
319 case BGP_MSG_OPEN:
1588f6f4
QY
320 atomic_fetch_add_explicit(&peer->open_out, 1,
321 memory_order_relaxed);
56257a44
QY
322 break;
323 case BGP_MSG_UPDATE:
1588f6f4
QY
324 atomic_fetch_add_explicit(&peer->update_out, 1,
325 memory_order_relaxed);
eb2277cf 326 uo++;
56257a44
QY
327 break;
328 case BGP_MSG_NOTIFY:
1588f6f4
QY
329 atomic_fetch_add_explicit(&peer->notify_out, 1,
330 memory_order_relaxed);
56257a44
QY
331 /* Double start timer. */
332 peer->v_start *= 2;
333
334 /* Overflow check. */
335 if (peer->v_start >= (60 * 2))
336 peer->v_start = (60 * 2);
337
a715eab3
QY
338 /*
339 * Handle Graceful Restart case where the state changes
340 * to Connect instead of Idle.
341 */
56257a44
QY
342 BGP_EVENT_ADD(peer, BGP_Stop);
343 goto done;
344
345 case BGP_MSG_KEEPALIVE:
1588f6f4
QY
346 atomic_fetch_add_explicit(&peer->keepalive_out, 1,
347 memory_order_relaxed);
56257a44
QY
348 break;
349 case BGP_MSG_ROUTE_REFRESH_NEW:
350 case BGP_MSG_ROUTE_REFRESH_OLD:
1588f6f4
QY
351 atomic_fetch_add_explicit(&peer->refresh_out, 1,
352 memory_order_relaxed);
56257a44
QY
353 break;
354 case BGP_MSG_CAPABILITY:
1588f6f4
QY
355 atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
356 memory_order_relaxed);
56257a44
QY
357 break;
358 }
359
360 count++;
424ab01d 361
56257a44
QY
362 stream_free(stream_fifo_pop(peer->obuf));
363 update_last_write = 1;
364 }
365
366done : {
eb2277cf
LB
367 /*
368 * Update last_update if UPDATEs were written.
369 * Note: that these are only updated at end,
370 * not per message (i.e., per loop)
371 */
372 if (uo)
1588f6f4
QY
373 atomic_store_explicit(&peer->last_update, bgp_clock(),
374 memory_order_relaxed);
56257a44 375
5c075a90 376 /* If we TXed any flavor of packet */
56257a44 377 if (update_last_write)
1588f6f4
QY
378 atomic_store_explicit(&peer->last_write, bgp_clock(),
379 memory_order_relaxed);
56257a44
QY
380}
381
424ab01d
QY
382 return status;
383}
384
a715eab3 385/*
51abb4b4 386 * Reads a chunk of data from peer->fd into peer->ibuf_work.
424ab01d 387 *
51abb4b4 388 * @return status flag (see top-of-file)
424ab01d
QY
389 */
390static uint16_t bgp_read(struct peer *peer)
391{
b750b0ba
QY
392 size_t readsize; // how many bytes we want to read
393 ssize_t nbytes; // how many bytes we actually read
424ab01d 394 uint16_t status = 0;
74ffbfe6 395 static uint8_t ibw[BGP_MAX_PACKET_SIZE * BGP_READ_PACKET_MAX];
424ab01d 396
74ffbfe6
QY
397 readsize = MIN(ringbuf_space(peer->ibuf_work), sizeof(ibw));
398 nbytes = read(peer->fd, ibw, readsize);
424ab01d 399
74ffbfe6
QY
400 /* EAGAIN or EWOULDBLOCK; come back later */
401 if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
402 SET_FLAG(status, BGP_IO_TRANS_ERR);
996c9314 403 /* Fatal error; tear down session */
74ffbfe6 404 } else if (nbytes < 0) {
e50f7cfd 405 flog_err(EC_BGP_UPDATE_RCV,
1c50c1c0
QY
406 "%s [Error] bgp_read_packet error: %s", peer->host,
407 safe_strerror(errno));
85145b62
QY
408
409 if (peer->status == Established) {
410 if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
411 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
412 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
413 } else
414 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
415 }
424ab01d 416
85145b62
QY
417 BGP_EVENT_ADD(peer, TCP_fatal_error);
418 SET_FLAG(status, BGP_IO_FATAL_ERR);
996c9314 419 /* Received EOF / TCP session closed */
74ffbfe6 420 } else if (nbytes == 0) {
85145b62
QY
421 if (bgp_debug_neighbor_events(peer))
422 zlog_debug("%s [Event] BGP connection closed fd %d",
423 peer->host, peer->fd);
424
425 if (peer->status == Established) {
426 if (CHECK_FLAG(peer->sflags, PEER_STATUS_NSF_MODE)) {
427 peer->last_reset = PEER_DOWN_NSF_CLOSE_SESSION;
428 SET_FLAG(peer->sflags, PEER_STATUS_NSF_WAIT);
429 } else
430 peer->last_reset = PEER_DOWN_CLOSE_SESSION;
424ab01d
QY
431 }
432
85145b62
QY
433 BGP_EVENT_ADD(peer, TCP_connection_closed);
434 SET_FLAG(status, BGP_IO_FATAL_ERR);
74ffbfe6
QY
435 } else {
436 assert(ringbuf_put(peer->ibuf_work, ibw, nbytes)
437 == (size_t)nbytes);
424ab01d
QY
438 }
439
424ab01d
QY
440 return status;
441}
442
443/*
444 * Called after we have read a BGP packet header. Validates marker, message
445 * type and packet length. If any of these aren't correct, sends a notify.
74ffbfe6
QY
446 *
447 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
448 * buffer.
424ab01d
QY
449 */
450static bool validate_header(struct peer *peer)
451{
3fe63c29
QY
452 uint16_t size;
453 uint8_t type;
74ffbfe6 454 struct ringbuf *pkt = peer->ibuf_work;
424ab01d 455
74ffbfe6
QY
456 static uint8_t m_correct[BGP_MARKER_SIZE] = {
457 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
458 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
459 uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
442c9afb 460
74ffbfe6
QY
461 if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
462 return false;
463
464 if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
442c9afb
QY
465 bgp_notify_send(peer, BGP_NOTIFY_HEADER_ERR,
466 BGP_NOTIFY_HEADER_NOT_SYNC);
467 return false;
468 }
424ab01d 469
74ffbfe6
QY
470 /* Get size and type in network byte order. */
471 ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
472 ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
473
474 size = ntohs(size);
424ab01d
QY
475
476 /* BGP type check. */
477 if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
478 && type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
479 && type != BGP_MSG_ROUTE_REFRESH_NEW
480 && type != BGP_MSG_ROUTE_REFRESH_OLD
481 && type != BGP_MSG_CAPABILITY) {
3fe63c29 482 if (bgp_debug_neighbor_events(peer))
424ab01d
QY
483 zlog_debug("%s unknown message type 0x%02x", peer->host,
484 type);
485
486 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
996c9314
LB
487 BGP_NOTIFY_HEADER_BAD_MESTYPE, &type,
488 1);
424ab01d
QY
489 return false;
490 }
491
3fe63c29 492 /* Minimum packet length check. */
424ab01d
QY
493 if ((size < BGP_HEADER_SIZE) || (size > BGP_MAX_PACKET_SIZE)
494 || (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
495 || (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
496 || (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
497 || (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
498 || (type == BGP_MSG_ROUTE_REFRESH_NEW
499 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
500 || (type == BGP_MSG_ROUTE_REFRESH_OLD
501 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
502 || (type == BGP_MSG_CAPABILITY
503 && size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
1588f6f4 504 if (bgp_debug_neighbor_events(peer)) {
424ab01d
QY
505 zlog_debug("%s bad message length - %d for %s",
506 peer->host, size,
507 type == 128 ? "ROUTE-REFRESH"
996c9314 508 : bgp_type_str[(int)type]);
1588f6f4 509 }
424ab01d 510
3fe63c29
QY
511 uint16_t nsize = htons(size);
512
424ab01d
QY
513 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
514 BGP_NOTIFY_HEADER_BAD_MESLEN,
996c9314 515 (unsigned char *)&nsize, 2);
424ab01d
QY
516 return false;
517 }
518
519 return true;
56257a44 520}