]> git.proxmox.com Git - mirror_frr.git/blob - bgpd/bgp_io.c
Merge pull request #11597 from opensourcerouting/pimd-packed-member
[mirror_frr.git] / bgpd / bgp_io.c
1 /* BGP I/O.
2 * Implements packet I/O in a pthread.
3 * Copyright (C) 2017 Cumulus Networks
4 * Quentin Young
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; see the file COPYING; if not, write to the
18 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
19 * MA 02110-1301 USA
20 */
21
22 /* clang-format off */
23 #include <zebra.h>
24 #include <pthread.h> // for pthread_mutex_unlock, pthread_mutex_lock
25 #include <sys/uio.h> // for writev
26
27 #include "frr_pthread.h"
28 #include "linklist.h" // for list_delete, list_delete_all_node, lis...
29 #include "log.h" // for zlog_debug, safe_strerror, zlog_err
30 #include "memory.h" // for MTYPE_TMP, XCALLOC, XFREE
31 #include "network.h" // for ERRNO_IO_RETRY
32 #include "stream.h" // for stream_get_endp, stream_getw_from, str...
33 #include "ringbuf.h" // for ringbuf_remain, ringbuf_peek, ringbuf_...
34 #include "thread.h" // for THREAD_OFF, THREAD_ARG, thread...
35
36 #include "bgpd/bgp_io.h"
37 #include "bgpd/bgp_debug.h" // for bgp_debug_neighbor_events, bgp_type_str
38 #include "bgpd/bgp_errors.h" // for expanded error reference information
39 #include "bgpd/bgp_fsm.h" // for BGP_EVENT_ADD, bgp_event
40 #include "bgpd/bgp_packet.h" // for bgp_notify_send_with_data, bgp_notify...
41 #include "bgpd/bgp_trace.h" // for frrtraces
42 #include "bgpd/bgpd.h" // for peer, BGP_MARKER_SIZE, bgp_master, bm
43 /* clang-format on */
44
45 /* forward declarations */
46 static uint16_t bgp_write(struct peer *);
47 static uint16_t bgp_read(struct peer *peer, int *code_p);
48 static void bgp_process_writes(struct thread *);
49 static void bgp_process_reads(struct thread *);
50 static bool validate_header(struct peer *);
51
52 /* generic i/o status codes */
53 #define BGP_IO_TRANS_ERR (1 << 0) // EAGAIN or similar occurred
54 #define BGP_IO_FATAL_ERR (1 << 1) // some kind of fatal TCP error
55
56 /* Thread external API ----------------------------------------------------- */
57
58 void bgp_writes_on(struct peer *peer)
59 {
60 struct frr_pthread *fpt = bgp_pth_io;
61 assert(fpt->running);
62
63 assert(peer->status != Deleted);
64 assert(peer->obuf);
65 assert(peer->ibuf);
66 assert(peer->ibuf_work);
67 assert(!peer->t_connect_check_r);
68 assert(!peer->t_connect_check_w);
69 assert(peer->fd);
70
71 thread_add_write(fpt->master, bgp_process_writes, peer, peer->fd,
72 &peer->t_write);
73 SET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
74 }
75
76 void bgp_writes_off(struct peer *peer)
77 {
78 struct frr_pthread *fpt = bgp_pth_io;
79 assert(fpt->running);
80
81 thread_cancel_async(fpt->master, &peer->t_write, NULL);
82 THREAD_OFF(peer->t_generate_updgrp_packets);
83
84 UNSET_FLAG(peer->thread_flags, PEER_THREAD_WRITES_ON);
85 }
86
87 void bgp_reads_on(struct peer *peer)
88 {
89 struct frr_pthread *fpt = bgp_pth_io;
90 assert(fpt->running);
91
92 assert(peer->status != Deleted);
93 assert(peer->ibuf);
94 assert(peer->fd);
95 assert(peer->ibuf_work);
96 assert(peer->obuf);
97 assert(!peer->t_connect_check_r);
98 assert(!peer->t_connect_check_w);
99 assert(peer->fd);
100
101 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
102 &peer->t_read);
103
104 SET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
105 }
106
107 void bgp_reads_off(struct peer *peer)
108 {
109 struct frr_pthread *fpt = bgp_pth_io;
110 assert(fpt->running);
111
112 thread_cancel_async(fpt->master, &peer->t_read, NULL);
113 THREAD_OFF(peer->t_process_packet);
114 THREAD_OFF(peer->t_process_packet_error);
115
116 UNSET_FLAG(peer->thread_flags, PEER_THREAD_READS_ON);
117 }
118
119 /* Thread internal functions ----------------------------------------------- */
120
121 /*
122 * Called from I/O pthread when a file descriptor has become ready for writing.
123 */
124 static void bgp_process_writes(struct thread *thread)
125 {
126 static struct peer *peer;
127 peer = THREAD_ARG(thread);
128 uint16_t status;
129 bool reschedule;
130 bool fatal = false;
131
132 if (peer->fd < 0)
133 return;
134
135 struct frr_pthread *fpt = bgp_pth_io;
136
137 frr_with_mutex(&peer->io_mtx) {
138 status = bgp_write(peer);
139 reschedule = (stream_fifo_head(peer->obuf) != NULL);
140 }
141
142 /* no problem */
143 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
144 }
145
146 /* problem */
147 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
148 reschedule = false;
149 fatal = true;
150 }
151
152 /* If suppress fib pending is enabled, route is advertised to peers when
153 * the status is received from the FIB. The delay is added
154 * to update group packet generate which will allow more routes to be
155 * sent in the update message
156 */
157 if (reschedule) {
158 thread_add_write(fpt->master, bgp_process_writes, peer,
159 peer->fd, &peer->t_write);
160 } else if (!fatal) {
161 BGP_UPDATE_GROUP_TIMER_ON(&peer->t_generate_updgrp_packets,
162 bgp_generate_updgrp_packets);
163 }
164 }
165
166 /*
167 * Called from I/O pthread when a file descriptor has become ready for reading,
168 * or has hung up.
169 *
170 * We read as much data as possible, process as many packets as we can and
171 * place them on peer->ibuf for secondary processing by the main thread.
172 */
173 static void bgp_process_reads(struct thread *thread)
174 {
175 /* clang-format off */
176 static struct peer *peer; // peer to read from
177 uint16_t status; // bgp_read status code
178 bool more = true; // whether we got more data
179 bool fatal = false; // whether fatal error occurred
180 bool added_pkt = false; // whether we pushed onto ->ibuf
181 int code = 0; // FSM code if error occurred
182 /* clang-format on */
183
184 peer = THREAD_ARG(thread);
185
186 if (peer->fd < 0 || bm->terminating)
187 return;
188
189 struct frr_pthread *fpt = bgp_pth_io;
190
191 frr_with_mutex(&peer->io_mtx) {
192 status = bgp_read(peer, &code);
193 }
194
195 /* error checking phase */
196 if (CHECK_FLAG(status, BGP_IO_TRANS_ERR)) {
197 /* no problem; just don't process packets */
198 more = false;
199 }
200
201 if (CHECK_FLAG(status, BGP_IO_FATAL_ERR)) {
202 /* problem; tear down session */
203 more = false;
204 fatal = true;
205
206 /* Handle the error in the main pthread, include the
207 * specific state change from 'bgp_read'.
208 */
209 thread_add_event(bm->master, bgp_packet_process_error,
210 peer, code, &peer->t_process_packet_error);
211 }
212
213 while (more) {
214 /* static buffer for transferring packets */
215 /* shorter alias to peer's input buffer */
216 struct ringbuf *ibw = peer->ibuf_work;
217 /* packet size as given by header */
218 uint16_t pktsize = 0;
219
220 /* check that we have enough data for a header */
221 if (ringbuf_remain(ibw) < BGP_HEADER_SIZE)
222 break;
223
224 /* check that header is valid */
225 if (!validate_header(peer)) {
226 fatal = true;
227 break;
228 }
229
230 /* header is valid; retrieve packet size */
231 ringbuf_peek(ibw, BGP_MARKER_SIZE, &pktsize, sizeof(pktsize));
232
233 pktsize = ntohs(pktsize);
234
235 /* if this fails we are seriously screwed */
236 assert(pktsize <= peer->max_packet_size);
237
238 /*
239 * If we have that much data, chuck it into its own
240 * stream and append to input queue for processing.
241 */
242 if (ringbuf_remain(ibw) >= pktsize) {
243 struct stream *pkt = stream_new(pktsize);
244
245 assert(STREAM_WRITEABLE(pkt) == pktsize);
246 assert(ringbuf_get(ibw, pkt->data, pktsize) == pktsize);
247 stream_set_endp(pkt, pktsize);
248
249 frrtrace(2, frr_bgp, packet_read, peer, pkt);
250 frr_with_mutex(&peer->io_mtx) {
251 stream_fifo_push(peer->ibuf, pkt);
252 }
253
254 added_pkt = true;
255 } else
256 break;
257 }
258
259 /* handle invalid header */
260 if (fatal) {
261 /* wipe buffer just in case someone screwed up */
262 ringbuf_wipe(peer->ibuf_work);
263 } else {
264 assert(ringbuf_space(peer->ibuf_work) >= peer->max_packet_size);
265
266 thread_add_read(fpt->master, bgp_process_reads, peer, peer->fd,
267 &peer->t_read);
268 if (added_pkt)
269 thread_add_event(bm->master, bgp_process_packet,
270 peer, 0, &peer->t_process_packet);
271 }
272 }
273
274 /*
275 * Flush peer output buffer.
276 *
277 * This function pops packets off of peer->obuf and writes them to peer->fd.
278 * The amount of packets written is equal to the minimum of peer->wpkt_quanta
279 * and the number of packets on the output buffer, unless an error occurs.
280 *
281 * If write() returns an error, the appropriate FSM event is generated.
282 *
283 * The return value is equal to the number of packets written
284 * (which may be zero).
285 */
286 static uint16_t bgp_write(struct peer *peer)
287 {
288 uint8_t type;
289 struct stream *s;
290 int update_last_write = 0;
291 unsigned int count;
292 uint32_t uo = 0;
293 uint16_t status = 0;
294 uint32_t wpkt_quanta_old;
295
296 int writenum = 0;
297 int num;
298 unsigned int iovsz;
299 unsigned int strmsz;
300 unsigned int total_written;
301 time_t now;
302
303 wpkt_quanta_old = atomic_load_explicit(&peer->bgp->wpkt_quanta,
304 memory_order_relaxed);
305 struct stream *ostreams[wpkt_quanta_old];
306 struct stream **streams = ostreams;
307 struct iovec iov[wpkt_quanta_old];
308
309 s = stream_fifo_head(peer->obuf);
310
311 if (!s)
312 goto done;
313
314 count = iovsz = 0;
315 while (count < wpkt_quanta_old && iovsz < array_size(iov) && s) {
316 ostreams[iovsz] = s;
317 iov[iovsz].iov_base = stream_pnt(s);
318 iov[iovsz].iov_len = STREAM_READABLE(s);
319 writenum += STREAM_READABLE(s);
320 s = s->next;
321 ++iovsz;
322 ++count;
323 }
324
325 strmsz = iovsz;
326 total_written = 0;
327
328 do {
329 num = writev(peer->fd, iov, iovsz);
330
331 if (num < 0) {
332 if (!ERRNO_IO_RETRY(errno)) {
333 BGP_EVENT_ADD(peer, TCP_fatal_error);
334 SET_FLAG(status, BGP_IO_FATAL_ERR);
335 } else {
336 SET_FLAG(status, BGP_IO_TRANS_ERR);
337 }
338
339 break;
340 } else if (num != writenum) {
341 unsigned int msg_written = 0;
342 unsigned int ic = iovsz;
343
344 for (unsigned int i = 0; i < ic; i++) {
345 size_t ss = iov[i].iov_len;
346
347 if (ss > (unsigned int) num)
348 break;
349
350 msg_written++;
351 iovsz--;
352 writenum -= ss;
353 num -= ss;
354 }
355
356 total_written += msg_written;
357
358 assert(total_written < count);
359
360 memmove(&iov, &iov[msg_written],
361 sizeof(iov[0]) * iovsz);
362 streams = &streams[msg_written];
363 stream_forward_getp(streams[0], num);
364 iov[0].iov_base = stream_pnt(streams[0]);
365 iov[0].iov_len = STREAM_READABLE(streams[0]);
366
367 writenum -= num;
368 num = 0;
369 assert(writenum > 0);
370 } else {
371 total_written = strmsz;
372 }
373
374 } while (num != writenum);
375
376 /* Handle statistics */
377 for (unsigned int i = 0; i < total_written; i++) {
378 s = stream_fifo_pop(peer->obuf);
379
380 assert(s == ostreams[i]);
381
382 /* Retrieve BGP packet type. */
383 stream_set_getp(s, BGP_MARKER_SIZE + 2);
384 type = stream_getc(s);
385
386 switch (type) {
387 case BGP_MSG_OPEN:
388 atomic_fetch_add_explicit(&peer->open_out, 1,
389 memory_order_relaxed);
390 break;
391 case BGP_MSG_UPDATE:
392 atomic_fetch_add_explicit(&peer->update_out, 1,
393 memory_order_relaxed);
394 uo++;
395 break;
396 case BGP_MSG_NOTIFY:
397 atomic_fetch_add_explicit(&peer->notify_out, 1,
398 memory_order_relaxed);
399 /* Double start timer. */
400 peer->v_start *= 2;
401
402 /* Overflow check. */
403 if (peer->v_start >= (60 * 2))
404 peer->v_start = (60 * 2);
405
406 /*
407 * Handle Graceful Restart case where the state changes
408 * to Connect instead of Idle.
409 */
410 BGP_EVENT_ADD(peer, BGP_Stop);
411 goto done;
412
413 case BGP_MSG_KEEPALIVE:
414 atomic_fetch_add_explicit(&peer->keepalive_out, 1,
415 memory_order_relaxed);
416 break;
417 case BGP_MSG_ROUTE_REFRESH_NEW:
418 case BGP_MSG_ROUTE_REFRESH_OLD:
419 atomic_fetch_add_explicit(&peer->refresh_out, 1,
420 memory_order_relaxed);
421 break;
422 case BGP_MSG_CAPABILITY:
423 atomic_fetch_add_explicit(&peer->dynamic_cap_out, 1,
424 memory_order_relaxed);
425 break;
426 }
427
428 stream_free(s);
429 ostreams[i] = NULL;
430 update_last_write = 1;
431 }
432
433 done : {
434 now = bgp_clock();
435 /*
436 * Update last_update if UPDATEs were written.
437 * Note: that these are only updated at end,
438 * not per message (i.e., per loop)
439 */
440 if (uo)
441 atomic_store_explicit(&peer->last_update, now,
442 memory_order_relaxed);
443
444 /* If we TXed any flavor of packet */
445 if (update_last_write) {
446 atomic_store_explicit(&peer->last_write, now,
447 memory_order_relaxed);
448 peer->last_sendq_ok = now;
449 }
450 }
451
452 return status;
453 }
454
455 /*
456 * Reads a chunk of data from peer->fd into peer->ibuf_work.
457 *
458 * code_p
459 * Pointer to location to store FSM event code in case of fatal error.
460 *
461 * @return status flag (see top-of-file)
462 */
463 static uint16_t bgp_read(struct peer *peer, int *code_p)
464 {
465 size_t readsize; // how many bytes we want to read
466 ssize_t nbytes; // how many bytes we actually read
467 uint16_t status = 0;
468
469 readsize =
470 MIN(ringbuf_space(peer->ibuf_work), sizeof(peer->ibuf_scratch));
471 nbytes = read(peer->fd, peer->ibuf_scratch, readsize);
472
473 /* EAGAIN or EWOULDBLOCK; come back later */
474 if (nbytes < 0 && ERRNO_IO_RETRY(errno)) {
475 SET_FLAG(status, BGP_IO_TRANS_ERR);
476 } else if (nbytes < 0) {
477 /* Fatal error; tear down session */
478 flog_err(EC_BGP_UPDATE_RCV,
479 "%s [Error] bgp_read_packet error: %s", peer->host,
480 safe_strerror(errno));
481
482 /* Handle the error in the main pthread. */
483 if (code_p)
484 *code_p = TCP_fatal_error;
485
486 SET_FLAG(status, BGP_IO_FATAL_ERR);
487
488 } else if (nbytes == 0) {
489 /* Received EOF / TCP session closed */
490 if (bgp_debug_neighbor_events(peer))
491 zlog_debug("%s [Event] BGP connection closed fd %d",
492 peer->host, peer->fd);
493
494 /* Handle the error in the main pthread. */
495 if (code_p)
496 *code_p = TCP_connection_closed;
497
498 SET_FLAG(status, BGP_IO_FATAL_ERR);
499 } else {
500 assert(ringbuf_put(peer->ibuf_work, peer->ibuf_scratch, nbytes)
501 == (size_t)nbytes);
502 }
503
504 return status;
505 }
506
507 /*
508 * Called after we have read a BGP packet header. Validates marker, message
509 * type and packet length. If any of these aren't correct, sends a notify.
510 *
511 * Assumes that there are at least BGP_HEADER_SIZE readable bytes in the input
512 * buffer.
513 */
514 static bool validate_header(struct peer *peer)
515 {
516 uint16_t size;
517 uint8_t type;
518 struct ringbuf *pkt = peer->ibuf_work;
519
520 static const uint8_t m_correct[BGP_MARKER_SIZE] = {
521 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
523 uint8_t m_rx[BGP_MARKER_SIZE] = {0x00};
524
525 if (ringbuf_peek(pkt, 0, m_rx, BGP_MARKER_SIZE) != BGP_MARKER_SIZE)
526 return false;
527
528 if (memcmp(m_correct, m_rx, BGP_MARKER_SIZE) != 0) {
529 bgp_notify_send(peer, BGP_NOTIFY_HEADER_ERR,
530 BGP_NOTIFY_HEADER_NOT_SYNC);
531 return false;
532 }
533
534 /* Get size and type in network byte order. */
535 ringbuf_peek(pkt, BGP_MARKER_SIZE, &size, sizeof(size));
536 ringbuf_peek(pkt, BGP_MARKER_SIZE + 2, &type, sizeof(type));
537
538 size = ntohs(size);
539
540 /* BGP type check. */
541 if (type != BGP_MSG_OPEN && type != BGP_MSG_UPDATE
542 && type != BGP_MSG_NOTIFY && type != BGP_MSG_KEEPALIVE
543 && type != BGP_MSG_ROUTE_REFRESH_NEW
544 && type != BGP_MSG_ROUTE_REFRESH_OLD
545 && type != BGP_MSG_CAPABILITY) {
546 if (bgp_debug_neighbor_events(peer))
547 zlog_debug("%s unknown message type 0x%02x", peer->host,
548 type);
549
550 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
551 BGP_NOTIFY_HEADER_BAD_MESTYPE, &type,
552 1);
553 return false;
554 }
555
556 /* Minimum packet length check. */
557 if ((size < BGP_HEADER_SIZE) || (size > peer->max_packet_size)
558 || (type == BGP_MSG_OPEN && size < BGP_MSG_OPEN_MIN_SIZE)
559 || (type == BGP_MSG_UPDATE && size < BGP_MSG_UPDATE_MIN_SIZE)
560 || (type == BGP_MSG_NOTIFY && size < BGP_MSG_NOTIFY_MIN_SIZE)
561 || (type == BGP_MSG_KEEPALIVE && size != BGP_MSG_KEEPALIVE_MIN_SIZE)
562 || (type == BGP_MSG_ROUTE_REFRESH_NEW
563 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
564 || (type == BGP_MSG_ROUTE_REFRESH_OLD
565 && size < BGP_MSG_ROUTE_REFRESH_MIN_SIZE)
566 || (type == BGP_MSG_CAPABILITY
567 && size < BGP_MSG_CAPABILITY_MIN_SIZE)) {
568 if (bgp_debug_neighbor_events(peer)) {
569 zlog_debug("%s bad message length - %d for %s",
570 peer->host, size,
571 type == 128 ? "ROUTE-REFRESH"
572 : bgp_type_str[(int)type]);
573 }
574
575 uint16_t nsize = htons(size);
576
577 bgp_notify_send_with_data(peer, BGP_NOTIFY_HEADER_ERR,
578 BGP_NOTIFY_HEADER_BAD_MESLEN,
579 (unsigned char *)&nsize, 2);
580 return false;
581 }
582
583 return true;
584 }