From aba5acdfdb347d2c21fc67d613d83d4430ca3937 Mon Sep 17 00:00:00 2001 From: "osdl.org!shemminger" Date: Thu, 15 Apr 2004 20:56:59 +0000 Subject: [PATCH] (Logical change 1.3) --- Config | 2 + Makefile | 77 + Modules/Catalogue | 7 + Modules/tcp_diag.c | 623 +++++ Patches/Catalogue | 46 + Patches/af_unix.dif | 401 ++++ Patches/pidentd-3.0.12.dif | 270 +++ Patches/rt_cache_stat.dif | 230 ++ Patches/symbol_exports.dif | 56 + README | 65 + README.decnet | 41 + README.iproute2+tc | 119 + RELNOTES | 168 ++ doc/Makefile | 57 + doc/Plan | 16 + doc/SNAPSHOT.tex | 1 + doc/api-ip6-flowlabels.tex | 429 ++++ doc/arpd.sgml | 130 + doc/do-psnup | 16 + doc/ip-cref.tex | 3316 ++++++++++++++++++++++++++ doc/ip-tunnels.tex | 469 ++++ doc/nstat.sgml | 110 + doc/preamble.tex | 26 + doc/rtstat.sgml | 52 + doc/ss.sgml | 525 ++++ etc/iproute2/rt_dsfield | 13 + etc/iproute2/rt_protos | 25 + etc/iproute2/rt_realms | 13 + etc/iproute2/rt_scopes | 11 + etc/iproute2/rt_tables | 11 + examples/SYN-DoS.rate.limit | 49 + examples/cbqinit.eth1 | 76 + examples/dhcp-client-script | 446 ++++ examples/diffserv/Edge1 | 68 + examples/diffserv/Edge2 | 87 + examples/diffserv/Edge31-ca-u32 | 170 ++ examples/diffserv/Edge31-cb-chains | 132 + examples/diffserv/Edge32-ca-u32 | 198 ++ examples/diffserv/Edge32-cb-chains | 144 ++ examples/diffserv/Edge32-cb-u32 | 145 ++ examples/diffserv/README | 98 + examples/diffserv/afcbq | 105 + examples/diffserv/ef-prio | 25 + examples/diffserv/efcbq | 31 + examples/diffserv/regression-testing | 125 + include-glibc/bits/sockunion.h | 25 + include-glibc/db.h | 10 + include-glibc/glibc-bugs.h | 20 + include-glibc/netinet/in.h | 11 + include-glibc/netinet/ip.h | 9 + include-glibc/socketbits.h | 270 +++ include/SNAPSHOT.h | 1 + include/libnetlink.h | 46 + include/ll_map.h | 12 + include/rt_names.h | 28 + include/rtm_map.h | 10 + include/tcp_diag.h | 119 + include/utils.h | 104 + ip/Makefile | 22 + ip/ifcfg | 145 ++ ip/ip.c | 167 ++ ip/ip_common.h | 20 + ip/ipaddress.c | 898 +++++++ ip/iplink.c | 397 +++ ip/ipmaddr.c | 342 +++ ip/ipmonitor.c | 152 ++ ip/ipmroute.c | 204 ++ ip/ipneigh.c | 484 ++++ ip/iproute.c | 1410 +++++++++++ ip/iprule.c | 323 +++ ip/iptunnel.c | 581 +++++ ip/routef | 3 + ip/routel | 60 + ip/rtm_map.c | 116 + ip/rtmon.c | 177 ++ ip/rtpr | 4 + lib/Makefile | 18 + lib/dnet_ntop.c | 98 + lib/dnet_pton.c | 71 + lib/inet_ntop.c | 199 ++ lib/inet_proto.c | 70 + lib/inet_pton.c | 217 ++ lib/ipx_ntop.c | 71 + lib/ipx_pton.c | 107 + lib/libnetlink.c | 521 ++++ lib/ll_addr.c | 91 + lib/ll_map.c | 169 ++ lib/ll_proto.c | 127 + lib/ll_types.c | 128 + lib/rt_names.c | 388 +++ lib/utils.c | 528 ++++ misc/Makefile | 37 + misc/arpd.c | 846 +++++++ misc/ifstat.c | 729 ++++++ misc/netbug | 53 + misc/nstat.c | 614 +++++ misc/rtacct.c | 625 +++++ misc/rtstat.c | 172 ++ misc/ss.c | 2672 +++++++++++++++++++++ misc/ssfilter.h | 21 + misc/ssfilter.y | 274 +++ tc/Makefile | 54 + tc/README.last | 47 + tc/f_fw.c | 116 + tc/f_route.c | 175 ++ tc/f_rsvp.c | 408 ++++ tc/f_tcindex.c | 186 ++ tc/f_u32.c | 977 ++++++++ tc/m_estimator.c | 64 + tc/m_police.c | 328 +++ tc/q_atm.c | 268 +++ tc/q_cbq.c | 555 +++++ tc/q_csz.c | 61 + tc/q_dsmark.c | 186 ++ tc/q_fifo.c | 101 + tc/q_gred.c | 345 +++ tc/q_hfsc.c | 61 + tc/q_hpfq.c | 61 + tc/q_ingress.c | 76 + tc/q_prio.c | 127 + tc/q_red.c | 222 ++ tc/q_sfq.c | 115 + tc/q_tbf.c | 272 +++ tc/tc.c | 306 +++ tc/tc_cbq.c | 57 + tc/tc_cbq.h | 9 + tc/tc_class.c | 361 +++ tc/tc_common.h | 5 + tc/tc_core.c | 85 + tc/tc_core.h | 16 + tc/tc_estimator.c | 44 + tc/tc_filter.c | 388 +++ tc/tc_qdisc.c | 353 +++ tc/tc_red.c | 97 + tc/tc_red.h | 8 + tc/tc_util.c | 313 +++ tc/tc_util.h | 57 + 137 files changed, 31144 insertions(+) diff --git a/Config b/Config index e69de29b..ca6cdcea 100644 --- a/Config +++ b/Config @@ -0,0 +1,2 @@ +TC_CONFIG_DIFFSERV=n +TC_CONFIG_ATM=n diff --git a/Makefile b/Makefile index e69de29b..05063e77 100644 --- a/Makefile +++ b/Makefile @@ -0,0 +1,77 @@ +# Path to parent kernel include files directory +DESTDIR= +SBINDIR=/sbin +CONFDIR=/etc/iproute2 +DOCDIR=/usr/doc/iproute2 + +KERNEL_INCLUDE=/usr/src/linux/include +LIBC_INCLUDE=/usr/include + +DEFINES= -DRESOLVE_HOSTNAMES + +#options if you have a bind>=4.9.4 libresolv (or, maybe, glibc) +LDLIBS=-lresolv +ADDLIB= + +#options if you compile with libc5, and without a bind>=4.9.4 libresolv +#LDLIBS= +#ADDLIB=inet_ntop.o inet_pton.o + +#options for decnet +ADDLIB+=dnet_ntop.o dnet_pton.o + +#options for ipx +ADDLIB+=ipx_ntop.o ipx_pton.o + +ifeq ($(LIBC_INCLUDE)/socketbits.h,$(wildcard $(LIBC_INCLUDE)/socketbits.h)) + ifeq ($(LIBC_INCLUDE)/net/if_packet.h,$(wildcard $(LIBC_INCLUDE)/net/if_packet.h)) + GLIBCFIX=-I../include-glibc -include ../include-glibc/glibc-bugs.h + endif +endif +ifeq ($(LIBC_INCLUDE)/bits/socket.h,$(wildcard $(LIBC_INCLUDE)/bits/socket.h)) + GLIBCFIX=-I../include-glibc -I/usr/include/db3 -include ../include-glibc/glibc-bugs.h +endif + + +CC = gcc +CCOPTS = -D_GNU_SOURCE -O2 -Wstrict-prototypes -Wall -g +CFLAGS = $(CCOPTS) $(GLIBCFIX) -I$(KERNEL_INCLUDE) -I../include $(DEFINES) + +LDLIBS += -L../lib -lnetlink -lutil + +SUBDIRS=lib ip tc misc + +LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a + +all: check-kernel + @set -e; \ + for i in $(SUBDIRS); \ + do $(MAKE) -C $$i; done + +check-kernel: +ifeq ($(KERNEL_INCLUDE),) + @echo "Please, set correct KERNEL_INCLUDE"; false +else + @set -e; \ + if [ ! -r $(KERNEL_INCLUDE)/linux/autoconf.h ]; then \ + echo "Please, compile the kernel first"; false; fi +endif + +install: all + install -m 0755 -d $(DESTDIR)$(SBINDIR) + install -m 0755 -d $(DESTDIR)$(CONFDIR) + install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples + install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples/diffserv + install -m 0644 README.iproute2+tc $(shell find examples -type f -maxdepth 1) $(DESTDIR)$(DOCDIR)/examples + install -m 0644 $(shell echo examples/diffserv/*) $(DESTDIR)$(DOCDIR)/examples/diffserv + @for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done + @cd etc/iproute2; for i in *; do \ + if [ ! -e $(DESTDIR)$(CONFDIR)/$$i ]; then \ + echo install -m 0644 $$i $(DESTDIR)$(CONFDIR); \ + install -m 0644 $$i $(DESTDIR)$(CONFDIR); fi; done + +clean: + for i in $(SUBDIRS) doc; \ + do $(MAKE) -C $$i clean; done + +.EXPORT_ALL_VARIABLES: diff --git a/Modules/Catalogue b/Modules/Catalogue index e69de29b..e5d2d0f2 100644 --- a/Modules/Catalogue +++ b/Modules/Catalogue @@ -0,0 +1,7 @@ +File: tcp_diag.c +Status: desired for kernels < 2.4.17 + not needed for kernels >= 2.4.17 +Description: adds tcpdiag facility to kernel to accelerate ss utility + and pidentd +Side effects: none + \ No newline at end of file diff --git a/Modules/tcp_diag.c b/Modules/tcp_diag.c index e69de29b..e11e221d 100644 --- a/Modules/tcp_diag.c +++ b/Modules/tcp_diag.c @@ -0,0 +1,623 @@ +/* + * tcp_diag.c Module for monitoring TCP sockets. + * + * Version: $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "tcp_diag.h" + +static struct sock *tcpnl; + + +#define TCPDIAG_PUT(skb, attrtype, attrlen) \ +({ int rtalen = RTA_LENGTH(attrlen); \ + struct rtattr *rta; \ + if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ + rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ + rta->rta_type = attrtype; \ + rta->rta_len = rtalen; \ + RTA_DATA(rta); }) + +static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq) +{ + struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; + struct tcpdiagmsg *r; + struct nlmsghdr *nlh; + struct tcp_info *info = NULL; + struct tcpdiag_meminfo *minfo = NULL; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + r = NLMSG_DATA(nlh); + if (sk->state != TCP_TIME_WAIT) { + if (ext & (1<<(TCPDIAG_MEMINFO-1))) + minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); + if (ext & (1<<(TCPDIAG_INFO-1))) + info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + } + r->tcpdiag_family = sk->family; + r->tcpdiag_state = sk->state; + r->tcpdiag_timer = 0; + r->tcpdiag_retrans = 0; + + r->id.tcpdiag_sport = sk->sport; + r->id.tcpdiag_dport = sk->dport; + r->id.tcpdiag_src[0] = sk->rcv_saddr; + r->id.tcpdiag_dst[0] = sk->daddr; + r->id.tcpdiag_if = sk->bound_dev_if; + *((struct sock **)&r->id.tcpdiag_cookie) = sk; + + if (r->tcpdiag_state == TCP_TIME_WAIT) { + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + long tmo = tw->ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->tcpdiag_state = tw->substate; + r->tcpdiag_timer = 3; + r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; + r->tcpdiag_rqueue = 0; + r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = 0; + r->tcpdiag_inode = 0; +#ifdef CONFIG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + memcpy(r->id.tcpdiag_src, &tw->v6_rcv_saddr, 16); + memcpy(r->id.tcpdiag_dst, &tw->v6_daddr, 16); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + +#ifdef CONFIG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + memcpy(r->id.tcpdiag_src, &sk->net_pinfo.af_inet6.rcv_saddr, 16); + memcpy(r->id.tcpdiag_dst, &sk->net_pinfo.af_inet6.daddr, 16); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ + + if (tp->pending == TCP_TIME_RETRANS) { + r->tcpdiag_timer = 1; + r->tcpdiag_retrans = tp->retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_timer = 4; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (timer_pending(&sk->timer)) { + r->tcpdiag_timer = 2; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(sk->timer.expires); + } else { + r->tcpdiag_timer = 0; + r->tcpdiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + r->tcpdiag_uid = sock_i_uid(sk); + r->tcpdiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->tcpdiag_rmem = atomic_read(&sk->rmem_alloc); + minfo->tcpdiag_wmem = sk->wmem_queued; + minfo->tcpdiag_fmem = sk->forward_alloc; + minfo->tcpdiag_tmem = atomic_read(&sk->wmem_alloc); + } + + if (info) { + u32 now = tcp_time_stamp; + + info->tcpi_state = sk->state; + info->tcpi_ca_state = tp->ca_state; + info->tcpi_retransmits = tp->retransmits; + info->tcpi_probes = tp->probes_out; + info->tcpi_backoff = tp->backoff; + info->tcpi_options = 0; + if (tp->tstamp_ok) + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->sack_ok) + info->tcpi_options |= TCPI_OPT_SACK; + if (tp->wscale_ok) { + info->tcpi_options |= TCPI_OPT_WSCALE; + info->tcpi_snd_wscale = tp->snd_wscale; + info->tcpi_rcv_wscale = tp->rcv_wscale; + } else { + info->tcpi_snd_wscale = 0; + info->tcpi_rcv_wscale = 0; + } +#ifdef CONFIG_INET_ECN + if (tp->ecn_flags&TCP_ECN_OK) + info->tcpi_options |= TCPI_OPT_ECN; +#endif + + info->tcpi_rto = (1000000*tp->rto)/HZ; + info->tcpi_ato = (1000000*tp->ack.ato)/HZ; + info->tcpi_snd_mss = tp->mss_cache; + info->tcpi_rcv_mss = tp->ack.rcv_mss; + + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; + info->tcpi_retrans = tp->retrans_out; + info->tcpi_fackets = tp->fackets_out; + + info->tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ; + info->tcpi_last_ack_sent = 0; + info->tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ; + info->tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ; + + info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; + info->tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3; + info->tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2; + info->tcpi_snd_ssthresh = tp->snd_ssthresh; + info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_advmss = tp->advmss; + info->tcpi_reordering = tp->reordering; + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); +#ifdef CONFIG_IPV6 +extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif); +#endif + +static int tcpdiag_get_exact(struct sk_buff *in_skb, struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct tcpdiagreq *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + + if (req->tcpdiag_family == AF_INET) { + sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, + req->id.tcpdiag_src[0], req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#ifdef CONFIG_IPV6 + else if (req->tcpdiag_family == AF_INET6) { + sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, + (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || + req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && + sk != *((struct sock **)&req->id.tcpdiag_cookie[0])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ + sizeof(struct tcpdiag_meminfo)+ + sizeof(struct tcp_info)+64), GFP_KERNEL); + if (!rep) + goto out; + + if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq) <= 0) + BUG(); + + err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->state == TCP_TIME_WAIT) + tcp_tw_put((struct tcp_tw_bucket*)sk); + else + sock_put(sk); + } + return err; +} + +int bitstring_match(u32 *a1, u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +int tcpdiag_bc_run(char *bc, int len, struct sock *sk) +{ + while (len > 0) { + int yes = 1; + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + + switch (op->code) { + case TCPDIAG_BC_NOP: + break; + case TCPDIAG_BC_JMP: + yes = 0; + break; + case TCPDIAG_BC_S_GE: + yes = (sk->num >= op[1].no); + break; + case TCPDIAG_BC_S_LE: + yes = (sk->num <= op[1].no); + break; + case TCPDIAG_BC_D_GE: + yes = (ntohs(sk->dport) >= op[1].no); + break; + case TCPDIAG_BC_D_LE: + yes = (ntohs(sk->dport) <= op[1].no); + break; + case TCPDIAG_BC_AUTO: + yes = !(sk->userlocks&SOCK_BINDPORT_LOCK); + break; + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + { + struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); + u32 *addr; + + if (cond->port != -1 && + cond->port != (op->code == TCPDIAG_BC_S_COND ? sk->num : ntohs(sk->dport))) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (sk->family == AF_INET6) { + if (op->code == TCPDIAG_BC_S_COND) + addr = (u32*)&sk->net_pinfo.af_inet6.rcv_saddr; + else + addr = (u32*)&sk->net_pinfo.af_inet6.daddr; + } else { + if (op->code == TCPDIAG_BC_S_COND) + addr = &sk->rcv_saddr; + else + addr = &sk->daddr; + } + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (sk->family == AF_INET6 && cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == __constant_htonl(0xffff) && + bitstring_match(addr+3, cond->addr, cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +int valid_cc(char *bc, int len, int cc) +{ + while (len >= 0) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +int tcpdiag_bc_audit(char *bytecode, int bytecode_len) +{ + char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case TCPDIAG_BC_AUTO: + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + case TCPDIAG_BC_S_GE: + case TCPDIAG_BC_S_LE: + case TCPDIAG_BC_D_GE: + case TCPDIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + case TCPDIAG_BC_JMP: + if (op->no < 4 || op->no > len+4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len-op->no)) + return -EINVAL; + break; + case TCPDIAG_BC_NOP: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + + +int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct rtattr *bc = NULL; + + if (cb->nlh->nlmsg_len > 4+NLMSG_SPACE(sizeof(struct tcpdiagreq))) + bc = (struct rtattr*)(r+1); + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) + goto skip_listen_ht; + tcp_listen_lock(); + for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + struct sock *sk = tcp_listening_hash[i]; + + if (i > s_i) + s_num = 0; + + for (sk = tcp_listening_hash[i], num = 0; + sk != NULL; + sk = sk->next, num++) { + if (num < s_num) + continue; + if (!(r->tcpdiag_states&TCPF_LISTEN) || + r->id.tcpdiag_dport) + continue; + if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport) + continue; + if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk)) + continue; + if (tcpdiag_fill(skb, sk, r->tcpdiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq) <= 0) { + tcp_listen_unlock(); + goto done; + } + } + } + tcp_listen_unlock(); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < tcp_ehash_size; i++) { + struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct sock *sk; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + for (sk = head->chain, num = 0; + sk != NULL; + sk = sk->next, num++) { + if (num < s_num) + continue; + if (!(r->tcpdiag_states&(1<state))) + continue; + if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport) + continue; + if (r->id.tcpdiag_dport != sk->dport && r->id.tcpdiag_dport) + continue; + if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk)) + continue; + if (tcpdiag_fill(skb, sk, r->tcpdiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq) <= 0) { + read_unlock_bh(&head->lock); + goto done; + } + } + + if (r->tcpdiag_states&TCPF_TIME_WAIT) { + for (sk = tcp_ehash[i+tcp_ehash_size].chain; + sk != NULL; + sk = sk->next, num++) { + if (num < s_num) + continue; + if (!(r->tcpdiag_states&(1<zapped))) + continue; + if (r->id.tcpdiag_sport != sk->sport && r->id.tcpdiag_sport) + continue; + if (r->id.tcpdiag_dport != sk->dport && r->id.tcpdiag_dport) + continue; + if (bc && !tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), sk)) + continue; + if (tcpdiag_fill(skb, sk, r->tcpdiag_ext, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq) <= 0) { + read_unlock_bh(&head->lock); + goto done; + } + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int tcpdiag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type != TCPDIAG_GETSOCK) + goto err_inval; + + if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { + struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); + if (rta->rta_type != TCPDIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) + goto err_inval; + if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(tcpnl, skb, nlh, + tcpdiag_dump, + tcpdiag_dump_done); + } else { + return tcpdiag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +extern __inline__ void tcpdiag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = tcpdiag_rcv_msg(skb, nlh); + if (err) + netlink_ack(skb, nlh, err); + } +} + +static void tcpdiag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->receive_queue)) != NULL) { + tcpdiag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static int __init tcpdiag_init(void) +{ + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + if (tcpnl == NULL) + return -EBUSY; + return 0; +} + +static void __exit tcpdiag_exit(void) +{ + printk(KERN_INFO "Caution: unloading tcp_diag is not very well supported. Nothing to worry, but yet.\n"); + if (tcpnl) + sock_release(tcpnl->socket); +} + +module_init(tcpdiag_init); +module_exit(tcpdiag_exit); + +/* + * Local variables: + * compile-command: "gcc -DMOPS -DMODULE -D__KERNEL__ -I../include -I/usr/src/linux/include -Wall -Wstrict-prototypes -O2 -c tcp_diag.c" + * End: + */ diff --git a/Patches/Catalogue b/Patches/Catalogue index e69de29b..8e192791 100644 --- a/Patches/Catalogue +++ b/Patches/Catalogue @@ -0,0 +1,46 @@ +File: rt_cache_stat.dif +Apply to: kernel < 2.4.7 +Status: recommended for kernels < 2.4.7. + already present in >= 2.4.7 +Description: tracing efficiency of routing cache +Side effects: none + +File: pidentd-3.0.12.dif +Apply to: pident-3.0.12 tree f.e. from am redhat rpm +Status: highly recommended +Description: Patch to pidentd allowing to use tcpdiag facility and fixing + some bugs in original pident. +Side effects: none. Does not break anything not depending on kernel version, + even if tcpdiag is absent. +Advice: not related to this patch but should be said yet. + Do NOT configure pidentd to use threads! Use option + "--without-threads" when doing "configure". + pidentd is typical example of application where + threading results in nothing but collapse of performance. + Apparently author learned thread programming and decided + to apply new knowledge to the first victim. + +File: symbol_exports.dif +Apply to: kernel < 2.4.17 +Status: desired for kernels < 2.4.17 + not needed for kernels >= 2.4.17 +Description: exports symbols required to load tcpdiag module + tcpdiag is builtin since 2.4.17, hence the exports + are redundant. +Side effects: none + +File: af_unix.dif +Apply to: kernel +Status: recommended +Desciption: implements fragmented skb for unix sockets reducing + vm pressure for datagram sockets and adds to /proc/net/unix + columns allowing to monitor recv/send memory and identify + peer of connected sockets. +Side effects: "lsof" blames something about unix sockets. + Not a big loss, lsof is not able to tell anything more + clever than "can't identify protocol" for sockets anyway. +Note: the patch affects area where one or two lines changed + several times while 2.4. It does not depend on this, + but unfortunately may reject. It apply cleanly to + 2.4.17. + diff --git a/Patches/af_unix.dif b/Patches/af_unix.dif index e69de29b..0e48a172 100644 --- a/Patches/af_unix.dif +++ b/Patches/af_unix.dif @@ -0,0 +1,401 @@ +diff -ur ../vger3-011229/linux/net/unix/af_unix.c linux/net/unix/af_unix.c +--- ../vger3-011229/linux/net/unix/af_unix.c Mon Dec 3 20:24:03 2001 ++++ linux/net/unix/af_unix.c Sat Jan 5 04:30:19 2002 +@@ -112,6 +112,7 @@ + #include + + int sysctl_unix_max_dgram_qlen = 10; ++int sysctl_unix_stream_pages = MAX_SKB_FRAGS; + + unix_socket *unix_socket_table[UNIX_HASH_SIZE+1]; + rwlock_t unix_table_lock = RW_LOCK_UNLOCKED; +@@ -1123,9 +1124,6 @@ + struct scm_cookie scm; + memset(&scm, 0, sizeof(scm)); + unix_detach_fds(&scm, skb); +- +- /* Alas, it calls VFS */ +- /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); + } +@@ -1140,6 +1138,67 @@ + scm->fp = NULL; + } + ++int datagram_copy_fromiovec(struct iovec *iov, struct sk_buff *skb, int size) ++{ ++ struct sock *sk; ++ struct sk_buff **tail, *skb1; ++ int copy = min_t(int, size, skb_tailroom(skb)); ++ ++ if (memcpy_fromiovec(skb_put(skb, copy), iov, copy)) ++ goto do_fault; ++ ++ if ((size -= copy) == 0) ++ return 0; ++ ++ sk = skb->sk; ++ skb1 = skb; ++ tail = &skb_shinfo(skb)->frag_list; ++ ++ do { ++ struct page *page; ++ int i = skb_shinfo(skb1)->nr_frags; ++ ++ if (i == MAX_SKB_FRAGS) { ++ skb1 = alloc_skb(0, sk->allocation); ++ if (skb1 == NULL) ++ goto do_oom; ++ *tail = skb1; ++ tail = &skb1->next; ++ i = 0; ++ skb->truesize += skb1->truesize; ++ atomic_add(skb1->truesize, &sk->wmem_alloc); ++ } ++ ++ page = alloc_pages(sk->allocation, 0); ++ if (page == NULL) ++ goto do_oom; ++ ++ copy = min_t(int, size, PAGE_SIZE); ++ skb_shinfo(skb1)->nr_frags=i+1; ++ skb_shinfo(skb1)->frags[i].page = page; ++ skb_shinfo(skb1)->frags[i].page_offset = 0; ++ skb_shinfo(skb1)->frags[i].size = copy; ++ ++ skb1->len += copy; ++ skb1->data_len += copy; ++ if (skb != skb1) { ++ skb->len += copy; ++ skb->data_len += copy; ++ } ++ skb->truesize += PAGE_SIZE; ++ atomic_add(PAGE_SIZE, &sk->wmem_alloc); ++ if (memcpy_fromiovec(page_address(page), iov, copy)) ++ goto do_fault; ++ } while ((size -= copy) > 0); ++ return 0; ++ ++do_oom: ++ return -ENOMEM; ++ ++do_fault: ++ return -EFAULT; ++} ++ + /* + * Send AF_UNIX data. + */ +@@ -1155,6 +1214,7 @@ + unsigned hash; + struct sk_buff *skb; + long timeo; ++ int alloc; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) +@@ -1178,10 +1238,14 @@ + goto out; + + err = -EMSGSIZE; +- if ((unsigned)len > sk->sndbuf - 32) ++ if ((unsigned)len > sk->sndbuf) + goto out; + +- skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err); ++ alloc = len; ++ if (alloc > SKB_MAX_HEAD(0)) ++ alloc = SKB_MAX_HEAD(0); ++ ++ skb = sock_alloc_send_skb(sk, alloc, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb==NULL) + goto out; + +@@ -1190,7 +1254,7 @@ + unix_attach_fds(scm, skb); + + skb->h.raw = skb->data; +- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); ++ err = datagram_copy_fromiovec(msg->msg_iov, skb, len); + if (err) + goto out_free; + +@@ -1275,74 +1339,57 @@ + return err; + } + +- + static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len, + struct scm_cookie *scm) + { + struct sock *sk = sock->sk; + unix_socket *other = NULL; +- struct sockaddr_un *sunaddr=msg->msg_name; +- int err,size; + struct sk_buff *skb; ++ int err; + int sent=0; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out_err; + +- if (msg->msg_namelen) { +- err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP); ++ err = -ENOTCONN; ++ other = unix_peer_get(sk); ++ if (!other) + goto out_err; +- } else { +- sunaddr = NULL; +- err = -ENOTCONN; +- other = unix_peer_get(sk); +- if (!other) +- goto out_err; +- } + + if (sk->shutdown&SEND_SHUTDOWN) + goto pipe_err; + +- while(sent < len) +- { +- /* +- * Optimisation for the fact that under 0.01% of X messages typically +- * need breaking up. +- */ ++ while(sent < len) { ++ int size, alloc; + +- size=len-sent; ++ size = len-sent; + + /* Keep two messages in the pipe so it schedules better */ +- if (size > sk->sndbuf/2 - 64) +- size = sk->sndbuf/2 - 64; ++ if (size > sk->sndbuf/2) ++ size = sk->sndbuf/2; + +- if (size > SKB_MAX_ALLOC) +- size = SKB_MAX_ALLOC; +- + /* + * Grab a buffer + */ +- +- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); ++ alloc = size; ++ ++ if (size > SKB_MAX_HEAD(0)) { ++ alloc = SKB_MAX_HEAD(0); ++ if (size > alloc + sysctl_unix_stream_pages*PAGE_SIZE) ++ size = alloc + sysctl_unix_stream_pages*PAGE_SIZE; ++ } ++ ++ skb=sock_alloc_send_skb(sk,alloc,msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; + +- /* +- * If you pass two values to the sock_alloc_send_skb +- * it tries to grab the large buffer with GFP_NOFS +- * (which can fail easily), and if it fails grab the +- * fallback size buffer which is under a page and will +- * succeed. [Alan] +- */ +- size = min_t(int, size, skb_tailroom(skb)); +- + memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred)); + if (scm->fp) + unix_attach_fds(scm, skb); + +- if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { ++ if ((err = datagram_copy_fromiovec(msg->msg_iov, skb, size)) != 0) { + kfree_skb(skb); + goto out_err; + } +@@ -1418,13 +1465,10 @@ + + scm->creds = *UNIXCREDS(skb); + +- if (!(flags & MSG_PEEK)) +- { ++ if (!(flags & MSG_PEEK)) { + if (UNIXCB(skb).fp) + unix_detach_fds(scm, skb); +- } +- else +- { ++ } else { + /* It is questionable: on PEEK we could: + - do not return fds - good, but too simple 8) + - return fds, and do not return them on read (old strategy, +@@ -1483,13 +1527,10 @@ + return timeo; + } + +- +- + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size, + int flags, struct scm_cookie *scm) + { + struct sock *sk = sock->sk; +- struct sockaddr_un *sunaddr=msg->msg_name; + int copied = 0; + int check_creds = 0; + int target; +@@ -1515,21 +1556,18 @@ + + down(&sk->protinfo.af_unix.readsem); + +- do +- { ++ do { + int chunk; + struct sk_buff *skb; + + skb=skb_dequeue(&sk->receive_queue); +- if (skb==NULL) +- { ++ if (skb==NULL) { + if (copied >= target) + break; + + /* + * POSIX 1003.1g mandates this order. + */ +- + if ((err = sock_error(sk)) != 0) + break; + if (sk->shutdown & RCV_SHUTDOWN) +@@ -1551,60 +1589,44 @@ + + if (check_creds) { + /* Never glue messages from different writers */ +- if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) { +- skb_queue_head(&sk->receive_queue, skb); +- break; +- } ++ if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) ++ goto out_put_back; + } else { + /* Copy credentials */ + scm->creds = *UNIXCREDS(skb); + check_creds = 1; + } + +- /* Copy address just once */ +- if (sunaddr) +- { +- unix_copy_addr(msg, skb->sk); +- sunaddr = NULL; +- } ++ chunk = min_t(int, skb->len - sk->protinfo.af_unix.copied, size); ++ err = skb_copy_datagram_iovec(skb, sk->protinfo.af_unix.copied, msg->msg_iov, chunk); ++ if (err) ++ goto out_put_back; + +- chunk = min_t(unsigned int, skb->len, size); +- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { +- skb_queue_head(&sk->receive_queue, skb); +- if (copied == 0) +- copied = -EFAULT; +- break; +- } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ +- if (!(flags & MSG_PEEK)) +- { +- skb_pull(skb, chunk); +- ++ if (!(flags & MSG_PEEK)) { + if (UNIXCB(skb).fp) + unix_detach_fds(scm, skb); + + /* put the skb back if we didn't use it up.. */ +- if (skb->len) +- { +- skb_queue_head(&sk->receive_queue, skb); +- break; +- } ++ if ((sk->protinfo.af_unix.copied += chunk) < skb->len) ++ goto out_put_back; ++ ++ sk->protinfo.af_unix.copied = 0; + + kfree_skb(skb); + + if (scm->fp) + break; +- } +- else +- { ++ } else { + /* It is questionable, see note in unix_dgram_recvmsg. + */ + if (UNIXCB(skb).fp) + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + ++out_put_back: + /* put message back and return */ + skb_queue_head(&sk->receive_queue, skb); + break; +@@ -1676,10 +1698,12 @@ + break; + } + ++ down(&sk->protinfo.af_unix.readsem); + spin_lock(&sk->receive_queue.lock); + if((skb=skb_peek(&sk->receive_queue))!=NULL) +- amount=skb->len; ++ amount=skb->len - sk->protinfo.af_unix.copied; + spin_unlock(&sk->receive_queue.lock); ++ up(&sk->protinfo.af_unix.readsem); + err = put_user(amount, (int *)arg); + break; + } +@@ -1734,7 +1758,7 @@ + int i; + unix_socket *s; + +- len+= sprintf(buffer,"Num RefCount Protocol Flags Type St " ++ len+= sprintf(buffer,"Peer RcvQueue WMem Flags Type St " + "Inode Path\n"); + + read_lock(&unix_table_lock); +@@ -1742,10 +1766,10 @@ + { + unix_state_rlock(s); + +- len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5ld", +- s, +- atomic_read(&s->refcnt), +- 0, ++ len+=sprintf(buffer+len,"%08lX: %08X %08X %08X %04X %02X %5ld", ++ unix_peer(s) ? sock_i_ino(unix_peer(s)) : 0, ++ skb_queue_len(&s->receive_queue), ++ atomic_read(&s->wmem_alloc), + s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + s->type, + s->socket ? +diff -ur ../vger3-011229/linux/net/unix/sysctl_net_unix.c linux/net/unix/sysctl_net_unix.c +--- ../vger3-011229/linux/net/unix/sysctl_net_unix.c Tue Jan 30 21:20:16 2001 ++++ linux/net/unix/sysctl_net_unix.c Sat Jan 5 04:10:58 2002 +@@ -13,10 +13,14 @@ + #include + + extern int sysctl_unix_max_dgram_qlen; ++extern int sysctl_unix_stream_pages; + + ctl_table unix_table[] = { + {NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen", + &sysctl_unix_max_dgram_qlen, sizeof(int), 0600, NULL, ++ &proc_dointvec }, ++ {NET_UNIX_STREAM_PAGES, "stream_pages", ++ &sysctl_unix_stream_pages, sizeof(int), 0600, NULL, + &proc_dointvec }, + {0} + }; diff --git a/Patches/pidentd-3.0.12.dif b/Patches/pidentd-3.0.12.dif index e69de29b..6e54e936 100644 --- a/Patches/pidentd-3.0.12.dif +++ b/Patches/pidentd-3.0.12.dif @@ -0,0 +1,270 @@ +diff -ur ../pidentd-3.0.12-orig/src/k_linux.c ./src/k_linux.c +--- ../pidentd-3.0.12-orig/src/k_linux.c Sat Jan 12 00:44:05 2002 ++++ ./src/k_linux.c Sat Nov 3 07:51:28 2001 +@@ -26,12 +26,65 @@ + + #include "pidentd.h" + ++#define NETLINK_TCPDIAG 4 ++#define TCPDIAG_GETSOCK 18 ++ ++#include ++#include ++ ++/* Socket identity */ ++struct tcpdiag_sockid ++{ ++ __u16 tcpdiag_sport; ++ __u16 tcpdiag_dport; ++ __u32 tcpdiag_src[4]; ++ __u32 tcpdiag_dst[4]; ++ __u32 tcpdiag_if; ++ __u32 tcpdiag_cookie[2]; ++#define TCPDIAG_NOCOOKIE (~0U) ++}; ++ ++/* Request structure */ ++ ++struct tcpdiagreq ++{ ++ __u8 tcpdiag_family; /* Family of addresses. */ ++ __u8 tcpdiag_src_len; ++ __u8 tcpdiag_dst_len; ++ __u8 tcpdiag_ext; /* Query extended information */ ++ ++ struct tcpdiag_sockid id; ++ ++ __u32 tcpdiag_states; /* States to dump */ ++ __u32 tcpdiag_dbs; /* Tables to dump (NI) */ ++}; ++ ++struct tcpdiagmsg ++{ ++ __u8 tcpdiag_family; ++ __u8 tcpdiag_state; ++ __u8 tcpdiag_timer; ++ __u8 tcpdiag_retrans; ++ ++ struct tcpdiag_sockid id; ++ ++ __u32 tcpdiag_expires; ++ __u32 tcpdiag_rqueue; ++ __u32 tcpdiag_wqueue; ++ __u32 tcpdiag_uid; ++ __u32 tcpdiag_inode; ++}; ++ ++ ++int tcpdiag_fd = -1; ++ + /* + ** Make sure we are running on a supported OS version + */ + int + ka_init(void) + { ++ tcpdiag_fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_TCPDIAG); + return 0; /* We always succeed */ + } + +@@ -56,6 +109,144 @@ + } + + ++ ++int k_lookup_tcpdiag(struct kernel *kp) ++{ ++ struct sockaddr_nl nladdr; ++ struct { ++ struct nlmsghdr nlh; ++ struct tcpdiagreq r; ++ } req; ++ struct msghdr msg; ++ char buf[8192]; ++ struct iovec iov[1]; ++ struct tcpdiagmsg *r; ++ static unsigned seqno = 123456; ++ ++ memset(&nladdr, 0, sizeof(nladdr)); ++ nladdr.nl_family = AF_NETLINK; ++ ++ req.nlh.nlmsg_len = sizeof(req); ++ req.nlh.nlmsg_type = TCPDIAG_GETSOCK; ++ req.nlh.nlmsg_flags = NLM_F_REQUEST; ++ req.nlh.nlmsg_pid = 0; ++ req.nlh.nlmsg_seq = ++seqno; ++ memset(&req.r, 0, sizeof(req.r)); ++ req.r.tcpdiag_family = AF_INET; ++ req.r.tcpdiag_states = ~0; ++ ++ req.r.id.tcpdiag_dport = kp->remote.sin_port; ++ req.r.id.tcpdiag_sport = kp->local.sin_port; ++ req.r.id.tcpdiag_dst[0] = kp->remote.sin_addr.s_addr; ++ req.r.id.tcpdiag_src[0] = kp->local.sin_addr.s_addr; ++ req.r.id.tcpdiag_cookie[0] = TCPDIAG_NOCOOKIE; ++ req.r.id.tcpdiag_cookie[1] = TCPDIAG_NOCOOKIE; ++ kp->ruid = NO_UID; ++ ++ iov[0] = (struct iovec){ &req, sizeof(req) }; ++ ++ msg = (struct msghdr) { ++ (void*)&nladdr, sizeof(nladdr), ++ iov, 1, ++ NULL, 0, ++ 0 ++ }; ++ ++ if (sendmsg(tcpdiag_fd, &msg, 0) < 0) { ++ if (errno == ECONNREFUSED) { ++ close(tcpdiag_fd); ++ tcpdiag_fd = -1; ++ return 0; ++ } ++ syslog(LOG_ERR, "system error on tcpdiag sendmsg: %m"); ++ return -1; ++ } ++ ++ iov[0] = (struct iovec){ buf, sizeof(buf) }; ++ ++ while (1) { ++ int status; ++ struct nlmsghdr *h; ++ ++ msg = (struct msghdr) { ++ (void*)&nladdr, sizeof(nladdr), ++ iov, 1, ++ NULL, 0, ++ 0 ++ }; ++ ++ status = recvmsg(tcpdiag_fd, &msg, 0); ++ ++ if (status < 0) { ++ if (errno == EINTR || errno == EAGAIN) ++ continue; ++ return -1; ++ } ++ if (status == 0) { ++ return -1; ++ } ++ ++ h = (struct nlmsghdr*)buf; ++ while (NLMSG_OK(h, status)) { ++ int err; ++ ++ if (/*h->nlmsg_pid != rth->local.nl_pid ||*/ ++ h->nlmsg_seq != seqno) ++ goto skip_it; ++ ++ if (h->nlmsg_type == NLMSG_DONE) ++ return -1; ++ if (h->nlmsg_type == NLMSG_ERROR) { ++ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); ++ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) { ++ return -1; ++ } else { ++ errno = -err->error; ++ if (errno == ECONNREFUSED) { ++ close(tcpdiag_fd); ++ tcpdiag_fd = -1; ++ return 0; ++ } ++ if (errno != ENOENT) ++ syslog(LOG_ERR, "tcpdiag answers: %m"); ++ } ++ return -1; ++ } ++ ++ r = NLMSG_DATA(h); ++ ++ /* Lookup _may_ return listening socket, if no ++ * better matches are found. */ ++ if (r->id.tcpdiag_dport == kp->remote.sin_port && ++ r->id.tcpdiag_dst[0] == kp->remote.sin_addr.s_addr) { ++ kp->ruid = r->tcpdiag_uid; ++ if (!r->tcpdiag_inode && !r->tcpdiag_uid) { ++ /* _NEVER_ return "root" for closed ++ * sockets. Otherwise people think ++ * that it is sysadmin who abuses their ++ * poor ircd. :-) */ ++ syslog(LOG_NOTICE, ++ "Req for stale socket(%d) %d from %x/%d", ++ r->tcpdiag_state, ntohs(r->id.tcpdiag_sport), ++ r->id.tcpdiag_dst[0], ntohs(r->id.tcpdiag_dport)); ++ return -1; ++ } ++ return 1; ++ } ++ ++ return -1; ++ ++skip_it: ++ h = NLMSG_NEXT(h, status); ++ } ++ if ((msg.msg_flags & MSG_TRUNC) || status) { ++ syslog(LOG_ERR, "truncated tcp_diag message"); ++ return -1; ++ } ++ } ++} ++ ++ + int + ka_lookup(void *vp, struct kernel *kp) + { +@@ -64,16 +255,23 @@ + long r_laddr, r_raddr, myladdr, myraddr; + int r_lport, r_rport, mylport, myrport; + int euid; +- +- ++ ++ if (tcpdiag_fd >= 0) { ++ int res; ++ if ((res = k_lookup_tcpdiag(kp)) != 0) ++ return res; ++ syslog(LOG_ERR, "tcp_diag is not loaded, fallback to proc"); ++ } ++ ++ + r_rport = ntohs(kp->remote.sin_port); + r_lport = ntohs(kp->local.sin_port); + r_raddr = kp->remote.sin_addr.s_addr; + r_laddr = kp->local.sin_addr.s_addr; ++ kp->ruid = NO_UID; + + fp = (FILE *) vp; + +- kp->ruid = NO_UID; + rewind(fp); + + /* eat header */ +@@ -82,13 +280,26 @@ + + while (fgets(buf, sizeof(buf)-1, fp) != NULL) + { +- if (sscanf(buf, "%*d: %lx:%x %lx:%x %*x %*x:%*x %*x:%*x %*x %d %*d %*d", +- &myladdr, &mylport, &myraddr, &myrport, &euid) == 5) ++ int state, ino; ++ if (sscanf(buf, "%*d: %x:%x %x:%x %x %*x:%*x %*x:%*x %*x %d %*d %u", ++ &myladdr, &mylport, &myraddr, &myrport, ++ &state, &euid, &ino) == 7) + { + if (myladdr == r_laddr && mylport == r_lport && + myraddr == r_raddr && myrport == r_rport) + { + kp->euid = euid; ++ if (ino == 0 && euid == 0) ++ { ++ /* _NEVER_ return "root" for closed ++ * sockets. Otherwise people think ++ * that it is sysadmin who abuses their ++ * poor ircd. :-) */ ++ syslog(LOG_NOTICE, ++ "Req for stale socket(%d) %d from %x/%d", ++ state, r_rport, r_raddr, r_lport); ++ return -1; ++ } + return 1; + } + } diff --git a/Patches/rt_cache_stat.dif b/Patches/rt_cache_stat.dif index e69de29b..a03ddf22 100644 --- a/Patches/rt_cache_stat.dif +++ b/Patches/rt_cache_stat.dif @@ -0,0 +1,230 @@ +--- linux/include/net/route.h.orig Tue Apr 17 07:25:48 2001 ++++ linux/include/net/route.h Tue Jul 10 23:35:18 2001 +@@ -14,6 +14,7 @@ + * Alan Cox : Support for TCP parameters. + * Alexey Kuznetsov: Major changes for new routing code. + * Mike McLagan : Routing by source ++ * Robert Olsson : Added rt_cache statistics + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License +@@ -90,6 +91,20 @@ + __u32 o_packets; + __u32 i_bytes; + __u32 i_packets; ++}; ++ ++struct rt_cache_stat ++{ ++ unsigned in_hit; ++ unsigned in_slow_tot; ++ unsigned in_slow_mc; ++ unsigned in_no_route; ++ unsigned in_brd; ++ unsigned in_martian_dst; ++ unsigned in_martian_src; ++ unsigned out_hit; ++ unsigned out_slow_tot; ++ unsigned out_slow_mc; + }; + + extern struct ip_rt_acct *ip_rt_acct; +--- linux/net/ipv4/route.c.orig Wed Mar 28 22:01:15 2001 ++++ linux/net/ipv4/route.c Tue Jul 10 23:27:51 2001 +@@ -52,6 +52,7 @@ + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark ++ * Robert Olsson : Added rt_cache statistics + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License +@@ -201,6 +202,8 @@ + static unsigned rt_hash_mask; + static int rt_hash_log; + ++struct rt_cache_stat rt_cache_stat[NR_CPUS]; ++ + static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res); + + static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos) +@@ -270,6 +273,44 @@ + len = length; + return len; + } ++ ++ ++#ifdef CONFIG_PROC_FS ++static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length) ++{ ++ int i, lcpu; ++ int len=0; ++ unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries); ++ ++ for (lcpu=0; lcpu length) ++ len = length; ++ if (len < 0) ++ len = 0; ++ ++ *start = buffer + offset; ++ return len; ++} ++#endif + + static __inline__ void rt_free(struct rtable *rt) + { +@@ -1163,6 +1204,8 @@ + u32 spec_dst; + struct in_device *in_dev = in_dev_get(dev); + u32 itag = 0; ++ int cpu = smp_processor_id(); ++ + + /* Primary sanity checks. */ + +@@ -1221,6 +1264,7 @@ + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; + #endif ++ rt_cache_stat[cpu].in_slow_mc++; + + in_dev_put(in_dev); + hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos); +@@ -1259,6 +1303,7 @@ + u32 spec_dst; + int err = -EINVAL; + int free_res = 0; ++ int cpu = smp_processor_id(); + + /* + * IP on this device is disabled. +@@ -1308,6 +1353,8 @@ + } + free_res = 1; + ++ rt_cache_stat[cpu].in_slow_tot++; ++ + #ifdef CONFIG_IP_ROUTE_NAT + /* Policy is applied before mapping destination, + but rerouting after map should be made with old source. +@@ -1455,6 +1502,7 @@ + } + flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; ++ rt_cache_stat[cpu].in_brd++; + + local_input: + rth = dst_alloc(&ipv4_dst_ops); +@@ -1498,6 +1546,7 @@ + goto intern; + + no_route: ++ rt_cache_stat[cpu].in_no_route++; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; +@@ -1506,6 +1555,7 @@ + * Do not cache martian addresses: they should be logged (RFC1812) + */ + martian_destination: ++ rt_cache_stat[cpu].in_martian_dst++; + #ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n", +@@ -1520,6 +1570,8 @@ + goto done; + + martian_source: ++ ++ rt_cache_stat[cpu].in_martian_src++; + #ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { + /* +@@ -1550,6 +1602,7 @@ + struct rtable * rth; + unsigned hash; + int iif = dev->ifindex; ++ int cpu = smp_processor_id(); + + tos &= IPTOS_RT_MASK; + hash = rt_hash_code(daddr, saddr^(iif<<5), tos); +@@ -1567,6 +1620,7 @@ + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; ++ rt_cache_stat[cpu].in_hit++; + read_unlock(&rt_hash_table[hash].lock); + skb->dst = (struct dst_entry*)rth; + return 0; +@@ -1621,6 +1675,7 @@ + int free_res = 0; + int err; + u32 tos; ++ int cpu = smp_processor_id(); + + tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK); + key.dst = oldkey->dst; +@@ -1847,14 +1902,18 @@ + + rth->u.dst.output=ip_output; + ++ rt_cache_stat[cpu].out_slow_tot++; ++ + if (flags&RTCF_LOCAL) { + rth->u.dst.input = ip_local_deliver; + rth->rt_spec_dst = key.dst; + } + if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) { + rth->rt_spec_dst = key.src; +- if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) ++ if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) { + rth->u.dst.output = ip_mc_output; ++ rt_cache_stat[cpu].out_slow_mc++; ++ } + #ifdef CONFIG_IP_MROUTE + if (res.type == RTN_MULTICAST) { + struct in_device *in_dev = in_dev_get(dev_out); +@@ -1894,6 +1953,7 @@ + { + unsigned hash; + struct rtable *rth; ++ int cpu = smp_processor_id(); + + hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos); + +@@ -1912,6 +1972,7 @@ + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; ++ rt_cache_stat[cpu].out_hit++; + read_unlock_bh(&rt_hash_table[hash].lock); + *rp = rth; + return 0; +@@ -2339,6 +2400,7 @@ + add_timer(&rt_periodic_timer); + + proc_net_create ("rt_cache", 0, rt_cache_get_info); ++ proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info); + #ifdef CONFIG_NET_CLS_ROUTE + create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL); + #endif diff --git a/Patches/symbol_exports.dif b/Patches/symbol_exports.dif index e69de29b..519ea7cd 100644 --- a/Patches/symbol_exports.dif +++ b/Patches/symbol_exports.dif @@ -0,0 +1,56 @@ +diff -ur ../vger3-010830/linux/net/ipv6/tcp_ipv6.c linux/net/ipv6/tcp_ipv6.c +--- ../vger3-010830/linux/net/ipv6/tcp_ipv6.c Wed Jun 13 21:14:05 2001 ++++ linux/net/ipv6/tcp_ipv6.c Fri Oct 12 06:59:07 2001 +@@ -339,13 +339,18 @@ + return tcp_v6_lookup_listener(daddr, hnum, dif); + } + +-#define tcp_v6_lookup(sa, sp, da, dp, dif) \ +-({ struct sock *___sk; \ +- local_bh_disable(); \ +- ___sk = __tcp_v6_lookup((sa),(sp),(da),ntohs(dp),(dif)); \ +- local_bh_enable(); \ +- ___sk; \ +-}) ++__inline__ struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, ++ struct in6_addr *daddr, u16 dport, ++ int dif) ++{ ++ struct sock *sk; ++ ++ local_bh_disable(); ++ sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif); ++ local_bh_enable(); ++ ++ return sk; ++} + + + /* +diff -ur ../vger3-010830/linux/net/netsyms.c linux/net/netsyms.c +--- ../vger3-010830/linux/net/netsyms.c Sun Aug 19 22:01:45 2001 ++++ linux/net/netsyms.c Fri Oct 12 07:59:17 2001 +@@ -72,6 +72,11 @@ + + extern int netdev_finish_unregister(struct net_device *dev); + ++extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, ++ struct in6_addr *daddr, u16 dport, ++ int dif); ++extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif); ++ + #include + + #ifdef CONFIG_IPX_MODULE +@@ -284,7 +289,11 @@ + EXPORT_SYMBOL(ndisc_mc_map); + EXPORT_SYMBOL(register_inet6addr_notifier); + EXPORT_SYMBOL(unregister_inet6addr_notifier); ++EXPORT_SYMBOL(tcp_v6_lookup); + #endif ++EXPORT_SYMBOL(tcp_v4_lookup); ++EXPORT_SYMBOL(tcp_timewait_cachep); ++EXPORT_SYMBOL(tcp_hashinfo); + #if defined (CONFIG_IPV6_MODULE) || defined (CONFIG_KHTTPD) || defined (CONFIG_KHTTPD_MODULE) + /* inet functions common to v4 and v6 */ + EXPORT_SYMBOL(inet_release); diff --git a/README b/README index e69de29b..53a45c2e 100644 --- a/README +++ b/README @@ -0,0 +1,65 @@ +Primary FTP site is: + + ftp://ftp.inr.ac.ru/ip-routing/ + +Mirrors are: + + ftp://linux.wauug.org/pub/net + ftp://ftp.nc.ras.ru/pub/mirrors/ftp.inr.ac.ru/ip-routing/ + ftp://ftp.gts.cz/MIRRORS/ftp.inr.ac.ru/ + ftp://ftp.funet.fi/pub/mirrors/ftp.inr.ac.ru/ip-routing/ (STM1 to USA) + ftp://sunsite.icm.edu.pl/pub/Linux/iproute/ + ftp://ftp.sunet.se/pub/Linux/ip-routing/ + ftp://ftp.nvg.ntnu.no/pub/linux/ip-routing/ + ftp://ftp.crc.ca/pub/systems/linux/ip-routing/ + ftp://ftp.proxad.net/mirrors/ftp.inr.ac.ru/ip-routing/ + ftp://donlug.dn.ua/pub/mirrors/ip-routing/ + ftp://omni.rk.tusur.ru/mirrors/ftp.inr.ac.ru/ip-routing/ + ftp://ftp.src.uchicago.edu/pub/linux/ip-routing/ + http://www.asit.ro/ip-routing/ + ftp://ftp.infoscience.co.jp/pub/linux/ip-routing/ (Japan) + ftp://ftp.sucs.swan.ac.uk/pub/mirrors/ftp.inr.ac.ru/ip-routing + http://mirror.schell.de/ftp.inr.ac.ru/ip-routing/ (Germany) + ftp://ftp.gin.cz/MIRRORS/ftp.inr.ac.ru/ip-routing + ftp://mirror.aarnet.edu.au/pub/ip-routing/ (Australia) + http://mirror.aarnet.edu.au/pub/ip-routing/ (Australia) + +RPMs are available at: + ftp://omni.rk.tusur.ru/Tango/ + ftp://ftp4.dgtu.donetsk.ua/pub/BlackCat/6.0/contrib/SRPMS/i[35]86/ + + + +How to compile this. +-------------------- + + +1. Look at start of Makefile and set correct values for: + +KERNEL_INCLUDE should point to correct linux kernel include directory. +Default (/usr/src/linux/include) is right as rule. + +ADDLIB should contain inet_* functions, if your libc contains +obsolete resolver library (<4.9.4) and you have no correct libresolv. +ADDLIB should also contain dnet_* functions if you don't have a +libdnet with support for them. If your libdnet does have support, +then comment out that line and uncomment the line to add -ldnet to +LDLIBS. + +LDLIBS should be empty, if you have no libresolv. + + +2. make + +Utilities "ip" and "rtmon" are in ip/ directory now, +"tc" is in tc/. That's all. + +3. To make documentation, cd to doc/ directory , then + look at start of Makefile and set correct values for + PAGESIZE=a4 , ie: a4 , letter ... (string) + PAGESPERPAGE=2 , ie: 1 , 2 ... (numeric) + and make there. It assumes, that latex, dvips and psnup + are in your path. + +Alexey Kuznetsov +kuznet@ms2.inr.ac.ru diff --git a/README.decnet b/README.decnet index e69de29b..4d7453aa 100644 --- a/README.decnet +++ b/README.decnet @@ -0,0 +1,41 @@ + +Here are a few quick points about DECnet support... + + o No name resolution is available as yet, all addresses must be + entered numerically. + + o The neighbour cache may well list every entry as having the address + 0.170. This is due to a problem that I need to sort out kernel side. + It is harmless (but don't try and use neigh add yet) just look in + /proc/net/decnet_neigh to see the real addresses for now. + + o The rtnetlink support in the kernel is rather exprimental, expect a + few odd things to happen for the next few DECnet kernel releases. + + o Whilst you can use ip addr add to add more than one DECnet address to an + interface, don't expect addresses which are not the same as the + kernels node address to work properly. i.e. You will break the DECnet + protocol if you do add anything other than the automatically generated + interface addresses to ethernet cards. This option is there for future + link layer support, where the device will have to be configed for + DECnet explicitly. + + o The DECnet support is currently self contained. You do not need the + libdnet library to use it. In fact until I've sent the dnet_pton and + dnet_ntop functions to Patrick to add, you can't use libdnet. + + o If you are not using the very latest 2.3.xx series kernels, don't + try and list DECnet routes if you've got IPv6 compiled into the + kernel. It will oops. + + o My main reason for writing the DECnet support for iproute2 was to + check out the DECnet routing code, so the route get and + route show cache commands are likely to be the most debugged out of + all of them. + + o If you find bugs in the DECnet support, please send them to me in the + first instance, and then I'll send Alexey a patch to fix it. IPv4/6 + bugs should be sent to Alexey as before. + +Steve Whitehouse + diff --git a/README.iproute2+tc b/README.iproute2+tc index e69de29b..edd79c0e 100644 --- a/README.iproute2+tc +++ b/README.iproute2+tc @@ -0,0 +1,119 @@ +iproute2+tc* + +It's the first release of Linux traffic control engine. + + +NOTES. +* csz scheduler is inoperational at the moment, and probably + never will be repaired but replaced with h-pfq scheduler. +* To use "fw" classifier you will need ipfwchains patch. +* No manual available. Ask me, if you have problems (only try to guess + answer yourself at first 8)). + + +Micro-manual how to start it the first time +------------------------------------------- + +A. Attach CBQ to eth1: + +tc qdisc add dev eth1 root handle 1: cbq bandwidth 10Mbit allot 1514 cell 8 \ +avpkt 1000 mpu 64 + +B. Add root class: + +tc class add dev eth1 parent 1:0 classid 1:1 cbq bandwidth 10Mbit rate 10Mbit \ +allot 1514 cell 8 weight 1Mbit prio 8 maxburst 20 avpkt 1000 + +C. Add default interactive class: + +tc class add dev eth1 parent 1:1 classid 1:2 cbq bandwidth 10Mbit rate 1Mbit \ +allot 1514 cell 8 weight 100Kbit prio 3 maxburst 20 avpkt 1000 split 1:0 \ +defmap c0 + +D. Add default class: + +tc class add dev eth1 parent 1:1 classid 1:3 cbq bandwidth 10Mbit rate 8Mbit \ +allot 1514 cell 8 weight 800Kbit prio 7 maxburst 20 avpkt 1000 split 1:0 \ +defmap 3f + +etc. etc. etc. Well, it is enough to start 8) The rest can be guessed 8) +Look also at more elaborated example, ready to start rsvpd, +in rsvp/cbqinit.eth1. + + +Terminology and advices about setting CBQ parameters may be found in Sally Floyd +papers. + + +Pairs X:Y are class handles, X:0 are qdisc heandles. +weight should be proportional to rate for leaf classes +(I choosed it ten times less, but it is not necessary) + +defmap is bitmap of logical priorities served by this class. + +E. Another qdiscs are simpler. F.e. let's join TBF on class 1:2 + +tc qdisc add dev eth1 parent 1:2 tbf rate 64Kbit buffer 5Kb/8 limit 10Kb + +F. Look at all that we created: + +tc qdisc ls dev eth1 +tc class ls dev eth1 + +G. Install "route" classifier on root of cbq and map destination from realm +1 to class 1:2 + +tc filter add dev eth1 parent 1:0 protocol ip prio 100 route to 1 classid 1:2 + +H. Assign routes to 10.11.12.0/24 to realm 1 + +ip route add 10.11.12.0/24 dev eth1 via whatever realm 1 + +etc. The same thing can be made with rules. +I still did not test ipchains, but they should work too. + +Setup of rsvp and u32 classifiers is more hairy. +If you read RSVP specs, you will understand how rsvp classifier +works easily. What's about u32... That's example: + + + +#! /bin/sh + +TC=/home/root/tc + +# Setup classifier root on eth1 root (it is cbq) +$TC filter add dev eth1 parent 1:0 prio 5 protocol ip u32 + +# Create hash table of 256 slots with ID 1: +$TC filter add dev eth1 parent 1:0 prio 5 handle 1: u32 divisor 256 + +# Add to 6th slot of hash table rule to select tcp/telnet to 193.233.7.75 +# direct it to class 1:4 and prescribe to fall to best effort, +# if traffic violate TBF (32kbit,5K) +$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:6: \ + match ip dst 193.233.7.75 \ + match tcp dst 0x17 0xffff \ + flowid 1:4 \ + police rate 32kbit buffer 5kb/8 mpu 64 mtu 1514 index 1 + +# Add to 1th slot of hash table rule to select icmp to 193.233.7.75 +# direct it to class 1:4 and prescribe to fall to best effort, +# if traffic violate TBF (10kbit,5K) +$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:: \ + sample ip protocol 1 0xff \ + match ip dst 193.233.7.75 \ + flowid 1:4 \ + police rate 10kbit buffer 5kb/8 mpu 64 mtu 1514 index 2 + +# Lookup hash table, if it is not fragmented frame +# Use protocol as hash key +$TC filter add dev eth1 parent 1:0 prio 5 handle ::1 u32 ht 800:: \ + match ip nofrag \ + offset mask 0x0F00 shift 6 \ + hashkey mask 0x00ff0000 at 8 \ + link 1: + + +Alexey Kuznetsov +kuznet@ms2.inr.ac.ru diff --git a/RELNOTES b/RELNOTES index e69de29b..17f00111 100644 --- a/RELNOTES +++ b/RELNOTES @@ -0,0 +1,168 @@ +[020116] +! 1. Compile with rh-7.2 +! 2. What the hell some people blame on socklen_t defined in unistd.h? Check. + * Kim Woelders , various useful fixups: compilation + with old kernels, cross-compiling, "all" == "any" in prefix spec. + * Collected from my disk, cleaned and packed to directory iproute2/misc/ + several utilities: ss, nstat, ifstat, rtacct, arpd and module tcp_diag. + Writing some docs. me. + * prepared patchlet for pidentd to use tcp_diag. + * David Miller: 64bit (and even worse 64bit kernel/32 bit user :-) fixes + to above. tcp_diag is merged to main tree. + * Alexandr D. Kanevskiy : various flaws in ss + * Alexandr D. Kanevskiy : oops, more aggressive caching + of names opened old bugs: ip started to print garbage in some places. + * Robert Olsson, rt_cache_stat. Renamed to rtstat. + * An old bug in "ip maddr ls": reduntant empty lines in output. + Seeing this crap for ages but lucky match of desire/ability to repair + and a huff about this happened only today. :-) + * "Mr. James W. Laferriere" + doc: option to produce ps output for non-a4 and not only 2 pages/sheet. + * Jamal's patch for ingres qdisc. + * Bernd Eckenfels : deleted orphaned bogus #include + in include/utils.h. + * Julian Anastasov : uninitialized fields in nexthop + producing funny "dead" nexthops in multipath routes. + Stupid me, look at the first line in [010803]... Was it difficult to guess + this that time? People blame for several months. :-) + Special thanks to bert hubert who raised the issue in netdev. + Thanks and apologies to Terry Schmidt , + Ruben Puettmann , + Mark Ivens . + * willy tarreau : "make install" target. + * Tunable limit for sch_sfq. Patch to kernel activating this + is about to be submitted. Reminded by Adi Nugroho . + +[010824] + * ip address add sets scope of loopback addreses to "host". + Advised by David Miller. + * ZIP! and David Ford + Some strcpy's changed to strncpy's. + * David Ford , test for compilation with gcc3. + * David Ford . Damn, I broke rtnl_talk in previous + snapshot. + +[010803] + * If "dev" is not specified in multipath route, ifindex remained + uninitialized. Grr. Thanks to Kunihiro Ishiguro . + * Rafal Maszkowski , batch mode tc. The most old patch. + * Updates list of data protocol ids. + Lots of reporters. I bring my apologies. + * Jan Rekorajski . Updated list of datalink types. + * Christina Chen . Bug in parsing IPv6 address match in u32. + * Pekka Savola . ip -6 route flush dev lo stuck + on deleting root of the table. + * Werner. dsmark fixes. + * Alexander Demenshin . Old miracleous bug + in ip monitor. It was puzzle, people permanently blame that + it prints some crap. + * Rui Prior . f_route failed to resolve fromif. + Werner also noticed this and sent patch. Bad place... [RETHINK] + * Kim Woelders . + - changes in Makefile for cross-compile + - understand "all" as alias for "any" + - bug in iprule.c +! [ NB. Also he sent patch for kernel. Do not forget! ] + * Werner. Fix to tc core files: wrong exits etc. + * Bernd Jendrissek . Some sanitizations of tc.c +!* Marian Jancar . He say q_tbf prints wrong latency! +! Seems, he is wrong. + * Werner (and Nikolai Vladychevski ) check ->print_copts + to avoid segfault. + +[001007] + * Compiles under rh-7.0 + +[000928] + * Sorry. I have lost all the CVS with changes made since 000305. + If someone sent me a patch after this date, please, resubmit. + Restored from the last backup and mailboxes: + + * Edit ip-cref.tex by raf . + * RTAX_REORDERING support. + * IFLA_MASTER support. + * Bug in rtnl_talk(), libnetlink.c. Reported by David P. Olshfski + + +[000305] + * Bugs in RESOLVE_HOSTNAMES. Bratislav Ilich + * ARPHRD_IEEE802_TR + +[000225] + * ECN in q_red.c. + +[000221] + * diffserv update from Jamal Hadi Salim + * Some bits of IPX from Steve Whitehouse. + * ATM qdisc from Werner Almesberger + * Support for new attributes on routes in linux-2.3. + +[991023] + No news, only several bugs are fixed. + * Since ss990630 "ip rule list" printed wrong prefix length. + Vladimir V. Ivanov + * "ip rule" parsed >INT_MAX values of metric incorrectly. + Matthew G. Marsh + * Some improvements in doc/Makefile advised by + Andi Kleen and Werner Almesberger. + +[990824] + * new attributes in "ip route": rtt, rttvar, cwnd, ssthresh and advmss. + * some updates in documentaion to reflect new status. + +[990630] + * DiffServ support. + Werner Almesberger + Jamal Hadi Salim + * DECnet support. + Steve Whitehouse + * Some minor tweaks in docs and code. + +[990530] + * routel script. Stephen R. van den Berg + * Bug in tc/q_prio.c resetting priomap. Reported by + Ole Husgaard and + Jan Kasprzak + * IP command reference manual is published (ip-cref.tex). + I am sorry, but tc-cref.tex is still not ready, to be more + exact the draft does not describe current tc 8-) + * ip, rtmon, rtacct utilities are updated according to manual 8-) + Lots of changes: + - (MAIN) "flush" command for addr, neigh and route. + - error messages are sanitized; now it does not print + usage() page on each error. + - output format is improved. + - "oneline" mode is added. + - etc. + * Name databases; resolution acsii <-> numeric is split out to lib/* + * scripts ifcfg, ifone and rtpr. + * examples/dhcp-client-script is copied from my patch to ISC dhcp. + * Makefile in doc/ directory. + +[990417] + * "pmtudisc" flag to "ip tunnel". Phil Karn + * bug in tc/q_tbf.c preventing setting peak_rate, Martin Mares + * doc/flowlabels.tex + +[990329] + + * This snapshot fixes some compatibility problems, which I introduced + occasionally to previous snapshots. + * Namely, "allot" to "tc qdisc add ... cbq" is accepted but ignored. + * Another changes are supposed to be shown in the next snapshot, but + because of troubles with "allot" I am forced to release premature + version. Namely, "cell", "prio", "weight" etc. are optional now. + * doc/ip-tunnels.tex + +[990327] + * History was not recorded. + +[981002] + * Rani Assaf contributed resolving + addresses to names. + BEWARE! DO NOT USE THIS OPTION, WHEN REPORTING BUGS IN + IPROUTE OR IN KERENEL. ALL THE BUG REPORTS MUST CONTAIN + ONLY NUMERIC ADDRESSES. + +[981101] + * now it should compile for any libc. diff --git a/doc/Makefile b/doc/Makefile index e69de29b..636b3288 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -0,0 +1,57 @@ +PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps +# tc-cref.ps +# api-rtnl.tex api-pmtudisc.tex api-news.tex +# iki-netdev.ps iki-neighdst.ps + + +LATEX=latex +DVIPS=dvips +SGML2DVI=sgml2latex --output=dvi +SGML2HTML=sgml2html -s 0 +LPR=lpr -Zsduplex +SHELL=bash +PAGESIZE=a4 +PAGESPERPAGE=2 + +HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml)) +DVIFILES=$(subst .ps,.dvi,$(PSFILES)) + + +all: pstwocol + +pstwocol: $(PSFILES) + +html: $(HTMLFILES) + +dvi: $(DVIFILES) + +print: $(PSFILES) + $(LPR) $(PSFILES) + +%.dvi: %.sgml + $(SGML2DVI) $< + +%.dvi: %.tex + @set -e; pass=2; echo "Running LaTeX $<"; \ + while [ `$(LATEX) $< &1 | \ + grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \ + if [ $$pass -gt 3 ]; then \ + echo "Seems, something is wrong. Try by hands." ; exit 1 ; \ + fi; \ + echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \ + done + +%.ps: %.dvi + $(DVIPS) $< -o $@.tmp + ./do-psnup $@.tmp $@ $(PAGESIZE) $(PAGESPERPAGE) + rm -f $@.tmp + +%.html: %.sgml + $(SGML2HTML) $< + +install: + install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR) + install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR) + +clean: + rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html diff --git a/doc/Plan b/doc/Plan index e69de29b..55f478ea 100644 --- a/doc/Plan +++ b/doc/Plan @@ -0,0 +1,16 @@ +Partially finished work. + +1. User Reference manuals. +1.1 IP Command reference (ip-cref.tex, published) +1.2 TC Command reference (tc-cref.tex) +1.3 IP tunnels (ip-tunnels.tex, published) + +2. Linux-2.2 Networking API +2.1 RTNETLINK (api-rtnl.tex) +2.2 Path MTU Discovery (api-pmtudisc.tex) +2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published) +2.4 Miscellaneous extensions (api-misc.tex) + +3. Linux-2.2 Networking Intra-Kernel Interfaces +3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex) +3.2 Neighbour cache and destination cache. (iki-neighdst.tex) diff --git a/doc/SNAPSHOT.tex b/doc/SNAPSHOT.tex index e69de29b..7ed02984 100644 --- a/doc/SNAPSHOT.tex +++ b/doc/SNAPSHOT.tex @@ -0,0 +1 @@ +\def\Draft{020116} diff --git a/doc/api-ip6-flowlabels.tex b/doc/api-ip6-flowlabels.tex index e69de29b..aa34e947 100644 --- a/doc/api-ip6-flowlabels.tex +++ b/doc/api-ip6-flowlabels.tex @@ -0,0 +1,429 @@ +\documentstyle[12pt,twoside]{article} +\def\TITLE{IPv6 Flow Labels} +\input preamble +\begin{center} +\Large\bf IPv6 Flow Labels in Linux-2.2. +\end{center} + + +\begin{center} +{ \large Alexey~N.~Kuznetsov } \\ +\em Institute for Nuclear Research, Moscow \\ +\verb|kuznet@ms2.inr.ac.ru| \\ +\rm April 11, 1999 +\end{center} + +\vspace{5mm} + +\tableofcontents + +\section{Introduction.} + +Every IPv6 packet carries 28 bits of flow information. RFC2460 splits +these bits to two fields: 8 bits of traffic class (or DS field, if you +prefer this term) and 20 bits of flow label. Currently there exist +no well-defined API to manage IPv6 flow information. In this document +I describe an attempt to design the API for Linux-2.2 IPv6 stack. + +\vskip 1mm + +The API must solve the following tasks: + +\begin{enumerate} + +\item To allow user to set traffic class bits. + +\item To allow user to read traffic class bits of received packets. +This feature is not so useful as the first one, however it will be +necessary f.e.\ to implement ECN [RFC2481] for datagram oriented services +or to implement receiver side of SRP or another end-to-end protocol +using traffic class bits. + +\item To assign flow labels to packets sent by user. + +\item To get flow labels of received packets. I do not know +any applications of this feature, but it is possible that receiver will +want to use flow labels to distinguish sub-flows. + +\item To allocate flow labels in the way, compliant to RFC2460. Namely: + +\begin{itemize} +\item +Flow labels must be uniformly distributed (pseudo-)random numbers, +so that any subset of 20 bits can be used as hash key. + +\item +Flows with coinciding source address and flow label must have identical +destination address and not-fragmentable extensions headers (i.e.\ +hop by hop options and all the headers up to and including routing header, +if it is present.) + +\begin{NB} +There is a hole in specs: some hop-by-hop options can be +defined only on per-packet base (f.e.\ jumbo payload option). +Essentially, it means that such options cannot present in packets +with flow labels. +\end{NB} +\begin{NB} +NB notes here and below reflect only my personal opinion, +they should be read with smile or should not be read at all :-). +\end{NB} + + +\item +Flow labels have finite lifetime and source is not allowed to reuse +flow label for another flow within the maximal lifetime has expired, +so that intermediate nodes will be able to invalidate flow state before +the label is taken over by another flow. +Flow state, including lifetime, is propagated along datagram path +by some application specific methods +(f.e.\ in RSVP PATH messages or in some hop-by-hop option). + + +\end{itemize} + +\end{enumerate} + +\section{Sending/receiving flow information.} + +\paragraph{Discussion.} +\addcontentsline{toc}{subsection}{Discussion} +It was proposed (Where? I do not remember any explicit statement) +to solve the first four tasks using +\verb|sin6_flowinfo| field added to \verb|struct| \verb|sockaddr_in6| +(see RFC2553). + +\begin{NB} + This method is difficult to consider as reasonable, because it + puts additional overhead to all the services, despite of only + very small subset of them (none, to be more exact) really use it. + It contradicts both to IETF spirit and the letter. Before RFC2553 + one justification existed, IPv6 address alignment left 4 byte + hole in \verb|sockaddr_in6| in any case. Now it has no justification. +\end{NB} + +We have two problems with this method. The first one is common for all OSes: +if \verb|recvmsg()| initializes \verb|sin6_flowinfo| to flow info +of received packet, we loose one very important property of BSD socket API, +namely, we are not allowed to use received address for reply directly +and have to mangle it, even if we are not interested in flowinfo subtleties. + +\begin{NB} + RFC2553 adds new requirement: to clear \verb|sin6_flowinfo|. + Certainly, it is not solution but rather attempt to force applications + to make unnecessary work. Well, as usually, one mistake in design + is followed by attempts to patch the hole and more mistakes... +\end{NB} + +Another problem is Linux specific. Historically Linux IPv6 did not +initialize \verb|sin6_flowinfo| at all, so that, if kernel does not +support flow labels, this field is not zero, but a random number. +Some applications also did not take care about it. + +\begin{NB} +Following RFC2553 such applications can be considered as broken, +but I still think that they are right: clearing all the address +before filling known fields is robust but stupid solution. +Useless wasting CPU cycles and +memory bandwidth is not a good idea. Such patches are acceptable +as temporary hacks, but not as standard of the future. +\end{NB} + + +\paragraph{Implementation.} +\addcontentsline{toc}{subsection}{Implementation} +By default Linux IPv6 does not read \verb|sin6_flowinfo| field +assuming that common applications are not obliged to initialize it +and are permitted to consider it as pure alignment padding. +In order to tell kernel that application +is aware of this field, it is necessary to set socket option +\verb|IPV6_FLOWINFO_SEND|. + +\begin{verbatim} + int on = 1; + setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO_SEND, + (void*)&on, sizeof(on)); +\end{verbatim} + +Linux kernel never fills \verb|sin6_flowinfo| field, when passing +message to user space, though the kernels which support flow labels +initialize it to zero. If user wants to get received flowinfo, he +will set option \verb|IPV6_FLOWINFO| and after this he will receive +flowinfo as ancillary data object of type \verb|IPV6_FLOWINFO| +(cf.\ RFC2292). + +\begin{verbatim} + int on = 1; + setsockopt(sock, SOL_IPV6, IPV6_FLOWINFO, (void*)&on, sizeof(on)); +\end{verbatim} + +Flowinfo received and latched by a connected TCP socket also may be fetched +with \verb|getsockopt()| \verb|IPV6_PKTOPTIONS| together with +another optional information. + +Besides that, in the spirit of RFC2292 the option \verb|IPV6_FLOWINFO| +may be used as alternative way to send flowinfo with \verb|sendmsg()| or +to latch it with \verb|IPV6_PKTOPTIONS|. + +\paragraph{Note about IPv6 options and destination address.} +\addcontentsline{toc}{subsection}{IPv6 options and destination address} +If \verb|sin6_flowinfo| does contain not zero flow label, +destination address in \verb|sin6_addr| and non-fragmentable +extension headers are ignored. Instead, kernel uses the values +cached at flow setup (see below). However, for connected sockets +kernel prefers the values set at connection time. + +\paragraph{Example.} +\addcontentsline{toc}{subsection}{Example} +After setting socket option \verb|IPV6_FLOWINFO| +flowlabel and DS field are received as ancillary data object +of type \verb|IPV6_FLOWINFO| and level \verb|SOL_IPV6|. +In the cases when it is convenient to use \verb|recvfrom(2)|, +it is possible to replace library variant with your own one, +sort of: + +\begin{verbatim} +#include +#include + +size_t recvfrom(int fd, char *buf, size_t len, int flags, + struct sockaddr *addr, int *addrlen) +{ + size_t cc; + char cbuf[128]; + struct cmsghdr *c; + struct iovec iov = { buf, len }; + struct msghdr msg = { addr, *addrlen, + &iov, 1, + cbuf, sizeof(cbuf), + 0 }; + + cc = recvmsg(fd, &msg, flags); + if (cc < 0) + return cc; + ((struct sockaddr_in6*)addr)->sin6_flowinfo = 0; + *addrlen = msg.msg_namelen; + for (c=CMSG_FIRSTHDR(&msg); c; c = CMSG_NEXTHDR(&msg, c)) { + if (c->cmsg_level != SOL_IPV6 || + c->cmsg_type != IPV6_FLOWINFO) + continue; + ((struct sockaddr_in6*)addr)->sin6_flowinfo = *(__u32*)CMSG_DATA(c); + } + return cc; +} +\end{verbatim} + + + +\section{Flow label management.} + +\paragraph{Discussion.} +\addcontentsline{toc}{subsection}{Discussion} +Requirements of RFC2460 are pretty tough. Particularly, lifetimes +longer than boot time require to store allocated labels at stable +storage, so that the full implementation necessarily includes user space flow +label manager. There are at least three different approaches: + +\begin{enumerate} +\item {\bf ``Cooperative''. } We could leave flow label allocation wholly +to user space. When user needs label he requests manager directly. The approach +is valid, but as any ``cooperative'' approach it suffers of security problems. + +\begin{NB} +One idea is to disallow not privileged user to allocate flow +labels, but instead to pass the socket to manager via \verb|SCM_RIGHTS| +control message, so that it will allocate label and assign it to socket +itself. Hmm... the idea is interesting. +\end{NB} + +\item {\bf ``Indirect''.} Kernel redirects requests to user level daemon +and does not install label until the daemon acknowledged the request. +The approach is the most promising, it is especially pleasant to recognize +parallel with IPsec API [RFC2367,Craig]. Actually, it may share API with +IPsec. + +\item {\bf ``Stupid''.} To allocate labels in kernel space. It is the simplest +method, but it suffers of two serious flaws: the first, +we cannot lease labels with lifetimes longer than boot time, the second, +it is sensitive to DoS attacks. Kernel have to remember all the obsolete +labels until their expiration and malicious user may fastly eat all the +flow label space. + +\end{enumerate} + +Certainly, I choose the most ``stupid'' method. It is the cheapest one +for implementor (i.e.\ me), and taking into account that flow labels +still have no serious applications it is not useful to work on more +advanced API, especially, taking into account that eventually we +will get it for no fee together with IPsec. + + +\paragraph{Implementation.} +\addcontentsline{toc}{subsection}{Implementation} +Socket option \verb|IPV6_FLOWLABEL_MGR| allows to +request flow label manager to allocate new flow label, to reuse +already allocated one or to delete old flow label. +Its argument is \verb|struct| \verb|in6_flowlabel_req|: + +\begin{verbatim} +struct in6_flowlabel_req +{ + struct in6_addr flr_dst; + __u32 flr_label; + __u8 flr_action; + __u8 flr_share; + __u16 flr_flags; + __u16 flr_expires; + __u16 flr_linger; + __u32 __flr_reserved; + /* Options in format of IPV6_PKTOPTIONS */ +}; +\end{verbatim} + +\begin{itemize} + +\item \verb|dst| is IPv6 destination address associated with the label. + +\item \verb|label| is flow label value in network byte order. If it is zero, +kernel will allocate new pseudo-random number. Otherwise, kernel will try +to lease flow label ordered by user. In this case, it is user task to provide +necessary flow label randomness. + +\item \verb|action| is requested operation. Currently, only three operations +are defined: + +\begin{verbatim} +#define IPV6_FL_A_GET 0 /* Get flow label */ +#define IPV6_FL_A_PUT 1 /* Release flow label */ +#define IPV6_FL_A_RENEW 2 /* Update expire time */ +\end{verbatim} + +\item \verb|flags| are optional modifiers. Currently +only \verb|IPV6_FL_A_GET| has modifiers: + +\begin{verbatim} +#define IPV6_FL_F_CREATE 1 /* Allowed to create new label */ +#define IPV6_FL_F_EXCL 2 /* Do not create new label */ +\end{verbatim} + + +\item \verb|share| defines who is allowed to reuse the same flow label. + +\begin{verbatim} +#define IPV6_FL_S_NONE 0 /* Not defined */ +#define IPV6_FL_S_EXCL 1 /* Label is private */ +#define IPV6_FL_S_PROCESS 2 /* May be reused by this process */ +#define IPV6_FL_S_USER 3 /* May be reused by this user */ +#define IPV6_FL_S_ANY 255 /* Anyone may reuse it */ +\end{verbatim} + +\item \verb|linger| is time in seconds. After the last user releases flow +label, it will not be reused with different destination and options at least +during this time. If \verb|share| is not \verb|IPV6_FL_S_EXCL| the label +still can be shared by another sockets. Current implementation does not allow +unprivileged user to set linger longer than 60 sec. + +\item \verb|expires| is time in seconds. Flow label will be kept at least +for this time, but it will not be destroyed before user released it explicitly +or closed all the sockets using it. Current implementation does not allow +unprivileged user to set timeout longer than 60 sec. Proviledged applications +MAY set longer lifetimes, but in this case they MUST save allocated +labels at stable storage and restore them back after reboot before the first +application allocates new flow. + +\end{itemize} + +This structure is followed by optional extension headers associated +with this flow label in format of \verb|IPV6_PKTOPTIONS|. Only +\verb|IPV6_HOPOPTS|, \verb|IPV6_RTHDR| and, if \verb|IPV6_RTHDR| presents, +\verb|IPV6_DSTOPTS| are allowed. + +\paragraph{Example.} +\addcontentsline{toc}{subsection}{Example} + The function \verb|get_flow_label| allocates +private flow label. + +\begin{verbatim} +int get_flow_label(int fd, struct sockaddr_in6 *dst, __u32 fl) +{ + int on = 1; + struct in6_flowlabel_req freq; + + memset(&freq, 0, sizeof(freq)); + freq.flr_label = htonl(fl); + freq.flr_action = IPV6_FL_A_GET; + freq.flr_flags = IPV6_FL_F_CREATE | IPV6_FL_F_EXCL; + freq.flr_share = IPV6_FL_S_EXCL; + memcpy(&freq.flr_dst, &dst->sin6_addr, 16); + if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, + &freq, sizeof(freq)) == -1) { + perror ("can't lease flowlabel"); + return -1; + } + dst->sin6_flowinfo |= freq.flr_label; + + if (setsockopt(fd, SOL_IPV6, IPV6_FLOWINFO_SEND, + &on, sizeof(on)) == -1) { + perror ("can't send flowinfo"); + + freq.flr_action = IPV6_FL_A_PUT; + setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, + &freq, sizeof(freq)); + return -1; + } + return 0; +} +\end{verbatim} + +A bit more complicated example using routing header can be found +in \verb|ping6| utility (\verb|iputils| package). Linux rsvpd backend +contains an example of using operation \verb|IPV6_FL_A_RENEW|. + +\paragraph{Listing flow labels.} +\addcontentsline{toc}{subsection}{Listing flow labels} +List of currently allocated +flow labels may be read from \verb|/proc/net/ip6_flowlabel|. + +\begin{verbatim} +Label S Owner Users Linger Expires Dst Opt +A1BE5 1 0 0 6 3 3ffe2400000000010a0020fffe71fb30 0 +\end{verbatim} + +\begin{itemize} +\item \verb|Label| is hexadecimal flow label value. +\item \verb|S| is sharing style. +\item \verb|Owner| is ID of creator, it is zero, pid or uid, depending on + sharing style. +\item \verb|Users| is number of applications using the label now. +\item \verb|Linger| is \verb|linger| of this label in seconds. +\item \verb|Expires| is time until expiration of the label in seconds. It may + be negative, if the label is in use. +\item \verb|Dst| is IPv6 destination address. +\item \verb|Opt| is length of options, associated with the label. Option + data are not accessible. +\end{itemize} + + +\paragraph{Flow labels and RSVP.} +\addcontentsline{toc}{subsection}{Flow labels and RSVP} +RSVP daemon supports IPv6 flow labels +without any modifications to standard ISI RAPI. Sender must allocate +flow label, fill corresponding sender template and submit it to local rsvp +daemon. rsvpd will check the label and start to announce it in PATH +messages. Rsvpd on sender node will renew the flow label, so that it will not +be reused before path state expires and all the intermediate +routers and receiver purge flow state. + +\verb|rtap| utility is modified to parse flow labels. F.e.\ if user allocated +flow label \verb|0xA1234|, he may write: + +\begin{verbatim} +RTAP> sender 3ffe:2400::1/FL0xA1234 +\end{verbatim} + +Receiver makes reservation with command: +\begin{verbatim} +RTAP> reserve ff 3ffe:2400::1/FL0xA1234 +\end{verbatim} + +\end{document} diff --git a/doc/arpd.sgml b/doc/arpd.sgml index e69de29b..0ab79c60 100644 --- a/doc/arpd.sgml +++ b/doc/arpd.sgml @@ -0,0 +1,130 @@ + + +
+ +ARPD Daemon +<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/ +<date>some_negative_number, 20 Sep 2001 +<abstract> +<tt/arpd/ is daemon collecting gratuitous ARP information, saving +it on local disk and feeding it to kernel on demand to avoid +redundant broadcasting due to limited size of kernel ARP cache. +</abstract> + + +<p><bf/Description/ + +<p>The format of the command is: + +<tscreen><verb> + arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ] +</verb></tscreen> + +<p> <tt/OPTIONS/ are: + +<itemize> + +<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists +of three columns: interface index, IP address and MAC address. +Negative entries for dead hosts are also shown, in this case MAC address +is replaced by word <tt/FAILED/ followed by colon and time when the fact +that host is dead was proven the last time. + +<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/ +in text format similar dumped by option <tt/-l/. Exit after load, +probably listing resulting database, if option <tt/-l/ is also given. +If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table. + +<item><tt/-b DATABASE/ - location of database file. Default location is +<tt>/var/lib/arpd/arpd.db</tt>. + +<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but +also send brodcast queries itself. <tt/NUMBER/ is number of such queries +to make before destination is considered as dead. When <tt/arpd/ is started +as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/ +or even with option <tt/-k/) without this option and still did not learn enough +information, you can observe 1 second gaps in service. Not fatal, but +not good. + +<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes +sense together with option <tt/-a/. + +<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/ +suppresses further attempts to resolve for this period. It makes sense +only together with option <tt/-k/. This timeout should not be too much +longer than boot time of a typical host not supporting gratuitous ARP. +Default value is 60 seconds. + +<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/ +in packets per second. Default value is 1. + +<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back. +Default value is 3. Together with option <tt/-R/ this option allows +to police broadcasting not to exceed <tt/B+R*T/ over any interval +of time <tt/T/. + +</itemize> + +<p><tt/INTERFACE/ is name of networking inteface to watch. +If no interfaces given, <tt/arpd/ monitors all the interfaces. +In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters, +it is supposed user does this himself after <tt/arpd/ is started. + + +<p> Signals + +<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted +<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/. +<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics +to <tt/syslog/. Effect of another signals is undefined, they may corrupt +database and leave <tt/sysctl/ parameters in an unpredictable state. + +<p> Note + +<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be +compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list +is not given on command line, variable <tt/app_solicit/ +on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>. +If this is not made <tt/arpd/ still collects gratuitous ARP information +in its database. + +<p> Examples + +<enum> +<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing +with kernel functionality: + +<tscreen><verb> + arpd -b /var/tmp/arpd.db +</verb></tscreen> + +<item> Look at result after some time: + +<tscreen><verb> + killall arpd + arpd -l -b /var/tmp/arpd.db +</verb></tscreen> + +<item> To enable kernel helper, leaving leading role to kernel: + +<tscreen><verb> + arpd -b /var/tmp/arpd.db -a 1 eth0 eth1 +</verb></tscreen> + +<item> Completely replace kernel resolution on interfaces <tt/eth0/ +and <tt/eth1/. In this case kernel still does unicast probing to +validate entries, but all the broadcast activity is suppressed +and made under authority of <tt/arpd/: + +<tscreen><verb> + arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1 +</verb></tscreen> + +This is mode which <tt/arpd/ is supposed to work normally. +It is not default just to prevent occasional enabling of too aggressive +mode occasionally. + +</enum> + +</article> + diff --git a/doc/do-psnup b/doc/do-psnup index e69de29b..2dce848e 100644 --- a/doc/do-psnup +++ b/doc/do-psnup @@ -0,0 +1,16 @@ +#! /bin/bash +# $1 = Temporary file . "string" +# $2 = File to process . "string" +# $3 = Page size . ie: a4 , letter ... "string" +# $4 = Number of pages to fit on a single sheet . "numeric" + +if type psnup >&/dev/null; then + echo "psnup -$4 -p$3 $1 $2" + psnup -$4 -p$3 $1 $2 +elif type psmulti >&/dev/null; then + echo "psmulti $1 > $2" + psmulti $1 > $2 +else + echo "cp $1 $2" + cp $1 $2 +fi diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex index e69de29b..5eaa4a89 100644 --- a/doc/ip-cref.tex +++ b/doc/ip-cref.tex @@ -0,0 +1,3316 @@ +\documentstyle[12pt,twoside]{article} +\def\TITLE{IP Command Reference} +\input preamble +\begin{center} +\Large\bf IP Command Reference. +\end{center} + + +\begin{center} +{ \large Alexey~N.~Kuznetsov } \\ +\em Institute for Nuclear Research, Moscow \\ +\verb|kuznet@ms2.inr.ac.ru| \\ +\rm April 14, 1999 +\end{center} + +\vspace{5mm} + +\tableofcontents + +\newpage + +\section{About this document} + +This document presents a comprehensive description of the \verb|ip| utility +from the \verb|iproute2| package. It is not a tutorial or user's guide. +It is a {\em dictionary\/}, not explaining terms, +but translating them into other terms, which may also be unknown to the reader. +However, the document is self-contained and the reader, provided they have a +basic networking background, will find enough information +and examples to understand and configure Linux-2.2 IP and IPv6 +networking. + +This document is split into sections explaining \verb|ip| commands +and options, decrypting \verb|ip| output and containing a few examples. +More voluminous examples and some topics, which require more elaborate +discussion, are in the appendix. + +The paragraphs beginning with NB contain side notes, warnings about +bugs and design drawbacks. They may be skipped at the first reading. + +\section{{\tt ip} --- command syntax} + +The generic form of an \verb|ip| command is: +\begin{verbatim} +ip [ OPTIONS ] OBJECT [ COMMAND [ ARGUMENTS ]] +\end{verbatim} +where \verb|OPTIONS| is a set of optional modifiers affecting the +general behaviour of the \verb|ip| utility or changing its output. All options +begin with the character \verb|'-'| and may be used in either long or abbreviated +forms. Currently, the following options are available: + +\begin{itemize} +\item \verb|-V|, \verb|-Version| + +--- print the version of the \verb|ip| utility and exit. + + +\item \verb|-s|, \verb|-stats|, \verb|-statistics| + +--- output more information. If the option +appears twice or more, the amount of information increases. +As a rule, the information is statistics or some time values. + + +\item \verb|-f|, \verb|-family| followed by a protocol family +identifier: \verb|inet|, \verb|inet6| or \verb|link|. + +--- enforce the protocol family to use. If the option is not present, +the protocol family is guessed from other arguments. If the rest of the command +line does not give enough information to guess the family, \verb|ip| falls back to the default +one, usually \verb|inet| or \verb|any|. \verb|link| is a special family +identifier meaning that no networking protocol is involved. + +\item \verb|-4| + +--- shortcut for \verb|-family inet|. + +\item \verb|-6| + +--- shortcut for \verb|-family inet6|. + +\item \verb|-0| + +--- shortcut for \verb|-family link|. + + +\item \verb|-o|, \verb|-oneline| + +--- output each record on a single line, replacing line feeds +with the \verb|'\'| character. This is convenient when you want to +count records with \verb|wc| or to \verb|grep| the output. The trivial +script \verb|rtpr| converts the output back into readable form. + +\item \verb|-r|, \verb|-resolve| + +--- use the system's name resolver to print DNS names instead of +host addresses. + +\begin{NB} + Do not use this option when reporting bugs or asking for advice. +\end{NB} +\begin{NB} + \verb|ip| never uses DNS to resolve names to addresses. +\end{NB} + +\end{itemize} + +\verb|OBJECT| is the object to manage or to get information about. +The object types currently understood by \verb|ip| are: + +\begin{itemize} +\item \verb|link| --- network device +\item \verb|address| --- protocol (IP or IPv6) address on a device +\item \verb|neighbour| --- ARP or NDISC cache entry +\item \verb|route| --- routing table entry +\item \verb|rule| --- rule in routing policy database +\item \verb|maddress| --- multicast address +\item \verb|mroute| --- multicast routing cache entry +\item \verb|tunnel| --- tunnel over IP +\end{itemize} + +Again, the names of all objects may be written in full or +abbreviated form, f.e.\ \verb|address| is abbreviated as \verb|addr| +or just \verb|a|. + +\verb|COMMAND| specifies the action to perform on the object. +The set of possible actions depends on the object type. +As a rule, it is possible to \verb|add|, \verb|delete| and +\verb|show| (or \verb|list|) objects, but some objects +do not allow all of these operations or have some additional commands. +The \verb|help| command is available for all objects. It prints +out a list of available commands and argument syntax conventions. + +If no command is given, some default command is assumed. +Usually it is \verb|list| or, if the objects of this class +cannot be listed, \verb|help|. + +\verb|ARGUMENTS| is a list of arguments to the command. +The arguments depend on the command and object. There are two types of arguments: +{\em flags\/}, consisting of a single keyword, and {\em parameters\/}, +consisting of a keyword followed by a value. For convenience, +each command has some {\em default parameter\/} +which may be omitted. F.e.\ parameter \verb|dev| is the default +for the {\tt ip link} command, so {\tt ip link ls eth0} is equivalent +to {\tt ip link ls dev eth0}. +In the command descriptions below such parameters +are distinguished with the marker: ``(default)''. + +Almost all keywords may be abbreviated with several first (or even single) +letters. The shortcuts are convenient when \verb|ip| is used interactively, +but they are not recommended in scripts or when reporting bugs +or asking for advice. ``Officially'' allowed abbreviations are listed +in the document body. + + + +\section{{\tt ip} --- error messages} + +\verb|ip| may fail for one of the following reasons: + +\begin{itemize} +\item +A syntax error on the command line: an unknown keyword, incorrectly formatted +IP address {\em et al\/}. In this case \verb|ip| prints an error message +and exits. As a rule, the error message will contain information +about the reason for the failure. Sometimes it also prints a help page. + +\item +The arguments did not pass verification for self-consistency. + +\item +\verb|ip| failed to compile a kernel request from the arguments +because the user didn't give enough information. + +\item +The kernel returned an error to some syscall. In this case \verb|ip| +prints the error message, as it is output with \verb|perror(3)|, +prefixed with a comment and a syscall identifier. + +\item +The kernel returned an error to some RTNETLINK request. +In this case \verb|ip| prints the error message, as it is output +with \verb|perror(3)| prefixed with ``RTNETLINK answers:''. + +\end{itemize} + +All the operations are atomic, i.e.\ +if the \verb|ip| utility fails, it does not change anything +in the system. One harmful exception is \verb|ip link| command +(Sec.\ref{IP-LINK}, p.\pageref{IP-LINK}), +which may change only some of the device parameters given +on command line. + +It is difficult to list all the error messages (especially +syntax errors). However, as a rule, their meaning is clear +from the context of the command. + +The most common mistakes are: + +\begin{enumerate} +\item Netlink is not configured in the kernel. The message is: +\begin{verbatim} +Cannot open netlink socket: Invalid value +\end{verbatim} + +\item RTNETLINK is not configured in the kernel. In this case +one of the following messages may be printed, depending on the command: +\begin{verbatim} +Cannot talk to rtnetlink: Connection refused +Cannot send dump request: Connection refused +\end{verbatim} + +\item The \verb|CONFIG_IP_MULTIPLE_TABLES| option was not selected +when configuring the kernel. In this case any attempt to use the +\verb|ip| \verb|rule| command will fail, f.e. +\begin{verbatim} +kuznet@kaiser $ ip rule list +RTNETLINK error: Invalid argument +dump terminated +\end{verbatim} + +\end{enumerate} + + +\section{{\tt ip link} --- network device configuration} +\label{IP-LINK} + +\paragraph{Object:} A \verb|link| is a network device and the corresponding +commands display and change the state of devices. + +\paragraph{Commands:} \verb|set| and \verb|show| (or \verb|list|). + +\subsection{{\tt ip link set} --- change device attributes} + +\paragraph{Abbreviations:} \verb|set|, \verb|s|. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|dev NAME| (default) + +--- \verb|NAME| specifies the network device on which to operate. + +\item \verb|up| and \verb|down| + +--- change the state of the device to \verb|UP| or \verb|DOWN|. + +\item \verb|arp on| or \verb|arp off| + +--- change the \verb|NOARP| flag on the device. + +\begin{NB} +This operation is {\em not allowed\/} if the device is in state \verb|UP|. +Though neither the \verb|ip| utility nor the kernel check for this condition. +You can get unpredictable results changing this flag while the +device is running. +\end{NB} + +\item \verb|multicast on| or \verb|multicast off| + +--- change the \verb|MULTICAST| flag on the device. + +\item \verb|dynamic on| or \verb|dynamic off| + +--- change the \verb|DYNAMIC| flag on the device. + +\item \verb|name NAME| + +--- change the name of the device. This operation is not +recommended if the device is running or has some addresses +already configured. + +\item \verb|txqueuelen NUMBER| or \verb|txqlen NUMBER| + +--- change the transmit queue length of the device. + +\item \verb|mtu NUMBER| + +--- change the MTU of the device. + +\item \verb|address LLADDRESS| + +--- change the station address of the interface. + +\item \verb|broadcast LLADDRESS|, \verb|brd LLADDRESS| or \verb|peer LLADDRESS| + +--- change the link layer broadcast address or the peer address when +the interface is \verb|POINTOPOINT|. + +\vskip 1mm +\begin{NB} +For most devices (f.e.\ for Ethernet) changing the link layer +broadcast address will break networking. +Do not use it, if you do not understand what this operation really does. +\end{NB} + +\end{itemize} + +\vskip 1mm +\begin{NB} +The {\tt ip} utility does not change the \verb|PROMISC| +or \verb|ALLMULTI| flags. These flags are considered +obsolete and should not be changed administratively. +\end{NB} + +\paragraph{Warning:} If multiple parameter changes are requested, +\verb|ip| aborts immediately after any of the changes have failed. +This is the only case when \verb|ip| can move the system to +an unpredictable state. The solution is to avoid changing +several parameters with one {\tt ip link set} call. + +\paragraph{Examples:} +\begin{itemize} +\item \verb|ip link set dummy address 00:00:00:00:00:01| + +--- change the station address of the interface \verb|dummy|. + +\item \verb|ip link set dummy up| + +--- start the interface \verb|dummy|. + +\end{itemize} + + +\subsection{{\tt ip link show} --- display device attributes} +\label{IP-LINK-SHOW} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|, +\verb|l|. + +\paragraph{Arguments:} +\begin{itemize} +\item \verb|dev NAME| (default) + +--- \verb|NAME| specifies the network device to show. +If this argument is omitted all devices are listed. + +\item \verb|up| + +--- only display running interfaces. + +\end{itemize} + + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@alisa:~ $ ip link ls eth0 +3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 + link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff +kuznet@alisa:~ $ ip link ls sit0 +5: sit0@NONE: <NOARP,UP> mtu 1480 qdisc noqueue + link/sit 0.0.0.0 brd 0.0.0.0 +kuznet@alisa:~ $ ip link ls dummy +2: dummy: <BROADCAST,NOARP> mtu 1500 qdisc noop + link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff +kuznet@alisa:~ $ +\end{verbatim} + + +The number before each colon is an {\em interface index\/} or {\em ifindex\/}. +This number uniquely identifies the interface. This is followed by the {\em interface name\/} +(\verb|eth0|, \verb|sit0| etc.). The interface name is also +unique at every given moment. However, the interface may disappear from the +list (f.e.\ when the corresponding driver module is unloaded) and another +one with the same name may be created later. Besides that, +the administrator may change the name of any device with +\verb|ip| \verb|link| \verb|set| \verb|name| +to make it more intelligible. + +The interface name may have another name or \verb|NONE| appended +after the \verb|@| sign. This means that this device is bound to some other +device, +i.e.\ packets send through it are encapsulated and sent via the ``master'' +device. If the name is \verb|NONE|, the master is unknown. + +Then we see the interface {\em mtu\/} (``maximal transfer unit''). This determines +the maximal size of data which can be sent as a single packet over this interface. + +{\em qdisc\/} (``queuing discipline'') shows the queuing algorithm used +on the interface. Particularly, \verb|noqueue| means that this interface +does not queue anything and \verb|noop| means that the interface is in blackhole +mode i.e.\ all packets sent to it are immediately discarded. +{\em qlen\/} is the default transmit queue length of the device measured +in packets. + +The interface flags are summarized in the angle brackets. + +\begin{itemize} +\item \verb|UP| --- the device is turned on. It is ready to accept +packets for transmission and it may inject into the kernel packets received +from other nodes on the network. + +\item \verb|LOOPBACK| --- the interface does not communicate with other +hosts. All packets sent through it will be returned +and nothing but bounced packets can be received. + +\item \verb|BROADCAST| --- the device has the facility to send packets +to all hosts sharing the same link. A typical example is an Ethernet link. + +\item \verb|POINTOPOINT| --- the link has only two ends with one node +attached to each end. All packets sent to this link will reach the peer +and all packets received by us came from this single peer. + +If neither \verb|LOOPBACK| nor \verb|BROADCAST| nor \verb|POINTOPOINT| +are set, the interface is assumed to be NMBA (Non-Broadcast Multi-Access). +This is the most generic type of device and the most complicated one, because +the host attached to a NBMA link has no means to send to anyone +without additionally configured information. + +\item \verb|MULTICAST| --- is an advisory flag indicating that the interface +is aware of multicasting i.e.\ sending packets to some subset of neighbouring +nodes. Broadcasting is a particular case of multicasting, where the multicast +group consists of all nodes on the link. It is important to emphasize +that software {\em must not\/} interpret the absence of this flag as the inability +to use multicasting on this interface. Any \verb|POINTOPOINT| and +\verb|BROADCAST| link is multicasting by definition, because we have +direct access to all the neighbours and, hence, to any part of them. +Certainly, the use of high bandwidth multicast transfers is not recommended +on broadcast-only links because of high expense, but it is not strictly +prohibited. + +\item \verb|PROMISC| --- the device listens to and feeds to the kernel all +traffic on the link even if it is not destined for us, not broadcasted +and not destined for a multicast group of which we are member. Usually +this mode exists only on broadcast links and is used by bridges and for network +monitoring. + +\item \verb|ALLMULTI| --- the device receives all multicast packets +wandering on the link. This mode is used by multicast routers. + +\item \verb|NOARP| --- this flag is different from the other ones. It has +no invariant value and its interpretation depends on the network protocols +involved. As a rule, it indicates that the device needs no address +resolution and that the software or hardware knows how to deliver packets +without any help from the protocol stacks. + +\item \verb|DYNAMIC| --- is an advisory flag indicating that the interface is +dynamically created and destroyed. + +\item \verb|SLAVE| --- this interface is bonded to some other interfaces +to share link capacities. + +\end{itemize} + +\vskip 1mm +\begin{NB} +There are other flags but they are either obsolete (\verb|NOTRAILERS|) +or not implemented (\verb|DEBUG|) or specific to some devices +(\verb|MASTER|, \verb|AUTOMEDIA| and \verb|PORTSEL|). We do not discuss +them here. +\end{NB} +\begin{NB} +The values of \verb|PROMISC| and \verb|ALLMULTI| flags +shown by the \verb|ifconfig| utility and by the \verb|ip| utility +are {\em different\/}. \verb|ip link ls| shows the true device state, +while \verb|ifconfig| shows the virtual state which was set with +\verb|ifconfig| itself. +\end{NB} + + +The second line contains information on the link layer addresses +associated with the device. The first word (\verb|ether|, \verb|sit|) +defines the interface hardware type. This type determines the format and semantics +of the addresses and is logically part of the address. +The default format of the station address and the broadcast address +(or the peer address for pointopoint links) is a +sequence of hexadecimal bytes separated by colons, but some link +types may have their natural address format, f.e.\ addresses +of tunnels over IP are printed as dotted-quad IP addresses. + +\vskip 1mm +\begin{NB} + NBMA links have no well-defined broadcast or peer address, + however this field may contain useful information, f.e.\ + about the address of broadcast relay or about the address of the ARP server. +\end{NB} +\begin{NB} +Multicast addresses are not shown by this command, see +\verb|ip maddr ls| in~Sec.\ref{IP-MADDR} (p.\pageref{IP-MADDR} of this +document). +\end{NB} + + +\paragraph{Statistics:} With the \verb|-statistics| option, \verb|ip| also +prints interface statistics: + +\begin{verbatim} +kuznet@alisa:~ $ ip -s link ls eth0 +3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 + link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff + RX: bytes packets errors dropped overrun mcast + 2449949362 2786187 0 0 0 0 + TX: bytes packets errors dropped carrier collsns + 178558497 1783945 332 0 332 35172 +kuznet@alisa:~ $ +\end{verbatim} +\verb|RX:| and \verb|TX:| lines summarize receiver and transmitter +statistics. They contain: +\begin{itemize} +\item \verb|bytes| --- the total number of bytes received or transmitted +on the interface. This number wraps when the maximal length of the data type +natural for the architecture is exceeded, so continuous monitoring requires +a user level daemon snapping it periodically. +\item \verb|packets| --- the total number of packets received or transmitted +on the interface. +\item \verb|errors| --- the total number of receiver or transmitter errors. +\item \verb|dropped| --- the total number of packets dropped due to lack +of resources. +\item \verb|overrun| --- the total number of receiver overruns resulting +in dropped packets. As a rule, if the interface is overrun, it means +serious problems in the kernel or that your machine is too slow +for this interface. +\item \verb|mcast| --- the total number of received multicast packets. This option +is only supported by a few devices. +\item \verb|carrier| --- total number of link media failures f.e.\ because +of lost carrier. +\item \verb|collsns| --- the total number of collision events +on Ethernet-like media. This number may have a different sense on other +link types. +\item \verb|compressed| --- the total number of compressed packets. This is +available only for links using VJ header compression. +\end{itemize} + + +If the \verb|-s| option is entered twice or more, +\verb|ip| prints more detailed statistics on receiver +and transmitter errors. + +\begin{verbatim} +kuznet@alisa:~ $ ip -s -s link ls eth0 +3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 + link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff + RX: bytes packets errors dropped overrun mcast + 2449949362 2786187 0 0 0 0 + RX errors: length crc frame fifo missed + 0 0 0 0 0 + TX: bytes packets errors dropped carrier collsns + 178558497 1783945 332 0 332 35172 + TX errors: aborted fifo window heartbeat + 0 0 0 332 +kuznet@alisa:~ $ +\end{verbatim} +These error names are pure Ethernetisms. Other devices +may have non zero values in these fields but they may be +interpreted differently. + + +\section{{\tt ip address} --- protocol address management} + +\paragraph{Abbreviations:} \verb|address|, \verb|addr|, \verb|a|. + +\paragraph{Object:} The \verb|address| is a protocol (IP or IPv6) address attached +to a network device. Each device must have at least one address +to use the corresponding protocol. It is possible to have several +different addresses attached to one device. These addresses are not +discriminated, so that the term {\em alias\/} is not quite appropriate +for them and we do not use it in this document. + +The \verb|ip addr| command displays addresses and their properties, +adds new addresses and deletes old ones. + +\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|flush| and \verb|show| +(or \verb|list|). + + +\subsection{{\tt ip address add} --- add a new protocol address} +\label{IP-ADDR-ADD} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|dev NAME| + +\noindent--- the name of the device to add the address to. + +\item \verb|local ADDRESS| (default) + +--- the address of the interface. The format of the address depends +on the protocol. It is a dotted quad for IP and a sequence of hexadecimal halfwords +separated by colons for IPv6. The \verb|ADDRESS| may be followed by +a slash and a decimal number which encodes the network prefix length. + + +\item \verb|peer ADDRESS| + +--- the address of the remote endpoint for pointopoint interfaces. +Again, the \verb|ADDRESS| may be followed by a slash and a decimal number, +encoding the network prefix length. If a peer address is specified, +the local address {\em cannot\/} have a prefix length. The network prefix is associated +with the peer rather than with the local address. + + +\item \verb|broadcast ADDRESS| + +--- the broadcast address on the interface. + +It is possible to use the special symbols \verb|'+'| and \verb|'-'| +instead of the broadcast address. In this case, the broadcast address +is derived by setting/resetting the host bits of the interface prefix. + +\vskip 1mm +\begin{NB} +Unlike \verb|ifconfig|, the \verb|ip| utility {\em does not\/} set any broadcast +address unless explicitly requested. +\end{NB} + + +\item \verb|label NAME| + +--- Each address may be tagged with a label string. +In order to preserve compatibility with Linux-2.0 net aliases, +this string must coincide with the name of the device or must be prefixed +with the device name followed by colon. + + +\item \verb|scope SCOPE_VALUE| + +--- the scope of the area where this address is valid. +The available scopes are listed in file \verb|/etc/iproute2/rt_scopes|. +Predefined scope values are: + + \begin{itemize} + \item \verb|global| --- the address is globally valid. + \item \verb|site| --- (IPv6 only) the address is site local, + i.e.\ it is valid inside this site. + \item \verb|link| --- the address is link local, i.e.\ + it is valid only on this device. + \item \verb|host| --- the address is valid only inside this host. + \end{itemize} + +Appendix~\ref{ADDR-SEL} (p.\pageref{ADDR-SEL} of this document) +contains more details on address scopes. + +\end{itemize} + +\paragraph{Examples:} +\begin{itemize} +\item \verb|ip addr add 127.0.0.1/8 dev lo brd + scope host| + +--- add the usual loopback address to the loopback device. + +\item \verb|ip addr add 10.0.0.1/24 brd + dev eth0 label eth0:Alias| + +--- add the address 10.0.0.1 with prefix length 24 (i.e.\ netmask +\verb|255.255.255.0|), standard broadcast and label \verb|eth0:Alias| +to the interface \verb|eth0|. +\end{itemize} + + +\subsection{{\tt ip address delete} --- delete a protocol address} + +\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. + +\paragraph{Arguments:} coincide with the arguments of \verb|ip addr add|. +The device name is a required argument. The rest are optional. +If no arguments are given, the first address is deleted. + +\paragraph{Examples:} +\begin{itemize} +\item \verb|ip addr del 127.0.0.1/8 dev lo| + +--- deletes the loopback address from the loopback device. +It would be best not to repeat this experiment. + +\item Disable IP on the interface \verb|eth0|: +\begin{verbatim} + while ip -f inet addr del dev eth0; do + : nothing + done +\end{verbatim} +Another method to disable IP on an interface using {\tt ip addr flush} +may be found in sec.\ref{IP-ADDR-FLUSH}, p.\pageref{IP-ADDR-FLUSH}. + +\end{itemize} + + +\subsection{{\tt ip address show} --- display protocol addresses} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|, +\verb|l|. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|dev NAME| (default) + +--- the name of the device. + +\item \verb|scope SCOPE_VAL| + +--- only list addresses with this scope. + +\item \verb|to PREFIX| + +--- only list addresses matching this prefix. + +\item \verb|label PATTERN| + +--- only list addresses with labels matching the \verb|PATTERN|. +\verb|PATTERN| is a usual shell style pattern. + + +\item \verb|dynamic| and \verb|permanent| + +--- (IPv6 only) only list addresses installed due to stateless +address configuration or only list permanent (not dynamic) addresses. + +\item \verb|tentative| + +--- (IPv6 only) only list addresses which did not pass duplicate +address detection. + +\item \verb|deprecated| + +--- (IPv6 only) only list deprecated addresses. + + +\item \verb|primary| and \verb|secondary| + +--- only list primary (or secondary) addresses. + +\end{itemize} + + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@alisa:~ $ ip addr ls eth0 +3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100 + link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff + inet 193.233.7.90/24 brd 193.233.7.255 scope global eth0 + inet6 3ffe:2400:0:1:2a0:ccff:fe66:1878/64 scope global dynamic + valid_lft forever preferred_lft 604746sec + inet6 fe80::2a0:ccff:fe66:1878/10 scope link +kuznet@alisa:~ $ +\end{verbatim} + +The first two lines coincide with the output of \verb|ip link ls|. +It is natural to interpret link layer addresses +as addresses of the protocol family \verb|AF_PACKET|. + +Then the list of IP and IPv6 addresses follows, accompanied by +additional address attributes: scope value (see Sec.\ref{IP-ADDR-ADD}, +p.\pageref{IP-ADDR-ADD} above), flags and the address label. + +Address flags are set by the kernel and cannot be changed +administratively. Currently, the following flags are defined: + +\begin{enumerate} +\item \verb|secondary| + +--- the address is not used when selecting the default source address +of outgoing packets (Cf.\ Appendix~\ref{ADDR-SEL}, p.\pageref{ADDR-SEL}.). +An IP address becomes secondary if another address with the same +prefix bits already exists. The first address is primary. +It is the leader of the group of all secondary addresses. When the leader +is deleted, all secondaries are purged too. + + +\item \verb|dynamic| + +--- the address was created due to stateless autoconfiguration~\cite{RFC-ADDRCONF}. +In this case the output also contains information on times, when +the address is still valid. After \verb|preferred_lft| expires the address is +moved to the deprecated state. After \verb|valid_lft| expires the address +is finally invalidated. + +\item \verb|deprecated| + +--- the address is deprecated, i.e.\ it is still valid, but cannot +be used by newly created connections. + +\item \verb|tentative| + +--- the address is not used because duplicate address detection~\cite{RFC-ADDRCONF} +is still not complete or failed. + +\end{enumerate} + + +\subsection{{\tt ip address flush} --- flush protocol addresses} +\label{IP-ADDR-FLUSH} + +\paragraph{Abbreviations:} \verb|flush|, \verb|f|. + +\paragraph{Description:}This command flushes the protocol addresses +selected by some criteria. + +\paragraph{Arguments:} This command has the same arguments as \verb|show|. +The difference is that it does not run when no arguments are given. + +\paragraph{Warning:} This command (and other \verb|flush| commands +described below) is pretty dangerous. If you make a mistake, it will +not forgive it, but will cruelly purge all the addresses. + +\paragraph{Statistics:} With the \verb|-statistics| option, the command +becomes verbose. It prints out the number of deleted addresses and the number +of rounds made to flush the address list. If this option is given +twice, \verb|ip addr flush| also dumps all the deleted addresses +in the format described in the previous subsection. + +\paragraph{Example:} Delete all the addresses from the private network +10.0.0.0/8: +\begin{verbatim} +netadm@amber:~ # ip -s -s a f to 10/8 +2: dummy inet 10.7.7.7/16 brd 10.7.255.255 scope global dummy +3: eth0 inet 10.10.7.7/16 brd 10.10.255.255 scope global eth0 +4: eth1 inet 10.8.7.7/16 brd 10.8.255.255 scope global eth1 + +*** Round 1, deleting 3 addresses *** +*** Flush is complete after 1 round *** +netadm@amber:~ # +\end{verbatim} +Another instructive example is disabling IP on all the Ethernets: +\begin{verbatim} +netadm@amber:~ # ip -4 addr flush label "eth*" +\end{verbatim} +And the last example shows how to flush all the IPv6 addresses +acquired by the host from stateless address autoconfiguration +after you enabled forwarding or disabled autoconfiguration. +\begin{verbatim} +netadm@amber:~ # ip -6 addr flush dynamic +\end{verbatim} + + + +\section{{\tt ip neighbour} --- neighbour/arp tables management} + +\paragraph{Abbreviations:} \verb|neighbour|, \verb|neighbor|, \verb|neigh|, +\verb|n|. + +\paragraph{Object:} \verb|neighbour| objects establish bindings between protocol +addresses and link layer addresses for hosts sharing the same link. +Neighbour entries are organized into tables. The IPv4 neighbour table +is known by another name --- the ARP table. + +The corresponding commands display neighbour bindings +and their properties, add new neighbour entries and delete old ones. + +\paragraph{Commands:} \verb|add|, \verb|change|, \verb|replace|, +\verb|delete|, \verb|flush| and \verb|show| (or \verb|list|). + +\paragraph{See also:} Appendix~\ref{PROXY-NEIGH}, p.\pageref{PROXY-NEIGH} +describes how to manage proxy ARP/NDISC with the \verb|ip| utility. + + +\subsection{{\tt ip neighbour add} --- add a new neighbour entry\\ + {\tt ip neighbour change} --- change an existing entry\\ + {\tt ip neighbour replace} --- add a new entry or change an existing one} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; +\verb|replace|, \verb|repl|. + +\paragraph{Description:} These commands create new neighbour records +or update existing ones. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|to ADDRESS| (default) + +--- the protocol address of the neighbour. It is either an IPv4 or IPv6 address. + +\item \verb|dev NAME| + +--- the interface to which this neighbour is attached. + + +\item \verb|lladdr LLADDRESS| + +--- the link layer address of the neighbour. \verb|LLADDRESS| can also be +\verb|null|. + +\item \verb|nud NUD_STATE| + +--- the state of the neighbour entry. \verb|nud| is an abbreviation for ``Neighbour +Unreachability Detection''. The state can take one of the following values: + +\begin{enumerate} +\item \verb|permanent| --- the neighbour entry is valid forever and can be only be removed +administratively. +\item \verb|noarp| --- the neighbour entry is valid. No attempts to validate +this entry will be made but it can be removed when its lifetime expires. +\item \verb|reachable| --- the neighbour entry is valid until the reachability +timeout expires. +\item \verb|stale| --- the neighbour entry is valid but suspicious. +This option to \verb|ip neigh| does not change the neighbour state if +it was valid and the address is not changed by this command. +\end{enumerate} + +\end{itemize} + +\paragraph{Examples:} +\begin{itemize} +\item \verb|ip neigh add 10.0.0.3 lladdr 0:0:0:0:0:1 dev eth0 nud perm| + +--- add a permanent ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|. + +\item \verb|ip neigh chg 10.0.0.3 dev eth0 nud reachable| + +--- change its state to \verb|reachable|. +\end{itemize} + + +\subsection{{\tt ip neighbour delete} --- delete a neighbour entry} + +\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. + +\paragraph{Description:} This command invalidates a neighbour entry. + +\paragraph{Arguments:} The arguments are the same as with \verb|ip neigh add|, +except that \verb|lladdr| and \verb|nud| are ignored. + + +\paragraph{Example:} +\begin{itemize} +\item \verb|ip neigh del 10.0.0.3 dev eth0| + +--- invalidate an ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|. + +\end{itemize} + +\begin{NB} + The deleted neighbour entry will not disappear from the tables + immediately. If it is in use it cannot be deleted until the last + client releases it. Otherwise it will be destroyed during + the next garbage collection. +\end{NB} + + +\paragraph{Warning:} Attempts to delete or manually change +a \verb|noarp| entry created by the kernel may result in unpredictable behaviour. +Particularly, the kernel may try to resolve this address even +on a \verb|NOARP| interface or if the address is multicast or broadcast. + + +\subsection{{\tt ip neighbour show} --- list neighbour entries} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|. + +\paragraph{Description:}This commands displays neighbour tables. + +\paragraph{Arguments:} + +\begin{itemize} + +\item \verb|to ADDRESS| (default) + +--- the prefix selecting the neighbours to list. + +\item \verb|dev NAME| + +--- only list the neighbours attached to this device. + +\item \verb|unused| + +--- only list neighbours which are not currently in use. + +\item \verb|nud NUD_STATE| + +--- only list neighbour entries in this state. \verb|NUD_STATE| takes +values listed below or the special value \verb|all| which means all states. +This option may occur more than once. If this option is absent, \verb|ip| +lists all entries except for \verb|none| and \verb|noarp|. + +\end{itemize} + + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@alisa:~ $ ip neigh ls +:: dev lo lladdr 00:00:00:00:00:00 nud noarp +fe80::200:cff:fe76:3f85 dev eth0 lladdr 00:00:0c:76:3f:85 router \ + nud stale +0.0.0.0 dev lo lladdr 00:00:00:00:00:00 nud noarp +193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 nud reachable +193.233.7.85 dev eth0 lladdr 00:e0:1e:63:39:00 nud stale +kuznet@alisa:~ $ +\end{verbatim} + +The first word of each line is the protocol address of the neighbour. +Then the device name follows. The rest of the line describes the contents of +the neighbour entry identified by the pair (device, address). + +\verb|lladdr| is the link layer address of the neighbour. + +\verb|nud| is the state of the ``neighbour unreachability detection'' machine +for this entry. The detailed description of the neighbour +state machine can be found in~\cite{RFC-NDISC}. Here is the full list +of the states with short descriptions: + +\begin{enumerate} +\item\verb|none| --- the state of the neighbour is void. +\item\verb|incomplete| --- the neighbour is in the process of resolution. +\item\verb|reachable| --- the neighbour is valid and apparently reachable. +\item\verb|stale| --- the neighbour is valid, but is probably already +unreachable, so the kernel will try to check it at the first transmission. +\item\verb|delay| --- a packet has been sent to the stale neighbour and the kernel is waiting +for confirmation. +\item\verb|probe| --- the delay timer expired but no confirmation was received. +The kernel has started to probe the neighbour with ARP/NDISC messages. +\item\verb|failed| --- resolution has failed. +\item\verb|noarp| --- the neighbour is valid. No attempts to check the entry +will be made. +\item\verb|permanent| --- it is a \verb|noarp| entry, but only the administrator +may remove the entry from the neighbour table. +\end{enumerate} + +The link layer address is valid in all states except for \verb|none|, +\verb|failed| and \verb|incomplete|. + +IPv6 neighbours can be marked with the additional flag \verb|router| +which means that the neighbour introduced itself as an IPv6 router~\cite{RFC-NDISC}. + +\paragraph{Statistics:} The \verb|-statistics| option displays some usage +statistics, f.e.\ + +\begin{verbatim} +kuznet@alisa:~ $ ip -s n ls 193.233.7.254 +193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \ + nud reachable +kuznet@alisa:~ $ +\end{verbatim} + +Here \verb|ref| is the number of users of this entry +and \verb|used| is a triplet of time intervals in seconds +separated by slashes. In this case they show that: + +\begin{enumerate} +\item the entry was used 12 seconds ago. +\item the entry was confirmed 13 seconds ago. +\item the entry was updated 20 seconds ago. +\end{enumerate} + +\subsection{{\tt ip neighbour flush} --- flush neighbour entries} + +\paragraph{Abbreviations:} \verb|flush|, \verb|f|. + +\paragraph{Description:}This command flushes neighbour tables, selecting +entries to flush by some criteria. + +\paragraph{Arguments:} This command has the same arguments as \verb|show|. +The differences are that it does not run when no arguments are given, +and that the default neighbour states to be flushed do not include +\verb|permanent| and \verb|noarp|. + + +\paragraph{Statistics:} With the \verb|-statistics| option, the command +becomes verbose. It prints out the number of deleted neighbours and the number +of rounds made to flush the neighbour table. If the option is given +twice, \verb|ip neigh flush| also dumps all the deleted neighbours +in the format described in the previous subsection. + +\paragraph{Example:} +\begin{verbatim} +netadm@alisa:~ # ip -s -s n f 193.233.7.254 +193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \ + nud reachable + +*** Round 1, deleting 1 entries *** +*** Flush is complete after 1 round *** +netadm@alisa:~ # +\end{verbatim} + + +\section{{\tt ip route} --- routing table management} +\label{IP-ROUTE} + +\paragraph{Abbreviations:} \verb|route|, \verb|ro|, \verb|r|. + +\paragraph{Object:} \verb|route| entries in the kernel routing tables keep +information about paths to other networked nodes. + +Each route entry has a {\em key\/} consisting of a {\em prefix\/} +(i.e.\ a pair containing a network address and the length of its mask) and, +optionally, the TOS value. An IP packet matches the route if the highest +bits of its destination address are equal to the route prefix at least +up to the prefix length and if the TOS of the route is zero or equal to +the TOS of the packet. + +If several routes match the packet, the following pruning rules +are used to select the best one (see~\cite{RFC1812}): +\begin{enumerate} +\item The longest matching prefix is selected. All shorter ones +are dropped. + +\item If the TOS of some route with the longest prefix is equal to the TOS +of the packet, the routes with different TOS are dropped. + +If no exact TOS match was found and routes with TOS=0 exist, +the rest of routes are pruned. + +Otherwise, the route lookup fails. + +\item If several routes remain after the previous steps, then +the routes with the best preference values are selected. + +\item If we still have several routes, then the {\em first\/} of them +is selected. + +\begin{NB} + Note the ambiguity of the last step. Unfortunately, Linux + historically allows such a bizarre situation. The sense of the +word ``first'' depends on the order of route additions and it is practically +impossible to maintain a bundle of such routes in this order. +\end{NB} + +For simplicity we will limit ourselves to the case where such a situation +is impossible and routes are uniquely identified by the triplet +\{prefix, tos, preference\}. Actually, it is impossible to create +non-unique routes with \verb|ip| commands described in this section. + +One useful exception to this rule is the default route on non-forwarding +hosts. It is ``officially'' allowed to have several fallback routes +when several routers are present on directly connected networks. +In this case, Linux-2.2 makes ``dead gateway detection''~\cite{RFC1122} +controlled by neighbour unreachability detection and by advice +from transport protocols to select a working router, so the order +of the routes is not essential. However, in this case, +fiddling with default routes manually is not recommended. Use the Router Discovery +protocol (see Appendix~\ref{EXAMPLE-SETUP}, p.\pageref{EXAMPLE-SETUP}) +instead. Actually, Linux-2.2 IPv6 does not give user level applications +any access to default routes. +\end{enumerate} + +Certainly, the steps above are not performed exactly +in this sequence. Instead, the routing table in the kernel is kept +in some data structure to achieve the final result +with minimal cost. However, not depending on a particular +routing algorithm implemented in the kernel, we can summarize +the statements above as: a route is identified by the triplet +\{prefix, tos, preference\}. This {\em key\/} lets us locate +the route in the routing table. + +\paragraph{Route attributes:} Each route key refers to a routing +information record containing +the data required to deliver IP packets (f.e.\ output device and +next hop router) and some optional attributes (f.e. the path MTU or +the preferred source address when communicating with this destination). +These attributes are described in the following subsection. + +\paragraph{Route types:} \label{IP-ROUTE-TYPES} +It is important that the set +of required and optional attributes depend on the route {\em type\/}. +The most important route type +is \verb|unicast|. It describes real paths to other hosts. +As a rule, common routing tables contain only such routes. However, +there are other types of routes with different semantics. The +full list of types understood by Linux-2.2 is: +\begin{itemize} +\item \verb|unicast| --- the route entry describes real paths to the +destinations covered by the route prefix. +\item \verb|unreachable| --- these destinations are unreachable. Packets +are discarded and the ICMP message {\em host unreachable\/} is generated. +The local senders get an \verb|EHOSTUNREACH| error. +\item \verb|blackhole| --- these destinations are unreachable. Packets +are discarded silently. The local senders get an \verb|EINVAL| error. +\item \verb|prohibit| --- these destinations are unreachable. Packets +are discarded and the ICMP message {\em communication administratively +prohibited\/} is generated. The local senders get an \verb|EACCES| error. +\item \verb|local| --- the destinations are assigned to this +host. The packets are looped back and delivered locally. +\item \verb|broadcast| --- the destinations are broadcast addresses. +The packets are sent as link broadcasts. +\item \verb|throw| --- a special control route used together with policy +rules (see sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). If such a route is selected, lookup +in this table is terminated pretending that no route was found. +Without policy routing it is equivalent to the absence of the route in the routing +table. The packets are dropped and the ICMP message {\em net unreachable\/} +is generated. The local senders get an \verb|ENETUNREACH| error. +\item \verb|nat| --- a special NAT route. Destinations covered by the prefix +are considered to be dummy (or external) addresses which require translation +to real (or internal) ones before forwarding. The addresses to translate to +are selected with the attribute \verb|via|. More about NAT is +in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}. +\item \verb|anycast| --- ({\em not implemented\/}) the destinations are +{\em anycast\/} addresses assigned to this host. They are mainly equivalent +to \verb|local| with one difference: such addresses are invalid when used +as the source address of any packet. +\item \verb|multicast| --- a special type used for multicast routing. +It is not present in normal routing tables. +\end{itemize} + +\paragraph{Route tables:} Linux-2.2 can pack routes into several routing +tables identified by a number in the range from 1 to 255 or by +name from the file \verb|/etc/iproute2/rt_tables|. By default all normal +routes are inserted into the \verb|main| table (ID 254) and the kernel only uses +this table when calculating routes. + +Actually, one other table always exists, which is invisible but +even more important. It is the \verb|local| table (ID 255). This table +consists of routes for local and broadcast addresses. The kernel maintains +this table automatically and the administrator usually need not modify it +or even look at it. + +The multiple routing tables enter the game when {\em policy routing\/} +is used. See sec.\ref{IP-RULE}, p.\pageref{IP-RULE}. +In this case, the table identifier effectively becomes +one more parameter, which should be added to the triplet +\{prefix, tos, preference\} to uniquely identify the route. + + +\subsection{{\tt ip route add} --- add a new route\\ + {\tt ip route change} --- change a route\\ + {\tt ip route replace} --- change a route or add a new one} +\label{IP-ROUTE-ADD} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; + \verb|replace|, \verb|repl|. + + +\paragraph{Arguments:} +\begin{itemize} +\item \verb|to PREFIX| or \verb|to TYPE PREFIX| (default) + +--- the destination prefix of the route. If \verb|TYPE| is omitted, +\verb|ip| assumes type \verb|unicast|. Other values of \verb|TYPE| +are listed above. \verb|PREFIX| is an IP or IPv6 address optionally followed +by a slash and the prefix length. If the length of the prefix is missing, +\verb|ip| assumes a full-length host route. There is also a special +\verb|PREFIX| --- \verb|default| --- which is equivalent to IP \verb|0/0| or +to IPv6 \verb|::/0|. + +\item \verb|tos TOS| or \verb|dsfield TOS| + +--- the Type Of Service (TOS) key. This key has no associated mask and +the longest match is understood as: First, compare the TOS +of the route and of the packet. If they are not equal, then the packet +may still match a route with a zero TOS. \verb|TOS| is either an 8 bit hexadecimal +number or an identifier from {\tt /etc/iproute2/rt\_dsfield}. + + +\item \verb|metric NUMBER| or \verb|preference NUMBER| + +--- the preference value of the route. \verb|NUMBER| is an arbitrary 32bit number. + +\item \verb|table TABLEID| + +--- the table to add this route to. +\verb|TABLEID| may be a number or a string from the file +\verb|/etc/iproute2/rt_tables|. If this parameter is omitted, +\verb|ip| assumes the \verb|main| table, with the exception of +\verb|local|, \verb|broadcast| and \verb|nat| routes, which are +put into the \verb|local| table by default. + +\item \verb|dev NAME| + +--- the output device name. + +\item \verb|via ADDRESS| + +--- the address of the nexthop router. Actually, the sense of this field depends +on the route type. For normal \verb|unicast| routes it is either the true nexthop +router or, if it is a direct route installed in BSD compatibility mode, +it can be a local address of the interface. +For NAT routes it is the first address of the block of translated IP destinations. + +\item \verb|src ADDRESS| + +--- the source address to prefer when sending to the destinations +covered by the route prefix. + +\item \verb|realm REALMID| + +--- the realm to which this route is assigned. +\verb|REALMID| may be a number or a string from the file +\verb|/etc/iproute2/rt_realms|. Sec.\ref{RT-REALMS} (p.\pageref{RT-REALMS}) +contains more information on realms. + +\item \verb|mtu MTU| or \verb|mtu lock MTU| + +--- the MTU along the path to the destination. If the modifier \verb|lock| is +not used, the MTU may be updated by the kernel due to Path MTU Discovery. +If the modifier \verb|lock| is used, no path MTU discovery will be tried, +all packets will be sent without the DF bit in IPv4 case +or fragmented to MTU for IPv6. + +\item \verb|window NUMBER| + +--- the maximal window for TCP to advertise to these destinations, +measured in bytes. It limits maximal data bursts that our TCP +peers are allowed to send to us. + +\item \verb|rtt NUMBER| + +--- the initial RTT (``Round Trip Time'') estimate. + + +\item \verb|rttvar NUMBER| + +--- \threeonly the initial RTT variance estimate. + + +\item \verb|ssthresh NUMBER| + +--- \threeonly an estimate for the initial slow start threshold. + + +\item \verb|cwnd NUMBER| + +--- \threeonly the clamp for congestion window. It is ignored if the \verb|lock| + flag is not used. + + +\item \verb|advmss NUMBER| + +--- \threeonly the MSS (``Maximal Segment Size'') to advertise to these + destinations when establishing TCP connections. If it is not given, + Linux uses a default value calculated from the first hop device MTU. + +\begin{NB} + If the path to these destination is asymmetric, this guess may be wrong. +\end{NB} + +\item \verb|reordering NUMBER| + +--- \threeonly Maximal reordering on the path to this destination. + If it is not given, Linux uses the value selected with \verb|sysctl| + variable \verb|net/ipv4/tcp_reordering|. + + + +\item \verb|nexthop NEXTHOP| + +--- the nexthop of a multipath route. \verb|NEXTHOP| is a complex value +with its own syntax similar to the top level argument lists: +\begin{itemize} +\item \verb|via ADDRESS| is the nexthop router. +\item \verb|dev NAME| is the output device. +\item \verb|weight NUMBER| is a weight for this element of a multipath +route reflecting its relative bandwidth or quality. +\end{itemize} + +\item \verb|scope SCOPE_VAL| + +--- the scope of the destinations covered by the route prefix. +\verb|SCOPE_VAL| may be a number or a string from the file +\verb|/etc/iproute2/rt_scopes|. +If this parameter is omitted, +\verb|ip| assumes scope \verb|global| for all gatewayed \verb|unicast| +routes, scope \verb|link| for direct \verb|unicast| and \verb|broadcast| routes +and scope \verb|host| for \verb|local| routes. + +\item \verb|protocol RTPROTO| + +--- the routing protocol identifier of this route. +\verb|RTPROTO| may be a number or a string from the file +\verb|/etc/iproute2/rt_protos|. If the routing protocol ID is +not given, \verb|ip| assumes protocol \verb|boot| (i.e.\ +it assumes the route was added by someone who doesn't +understand what they are doing). Several protocol values have a fixed interpretation. +Namely: +\begin{itemize} +\item \verb|redirect| --- the route was installed due to an ICMP redirect. +\item \verb|kernel| --- the route was installed by the kernel during +autoconfiguration. +\item \verb|boot| --- the route was installed during the bootup sequence. +If a routing daemon starts, it will purge all of them. +\item \verb|static| --- the route was installed by the administrator +to override dynamic routing. Routing daemon will respect them +and, probably, even advertise them to its peers. +\item \verb|ra| --- the route was installed by Router Discovery protocol. +\end{itemize} +The rest of the values are not reserved and the administrator is free +to assign (or not to assign) protocol tags. At least, routing +daemons should take care of setting some unique protocol values, +f.e.\ as they are assigned in \verb|rtnetlink.h| or in \verb|rt_protos| +database. + + +\item \verb|onlink| + +--- pretend that the nexthop is directly attached to this link, +even if it does not match any interface prefix. One application of this +option may be found in~\cite{IP-TUNNELS}. + +\item \verb|equalize| + +--- allow packet by packet randomization on multipath routes. +Without this modifier, the route will be frozen to one selected +nexthop, so that load splitting will only occur on per-flow base. +\verb|equalize| only works if the kernel is patched. + + +\end{itemize} + + +\begin{NB} + Actually there are more commands: \verb|prepend| does the same + thing as classic \verb|route add|, i.e.\ adds a route, even if another + route to the same destination exists. Its opposite case is \verb|append|, + which adds the route to the end of the list. Avoid these + features. +\end{NB} +\begin{NB} + More sad news, IPv6 only understands the \verb|append| command correctly. + All the others are translated into \verb|append| commands. Certainly, + this will change in the future. +\end{NB} + +\paragraph{Examples:} +\begin{itemize} +\item add a plain route to network 10.0.0/24 via gateway 193.233.7.65 +\begin{verbatim} + ip route add 10.0.0/24 via 193.233.7.65 +\end{verbatim} +\item change it to a direct route via the \verb|dummy| device +\begin{verbatim} + ip ro chg 10.0.0/24 dev dummy +\end{verbatim} +\item add a default multipath route splitting the load between \verb|ppp0| +and \verb|ppp1| +\begin{verbatim} + ip route add default scope global nexthop dev ppp0 \ + nexthop dev ppp1 +\end{verbatim} +Note the scope value. It is not necessary but it informs the kernel +that this route is gatewayed rather than direct. Actually, if you +know the addresses of remote endpoints it would be better to use the +\verb|via| parameter. +\item announce that the address 192.203.80.144 is not a real one, but +should be translated to 193.233.7.83 before forwarding +\begin{verbatim} + ip route add nat 192.203.80.144 via 193.233.7.83 +\end{verbatim} +Backward translation is setup with policy rules described +in the following section (sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). +\end{itemize} + +\subsection{{\tt ip route delete} --- delete a route} + +\paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|. + +\paragraph{Arguments:} \verb|ip route del| has the same arguments as +\verb|ip route add|, but their semantics are a bit different. + +Key values (\verb|to|, \verb|tos|, \verb|preference| and \verb|table|) +select the route to delete. If optional attributes are present, \verb|ip| +verifies that they coincide with the attributes of the route to delete. +If no route with the given key and attributes was found, \verb|ip route del| +fails. +\begin{NB} +Linux-2.0 had the option to delete a route selected only by prefix address, +ignoring its length (i.e.\ netmask). This option no longer exists +because it was ambiguous. However, look at {\tt ip route flush} +(sec.\ref{IP-ROUTE-FLUSH}, p.\pageref{IP-ROUTE-FLUSH}) which +provides similar and even richer functionality. +\end{NB} + +\paragraph{Example:} +\begin{itemize} +\item delete the multipath route created by the command in previous subsection +\begin{verbatim} + ip route del default scope global nexthop dev ppp0 \ + nexthop dev ppp1 +\end{verbatim} +\end{itemize} + + + +\subsection{{\tt ip route show} --- list routes} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. + +\paragraph{Description:} the command displays the contents of the routing tables +or the route(s) selected by some criteria. + + +\paragraph{Arguments:} +\begin{itemize} +\item \verb|to SELECTOR| (default) + +--- only select routes from the given range of destinations. \verb|SELECTOR| +consists of an optional modifier (\verb|root|, \verb|match| or \verb|exact|) +and a prefix. \verb|root PREFIX| selects routes with prefixes not shorter +than \verb|PREFIX|. F.e.\ \verb|root 0/0| selects the entire routing table. +\verb|match PREFIX| selects routes with prefixes not longer than +\verb|PREFIX|. F.e.\ \verb|match 10.0/16| selects \verb|10.0/16|, +\verb|10/8| and \verb|0/0|, but it does not select \verb|10.1/16| and +\verb|10.0.0/24|. And \verb|exact PREFIX| (or just \verb|PREFIX|) +selects routes with this exact prefix. If neither of these options +are present, \verb|ip| assumes \verb|root 0/0| i.e.\ it lists the entire table. + + +\item \verb|tos TOS| or \verb|dsfield TOS| + + --- only select routes with the given TOS. + + +\item \verb|table TABLEID| + + --- show the routes from this table(s). The default setting is to show +\verb|table| \verb|main|. \verb|TABLEID| may either be the ID of a real table +or one of the special values: + \begin{itemize} + \item \verb|all| --- list all of the tables. + \item \verb|cache| --- dump the routing cache. + \end{itemize} +\begin{NB} + IPv6 has a single table. However, splitting it into \verb|main|, \verb|local| + and \verb|cache| is emulated by the \verb|ip| utility. +\end{NB} + +\item \verb|cloned| or \verb|cached| + +--- list cloned routes i.e.\ routes which were dynamically forked from +other routes because some route attribute (f.e.\ MTU) was updated. +Actually, it is equivalent to \verb|table cache|. + +\item \verb|from SELECTOR| + +--- the same syntax as for \verb|to|, but it binds the source address range +rather than destinations. Note that the \verb|from| option only works with +cloned routes. + +\item \verb|protocol RTPROTO| + +--- only list routes of this protocol. + + +\item \verb|scope SCOPE_VAL| + +--- only list routes with this scope. + +\item \verb|type TYPE| + +--- only list routes of this type. + +\item \verb|dev NAME| + +--- only list routes going via this device. + +\item \verb|via PREFIX| + +--- only list routes going via the nexthop routers selected by \verb|PREFIX|. + +\item \verb|src PREFIX| + +--- only list routes with preferred source addresses selected +by \verb|PREFIX|. + +\item \verb|realm REALMID| or \verb|realms FROMREALM/TOREALM| + +--- only list routes with these realms. + +\end{itemize} + +\paragraph{Examples:} Let us count routes of protocol \verb|gated/bgp| +on a router: +\begin{verbatim} +kuznet@amber:~ $ ip ro ls proto gated/bgp | wc + 1413 9891 79010 +kuznet@amber:~ $ +\end{verbatim} +To count the size of the routing cache, we have to use the \verb|-o| option +because cached attributes can take more than one line of output: +\begin{verbatim} +kuznet@amber:~ $ ip -o ro ls cloned | wc + 159 2543 18707 +kuznet@amber:~ $ +\end{verbatim} + + +\paragraph{Output format:} The output of this command consists +of per route records separated by line feeds. +However, some records may consist +of more than one line: particularly, this is the case when the route +is cloned or you requested additional statistics. If the +\verb|-o| option was given, then line feeds separating lines inside +records are replaced with the backslash sign. + +The output has the same syntax as arguments given to {\tt ip route add}, +so that it can be understood easily. F.e.\ +\begin{verbatim} +kuznet@amber:~ $ ip ro ls 193.233.7/24 +193.233.7.0/24 dev eth0 proto gated/conn scope link \ + src 193.233.7.65 realms inr.ac +kuznet@amber:~ $ +\end{verbatim} + +If you list cloned entries, the output contains other attributes which +are evaluated during route calculation and updated during route +lifetime. An example of the output is: +\begin{verbatim} +kuznet@amber:~ $ ip ro ls 193.233.7.82 tab cache +193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \ + realms inr.ac/inr.ac + cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0 +193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac + cache mtu 1500 rtt 300 +kuznet@amber:~ $ +\end{verbatim} +\begin{NB} + \label{NB-strange-route} + The route looks a bit strange, doesn't it? Did you notice that + it is a path from 193.233.7.82 back to 193.233.82? Well, you will + see in the section on \verb|ip route get| (p.\pageref{NB-nature-of-strangeness}) + how it appeared. +\end{NB} +The second line, starting with the word \verb|cache|, shows +additional attributes which normal routes do not possess. +Cached flags are summarized in angle brackets: +\begin{itemize} +\item \verb|local| --- packets are delivered locally. +It stands for loopback unicast routes, for broadcast routes +and for multicast routes, if this host is a member of the corresponding +group. + +\item \verb|reject| --- the path is bad. Any attempt to use it results +in an error. See attribute \verb|error| below (p.\pageref{IP-ROUTE-GET-error}). + +\item \verb|mc| --- the destination is multicast. + +\item \verb|brd| --- the destination is broadcast. + +\item \verb|src-direct| --- the source is on a directly connected +interface. + +\item \verb|redirected| --- the route was created by an ICMP Redirect. + +\item \verb|redirect| --- packets going via this route will +trigger an ICMP redirect. + +\item \verb|fastroute| --- the route is eligible to be used for fastroute. + +\item \verb|equalize| --- make packet by packet randomization +along this path. + +\item \verb|dst-nat| --- the destination address requires translation. + +\item \verb|src-nat| --- the source address requires translation. + +\item \verb|masq| --- the source address requires masquerading. +This feature disappeared in linux-2.4. + +\item \verb|notify| --- ({\em not implemented}) change/deletion +of this route will trigger RTNETLINK notification. +\end{itemize} + +Then some optional attributes follow: +\begin{itemize} +\item \verb|error| --- on \verb|reject| routes it is error code +returned to local senders when they try to use this route. +These error codes are translated into ICMP error codes, sent to remote +senders, according to the rules described above in the subsection +devoted to route types (p.\pageref{IP-ROUTE-TYPES}). +\label{IP-ROUTE-GET-error} + +\item \verb|expires| --- this entry will expire after this timeout. + +\item \verb|iif| --- the packets for this path are expected to arrive +on this interface. +\end{itemize} + +\paragraph{Statistics:} With the \verb|-statistics| option, more +information about this route is shown: +\begin{itemize} +\item \verb|users| --- the number of users of this entry. +\item \verb|age| --- shows when this route was last used. +\item \verb|used| --- the number of lookups of this route since its creation. +\end{itemize} + + +\subsection{{\tt ip route flush} --- flush routing tables} +\label{IP-ROUTE-FLUSH} + +\paragraph{Abbreviations:} \verb|flush|, \verb|f|. + +\paragraph{Description:} this command flushes routes selected +by some criteria. + +\paragraph{Arguments:} the arguments have the same syntax and semantics +as the arguments of \verb|ip route show|, but routing tables are not +listed but purged. The only difference is the default action: \verb|show| +dumps all the IP main routing table but \verb|flush| prints the helper page. +The reason for this difference does not require any explanation, does it? + + +\paragraph{Statistics:} With the \verb|-statistics| option, the command +becomes verbose. It prints out the number of deleted routes and the number +of rounds made to flush the routing table. If the option is given +twice, \verb|ip route flush| also dumps all the deleted routes +in the format described in the previous subsection. + +\paragraph{Examples:} The first example flushes all the +gatewayed routes from the main table (f.e.\ after a routing daemon crash). +\begin{verbatim} +netadm@amber:~ # ip -4 ro flush scope global type unicast +\end{verbatim} +This option deserves to be put into a scriptlet \verb|routef|. +\begin{NB} +This option was described in the \verb|route(8)| man page borrowed +from BSD, but was never implemented in Linux. +\end{NB} + +The second example flushes all IPv6 cloned routes: +\begin{verbatim} +netadm@amber:~ # ip -6 -s -s ro flush cache +3ffe:2400::220:afff:fef4:c5d1 via 3ffe:2400::220:afff:fef4:c5d1 \ + dev eth0 metric 0 + cache used 2 age 12sec mtu 1500 rtt 300 +3ffe:2400::280:adff:feb7:8034 via 3ffe:2400::280:adff:feb7:8034 \ + dev eth0 metric 0 + cache used 2 age 15sec mtu 1500 rtt 300 +3ffe:2400::280:c8ff:fe59:5bcc via 3ffe:2400::280:c8ff:fe59:5bcc \ + dev eth0 metric 0 + cache users 1 used 1 age 23sec mtu 1500 rtt 300 +3ffe:2400:0:1:2a0:ccff:fe66:1878 via 3ffe:2400:0:1:2a0:ccff:fe66:1878 \ + dev eth1 metric 0 + cache used 2 age 20sec mtu 1500 rtt 300 +3ffe:2400:0:1:a00:20ff:fe71:fb30 via 3ffe:2400:0:1:a00:20ff:fe71:fb30 \ + dev eth1 metric 0 + cache used 2 age 33sec mtu 1500 rtt 300 +ff02::1 via ff02::1 dev eth1 metric 0 + cache users 1 used 1 age 45sec mtu 1500 rtt 300 + +*** Round 1, deleting 6 entries *** +*** Flush is complete after 1 round *** +netadm@amber:~ # ip -6 -s -s ro flush cache +Nothing to flush. +netadm@amber:~ # +\end{verbatim} + +The third example flushes BGP routing tables after a \verb|gated| +death. +\begin{verbatim} +netadm@amber:~ # ip ro ls proto gated/bgp | wc + 1408 9856 78730 +netadm@amber:~ # ip -s ro f proto gated/bgp + +*** Round 1, deleting 1408 entries *** +*** Flush is complete after 1 round *** +netadm@amber:~ # ip ro f proto gated/bgp +Nothing to flush. +netadm@amber:~ # ip ro ls proto gated/bgp +netadm@amber:~ # +\end{verbatim} + + +\subsection{{\tt ip route get} --- get a single route} +\label{IP-ROUTE-GET} + +\paragraph{Abbreviations:} \verb|get|, \verb|g|. + +\paragraph{Description:} this command gets a single route to a destination +and prints its contents exactly as the kernel sees it. + +\paragraph{Arguments:} +\begin{itemize} +\item \verb|to ADDRESS| (default) + +--- the destination address. + +\item \verb|from ADDRESS| + +--- the source address. + +\item \verb|tos TOS| or \verb|dsfield TOS| + +--- the Type Of Service. + +\item \verb|iif NAME| + +--- the device from which this packet is expected to arrive. + +\item \verb|oif NAME| + +--- force the output device on which this packet will be routed. + +\item \verb|connected| + +--- if no source address (option \verb|from|) was given, relookup +the route with the source set to the preferred address received from the first lookup. +If policy routing is used, it may be a different route. + +\end{itemize} + +Note that this operation is not equivalent to \verb|ip route show|. +\verb|show| shows existing routes. \verb|get| resolves them and +creates new clones if necessary. Essentially, \verb|get| +is equivalent to sending a packet along this path. +If the \verb|iif| argument is not given, the kernel creates a route +to output packets towards the requested destination. +This is equivalent to pinging the destination +with a subsequent {\tt ip route ls cache}, however, no packets are +actually sent. With the \verb|iif| argument, the kernel pretends +that a packet arrived from this interface and searches for +a path to forward the packet. + +\paragraph{Output format:} This command outputs routes in the same +format as \verb|ip route ls|. + +\paragraph{Examples:} +\begin{itemize} +\item Find a route to output packets to 193.233.7.82: +\begin{verbatim} +kuznet@amber:~ $ ip route get 193.233.7.82 +193.233.7.82 dev eth0 src 193.233.7.65 realms inr.ac + cache mtu 1500 rtt 300 +kuznet@amber:~ $ +\end{verbatim} + +\item Find a route to forward packets arriving on \verb|eth0| +from 193.233.7.82 and destined for 193.233.7.82: +\begin{verbatim} +kuznet@amber:~ $ ip r g 193.233.7.82 from 193.233.7.82 iif eth0 +193.233.7.82 from 193.233.7.82 dev eth0 src 193.233.7.65 \ + realms inr.ac/inr.ac + cache <src-direct,redirect> mtu 1500 rtt 300 iif eth0 +kuznet@amber:~ $ +\end{verbatim} +\begin{NB} + \label{NB-nature-of-strangeness} + This is the command that created the funny route from 193.233.7.82 + looped back to 193.233.7.82 (cf.\ NB on~p.\pageref{NB-strange-route}). + Note the \verb|redirect| flag on it. +\end{NB} + +\item Find a multicast route for packets arriving on \verb|eth0| +from host 193.233.7.82 and destined for multicast group 224.2.127.254 +(it is assumed that a multicast routing daemon is running. +In this case, it is \verb|pimd|) +\begin{verbatim} +kuznet@amber:~ $ ip r g 224.2.127.254 from 193.233.7.82 iif eth0 +multicast 224.2.127.254 from 193.233.7.82 dev lo \ + src 193.233.7.65 realms inr.ac/cosmos + cache <mc> iif eth0 Oifs: eth1 pimreg +kuznet@amber:~ $ +\end{verbatim} +This route differs from the ones seen before. It contains a ``normal'' part +and a ``multicast'' part. The normal part is used to deliver (or not to +deliver) the packet to local IP listeners. In this case the router +is not a member +of this group, so that route has no \verb|local| flag and only +forwards packets. The output device for such entries is always loopback. +The multicast part consists of an additional \verb|Oifs:| list showing +the output interfaces. +\end{itemize} + + +It is time for a more complicated example. Let us add an invalid +gatewayed route for a destination which is really directly connected: +\begin{verbatim} +netadm@alisa:~ # ip route add 193.233.7.98 via 193.233.7.254 +netadm@alisa:~ # ip route get 193.233.7.98 +193.233.7.98 via 193.233.7.254 dev eth0 src 193.233.7.90 + cache mtu 1500 rtt 3072 +netadm@alisa:~ # +\end{verbatim} +and probe it with ping: +\begin{verbatim} +netadm@alisa:~ # ping -n 193.233.7.98 +PING 193.233.7.98 (193.233.7.98) from 193.233.7.90 : 56 data bytes +From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98) +64 bytes from 193.233.7.98: icmp_seq=0 ttl=255 time=3.5 ms +From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98) +64 bytes from 193.233.7.98: icmp_seq=1 ttl=255 time=2.2 ms +64 bytes from 193.233.7.98: icmp_seq=2 ttl=255 time=0.4 ms +64 bytes from 193.233.7.98: icmp_seq=3 ttl=255 time=0.4 ms +64 bytes from 193.233.7.98: icmp_seq=4 ttl=255 time=0.4 ms +^C +--- 193.233.7.98 ping statistics --- +5 packets transmitted, 5 packets received, 0% packet loss +round-trip min/avg/max = 0.4/1.3/3.5 ms +netadm@alisa:~ # +\end{verbatim} +What happened? Router 193.233.7.254 understood that we have a much +better path to the destination and sent us an ICMP redirect message. +We may retry \verb|ip route get| to see what we have in the routing +tables now: +\begin{verbatim} +netadm@alisa:~ # ip route get 193.233.7.98 +193.233.7.98 dev eth0 src 193.233.7.90 + cache <redirected> mtu 1500 rtt 3072 +netadm@alisa:~ # +\end{verbatim} + + + +\section{{\tt ip rule} --- routing policy database management} +\label{IP-RULE} + +\paragraph{Abbreviations:} \verb|rule|, \verb|ru|. + +\paragraph{Object:} \verb|rule|s in the routing policy database control +the route selection algorithm. + +Classic routing algorithms used in the Internet make routing decisions +based only on the destination address of packets (and in theory, +but not in practice, on the TOS field). The seminal review of classic +routing algorithms and their modifications can be found in~\cite{RFC1812}. + +In some circumstances we want to route packets differently depending not only +on destination addresses, but also on other packet fields: source address, +IP protocol, transport protocol ports or even packet payload. +This task is called ``policy routing''. + +\begin{NB} + ``policy routing'' $\neq$ ``routing policy''. + +\noindent ``policy routing'' $=$ ``cunning routing''. + +\noindent ``routing policy'' $=$ ``routing tactics'' or ``routing plan''. +\end{NB} + +To solve this task, the conventional destination based routing table, ordered +according to the longest match rule, is replaced with a ``routing policy +database'' (or RPDB), which selects routes +by executing some set of rules. The rules may have lots of keys of different +natures and therefore they have no natural ordering, but one imposed +by the administrator. Linux-2.2 RPDB is a linear list of rules +ordered by numeric priority value. +RPDB explicitly allows matching a few packet fields: + +\begin{itemize} +\item packet source address. +\item packet destination address. +\item TOS. +\item incoming interface (which is packet metadata, rather than a packet field). +\end{itemize} + +Matching IP protocols and transport ports is also possible, +indirectly, via \verb|ipchains|, by exploiting their ability +to mark some classes of packets with \verb|fwmark|. Therefore, +\verb|fwmark| is also included in the set of keys checked by rules. + +Each policy routing rule consists of a {\em selector\/} and an {\em action\/} +predicate. The RPDB is scanned in the order of increasing priority. The selector +of each rule is applied to \{source address, destination address, incoming +interface, tos, fwmark\} and, if the selector matches the packet, +the action is performed. The action predicate may return with success. +In this case, it will either give a route or failure indication +and the RPDB lookup is terminated. Otherwise, the RPDB program +continues on the next rule. + +What is the action, semantically? The natural action is to select the +nexthop and the output device. This is what +Cisco IOS~\cite{IOS} does. Let us call it ``match \& set''. +The Linux-2.2 approach is more flexible. The action includes +lookups in destination-based routing tables and selecting +a route from these tables according to the classic longest match algorithm. +The ``match \& set'' approach is the simplest case of the Linux one. It is realized +when a second level routing table contains a single default route. +Recall that Linux-2.2 supports multiple tables +managed with the \verb|ip route| command, described in the previous section. + +At startup time the kernel configures the default RPDB consisting of three +rules: + +\begin{enumerate} +\item Priority: 0, Selector: match anything, Action: lookup routing +table \verb|local| (ID 255). +The \verb|local| table is a special routing table containing +high priority control routes for local and broadcast addresses. + +Rule 0 is special. It cannot be deleted or overridden. + + +\item Priority: 32766, Selector: match anything, Action: lookup routing +table \verb|main| (ID 254). +The \verb|main| table is the normal routing table containing all non-policy +routes. This rule may be deleted and/or overridden with other +ones by the administrator. + +\item Priority: 32767, Selector: match anything, Action: lookup routing +table \verb|default| (ID 253). +The \verb|default| table is empty. It is reserved for some +post-processing if no previous default rules selected the packet. +This rule may also be deleted. + +\end{enumerate} + +Do not confuse routing tables with rules: rules point to routing tables, +several rules may refer to one routing table and some routing tables +may have no rules pointing to them. If the administrator deletes all the rules +referring to a table, the table is not used, but it still exists +and will disappear only after all the routes contained in it are deleted. + + +\paragraph{Rule attributes:} Each RPDB entry has additional +attributes. F.e.\ each rule has a pointer to some routing +table. NAT and masquerading rules have an attribute to select new IP +address to translate/masquerade. Besides that, rules have some +optional attributes, which routes have, namely \verb|realms|. +These values do not override those contained in the routing tables. They +are only used if the route did not select any attributes. + + +\paragraph{Rule types:} The RPDB may contain rules of the following +types: +\begin{itemize} +\item \verb|unicast| --- the rule prescribes to return the route found +in the routing table referenced by the rule. +\item \verb|blackhole| --- the rule prescribes to silently drop the packet. +\item \verb|unreachable| --- the rule prescribes to generate a ``Network +is unreachable'' error. +\item \verb|prohibit| --- the rule prescribes to generate +``Communication is administratively prohibited'' error. +\item \verb|nat| --- the rule prescribes to translate the source address +of the IP packet into some other value. More about NAT is +in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}. +\end{itemize} + + +\paragraph{Commands:} \verb|add|, \verb|delete| and \verb|show| +(or \verb|list|). + +\subsection{{\tt ip rule add} --- insert a new rule\\ + {\tt ip rule delete} --- delete a rule} +\label{IP-RULE-ADD} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, + \verb|d|. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|type TYPE| (default) + +--- the type of this rule. The list of valid types was given in the previous +subsection. + +\item \verb|from PREFIX| + +--- select the source prefix to match. + +\item \verb|to PREFIX| + +--- select the destination prefix to match. + +\item \verb|iif NAME| + +--- select the incoming device to match. If the interface is loopback, +the rule only matches packets originating from this host. This means that you +may create separate routing tables for forwarded and local packets and, +hence, completely segregate them. + +\item \verb|tos TOS| or \verb|dsfield TOS| + +--- select the TOS value to match. + +\item \verb|fwmark MARK| + +--- select the \verb|fwmark| value to match. + +\item \verb|priority PREFERENCE| + +--- the priority of this rule. Each rule should have an explicitly +set {\em unique\/} priority value. +\begin{NB} + Really, for historical reasons \verb|ip rule add| does not require a + priority value and allows them to be non-unique. + If the user does not supplied a priority, it is selected by the kernel. + If the user creates a rule with a priority value that + already exists, the kernel does not reject the request. It adds + the new rule before all old rules of the same priority. + + It is mistake in design, no more. And it will be fixed one day, + so do not rely on this feature. Use explicit priorities. +\end{NB} + + +\item \verb|table TABLEID| + +--- the routing table identifier to lookup if the rule selector matches. + +\item \verb|realms FROM/TO| + +--- Realms to select if the rule matched and the routing table lookup +succeeded. Realm \verb|TO| is only used if the route did not select +any realm. + +\item \verb|nat ADDRESS| + +--- The base of the IP address block to translate (for source addresses). +The \verb|ADDRESS| may be either the start of the block of NAT addresses +(selected by NAT routes) or in linux-2.2 a local host address (or even zero). +In the last case the router does not translate the packets, +but masquerades them to this address; this feature disappered in 2.4. +More about NAT is in Appendix~\ref{ROUTE-NAT}, +p.\pageref{ROUTE-NAT}. + +\end{itemize} + +\paragraph{Warning:} Changes to the RPDB made with these commands +do not become active immediately. It is assumed that after +a script finishes a batch of updates, it flushes the routing cache +with \verb|ip route flush cache|. + +\paragraph{Examples:} +\begin{itemize} +\item Route packets with source addresses from 192.203.80/24 +according to routing table \verb|inr.ruhep|: +\begin{verbatim} +ip ru add from 192.203.80.0/24 table inr.ruhep prio 220 +\end{verbatim} + +\item Translate packet source address 193.233.7.83 into 192.203.80.144 +and route it according to table \#1 (actually, it is \verb|inr.ruhep|): +\begin{verbatim} +ip ru add from 193.233.7.83 nat 192.203.80.144 table 1 prio 320 +\end{verbatim} + +\item Delete the unused default rule: +\begin{verbatim} +ip ru del prio 32767 +\end{verbatim} + +\end{itemize} + + + +\subsection{{\tt ip rule show} --- list rules} +\label{IP-RULE-SHOW} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. + + +\paragraph{Arguments:} Good news, this is one command that has no arguments. + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@amber:~ $ ip ru ls +0: from all lookup local +200: from 192.203.80.0/24 to 193.233.7.0/24 lookup main +210: from 192.203.80.0/24 to 192.203.80.0/24 lookup main +220: from 192.203.80.0/24 lookup inr.ruhep realms inr.ruhep/radio-msu +300: from 193.233.7.83 to 193.233.7.0/24 lookup main +310: from 193.233.7.83 to 192.203.80.0/24 lookup main +320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144 +32766: from all lookup main +kuznet@amber:~ $ +\end{verbatim} + +In the first column is the rule priority value followed +by a colon. Then the selectors follow. Each key is prefixed +with the same keyword that was used to create the rule. + +The keyword \verb|lookup| is followed by a routing table identifier, +as it is recorded in the file \verb|/etc/iproute2/rt_tables|. + +If the rule does NAT (f.e.\ rule \#320), it is shown by the keyword +\verb|map-to| followed by the start of the block of addresses to map. + +The sense of this example is pretty simple. The prefixes +192.203.80.0/24 and 193.233.7.0/24 form the internal network, but +they are routed differently when the packets leave it. +Besides that, the host 193.233.7.83 is translated into +another prefix to look like 192.203.80.144 when talking +to the outer world. + + + +\section{{\tt ip maddress} --- multicast addresses management} +\label{IP-MADDR} + +\paragraph{Object:} \verb|maddress| objects are multicast addresses. + +\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|show| (or \verb|list|). + +\subsection{{\tt ip maddress show} --- list multicast addresses} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. + +\paragraph{Arguments:} + +\begin{itemize} + +\item \verb|dev NAME| (default) + +--- the device name. + +\end{itemize} + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@alisa:~ $ ip maddr ls dummy +2: dummy + link 33:33:00:00:00:01 + link 01:00:5e:00:00:01 + inet 224.0.0.1 users 2 + inet6 ff02::1 +kuznet@alisa:~ $ +\end{verbatim} + +The first line of the output shows the interface index and its name. +Then the multicast address list follows. Each line starts with the +protocol identifier. The word \verb|link| denotes a link layer +multicast addresses. + +If a multicast address has more than one user, the number +of users is shown after the \verb|users| keyword. + +One additional feature not present in the example above +is the \verb|static| flag, which indicates that the address was joined +with \verb|ip maddr add|. See the following subsection. + + + +\subsection{{\tt ip maddress add} --- add a multicast address\\ + {\tt ip maddress delete} --- delete a multicast address} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, \verb|d|. + +\paragraph{Description:} these commands attach/detach +a static link layer multicast address to listen on the interface. +Note that it is impossible to join protocol multicast groups +statically. This command only manages link layer addresses. + + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|address LLADDRESS| (default) + +--- the link layer multicast address. + +\item \verb|dev NAME| + +--- the device to join/leave this multicast address. + +\end{itemize} + + +\paragraph{Example:} Let us continue with the example from the previous subsection. + +\begin{verbatim} +netadm@alisa:~ # ip maddr add 33:33:00:00:00:01 dev dummy +netadm@alisa:~ # ip -0 maddr ls dummy +2: dummy + link 33:33:00:00:00:01 users 2 static + link 01:00:5e:00:00:01 +netadm@alisa:~ # ip maddr del 33:33:00:00:00:01 dev dummy +\end{verbatim} + +\begin{NB} + Neither \verb|ip| nor the kernel check for multicast address validity. + Particularly, this means that you can try to load a unicast address + instead of a multicast address. Most drivers will ignore such addresses, + but several (f.e.\ Tulip) will intern it to their on-board filter. + The effects may be strange. Namely, the addresses become additional + local link addresses and, if you loaded the address of another host + to the router, wait for duplicated packets on the wire. + It is not a bug, but rather a hole in the API and intra-kernel interfaces. + This feature is really more useful for traffic monitoring, but using it + with Linux-2.2 you {\em have to\/} be sure that the host is not + a router and, especially, that it is not a transparent proxy or masquerading + agent. +\end{NB} + + + +\section{{\tt ip mroute} --- multicast routing cache management} +\label{IP-MROUTE} + +\paragraph{Abbreviations:} \verb|mroute|, \verb|mr|. + +\paragraph{Object:} \verb|mroute| objects are multicast routing cache +entries created by a user level mrouting daemon +(f.e.\ \verb|pimd| or \verb|mrouted|). + +Due to the limitations of the current interface to the multicast routing +engine, it is impossible to change \verb|mroute| objects administratively, +so we may only display them. This limitation will be removed +in the future. + +\paragraph{Commands:} \verb|show| (or \verb|list|). + + +\subsection{{\tt ip mroute show} --- list mroute cache entries} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. + +\paragraph{Arguments:} + +\begin{itemize} +\item \verb|to PREFIX| (default) + +--- the prefix selecting the destination multicast addresses to list. + + +\item \verb|iif NAME| + +--- the interface on which multicast packets are received. + + +\item \verb|from PREFIX| + +--- the prefix selecting the IP source addresses of the multicast route. + + +\end{itemize} + +\paragraph{Output format:} + +\begin{verbatim} +kuznet@amber:~ $ ip mroute ls +(193.232.127.6, 224.0.1.39) Iif: unresolved +(193.232.244.34, 224.0.1.40) Iif: unresolved +(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg +kuznet@amber:~ $ +\end{verbatim} + +Each line shows one (S,G) entry in the multicast routing cache, +where S is the source address and G is the multicast group. \verb|Iif| is +the interface on which multicast packets are expected to arrive. +If the word \verb|unresolved| is there instead of the interface name, +it means that the routing daemon still hasn't resolved this entry. +The keyword \verb|oifs| is followed by a list of output interfaces, separated +by spaces. If a multicast routing entry is created with non-trivial +TTL scope, administrative distances are appended to the device names +in the \verb|oifs| list. + +\paragraph{Statistics:} The \verb|-statistics| option also prints the +number of packets and bytes forwarded along this route and +the number of packets that arrived on the wrong interface, if this number is not zero. + +\begin{verbatim} +kuznet@amber:~ $ ip -s mr ls 224.66/16 +(193.233.7.65, 224.66.66.66) Iif: eth0 Oifs: pimreg + 9383 packets, 300256 bytes +kuznet@amber:~ $ +\end{verbatim} + + +\section{{\tt ip tunnel} --- tunnel configuration} +\label{IP-TUNNEL} + +\paragraph{Abbreviations:} \verb|tunnel|, \verb|tunl|. + +\paragraph{Object:} \verb|tunnel| objects are tunnels, encapsulating +packets in IPv4 packets and then sending them over the IP infrastructure. + +\paragraph{Commands:} \verb|add|, \verb|delete|, \verb|change|, \verb|show| +(or \verb|list|). + +\paragraph{See also:} A more informal discussion of tunneling +over IP and the \verb|ip tunnel| command can be found in~\cite{IP-TUNNELS}. + +\subsection{{\tt ip tunnel add} --- add a new tunnel\\ + {\tt ip tunnel change} --- change an existing tunnel\\ + {\tt ip tunnel delete} --- destroy a tunnel} + +\paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|; +\verb|delete|, \verb|del|, \verb|d|. + + +\paragraph{Arguments:} + +\begin{itemize} + +\item \verb|name NAME| (default) + +--- select the tunnel device name. + +\item \verb|mode MODE| + +--- set the tunnel mode. Three modes are currently available: + \verb|ipip|, \verb|sit| and \verb|gre|. + +\item \verb|remote ADDRESS| + +--- set the remote endpoint of the tunnel. + +\item \verb|local ADDRESS| + +--- set the fixed local address for tunneled packets. +It must be an address on another interface of this host. + +\item \verb|ttl N| + +--- set a fixed TTL \verb|N| on tunneled packets. + \verb|N| is a number in the range 1--255. 0 is a special value + meaning that packets inherit the TTL value. + The default value is: \verb|inherit|. + +\item \verb|tos T| or \verb|dsfield T| + +--- set a fixed TOS \verb|T| on tunneled packets. + The default value is: \verb|inherit|. + + + +\item \verb|dev NAME| + +--- bind the tunnel to the device \verb|NAME| so that + tunneled packets will only be routed via this device and will + not be able to escape to another device when the route to endpoint changes. + +\item \verb|nopmtudisc| + +--- disable Path MTU Discovery on this tunnel. + It is enabled by default. Note that a fixed ttl is incompatible + with this option: tunnelling with a fixed ttl always makes pmtu discovery. + +\item \verb|key K|, \verb|ikey K|, \verb|okey K| + +--- (only GRE tunnels) use keyed GRE with key \verb|K|. \verb|K| is + either a number or an IP address-like dotted quad. + The \verb|key| parameter sets the key to use in both directions. + The \verb|ikey| and \verb|okey| parameters set different keys for input and output. + + +\item \verb|csum|, \verb|icsum|, \verb|ocsum| + +--- (only GRE tunnels) generate/require checksums for tunneled packets. + The \verb|ocsum| flag calculates checksums for outgoing packets. + The \verb|icsum| flag requires that all input packets have the correct + checksum. The \verb|csum| flag is equivalent to the combination + ``\verb|icsum| \verb|ocsum|''. + +\item \verb|seq|, \verb|iseq|, \verb|oseq| + +--- (only GRE tunnels) serialize packets. + The \verb|oseq| flag enables sequencing of outgoing packets. + The \verb|iseq| flag requires that all input packets are serialized. + The \verb|seq| flag is equivalent to the combination ``\verb|iseq| \verb|oseq|''. + +\begin{NB} + I think this option does not + work. At least, I did not test it, did not debug it and + do not even understand how it is supposed to work or for what + purpose Cisco planned to use it. Do not use it. +\end{NB} + + +\end{itemize} + +\paragraph{Example:} Create a pointopoint IPv6 tunnel with maximal TTL of 32. +\begin{verbatim} +netadm@amber:~ # ip tunl add Cisco mode sit remote 192.31.7.104 \ + local 192.203.80.142 ttl 32 +\end{verbatim} + +\subsection{{\tt ip tunnel show} --- list tunnels} + +\paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|. + + +\paragraph{Arguments:} None. + +\paragraph{Output format:} +\begin{verbatim} +kuznet@amber:~ $ ip tunl ls Cisco +Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32 +kuznet@amber:~ $ +\end{verbatim} +The line starts with the tunnel device name followed by a colon. +Then the tunnel mode follows. The parameters of the tunnel are listed +with the same keywords that were used when creating the tunnel. + +\paragraph{Statistics:} + +\begin{verbatim} +kuznet@amber:~ $ ip -s tunl ls Cisco +Cisco: ipv6/ip remote 192.31.7.104 local 192.203.80.142 ttl 32 +RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts + 12566 1707516 0 0 0 0 +TX: Packets Bytes Errors DeadLoop NoRoute NoBufs + 13445 1879677 0 0 0 0 +kuznet@amber:~ $ +\end{verbatim} +Essentially, these numbers are the same as the numbers +printed with {\tt ip -s link show} +(sec.\ref{IP-LINK-SHOW}, p.\pageref{IP-LINK-SHOW}) but the tags are different +to reflect that they are tunnel specific. +\begin{itemize} +\item \verb|CsumErrs| --- the total number of packets dropped +because of checksum failures for a GRE tunnel with checksumming enabled. +\item \verb|OutOfSeq| --- the total number of packets dropped +because they arrived out of sequence for a GRE tunnel with +serialization enabled. +\item \verb|Mcasts| --- the total number of multicast packets +received on a broadcast GRE tunnel. +\item \verb|DeadLoop| --- the total number of packets which were not +transmitted because the tunnel is looped back to itself. +\item \verb|NoRoute| --- the total number of packets which were not +transmitted because there is no IP route to the remote endpoint. +\item \verb|NoBufs| --- the total number of packets which were not +transmitted because the kernel failed to allocate a buffer. +\end{itemize} + + +\section{{\tt ip monitor} and {\tt rtmon} --- state monitoring} +\label{IP-MONITOR} + +The \verb|ip| utility can monitor the state of devices, addresses +and routes continuously. This option has a slightly different format. +Namely, +the \verb|monitor| command is the first in the command line and then +the object list follows: +\begin{verbatim} + ip monitor [ file FILE ] [ all | OBJECT-LIST ] +\end{verbatim} +\verb|OBJECT-LIST| is the list of object types that we want to monitor. +It may contain \verb|link|, \verb|address| and \verb|route|. +If no \verb|file| argument is given, \verb|ip| opens RTNETLINK, +listens on it and dumps state changes in the format described +in previous sections. + +If a file name is given, it does not listen on RTNETLINK, +but opens the file containing RTNETLINK messages saved in binary format +and dumps them. Such a history file can be generated with the +\verb|rtmon| utility. This utility has a command line syntax similar to +\verb|ip monitor|. +Ideally, \verb|rtmon| should be started before +the first network configuration command is issued. F.e.\ if +you insert: +\begin{verbatim} + rtmon file /var/log/rtmon.log +\end{verbatim} +in a startup script, you will be able to view the full history +later. + +Certainly, it is possible to start \verb|rtmon| at any time. +It prepends the history with the state snapshot dumped at the moment +of starting. + + +\section{Route realms and policy propagation, {\tt rtacct}} +\label{RT-REALMS} + +On routers using OSPF ASE or, especially, the BGP protocol, routing +tables may be huge. If we want to classify or to account for the packets +per route, we will have to keep lots of information. Even worse, if we +want to distinguish the packets not only by their destination, but +also by their source, the task gets quadratic complexity and its solution +is physically impossible. + +One approach to propagating the policy from routing protocols +to the forwarding engine has been proposed in~\cite{IOS-BGP-PP}. +Essentially, Cisco Policy Propagation via BGP is based on the fact +that dedicated routers all have the RIB (Routing Information Base) +close to the forwarding engine, so policy routing rules can +check all the route attributes, including ASPATH information +and community strings. + +The Linux architecture, splitting the RIB (maintained by a user level +daemon) and the kernel based FIB (Forwarding Information Base), +does not allow such a simple approach. + +It is to our fortune because there is another solution +which allows even more flexible policy and richer semantics. + +Namely, routes can be clustered together in user space, based on their +attributes. F.e.\ a BGP router knows route ASPATH, its community; +an OSPF router knows the route tag or its area. The administrator, when adding +routes manually, also knows their nature. Providing that the number of such +aggregates (we call them {\em realms\/}) is low, the task of full +classification both by source and destination becomes quite manageable. + +So each route may be assigned to a realm. It is assumed that +this identification is made by a routing daemon, but static routes +can also be handled manually with \verb|ip route| (see sec.\ref{IP-ROUTE}, +p.\pageref{IP-ROUTE}). +\begin{NB} + There is a patch to \verb|gated|, allowing classification of routes + to realms with all the set of policy rules implemented in \verb|gated|: + by prefix, by ASPATH, by origin, by tag etc. +\end{NB} + +To facilitate the construction (f.e.\ in case the routing +daemon is not aware of realms), missing realms may be completed +with routing policy rules, see sec.~\ref{IP-RULE}, p.\pageref{IP-RULE}. + +For each packet the kernel calculates a tuple of realms: source realm +and destination realm, using the following algorithm: + +\begin{enumerate} +\item If the route has a realm, the destination realm of the packet is set to it. +\item If the rule has a source realm, the source realm of the packet is set to it. +If the destination realm was not inherited from the route and the rule has a destination realm, +it is also set. +\item If at least one of the realms is still unknown, the kernel finds +the reversed route to the source of the packet. +\item If the source realm is still unknown, get it from the reversed route. +\item If one of the realms is still unknown, swap the realms of reversed +routes and apply step 2 again. +\end{enumerate} + +After this procedure is completed we know what realm the packet +arrived from and the realm where it is going to propagate to. +If some of the realms are unknown, they are initialized to zero +(or realm \verb|unknown|). + +The main application of realms is the TC \verb|route| classifier~\cite{TC-CREF}, +where they are used to help assign packets to traffic classes, +to account, police and schedule them according to this +classification. + +A much simpler but still very useful application is incoming packet +accounting by realms. The kernel gathers a packet statistics summary +which can be viewed with the \verb|rtacct| utility. +\begin{verbatim} +kuznet@amber:~ $ rtacct russia +Realm BytesTo PktsTo BytesFrom PktsFrom +russia 20576778 169176 47080168 153805 +kuznet@amber:~ $ +\end{verbatim} +This shows that this router received 153805 packets from +the realm \verb|russia| and forwarded 169176 packets to \verb|russia|. +The realm \verb|russia| consists of routes with ASPATHs not leaving +Russia. + +Note that locally originating packets are not accounted here, +\verb|rtacct| shows incoming packets only. Using the \verb|route| +classifier (see~\cite{TC-CREF}) you can get even more detailed +accounting information about outgoing packets, optionally +summarizing traffic not only by source or destination, but +by any pair of source and destination realms. + + +\begin{thebibliography}{99} +\addcontentsline{toc}{section}{References} +\bibitem{RFC-NDISC} T.~Narten, E.~Nordmark, W.~Simpson. +``Neighbor Discovery for IP Version 6 (IPv6)'', RFC-2461. + +\bibitem{RFC-ADDRCONF} S.~Thomson, T.~Narten. +``IPv6 Stateless Address Autoconfiguration'', RFC-2462. + +\bibitem{RFC1812} F.~Baker. +``Requirements for IP Version 4 Routers'', RFC-1812. + +\bibitem{RFC1122} R.~T.~Braden. +``Requirements for Internet hosts --- communication layers'', RFC-1122. + +\bibitem{IOS} ``Cisco IOS Release 12.0 Network Protocols +Command Reference, Part 1'' and +``Cisco IOS Release 12.0 Quality of Service Solutions +Configuration Guide: Configuring Policy-Based Routing'',\\ +http://www.cisco.com/univercd/cc/td/doc/product/software/ios120. + +\bibitem{IP-TUNNELS} A.~N.~Kuznetsov. +``Tunnels over IP in Linux-2.2'', \\ +In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}. + +\bibitem{TC-CREF} A.~N.~Kuznetsov. ``TC Command Reference'',\\ +In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}. + +\bibitem{IOS-BGP-PP} ``Cisco IOS Release 12.0 Quality of Service Solutions +Configuration Guide: Configuring QoS Policy Propagation via +Border Gateway Protocol'',\\ +http://www.cisco.com/univercd/cc/td/doc/product/software/ios120. + +\bibitem{RFC-DHCP} R.~Droms. +``Dynamic Host Configuration Protocol.'', RFC-2131 + +\end{thebibliography} + + + + +\appendix +\addcontentsline{toc}{section}{Appendix} + +\section{Source address selection} +\label{ADDR-SEL} + +When a host creates an IP packet, it must select some source +address. Correct source address selection is a critical procedure, +because it gives the receiver the information needed to deliver a +reply. If the source is selected incorrectly, in the best case, +the backward path may appear different to the forward one which +is harmful for performance. In the worst case, when the addresses +are administratively scoped, the reply may be lost entirely. + +Linux-2.2 selects source addresses using the following algorithm: + +\begin{itemize} +\item +The application may select a source address explicitly with \verb|bind(2)| +syscall or supplying it to \verb|sendmsg(2)| via the ancillary data object +\verb|IP_PKTINFO|. In this case the kernel only checks the validity +of the address and never tries to ``improve'' an incorrect user choice, +generating an error instead. +\begin{NB} + Never say ``Never''. The sysctl option \verb|ip_dynaddr| breaks + this axiom. It has been made deliberately with the purpose + of automatically reselecting the address on hosts with dynamic dial-out interfaces. + However, this hack {\em must not\/} be used on multihomed hosts + and especially on routers: it would break them. +\end{NB} + + +\item Otherwise, IP routing tables can contain an explicit source +address hint for this destination. The hint is set with the \verb|src| parameter +to the \verb|ip route| command, sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}. + + +\item Otherwise, the kernel searches through the list of addresses +attached to the interface through which the packets will be routed. +The search strategies are different for IP and IPv6. Namely: + +\begin{itemize} +\item IPv6 searches for the first valid, not deprecated address +with the same scope as the destination. + +\item IP searches for the first valid address with a scope wider +than the scope of the destination but it prefers addresses +which fall to the same subnet as the nexthop of the route +to the destination. Unlike IPv6, the scopes of IPv4 destinations +are not encoded in their addresses but are supplied +in routing tables instead (the \verb|scope| parameter to the \verb|ip route| command, +sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}). + +\end{itemize} + + +\item Otherwise, if the scope of the destination is \verb|link| or \verb|host|, +the algorithm fails and returns a zero source address. + +\item Otherwise, all interfaces are scanned to search for an address +with an appropriate scope. The loopback device \verb|lo| is always the first +in the search list, so that if an address with global scope (not 127.0.0.1!) +is configured on loopback, it is always preferred. + +\end{itemize} + + +\section{Proxy ARP/NDISC} +\label{PROXY-NEIGH} + +Routers may answer ARP/NDISC solicitations on behalf of other hosts. +In Linux-2.2 proxy ARP on an interface may be enabled +by setting the kernel \verb|sysctl| variable +\verb|/proc/sys/net/ipv4/conf/<dev>/proxy_arp| to 1. After this, the router +starts to answer ARP requests on the interface \verb|<dev>|, provided +the route to the requested destination does {\em not\/} go back via the same +device. + +The variable \verb|/proc/sys/net/ipv4/conf/all/proxy_arp| enables proxy +ARP on all the IP devices. + +However, this approach fails in the case of IPv6 because the router +must join the solicited node multicast address to listen for the corresponding +NDISC queries. It means that proxy NDISC is possible only on a per destination +basis. + +Logically, proxy ARP/NDISC is not a kernel task. It can easily be implemented +in user space. However, similar functionality was present in BSD kernels +and in Linux-2.0, so we have to preserve it at least to the extent that +is standardized in BSD. +\begin{NB} + Linux-2.0 ARP had a feature called {\em subnet\/} proxy ARP. + It is replaced with the sysctl flag in Linux-2.2. +\end{NB} + + +The \verb|ip| utility provides a way to manage proxy ARP/NDISC +with the \verb|ip neigh| command, namely: +\begin{verbatim} + ip neigh add proxy ADDRESS [ dev NAME ] +\end{verbatim} +adds a new proxy ARP/NDISC record and +\begin{verbatim} + ip neigh del proxy ADDRESS [ dev NAME ] +\end{verbatim} +deletes it. + +If the name of the device is not given, the router will answer solicitations +for address \verb|ADDRESS| on all devices, otherwise it will only serve +the device \verb|NAME|. Even if the proxy entry is created with +\verb|ip neigh|, the router {\em will not\/} answer a query if the route +to the destination goes back via the interface from which the solicitation +was received. + +It is important to emphasize that proxy entries have {\em no\/} +parameters other than these (IP/IPv6 address and optional device). +Particularly, the entry does not store any link layer address. +It always advertises the station address of the interface +on which it sends advertisements (i.e. it's own station address). + +\section{Route NAT status} +\label{ROUTE-NAT} + +NAT (or ``Network Address Translation'') remaps some parts +of the IP address space into other ones. Linux-2.2 route NAT is supposed +to be used to facilitate policy routing by rewriting addresses +to other routing domains or to help while renumbering sites +to another prefix. + +\paragraph{What it is not:} +It is necessary to emphasize that {\em it is not supposed\/} +to be used to compress address space or to split load. +This is not missing functionality but a design principle. +Route NAT is {\em stateless\/}. It does not hold any state +about translated sessions. This means that it handles any number +of sessions flawlessly. But it also means that it is {\em static\/}. +It cannot detect the moment when the last TCP client stops +using an address. For the same reason, it will not help to split +load between several servers. +\begin{NB} +It is a pretty commonly held belief that it is useful to split load between +several servers with NAT. This is a mistake. All you get from this +is the requirement that the router keep the state of all the TCP connections +going via it. Well, if the router is so powerful, run apache on it. 8) +\end{NB} + +The second feature: it does not touch packet payload, +does not try to ``improve'' broken protocols by looking +through its data and mangling it. It mangles IP addresses, +only IP addresses and nothing but IP addresses. +This also, is not missing any functionality. + +To resume: if you need to compress address space or keep +active FTP clients happy, your choice is not route NAT but masquerading, +port forwarding, NAPT etc. +\begin{NB} +By the way, you may also want to look at +http://www.suse.com/\~mha/HyperNews/get/linux-ip-nat.html +\end{NB} + + +\paragraph{How it works.} +Some part of the address space is reserved for dummy addresses +which will look for all the world like some host addresses +inside your network. No other hosts may use these addresses, +however other routers may also be configured to translate them. +\begin{NB} +A great advantage of route NAT is that it may be used not +only in stub networks but in environments with arbitrarily complicated +structure. It does not firewall, it {\em forwards.} +\end{NB} +These addresses are selected by the \verb|ip route| command +(sec.\ref{IP-ROUTE-ADD}, p.\pageref{IP-ROUTE-ADD}). F.e.\ +\begin{verbatim} + ip route add nat 192.203.80.144 via 193.233.7.83 +\end{verbatim} +states that the single address 192.203.80.144 is a dummy NAT address. +For all the world it looks like a host address inside our network. +For neighbouring hosts and routers it looks like the local address +of the translating router. The router answers ARP for it, advertises +this address as routed via it, {\em et al\/}. When the router +receives a packet destined for 192.203.80.144, it replaces +this address with 193.233.7.83 which is the address of some real +host and forwards the packet. If you need to remap +blocks of addresses, you may use a command like: +\begin{verbatim} + ip route add nat 192.203.80.192/26 via 193.233.7.64 +\end{verbatim} +This command will map a block of 63 addresses 192.203.80.192-255 to +193.233.7.64-127. + +When an internal host (193.233.7.83 in the example above) +sends something to the outer world and these packets are forwarded +by our router, it should translate the source address 193.233.7.83 +into 192.203.80.144. This task is solved by setting a special +policy rule (sec.\ref{IP-RULE-ADD}, p.\pageref{IP-RULE-ADD}): +\begin{verbatim} + ip rule add prio 320 from 193.233.7.83 nat 192.203.80.144 +\end{verbatim} +This rule says that the source address 193.233.7.83 +should be translated into 192.203.80.144 before forwarding. +It is important that the address after the \verb|nat| keyword +is some NAT address, declared by {\tt ip route add nat}. +If it is just a random address the router will not map to it. +\begin{NB} +The exception is when the address is a local address of this +router (or 0.0.0.0) and masquerading is configured in the linux-2.2 +kernel. In this case the router will masquerade the packets as this address. +If 0.0.0.0 is selected, the result is equivalent to one +obtained with firewalling rules. Otherwise, you have the way +to order Linux to masquerade to this fixed address. +NAT mechanism used in linux-2.4 is more flexible than +masquerading, so that this feature has lost meaning and disabled. +\end{NB} + +If the network has non-trivial internal structure, it is +useful and even necessary to add rules disabling translation +when a packet does not leave this network. Let us return to the +example from sec.\ref{IP-RULE-SHOW} (p.\pageref{IP-RULE-SHOW}). +\begin{verbatim} +300: from 193.233.7.83 to 193.233.7.0/24 lookup main +310: from 193.233.7.83 to 192.203.80.0/24 lookup main +320: from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144 +\end{verbatim} +This block of rules causes normal forwarding when +packets from 193.233.7.83 do not leave networks 193.233.7/24 +and 192.203.80/24. Also, if the \verb|inr.ruhep| table does not +contain a route to the destination (which means that the routing +domain owning addresses from 192.203.80/24 is dead), no translation +will occur. Otherwise, the packets are translated. + +\paragraph{How to only translate selected ports:} +If you only want to translate selected ports (f.e.\ http) +and leave the rest intact, you may use \verb|ipchains| +to \verb|fwmark| a class of packets. +Suppose you did and all the packets from 193.233.7.83 +destined for port 80 are marked with marker 0x1234 in input fwchain. +In this case you may replace rule \#320 with: +\begin{verbatim} +320: from 193.233.7.83 fwmark 1234 lookup main map-to 192.203.80.144 +\end{verbatim} +and translation will only be enabled for outgoing http requests. + +\section{Example: minimal host setup} +\label{EXAMPLE-SETUP} + +The following script gives an example of a fault safe +setup of IP (and IPv6, if it is compiled into the kernel) +in the common case of a node attached to a single broadcast +network. A more advanced script, which may be used both on multihomed +hosts and on routers, is described in the following +section. + +The utilities used in the script may be found in the +directory ftp://ftp.inr.ac.ru/ip-routing/: +\begin{enumerate} +\item \verb|ip| --- package \verb|iproute2|. +\item \verb|arping| --- package \verb|iputils|. +\item \verb|rdisc| --- package \verb|iputils|. +\end{enumerate} +\begin{NB} +It also refers to a DHCP client, \verb|dhcpcd|. I should refrain from +recommending a good DHCP client to use. All that I can +say is that ISC \verb|dhcp-2.0b1pl6| patched with the patch that +can be found in the \verb|dhcp.bootp.rarp| subdirectory of +the same ftp site {\em does\/} work, +at least on Ethernet and Token Ring. +\end{NB} + +\begin{verbatim} +#! /bin/bash +\end{verbatim} +\begin{flushleft} +\# {\bf Usage: \verb|ifone ADDRESS[/PREFIX-LENGTH] [DEVICE]|}\\ +\# {\bf Parameters:}\\ +\# \$1 --- Static IP address, optionally followed by prefix length.\\ +\# \$2 --- Device name. If it is missing, \verb|eth0| is asssumed.\\ +\# F.e. \verb|ifone 193.233.7.90| +\end{flushleft} +\begin{verbatim} +dev=$2 +: ${dev:=eth0} +ipaddr= +\end{verbatim} +\# Parse IP address, splitting prefix length. +\begin{verbatim} +if [ "$1" != "" ]; then + ipaddr=${1%/*} + if [ "$1" != "$ipaddr" ]; then + pfxlen=${1#*/} + fi + : ${pfxlen:=24} +fi +pfx="${ipaddr}/${pfxlen}" +\end{verbatim} + +\begin{flushleft} +\# {\bf Step 0} --- enable loopback.\\ +\#\\ +\# This step is necessary on any networked box before attempt\\ +\# to configure any other device.\\ +\end{flushleft} +\begin{verbatim} +ip link set up dev lo +ip addr add 127.0.0.1/8 dev lo brd + scope host +\end{verbatim} +\begin{flushleft} +\# IPv6 autoconfigure themself on loopback.\\ +\#\\ +\# If user gave loopback as device, we add the address as alias and exit. +\end{flushleft} +\begin{verbatim} +if [ "$dev" = "lo" ]; then + if [ "$ipaddr" != "" -a "$ipaddr" != "127.0.0.1" ]; then + ip address add $ipaddr dev $dev + exit $? + fi + exit 0 +fi +\end{verbatim} + +\noindent\# {\bf Step 1} --- enable device \verb|$dev| + +\begin{verbatim} +if ! ip link set up dev $dev ; then + echo "Cannot enable interface $dev. Aborting." 1>&2 + exit 1 +fi +\end{verbatim} +\begin{flushleft} +\# The interface is \verb|UP|. IPv6 started stateless autoconfiguration itself,\\ +\# and its configuration finishes here. However,\\ +\# IP still needs some static preconfigured address. +\end{flushleft} +\begin{verbatim} +if [ "$ipaddr" = "" ]; then + echo "No address for $dev is configured, trying DHCP..." 1>&2 + dhcpcd + exit $? +fi +\end{verbatim} + +\begin{flushleft} +\# {\bf Step 2} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\ +\# Send two probes and wait for result for 3 seconds.\\ +\# If the interface opens slower f.e.\ due to long media detection,\\ +\# you want to increase the timeout.\\ +\end{flushleft} +\begin{verbatim} +if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then + echo "Address $ipaddr is busy, trying DHCP..." 1>&2 + dhcpcd + exit $? +fi +\end{verbatim} +\begin{flushleft} +\# OK, the address is unique, we may add it on the interface.\\ +\#\\ +\# {\bf Step 3} --- Configure the address on the interface. +\end{flushleft} + +\begin{verbatim} +if ! ip address add $pfx brd + dev $dev; then + echo "Failed to add $pfx on $dev, trying DHCP..." 1>&2 + dhcpcd + exit $? +fi +\end{verbatim} + +\noindent\# {\bf Step 4} --- Announce our presence on the link. +\begin{verbatim} +arping -A -c 1 -I $dev $ipaddr +noarp=$? +( sleep 2; + arping -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null & +\end{verbatim} + +\begin{flushleft} +\# {\bf Step 5} (optional) --- Add some control routes.\\ +\#\\ +\# 1. Prohibit link local multicast addresses.\\ +\# 2. Prohibit link local (alias, limited) broadcast.\\ +\# 3. Add default multicast route. +\end{flushleft} +\begin{verbatim} +ip route add unreachable 224.0.0.0/24 +ip route add unreachable 255.255.255.255 +if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then + ip route add 224.0.0.0/4 dev $dev scope global +fi +\end{verbatim} + +\begin{flushleft} +\# {\bf Step 6} --- Add fallback default route with huge metric.\\ +\# If a proxy ARP server is present on the interface, we will be\\ +\# able to talk to all the Internet without further configuration.\\ +\# It is not so cheap though and we still hope that this route\\ +\# will be overridden by more correct one by rdisc.\\ +\# Do not make this step if the device is not ARPable,\\ +\# because dead nexthop detection does not work on them. +\end{flushleft} +\begin{verbatim} +if [ "$noarp" = "0" ]; then + ip ro add default dev $dev metric 30000 scope global +fi +\end{verbatim} + +\begin{flushleft} +\# {\bf Step 7} --- Restart router discovery and exit. +\end{flushleft} +\begin{verbatim} +killall -HUP rdisc || rdisc -fs +exit 0 +\end{verbatim} + + +\section{Example: {\protect\tt ifcfg} --- interface address management} +\label{EXAMPLE-IFCFG} + +This is a simplistic script replacing one option of \verb|ifconfig|, +namely, IP address management. It not only adds +addresses, but also carries out Duplicate Address Detection~\cite{RFC-DHCP}, +sends unsolicited ARP to update the caches of other hosts sharing +the interface, adds some control routes and restarts Router Discovery +when it is necessary. + +I strongly recommend using it {\em instead\/} of \verb|ifconfig| both +on hosts and on routers. + +\begin{verbatim} +#! /bin/bash +\end{verbatim} +\begin{flushleft} +\# {\bf Usage: \verb?ifcfg DEVICE[:ALIAS] [add|del] ADDRESS[/LENGTH] [PEER]?}\\ +\# {\bf Parameters:}\\ +\# ---Device name. It may have alias suffix, separated by colon.\\ +\# ---Command: add, delete or stop.\\ +\# ---IP address, optionally followed by prefix length.\\ +\# ---Optional peer address for pointopoint interfaces.\\ +\# F.e. \verb|ifcfg eth0 193.233.7.90/24| + +\noindent\# This function determines, whether it is router or host.\\ +\# It returns 0, if the host is apparently not router. +\end{flushleft} +\begin{verbatim} +CheckForwarding () { + local sbase fwd + sbase=/proc/sys/net/ipv4/conf + fwd=0 + if [ -d $sbase ]; then + for dir in $sbase/*/forwarding; do + fwd=$[$fwd + `cat $dir`] + done + else + fwd=2 + fi + return $fwd +} +\end{verbatim} +\begin{flushleft} +\# This function restarts Router Discovery.\\ +\end{flushleft} +\begin{verbatim} +RestartRDISC () { + killall -HUP rdisc || rdisc -fs +} +\end{verbatim} +\begin{flushleft} +\# Calculate ABC "natural" mask length\\ +\# Arg: \$1 = dotquad address +\end{flushleft} +\begin{verbatim} +ABCMaskLen () { + local class; + class=${1%%.*} + if [ $class -eq 0 -o $class -ge 224 ]; then return 0 + elif [ $class -ge 192 ]; then return 24 + elif [ $class -ge 128 ]; then return 16 + else return 8 ; fi +} +\end{verbatim} + + +\begin{flushleft} +\# {\bf MAIN()}\\ +\#\\ +\# Strip alias suffix separated by colon. +\end{flushleft} +\begin{verbatim} +label="label $1" +ldev=$1 +dev=${1%:*} +if [ "$dev" = "" -o "$1" = "help" ]; then + echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2 + echo " add - add new address" 1>&2 + echo " del - delete address" 1>&2 + echo " stop - completely disable IP" 1>&2 + exit 1 +fi +shift + +CheckForwarding +fwd=$? +\end{verbatim} +\begin{flushleft} +\# Parse command. If it is ``stop'', flush and exit. +\end{flushleft} +\begin{verbatim} +deleting=0 +case "$1" in +add) shift ;; +stop) + if [ "$ldev" != "$dev" ]; then + echo "Cannot stop alias $ldev" 1>&2 + exit 1; + fi + ip -4 addr flush dev $dev $label || exit 1 + if [ $fwd -eq 0 ]; then RestartRDISC; fi + exit 0 ;; +del*) + deleting=1; shift ;; +*) +esac +\end{verbatim} +\begin{flushleft} +\# Parse prefix, split prefix length, separated by slash. +\end{flushleft} +\begin{verbatim} +ipaddr= +pfxlen= +if [ "$1" != "" ]; then + ipaddr=${1%/*} + if [ "$1" != "$ipaddr" ]; then + pfxlen=${1#*/} + fi + if [ "$ipaddr" = "" ]; then + echo "$1 is bad IP address." 1>&2 + exit 1 + fi +fi +shift +\end{verbatim} +\begin{flushleft} +\# If peer address is present, prefix length is 32.\\ +\# Otherwise, if prefix length was not given, guess it. +\end{flushleft} +\begin{verbatim} +peer=$1 +if [ "$peer" != "" ]; then + if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then + echo "Peer address with non-trivial netmask." 1>&2 + exit 1 + fi + pfx="$ipaddr peer $peer" +else + if [ "$pfxlen" = "" ]; then + ABCMaskLen $ipaddr + pfxlen=$? + fi + pfx="$ipaddr/$pfxlen" +fi +if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then + label= +fi +\end{verbatim} +\begin{flushleft} +\# If deletion was requested, delete the address and restart RDISC +\end{flushleft} +\begin{verbatim} +if [ $deleting -ne 0 ]; then + ip addr del $pfx dev $dev $label || exit 1 + if [ $fwd -eq 0 ]; then RestartRDISC; fi + exit 0 +fi +\end{verbatim} +\begin{flushleft} +\# Start interface initialization.\\ +\#\\ +\# {\bf Step 0} --- enable device \verb|$dev| +\end{flushleft} +\begin{verbatim} +if ! ip link set up dev $dev ; then + echo "Error: cannot enable interface $dev." 1>&2 + exit 1 +fi +if [ "$ipaddr" = "" ]; then exit 0; fi +\end{verbatim} +\begin{flushleft} +\# {\bf Step 1} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\ +\# Send two probes and wait for result for 3 seconds.\\ +\# If the interface opens slower f.e.\ due to long media detection,\\ +\# you want to increase the timeout.\\ +\end{flushleft} +\begin{verbatim} +if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then + echo "Error: some host already uses address $ipaddr on $dev." 1>&2 + exit 1 +fi +\end{verbatim} +\begin{flushleft} +\# OK, the address is unique. We may add it to the interface.\\ +\#\\ +\# {\bf Step 2} --- Configure the address on the interface. +\end{flushleft} +\begin{verbatim} +if ! ip address add $pfx brd + dev $dev $label; then + echo "Error: failed to add $pfx on $dev." 1>&2 + exit 1 +fi +\end{verbatim} +\noindent\# {\bf Step 3} --- Announce our presence on the link +\begin{verbatim} +arping -q -A -c 1 -I $dev $ipaddr +noarp=$? +( sleep 2 ; + arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null & +\end{verbatim} +\begin{flushleft} +\# {\bf Step 4} (optional) --- Add some control routes.\\ +\#\\ +\# 1. Prohibit link local multicast addresses.\\ +\# 2. Prohibit link local (alias, limited) broadcast.\\ +\# 3. Add default multicast route. +\end{flushleft} +\begin{verbatim} +ip route add unreachable 224.0.0.0/24 >& /dev/null +ip route add unreachable 255.255.255.255 >& /dev/null +if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then + ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null +fi +\end{verbatim} +\begin{flushleft} +\# {\bf Step 5} --- Add fallback default route with huge metric.\\ +\# If a proxy ARP server is present on the interface, we will be\\ +\# able to talk to all the Internet without further configuration.\\ +\# Do not make this step on router or if the device is not ARPable.\\ +\# because dead nexthop detection does not work on them. +\end{flushleft} +\begin{verbatim} +if [ $fwd -eq 0 ]; then + if [ $noarp -eq 0 ]; then + ip ro append default dev $dev metric 30000 scope global + elif [ "$peer" != "" ]; then + if ping -q -c 2 -w 4 $peer ; then + ip ro append default via $peer dev $dev metric 30001 + fi + fi + RestartRDISC +fi + +exit 0 +\end{verbatim} +\begin{flushleft} +\# End of {\bf MAIN()} +\end{flushleft} + + +\end{document} diff --git a/doc/ip-tunnels.tex b/doc/ip-tunnels.tex index e69de29b..0a8c930c 100644 --- a/doc/ip-tunnels.tex +++ b/doc/ip-tunnels.tex @@ -0,0 +1,469 @@ +\documentstyle[12pt,twoside]{article} +\def\TITLE{Tunnels over IP} +\input preamble +\begin{center} +\Large\bf Tunnels over IP in Linux-2.2 +\end{center} + + +\begin{center} +{ \large Alexey~N.~Kuznetsov } \\ +\em Institute for Nuclear Research, Moscow \\ +\verb|kuznet@ms2.inr.ac.ru| \\ +\rm March 17, 1999 +\end{center} + +\vspace{5mm} + +\tableofcontents + + +\section{Instead of introduction: micro-FAQ.} + +\begin{itemize} + +\item +Q: In linux-2.0.36 I used: +\begin{verbatim} + ifconfig tunl1 10.0.0.1 pointopoint 193.233.7.65 +\end{verbatim} +to create tunnel. It does not work in 2.2.0! + +A: You are right, it does not work. The command written above is split to two commands. +\begin{verbatim} + ip tunnel add MY-TUNNEL mode ipip remote 193.233.7.65 +\end{verbatim} +will create tunnel device with name \verb|MY-TUNNEL|. Now you may configure +it with: +\begin{verbatim} + ifconfig MY-TUNNEL 10.0.0.1 +\end{verbatim} +Certainly, if you prefer name \verb|tunl1| to \verb|MY-TUNNEL|, +you still may use it. + +\item +Q: In linux-2.0.36 I used: +\begin{verbatim} + ifconfig tunl0 10.0.0.1 + route add -net 10.0.0.0 gw 193.233.7.65 dev tunl0 +\end{verbatim} +to tunnel net 10.0.0.0 via router 193.233.7.65. It does not +work in 2.2.0! Moreover, \verb|route| prints a funny error sort of +``network unreachable'' and after this I found a strange direct route +to 10.0.0.0 via \verb|tunl0| in routing table. + +A: Yes, in 2.2 the rule that {\em normal} gateway must reside on directly +connected network has not any exceptions. You may tell kernel, that +this particular route is {\em abnormal}: +\begin{verbatim} + ifconfig tunl0 10.0.0.1 netmask 255.255.255.255 + ip route add 10.0.0.0/8 via 193.233.7.65 dev tunl0 onlink +\end{verbatim} +Note keyword \verb|onlink|, it is the magic key that orders kernel +not to check for consistency of gateway address. +Probably, after this explanation you have already guessed another method +to cheat kernel: +\begin{verbatim} + ifconfig tunl0 10.0.0.1 netmask 255.255.255.255 + route add -host 193.233.7.65 dev tunl0 + route add -net 10.0.0.0 netmask 255.0.0.0 gw 193.233.7.65 + route del -host 193.233.7.65 dev tunl0 +\end{verbatim} +Well, if you like such tricks, nobody may prohibit you to use them. +Only do not forget +that between \verb|route add| and \verb|route del| host 193.233.7.65 is +unreachable. + +\item +Q: In 2.0.36 I used to load \verb|tunnel| device module and \verb|ipip| module. +I cannot find any \verb|tunnel| in 2.2! + +A: Linux-2.2 has single module \verb|ipip| for both directions of tunneling +and for all IPIP tunnel devices. + +\item +Q: \verb|traceroute| does not work over tunnel! Well, stop... It works, + only skips some number of hops. + +A: Yes. By default tunnel driver copies \verb|ttl| value from +inner packet to outer one. It means that path traversed by tunneled +packets to another endpoint is not hidden. If you dislike this, or if you +are going to use some routing protocol expecting that packets +with ttl 1 will reach peering host (f.e.\ RIP, OSPF or EBGP) +and you are not afraid of +tunnel loops, you may append option \verb|ttl 64|, when creating tunnel +with \verb|ip tunnel add|. + +\item +Q: ... Well, list of things, which 2.0 was able to do finishes. + +\end{itemize} + +\paragraph{Summary of differences between 2.2 and 2.0.} + +\begin{itemize} + +\item {\bf In 2.0} you could compile tunnel device into kernel + and got set of 4 devices \verb|tunl0| ... \verb|tunl3| or, + alternatively, compile it as module and load new module + for each new tunnel. Also, module \verb|ipip| was necessary + to receive tunneled packets. + + {\bf 2.2} has {\em one\/} module \verb|ipip|. Loading it you get base + tunnel device \verb|tunl0| and another tunnels may be created with command + \verb|ip tunnel add|. These new devices may have arbitrary names. + + +\item {\bf In 2.0} you set remote tunnel endpoint address with + the command \verb|ifconfig| ... \verb|pointopoint A|. + + {\bf In 2.2} this command has the same semantics on all + the interfaces, namely it sets not tunnel endpoint, + but address of peering host, which is directly reachable + via this tunnel, + rather than via Internet. Actual tunnel endpoint address \verb|A| + should be set with \verb|ip tunnel add ... remote A|. + +\item {\bf In 2.0} you create tunnel routes with the command: +\begin{verbatim} + route add -net 10.0.0.0 gw A dev tunl0 +\end{verbatim} + + {\bf 2.2} interprets this command equally for all device + kinds and gateway is required to be directly reachable via this tunnel, + rather than via Internet. You still may use \verb|ip route add ... onlink| + to override this behaviour. + +\end{itemize} + + +\section{Tunnel setup: basics} + +Standard Linux-2.2 kernel supports three flavor of tunnels, +listed in the following table: +\vspace{2mm} + +\begin{tabular}{lll} +\vrule depth 0.8ex width 0pt\relax +Mode & Description & Base device \\ +ipip & IP over IP & tunl0 \\ +sit & IPv6 over IP & sit0 \\ +gre & ANY over GRE over IP & gre0 +\end{tabular} + +\vspace{2mm} + +\noindent All the kinds of tunnels are created with one command: +\begin{verbatim} + ip tunnel add <NAME> mode <MODE> [ local <S> ] [ remote <D> ] +\end{verbatim} + +This command creates new tunnel device with name \verb|<NAME>|. +The \verb|<NAME>| is an arbitrary string. Particularly, +it may be even \verb|eth0|. The rest of parameters set +different tunnel characteristics. + +\begin{itemize} + +\item +\verb|mode <MODE>| sets tunnel mode. Three modes are available now + \verb|ipip|, \verb|sit| and \verb|gre|. + +\item +\verb|remote <D>| sets remote endpoint of the tunnel to IP + address \verb|<D>|. +\item +\verb|local <S>| sets fixed local address for tunneled + packets. It must be an address on another interface of this host. + +\end{itemize} + +\let\thefootnote\oldthefootnote + +Both \verb|remote| and \verb|local| may be omitted. In this case we +say that they are zero or wildcard. Two tunnels of one mode cannot +have the same \verb|remote| and \verb|local|. Particularly it means +that base device or fallback tunnel cannot be replicated.\footnote{ +This restriction is relaxed for keyed GRE tunnels.} + +Tunnels are divided to two classes: {\bf pointopoint} tunnels, which +have some not wildcard \verb|remote| address and deliver all the packets +to this destination, and {\bf NBMA} (i.e. Non-Broadcast Multi-Access) tunnels, +which have no \verb|remote|. Particularly, base devices (f.e.\ \verb|tunl0|) +are NBMA, because they have neither \verb|remote| nor +\verb|local| addresses. + + +After tunnel device is created you should configure it as you did +it with another devices. Certainly, the configuration of tunnels has +some features related to the fact that they work over existing Internet +routing infrastructure and simultaneously create new virtual links, +which changes this infrastructure. The danger that not enough careful +tunnel setup will result in formation of tunnel loops, +collapse of routing or flooding network with exponentially +growing number of tunneled fragments is very real. + + +Protocol setup on pointopoint tunnels does not differ of configuration +of another devices. You should set a protocol address with \verb|ifconfig| +and add routes with \verb|route| utility. + +NBMA tunnels are different. To route something via NBMA tunnel +you have to explain to driver, where it should deliver packets to. +The only way to make it is to create special routes with gateway +address pointing to desired endpoint. F.e.\ +\begin{verbatim} + ip route add 10.0.0.0/24 via <A> dev tunl0 onlink +\end{verbatim} +It is important to use option \verb|onlink|, otherwise +kernel will refuse request to create route via gateway not directly +reachable over device \verb|tunl0|. With IPv6 the situation is much simpler: +when you start device \verb|sit0|, it automatically configures itself +with all IPv4 addresses mapped to IPv6 space, so that all IPv4 +Internet is {\em really reachable} via \verb|sit0|! Excellent, the command +\begin{verbatim} + ip route add 3FFE::/16 via ::193.233.7.65 dev sit0 +\end{verbatim} +will route \verb|3FFE::/16| via \verb|sit0|, sending all the packets +destined to this prefix to 193.233.7.65. + +\section{Tunnel setup: options} + +Command \verb|ip tunnel add| has several additional options. +\begin{itemize} + +\item \verb|ttl N| --- set fixed TTL \verb|N| on tunneled packets. + \verb|N| is number in the range 1--255. 0 is special value, + meaning that packets inherit TTL value. + Default value is: \verb|inherit|. + +\item \verb|tos T| --- set fixed tos \verb|T| on tunneled packets. + Default value is: \verb|inherit|. + +\item \verb|dev DEV| --- bind tunnel to device \verb|DEV|, so that + tunneled packets will be routed only via this device and will + not be able to escape to another device, when route to endpoint changes. + +\item \verb|nopmtudisc| --- disable Path MTU Discovery on this tunnel. + It is enabled by default. Note that fixed ttl is incompatible + with this option: tunnels with fixed ttl always make pmtu discovery. + +\end{itemize} + +\verb|ipip| and \verb|sit| tunnels have no more options. \verb|gre| +tunnels are more complicated: + +\begin{itemize} + +\item \verb|key K| --- use keyed GRE with key \verb|K|. \verb|K| is + either number or IP address-like dotted quad. + +\item \verb|csum| --- checksum tunneled packets. + +\item \verb|seq| --- serialize packets. +\begin{NB} + I think this option does not + work. At least, I did not test it, did not debug it and + even do not understand, how it is supposed to work and for what + purpose Cisco planned to use it. +\end{NB} + +\end{itemize} + + +Actually, these GRE options can be set separately for input and +output directions by prefixing corresponding keywords with letter +\verb|i| or \verb|o|. F.e.\ \verb|icsum| orders to accept only +packets with correct checksum and \verb|ocsum| means, that +our host will calculate and send checksum. + +Command \verb|ip tunnel add| is not the only operation, +which can be made with tunnels. Certainly, you may get short help page +with: +\begin{verbatim} + ip tunnel help +\end{verbatim} + +Besides that, you may view list of installed tunnels with the help of command: +\begin{verbatim} + ip tunnel ls +\end{verbatim} +Also you may look at statistics: +\begin{verbatim} + ip -s tunnel ls Cisco +\end{verbatim} +where \verb|Cisco| is name of tunnel device. Command +\begin{verbatim} + ip tunnel del Cisco +\end{verbatim} +destroys tunnel \verb|Cisco|. And, finally, +\begin{verbatim} + ip tunnel change Cisco mode sit local ME remote HE ttl 32 +\end{verbatim} +changes its parameters. + +\section{Differences 2.2 and 2.0 tunnels revisited.} + +Now we can discuss more subtle differences between tunneling in 2.0 +and 2.2. + +\begin{itemize} + +\item In 2.0 all tunneled packets were received promiscuously +as soon as you loaded module \verb|ipip|. 2.2 tries to select the best +tunnel device and packet looks as received on this. F.e.\ if host +received \verb|ipip| packet from host \verb|D| destined to our +local address \verb|S|, kernel searches for matching tunnels +in order: + +\begin{tabular}{ll} +1 & \verb|remote| is \verb|D| and \verb|local| is \verb|S| \\ +2 & \verb|remote| is \verb|D| and \verb|local| is wildcard \\ +3 & \verb|remote| is wildcard and \verb|local| is \verb|S| \\ +4 & \verb|tunl0| +\end{tabular} + +If tunnel exists, but it is not in \verb|UP| state, the tunnel is ignored. +Note, that if \verb|tunl0| is \verb|UP| it receives all the IPIP packets, +not acknowledged by more specific tunnels. +Be careful, it means that without carefully installed firewall rules +anyone on the Internet may inject to your network any packets with +source addresses indistinguishable from local ones. It is not so bad idea +to design tunnels in the way enforcing maximal route symmetry +and to enable reversed path filter (\verb|rp_filter| sysctl option) on +tunnel devices. + +\item In 2.2 you can monitor and debug tunnels with \verb|tcpdump|. +F.e.\ \verb|tcpdump| \verb|-i Cisco| \verb|-nvv| will dump packets, +which kernel output, via tunnel \verb|Cisco| and the packets received on it +from kernel viewpoint. + +\end{itemize} + + +\section{Linux and Cisco IOS tunnels.} + +Among another tunnels Cisco IOS supports IPIP and GRE. +Essentially, Cisco setup is subset of options, available for Linux. +Let us consider the simplest example: + +\begin{verbatim} +interface Tunnel0 + tunnel mode gre ip + tunnel source 10.10.14.1 + tunnel destination 10.10.13.2 +\end{verbatim} + + +This command set translates to: + +\begin{verbatim} + ip tunnel add Tunnel0 \ + mode gre \ + local 10.10.14.1 \ + remote 10.10.13.2 +\end{verbatim} + +Any questions? No questions. + +\section{Interaction IPIP tunnels and DVMRP.} + +DVMRP exploits IPIP tunnels to route multicasts via Internet. +\verb|mrouted| creates +IPIP tunnels listed in its configuration file automatically. +From kernel and user viewpoints there are no differences between +tunnels, created in this way, and tunnels created by \verb|ip tunnel|. +I.e.\ if \verb|mrouted| created some tunnel, it may be used to +route unicast packets, provided appropriate routes are added. +And vice versa, if administrator has already created a tunnel, +it will be reused by \verb|mrouted|, if it requests DVMRP +tunnel with the same local and remote addresses. + +Do not wonder, if your manually configured tunnel is +destroyed, when mrouted exits. + + +\section{Broadcast GRE ``tunnels''.} + +It is possible to set \verb|remote| for GRE tunnel to a multicast +address. Such tunnel becomes {\bf broadcast} tunnel (though word +tunnel is not quite appropriate in this case, it is rather virtual network). +\begin{verbatim} + ip tunnel add Universe local 193.233.7.65 \ + remote 224.66.66.66 ttl 16 + ip addr add 10.0.0.1/16 dev Universe + ip link set Universe up +\end{verbatim} +This tunnel is true broadcast network and broadcast packets are +sent to multicast group 224.66.66.66. By default such tunnel starts +to resolve both IP and IPv6 addresses via ARP/NDISC, so that +if multicast routing is supported in surrounding network, all GRE nodes +will find one another automatically and will form virtual Ethernet-like +broadcast network. If multicast routing does not work, it is unpleasant +but not fatal flaw. The tunnel becomes NBMA rather than broadcast network. +You may disable dynamic ARPing by: +\begin{verbatim} + echo 0 > /proc/sys/net/ipv4/neigh/Universe/mcast_solicit +\end{verbatim} +and to add required information to ARP tables manually: +\begin{verbatim} + ip neigh add 10.0.0.2 lladdr 128.6.190.2 dev Universe nud permanent +\end{verbatim} +In this case packets sent to 10.0.0.2 will be encapsulated in GRE +and sent to 128.6.190.2. It is possible to facilitate address resolution +using methods typical for another NBMA networks f.e.\ to start user +level \verb|arpd| daemon, which will maintain database of hosts attached +to GRE virtual network or ask for information +dedicated ARP or NHRP server. + + +Actually, such setup is the most natural for tunneling, +it is really flexible, scalable and easily managable, so that +it is strongly recommended to be used with GRE tunnels instead of ugly +hack with NBMA mode and \verb|onlink| modifier. Unfortunately, +by historical reasons broadcast mode is not supported by IPIP tunnels, +but this probably will change in future. + + + +\section{Traffic control issues.} + +Tunnels are devices, hence all the power of Linux traffic control +applies to them. The simplest (and the most useful in practice) +example is limiting tunnel bandwidth. The following command: +\begin{verbatim} + tc qdisc add dev tunl0 root tbf \ + rate 128Kbit burst 4K limit 10K +\end{verbatim} +will limit tunneled traffic to 128Kbit with maximal burst size of 4K +and queuing not more than 10K. + +However, you should remember, that tunnels are {\em virtual} devices +implemented in software and true queue management is impossible for them +just because they have no queues. Instead, it is better to create classes +on real physical interfaces and to map tunneled packets to them. +In general case of dynamic routing you should create such classes +on all outgoing interfaces, or, alternatively, +to use option \verb|dev DEV| to bind tunnel to a fixed physical device. +In the last case packets will be routed only via specified device +and you need to setup corresponding classes only on it. +Though you have to pay for this convenience, +if routing will change, your tunnel will fail. + +Suppose that CBQ class \verb|1:ABC| has been created on device \verb|eth0| +specially for tunnel \verb|Cisco| with endpoints \verb|S| and \verb|D|. +Now you can select IPIP packets with addresses \verb|S| and \verb|D| +with some classifier and map them to class \verb|1:ABC|. F.e.\ +it is easy to make with \verb|rsvp| classifier: +\begin{verbatim} + tc filter add dev eth0 pref 100 proto ip rsvp \ + session D ipproto ipip filter S \ + classid 1:ABC +\end{verbatim} + +If you want to make more detailed classification of sub-flows +transmitted via tunnel, you can build CBQ subtree, +rooted at \verb|1:ABC| and attach to subroot set of rules parsing +IPIP packets more deeply. + +\end{document} diff --git a/doc/nstat.sgml b/doc/nstat.sgml index e69de29b..be9d8bcc 100644 --- a/doc/nstat.sgml +++ b/doc/nstat.sgml @@ -0,0 +1,110 @@ +<!doctype linuxdoc system> + +<article> + +<title>NSTAT, IFSTAT and RTACCT Utilities +<author>Alexey Kuznetosv, <tt/kuznet@ms2.inr.ac.ru/ +<date>some_negative_number, 20 Sep 2001 +<abstract> +<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping +to monitor kernel snmp counters and network interface statistics. +</abstract> + +<p> These utilities are very similar, so that I describe +them simultaneously, using name <tt/Xstat/ in the places which apply +to all of them. + +<p>The format of the command is: + +<tscreen><verb> + Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ] +</verb></tscreen> + +<p> +<tt/PATTERN/ is shell style pattern, selecting identifier +of SNMP variables or interfaces to show. Variable is displayed +if one of patterns matches its name. If no patterns are given, +<tt/Xstat/ assumes that user wants to see all the variables. + +<p> <tt/OPTIONS/ is list of single letter options, using common unix +conventions. + +<itemize> +<item><tt/-h/ - show help page +<item><tt/-?/ - the same, of course +<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit +<item><tt/-z/ - dump zero counters too. By default they are not shown. +<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/ + calculates increments since the previous use. +<item><tt/-s/ - do not update history, so that the next time you will + see counters including values accumulated to the moment + of this measurement too. +<item><tt/-n/ - do not display anything, only update history. +<item><tt/-r/ - reset history. +<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting + statistics. <tt/INTERVAL/ is interval between measurements + in seconds. +<item><tt/-t INTERVAL/ - time interval to average rates. Default value + is 60 seconds. +<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only). +</itemize> + +<p> +History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt> +or in file given by environment variables <tt/NSTAT_HISTORY/, +<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/. +Each time when you use <tt/Xstat/ values there are updated. +If you use patterns, only the values which you _really_ see +are updated. If you want to skip an unintersting period, +use option <tt/-n/, or just output to <tt>/dev/null</tt>. + +<p> +<tt/Xstat/ understands when history is invalidated by system reboot +or source of information switched between different instances +of daemonic <tt/Xstat/ and kernel SNMP tables and does not +use invalid history. + +<p> Beware, <tt/Xstat/ will not produce sane output, +when many processes use it simultaneously. If several processes +under single user need this utility they should use environment +variables to put their history in safe places +or to use it with options <tt/-a -s/. + +<p> +Well, that's all. The utility is very simple, but nevertheless +very handy. + +<p> <bf/Output of XSTAT/ +<p> The first line of output is <tt/#/ followed by identifier +of source of information, it may be word <tt/kernel/, when <tt/Xstat/ +gets information from kernel or some dotted decimal number followed +by parameters, when it obtains information from running <tt/Xstat/ daemon. + +<p>In the case of <tt/nstat/ the rest of output consists of three columns: +SNMP MIB identifier, +its value (or increment since previous measurement) and average +rate of increase of the counter per second. <tt/ifstat/ outputs +interface name followed by pairs of counter and rate of its change. + +<p> <bf/Daemonic Xstat/ +<p> <tt/Xstat/ may be started as daemon by any user. This makes sense +to avoid wrapped counters and to obtain reasonable long counters +for large time. Also <tt/Xstat/ daemon calculates average rates. +For the first goal sampling interval (option <tt/-d/) may be large enough, +f.e. for gigabit rates byte counters overflow not more frequently than +each 40 seconds and you may select interval of 20 seconds. +From the other hand, when <tt/Xstat/ is used for estimating rates +interval should be less than averaging period (option <tt/-t/), otherwise +estimation loses in quality. + +Client <tt/Xstat/, before trying to get information from the kernel, +contacts daemon started by this user, then it tries system wide +daemon, which is supposed to be started by superuser. And only if +none of them replied it gets information from kernel. + +<p> <bf/Environment/ +<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/. +<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/. +<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/. + +</article> diff --git a/doc/preamble.tex b/doc/preamble.tex index e69de29b..80ca5087 100644 --- a/doc/preamble.tex +++ b/doc/preamble.tex @@ -0,0 +1,26 @@ +\textwidth 6.0in +\textheight 8.5in + +\input SNAPSHOT + +\pagestyle{myheadings} +\markboth{\protect\TITLE}{} +\markright{{\protect\sc iproute2-ss\Draft}} + +% To print it in compact form: both sides on one sheet (psnup -2) +\evensidemargin=\oddsidemargin + +\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB. +}{\par\egroup \vskip 1mm} + +\def\threeonly{[2.3.15+ only] } + +\begin{document} + +\makeatletter +\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}} +\makeatother +\let\oldthefootnote\thefootnote +\def\thefootnote{} +\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov} + diff --git a/doc/rtstat.sgml b/doc/rtstat.sgml index e69de29b..07391c39 100644 --- a/doc/rtstat.sgml +++ b/doc/rtstat.sgml @@ -0,0 +1,52 @@ +<!doctype linuxdoc system> + +<article> + +<title>RTACCT Utility +<author>Robert Olsson +<date>some_negative_number, 20 Dec 2001 + +<p> +Here is some code for monitoring the route cache. For systems handling high +network load, servers, routers, firewalls etc the route cache and its garbage +collection is crucial. Linux has a solid implementation. + +<p> +The kernel patch (not required since linux-2.4.7) adds statistics counters +from route cache process into +/proc/net/rt_cache_stat. A companion user mode program presents the statistics +in a vmstat or iostat manner. The ratio between cache hits and misses gives +the flow length. + +<p> +Hopefully it can help understanding performance and DoS and other related +issues. + +<p> An URL where newer versions of this utility can be (probably) found +is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/ + + +<p><bf/Description/ + +<p>The format of the command is: + +<tscreen><verb> + rtstat [ OPTIONS ] +</verb></tscreen> + +<p> <tt/OPTIONS/ are: + +<itemize> + +<item><tt/-h/, <tt/-help/ - show help page and version of the utility. + +<item><tt/-i INTERVAL/ - interval between snapshots, default value is +2 seconds. + +<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line, +1 prescribes to print it once and 2 (this is default setting) forces header +line each 20 lines. + +</itemize> + +</article> diff --git a/doc/ss.sgml b/doc/ss.sgml index e69de29b..0b1b5335 100644 --- a/doc/ss.sgml +++ b/doc/ss.sgml @@ -0,0 +1,525 @@ +<!doctype linuxdoc system> + +<article> + +<title>SS Utility: Quick Intro +<author>Alexey Kuznetosv, <tt/kuznet@ms2.inr.ac.ru/ +<date>some_negative_number, 20 Sep 2001 +<abstract> +<tt/ss/ is one another utility to investigate sockets. +Functionally it is NOT better than <tt/netstat/ combined +with some perl/awk scripts and though it is surely faster +it is not enough to make it much better. :-) +So, stop reading this now and do not waste your time. +Well, certainly, it proposes some functionality, which current +netstat is still not able to do, but surely will soon. +</abstract> + +<sect>Why? + +<p> <tt>/proc</tt> interface is inadequate, unfortunately. +When amount of sockets is enough large, <tt/netstat/ or even +plain <tt>cat /proc/net/tcp/</tt> cause nothing but pains and curses. +In linux-2.4 the desease became worse: even if amount +of sockets is small reading <tt>/proc/net/tcp/</tt> is slow enough. + +This utility presents a new approach, which is supposed to scale +well. I am not going to describe technical details here and +will concentrate on description of the command. +The only important thing to say is that it is not so bad idea +to load module <tt/tcp_diag/, which can be found in directory +<tt/Modules/ of <tt/iproute2/. If you do not make this <tt/ss/ +will work, but it falls back to <tt>/proc</tt> and becomes slow +like <tt/netstat/, well, a bit faster yet (see section "Some numbers"). + +<sect>Old news + +<p> +In the simplest form <tt/ss/ is equivalent to netstat +with some small deviations. + +<itemize> +<item><tt/ss -t -a/ dumps all TCP sockets +<item><tt/ss -u -a/ dumps all UDP sockets +<item><tt/ss -w -a/ dumps all RAW sockets +<item><tt/ss -x -a/ dumps all UNIX sockets +</itemize> + +<p> +Option <tt/-o/ shows TCP timers state. +Option <tt/-e/ shows some extended information. +Etc. etc. etc. Seems, all the options of netstat related to sockets +are supported. Though not AX.25 and other bizarres. :-) +If someone wants, he can make support for decnet and ipx. +Some rudimentary support for them is already present in iproute2 libutils, +and I will be glad to see these new members. + +<p> +However, standard functionality is a bit different: + +<p> +The first: without option <tt/-a/ sockets in states +<tt/TIME-WAIT/ and <tt/SYN-RECV/ are skipped too. +It is more reasonable default, I think. + +<p> +The second: format of UNIX sockets is different. It coincides +with tcp/udp. Though standard kernel still does not allow to +see write/read queues and peer address of connected UNIX sockets, +the patch doing this exists. + +<p> +The third: default is to dump only TCP sockets, rather than all of the types. + +<p> +The next: by default it does not resolve numeric host addresses (like <tt/ip/)! +Resolving is enabled with option <tt/-r/. Service names, usually stored +in local files, are resolved by default. Also, if service database +does not contain references to a port, <tt/ss/ queries system +<tt/rpcbind/. RPC services are prefixed with <tt/rpc./ +Resolution of services may be suppressed with option <tt/-n/. + +<p> +It does not accept "long" options (I dislike them, sorry). +So, address family is given with family identifier following +option <tt/-f/ to be algined to iproute2 conventions. +Mostly, it is to allow option parser to parse +addresses correctly, but as side effect it really limits dumping +to sockets supporting only given family. Option <tt/-A/ followed +by list of socket tables to dump is also supported. +Logically, id of socket table is different of _address_ family, which is +another point of incompatibility. So, id is one of +<tt/all/, <tt/tcp/, <tt/udp/, +<tt/raw/, <tt/inet/, <tt/unix/, <tt/packet/, <tt/netlink/. See? +Well, <tt/inet/ is just abbreviation for <tt/tcp|udp|raw/ +and it is not difficult to guess that <tt/packet/ allows +to look at packet sockets. Actually, there are also some other abbreviations, +f.e. <tt/unix_dgram/ selects only datagram UNIX sockets. + +<p> +The next: well, I still do not know. :-) + + + + +<sect>Time to talk about new functionality. + +<p>It is builtin filtering of socket lists. + +<sect1> Filtering by state. + +<p> +<tt/ss/ allows to filter socket states, using keywords +<tt/state/ and <tt/exclude/, followed by some state +identifier. + +<p> +State identifier are standard TCP state names (not listed, +they are useless for you if you already do not know them) +or abbreviations: + +<itemize> +<item><tt/all/ - for all the states +<item><tt/bucket/ - for TCP minisockets (<tt/TIME-WAIT|SYN-RECV/) +<item><tt/big/ - all except for minisockets +<item><tt/connected/ - not closed and not listening +<item><tt/synchronized/ - connected and not <tt/SYN-SENT/ +</itemize> + +<p> + F.e. to dump all tcp sockets except <tt/SYN-RECV/: + +<tscreen><verb> + ss exclude SYN-RECV +</verb></tscreen> + +<p> + If neither <tt/state/ nor <tt/exclude/ directives + are present, + state filter defaults to <tt/all/ with option <tt/-a/ + or to <tt/all/, + excluding listening, syn-recv, time-wait and closed sockets. + +<sect1> Filtering by addresses and ports. + +<p> +Option list may contain address/port filter. +It is boolean expression which consists of boolean operation +<tt/or/, <tt/and/, <tt/not/ and predicates. +Actually, all the flavors of names for boolean operations are eaten: +<tt/&/, <tt/&&/, <tt/|/, <tt/||/, <tt/!/, but do not forget +about special sense given to these symbols by unix shells and escape +them correctly, when used from command line. + +<p> +Predicates may be of the folowing kinds: + +<itemize> +<item>A. Address/port match, where address is checked against mask + and port is either wildcard or exact. It is one of: + +<tscreen><verb> + dst prefix:port + src prefix:port + src unix:STRING + src link:protocol:ifindex + src nl:channel:pid +</verb></tscreen> + + Both prefix and port may be absent or replaced with <tt/*/, + which means wildcard. UNIX socket use more powerful scheme + matching to socket names by shell wildcards. Also, prefixes + unix: and link: may be omitted, if address family is evident + from context (with option <tt/-x/ or with <tt/-f unix/ + or with <tt/unix/ keyword) + +<p> + F.e. + +<tscreen><verb> + dst 10.0.0.1 + dst 10.0.0.1: + dst 10.0.0.1/32: + dst 10.0.0.1:* +</verb></tscreen> + are equivalent and mean socket connected to + any port on host 10.0.0.1 + +<tscreen><verb> + dst 10.0.0.0/24:22 +</verb></tscreen> + sockets connected to port 22 on network + 10.0.0.0...255. + +<p> + Note that port separated of address with colon, which creates + troubles with IPv6 addresses. Generally, we interpret the last + colon as splitting port. To allow to give IPv6 addresses, + trick like used in IPv6 HTTP URLs may be used: + +<tscreen><verb> + dst [::1] +</verb></tscreen> + are sockets connected to ::1 on any port + +<p> + Another way is <tt/dst ::1/128/. / helps to understand that + colon is part of IPv6 address. + +<p> + Now we can add another alias for <tt/dst 10.0.0.1/: + <tt/dst [10.0.0.1]/. :-) + +<p> Address may be a DNS name. In this case all the addresses are looked + up (in all the address families, if it is not limited by option <tt/-f/ + or special address prefix <tt/inet:/, <tt/inet6/) and resulting + expression is <tt/or/ over all of them. + +<item> B. Port expressions: +<tscreen><verb> + dport >= :1024 + dport != :22 + sport < :32000 +</verb></tscreen> + etc. + + All the relations: <tt/</, <tt/>/, <tt/=/, <tt/>=/, <tt/=/, <tt/==/, + <tt/!=/, <tt/eq/, <tt/ge/, <tt/lt/, <tt/ne/... + Use variant which you like more, but not forget to escape special + characters when typing them in command line. :-) + + Note that port number syntactically coincides to the case A! + You may even add an IP address, but it will not participate + incomparison, except for <tt/==/ and <tt/!=/, which are equivalent + to corresponding predicates of type A. F.e. +<p> +<tt/dst 10.0.0.1:22/ + is equivalent to <tt/dport eq 10.0.0.1:22/ + and + <tt/not dst 10.0.0.1:22/ is equivalent to + <tt/dport neq 10.0.0.1:22/ + +<item>C. Keyword <tt/autobound/. It matches to sockets bound automatically + on local system. + +</itemize> + + +<sect> Examples + +<p> +<itemize> +<item>1. List all the tcp sockets in state <tt/FIN-WAIT-1/ for our apache + to network 193.233.7/24 and look at their timers: + +<tscreen><verb> + ss -o state fin-wait-1 \( sport = :http or sport = :https \) \ + dst 193.233.7/24 +</verb></tscreen> + + Oops, forgot to say that missing logical operation is + equivalent to <tt/and/. + +<item> 2. Well, now look at the rest... + +<tscreen><verb> + ss -o excl fin-wait-1 + ss state fin-wait-1 \( sport neq :http and sport neq :https \) \ + or not dst 193.233.7/24 +</verb></tscreen> + + Note that we have to do _two_ calls of ss to do this. + State match is always anded to address/port match. + The reason for this is purely technical: ss does fast skip of + not matching states before parsing addresses and I consider the + ability to skip fastly gobs of time-wait and syn-recv sockets + as more important than logical generality. + +<item> 3. So, let's look at all our sockets using autobound ports: + +<tscreen><verb> + ss -a -A all autobound +</verb></tscreen> + + +<item> 4. And eventually find all the local processes connected + to local X servers: + +<tscreen><verb> + ss -xp dst "/tmp/.X11-unix/*" +</verb></tscreen> + + Pardon, this does not work with current kernel, patching is required. + But we still can look at server side: + +<tscreen><verb> + ss -x src "/tmp/.X11-unix/*" +</verb></tscreen> + +</itemize> + + +<sect> Returning to ground: real manual + +<p> +<sect1> Command arguments + +<p> General format of arguments to <tt/ss/ is: + +<tscreen><verb> + ss [ OPTIONS ] [ STATE-FILTER ] [ ADDRESS-FILTER ] +</verb></tscreen> + +<sect2><tt/OPTIONS/ +<p> <tt/OPTIONS/ is list of single letter options, using common unix +conventions. + +<itemize> +<item><tt/-h/ - show help page +<item><tt/-?/ - the same, of course +<item><tt/-v/, <tt/-V/ - print version of <tt/ss/ and exit +<item><tt/-s/ - print summary statistics. This option does not parse +socket lists obtaining summary from various sources. It is useful +when amount of sockets is so huge that parsing <tt>/proc/net/tcp</tt> +is painful. +<item><tt/-D FILE/ - do not display anything, just dump raw information +about TCP sockets to <tt/FILE/ after applying filters. If <tt/FILE/ is <tt/-/ +<tt/stdout/ is used. +<item><tt/-F FILE/ - read continuation of filter from <tt/FILE/. +Each line of <tt/FILE/ is interpreted like single command line option. +If <tt/FILE/ is <tt/-/ <tt/stdin/ is used. +<item><tt/-r/ - try to resolve numeric address/ports +<item><tt/-n/ - do not try to resolve ports +<item><tt/-o/ - show some optional information, f.e. TCP timers +<item><tt/-i/ - show some infomration specific to TCP (RTO, congestion +window, slow start threshould etc.) +<item><tt/-e/ - show even more optional information +<item><tt/-m/ - show extended information on memory used by the socket. +It is available only with <tt/tcp_diag/ enabled. +<item><tt/-p/ - show list of processes owning the socket +<item><tt/-f FAMILY/ - default address family used for parsing addresses. + Also this option limits listing to sockets supporting + given address family. Currently the following families + are supported: <tt/unix/, <tt/inet/, <tt/inet6/, <tt/link/, + <tt/netlink/. +<item><tt/-4/ - alias for <tt/-f inet/ +<item><tt/-6/ - alias for <tt/-f inet6/ +<item><tt/-0/ - alias for <tt/-f link/ +<item><tt/-A LIST-OF-TABLES/ - list of socket tables to dump, separated + by commas. The following identifiers are understood: + <tt/all/, <tt/inet/, <tt/tcp/, <tt/udp/, <tt/raw/, + <tt/unix/, <tt/packet/, <tt/netlink/, <tt/unix_dgram/, + <tt/unix_stream/, <tt/packet_raw/, <tt/packet_dgram/. +<item><tt/-x/ - alias for <tt/-A unix/ +<item><tt/-t/ - alias for <tt/-A tcp/ +<item><tt/-u/ - alias for <tt/-A udp/ +<item><tt/-w/ - alias for <tt/-A raw/ +<item><tt/-a/ - show sockets of all the states. By default sockets + in states <tt/LISTEN/, <tt/TIME-WAIT/, <tt/SYN_RECV/ + and <tt/CLOSE/ are skipped. +<item><tt/-l/ - show only sockets in state <tt/LISTEN/ +</itemize> + +<sect2><tt/STATE-FILTER/ + +<p><tt/STATE-FILTER/ allows to construct arbitrary set of +states to match. Its syntax is sequence of keywords <tt/state/ +and <tt/exclude/ followed by identifier of state. +Available identifiers are: + +<p> +<itemize> +<item> All standard TCP states: <tt/established/, <tt/syn-sent/, +<tt/syn-recv/, <tt/fin-wait-1/, <tt/fin-wait-2/, <tt/time-wait/, +<tt/closed/, <tt/close-wait/, <tt/last-ack/, <tt/listen/ and <tt/closing/. + +<item><tt/all/ - for all the states +<item><tt/connected/ - all the states except for <tt/listen/ and <tt/closed/ +<item><tt/synchronized/ - all the <tt/connected/ states except for +<tt/syn-sent/ +<item><tt/bucket/ - states, which are maintained as minisockets, i.e. +<tt/time-wait/ and <tt/syn-recv/. +<item><tt/big/ - opposite to <tt/bucket/ +</itemize> + +<sect2><tt/ADDRESS_FILTER/ + +<p><tt/ADDRESS_FILTER/ is boolean expression with operations <tt/and/, <tt/or/ +and <tt/not/, which can be abbreviated in C style f.e. as <tt/&/, +<tt/&&/. + +<p> +Predicates check socket addresses, both local and remote. +There are the following kinds of predicates: + +<itemize> +<item> <tt/dst ADDRESS_PATTERN/ - matches remote address and port +<item> <tt/src ADDRESS_PATTERN/ - matches local address and port +<item> <tt/dport RELOP PORT/ - compares remote port to a number +<item> <tt/sport RELOP PORT/ - compares local port to a number +<item> <tt/autobound/ - checks that socket is bound to an ephemeral + port +</itemize> + +<p><tt/RELOP/ is some of <tt/<=/, <tt/>=/, <tt/==/ etc. +To make this more convinient for use in unix shell, alphabetic +FORTRAN-like notations <tt/le/, <tt/gt/ etc. are accepted as well. + +<p>The format and semantics of <tt/ADDRESS_PATTERN/ depends on address +family. + +<itemize> +<item><tt/inet/ - <tt/ADDRESS_PATTERN/ consists of IP prefix, optionally +followed by colon and port. If prefix or port part is absent or replaced +with <tt/*/, this means wildcard match. +<item><tt/inet6/ - The same as <tt/inet/, only prefix refers to an IPv6 +address. Unlike <tt/inet/ colon becomes ambiguous, so that <tt/ss/ allows +to use scheme, like used in URLs, where address is suppounded with +<tt/[/ ... <tt/]/. +<item><tt/unix/ - <tt/ADDRESS_PATTERN/ is shell-style wildcard. +<item><tt/packet/ - format looks like <tt/inet/, only interface index +stays instead of port and link layer protocol id instead of address. +<item><tt/netlink/ - format looks like <tt/inet/, only socket pid +stays instead of port and netlink channel instead of address. +</itemize> + +<p><tt/PORT/ is syntactically <tt/ADDRESS_PATTERN/ with wildcard +address part. Certainly, it is undefined for UNIX sockets. + +<sect1> Environment variables + +<p> +<tt/ss/ allows to change source of information using various +environment variables: + +<p> +<itemize> +<item> <tt/PROC_SLABINFO/ to override <tt>/proc/slabinfo</tt> +<item> <tt/PROC_NET_TCP/ to override <tt>/proc/net/tcp</tt> +<item> <tt/PROC_NET_UDP/ to override <tt>/proc/net/udp</tt> +<item> etc. +</itemize> + +<p> +Variable <tt/PROC_ROOT/ allows to change root of all the <tt>/proc/</tt> +hierarchy. + +<p> +Variable <tt/TCPDIAG_FILE/ prescribes to open a file instead of +requesting kernel to dump information about TCP sockets. + + +<p> This option is used mainly to investigate bug reports, +when dumps of files usually found in <tt>/proc/</tt> are recevied +by e-mail. + +<sect1> Output format + +<p>Six columns. The first is <tt/Netid/, it denotes socket type and +transport protocol, when it is ambiguous: <tt/tcp/, <tt/udp/, <tt/raw/, +<tt/u_str/ is abbreviation for <tt/unix_stream/, <tt/u_dgr/ for UNIX +datagram sockets, <tt/nl/ for netlink, <tt/p_raw/ and <tt/p_dgr/ for +raw and datagram packet sockets. This column is optional, it will +be hidden, if filter selects an unique netid. + +<p> +The second column is <tt/State/. Socket state is displayed here. +The names are standard TCP names, except for <tt/UNCONN/, which +cannot happen for TCP, but normal for not connected sockets +of another types. Again, this column can be hidden. + +<p> +Then two columns (<tt/Recv-Q/ and <tt/Send-Q/) showing amount of data +queued for receive and transmit. + +<p> +And the last two columns display local address and port of the socket +and its peer address, if the socket is connected. + +<p> +If options <tt/-o/, <tt/-e/ or <tt/-p/ were given, options are +displayed not in fixed positions but separated by spaces pairs: +<tt/option:value/. If value is not a single number, it is presented +as list of values, enclosed to <tt/(/ ... <tt/)/ and separated with +commas. F.e. + +<tscreen><verb> + timer:(keepalive,111min,0) +</verb></tscreen> +is typical format for TCP timer (option <tt/-o/). + +<tscreen><verb> + users:((X,113,3)) +</verb></tscreen> +is typical for list of users (option <tt/-p/). + + +<sect>Some numbers + +<p> +Well, let us use <tt/pidentd/ and a tool <tt/ibench/ to measure +its performance. It is 30 requests per second here. Nothing to test, +it is too slow. OK, let us patch pidentd with patch from directory +Patches. After this it handles about 4300 requests per second +and becomes handy tool to pollute socket tables with lots of timewait +buckets. + +<p> +So, each test starts from pollution tables with 30000 sockets +and then doing full dump of the table piped to wc and measuring +timings with time: + +<p>Results: + +<itemize> +<item> <tt/netstat -at/ - 15.6 seconds +<item> <tt/ss -atr/, but without <tt/tcp_diag/ - 5.4 seconds +<item> <tt/ss -atr/ with <tt/tcp_diag/ - 0.47 seconds +</itemize> + +No comments. Though one comment is necessary, most of time +without <tt/tcp_diag/ is wasted inside kernel with completely +blocked networking. More than 10 seconds, yes. <tt/tcp_diag/ +does the same work for 100 milliseconds of system time. + +</article> diff --git a/etc/iproute2/rt_dsfield b/etc/iproute2/rt_dsfield index e69de29b..110061a8 100644 --- a/etc/iproute2/rt_dsfield +++ b/etc/iproute2/rt_dsfield @@ -0,0 +1,13 @@ +0x10 lowdelay +0x08 throughput +0x04 reliability +# This value overlap with ECT, do not use it! +0x02 mincost +# These values seems do not want to die, Cisco likes them by a strange reason. +0x20 priority +0x40 immediate +0x60 flash +0x80 flash-override +0xa0 critical +0xc0 internet +0xe0 network diff --git a/etc/iproute2/rt_protos b/etc/iproute2/rt_protos index e69de29b..8c985d79 100644 --- a/etc/iproute2/rt_protos +++ b/etc/iproute2/rt_protos @@ -0,0 +1,25 @@ +# +# Reserved protocols. +# +0 unspec +1 redirect +2 kernel +3 boot +4 static +8 gated +9 ra +10 mrt +11 zebra +12 bird +# +# Used by me for gated +# +254 gated/aggr +253 gated/bgp +252 gated/ospf +251 gated/ospfase +250 gated/rip +249 gated/static +248 gated/conn +247 gated/inet +246 gated/default diff --git a/etc/iproute2/rt_realms b/etc/iproute2/rt_realms index e69de29b..eedd76d2 100644 --- a/etc/iproute2/rt_realms +++ b/etc/iproute2/rt_realms @@ -0,0 +1,13 @@ +# +# reserved values +# +0 cosmos +# +# local +# +#1 inr.ac +#2 inr.ruhep +#3 freenet +#4 radio-msu +#5 russia +#6 internet diff --git a/etc/iproute2/rt_scopes b/etc/iproute2/rt_scopes index e69de29b..8514bc11 100644 --- a/etc/iproute2/rt_scopes +++ b/etc/iproute2/rt_scopes @@ -0,0 +1,11 @@ +# +# reserved values +# +0 global +255 nowhere +254 host +253 link +# +# pseudo-reserved +# +200 site diff --git a/etc/iproute2/rt_tables b/etc/iproute2/rt_tables index e69de29b..541abfd2 100644 --- a/etc/iproute2/rt_tables +++ b/etc/iproute2/rt_tables @@ -0,0 +1,11 @@ +# +# reserved values +# +255 local +254 main +253 default +0 unspec +# +# local +# +#1 inr.ruhep diff --git a/examples/SYN-DoS.rate.limit b/examples/SYN-DoS.rate.limit index e69de29b..8766b679 100644 --- a/examples/SYN-DoS.rate.limit +++ b/examples/SYN-DoS.rate.limit @@ -0,0 +1,49 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities +# this script shows how one can rate limit incoming SYNs +# Useful for TCP-SYN attack protection. You can use +# IPchains to have more powerful additions to the SYN (eg +# in addition the subnet) +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +# +# tag all incoming SYN packets through $INDEV as mark value 1 +############################################################ +$IPCHAINS -A input -i $INDEV -y -m 1 +############################################################ +# +# install the ingress qdisc on the ingress interface +############################################################ +$TC qdisc add dev $INDEV handle ffff: ingress +############################################################ + +# +# +# SYN packets are 40 bytes (320 bits) so three SYNs equals +# 960 bits (approximately 1kbit); so we rate limit below +# the incoming SYNs to 3/sec (not very sueful really; but +#serves to show the point - JHS +############################################################ +$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 1 fw \ +police rate 1kbit burst 40 mtu 9k drop flowid :1 +############################################################ + + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/cbqinit.eth1 b/examples/cbqinit.eth1 index e69de29b..226ec1c5 100644 --- a/examples/cbqinit.eth1 +++ b/examples/cbqinit.eth1 @@ -0,0 +1,76 @@ +#! /bin/sh + +TC=/home/root/tc +IP=/home/root/ip +DEVICE=eth1 +BANDWIDTH="bandwidth 10Mbit" + +# Attach CBQ on $DEVICE. It will have handle 1:. +# $BANDWIDTH is real $DEVICE bandwidth (10Mbit). +# avpkt is average packet size. +# mpu is minimal packet size. + +$TC qdisc add dev $DEVICE root handle 1: cbq \ +$BANDWIDTH avpkt 1000 mpu 64 + +# Create root class with classid 1:1. This step is not necessary. +# bandwidth is the same as on CBQ itself. +# rate == all the bandwidth +# allot is MTU + MAC header +# maxburst measure allowed class burstiness (please,read S.Floyd and VJ papers) +# est 1sec 8sec means, that kernel will evaluate average rate +# on this class with period 1sec and time constant 8sec. +# This rate is viewed with "tc -s class ls dev $DEVICE" + +$TC class add dev $DEVICE parent 1:0 classid :1 est 1sec 8sec cbq \ +$BANDWIDTH rate 10Mbit allot 1514 maxburst 50 avpkt 1000 + +# Bulk. +# New parameters are: +# weight, which is set to be proportional to +# "rate". It is not necessary, weight=1 will work as well. +# defmap and split say that best effort ttraffic, not classfied +# by another means will fall to this class. + +$TC class add dev $DEVICE parent 1:1 classid :2 est 1sec 8sec cbq \ +$BANDWIDTH rate 4Mbit allot 1514 weight 500Kbit \ +prio 6 maxburst 50 avpkt 1000 split 1:0 defmap ff3d + +# OPTIONAL. +# Attach "sfq" qdisc to this class, quantum is MTU, perturb +# gives period of hash function perturbation in seconds. +# +$TC qdisc add dev $DEVICE parent 1:2 sfq quantum 1514b perturb 15 + +# Interactive-burst class + +$TC class add dev $DEVICE parent 1:1 classid :3 est 2sec 16sec cbq \ +$BANDWIDTH rate 1Mbit allot 1514 weight 100Kbit \ +prio 2 maxburst 100 avpkt 1000 split 1:0 defmap c0 + +$TC qdisc add dev $DEVICE parent 1:3 sfq quantum 1514b perturb 15 + +# Background. + +$TC class add dev $DEVICE parent 1:1 classid :4 est 1sec 8sec cbq \ + $BANDWIDTH rate 100Kbit allot 1514 weight 10Mbit \ + prio 7 maxburst 10 avpkt 1000 split 1:0 defmap 2 + +$TC qdisc add dev $DEVICE parent 1:4 sfq quantum 1514b perturb 15 + +# Realtime class for RSVP + +$TC class add dev $DEVICE parent 1:1 classid 1:7FFE cbq \ +rate 5Mbit $BANDWIDTH allot 1514b avpkt 1000 \ +maxburst 20 + +# Reclassified realtime traffic +# +# New element: split is not 1:0, but 1:7FFE. It means, +# that only real-time packets, which violated policing filters +# or exceeded reshaping buffers will fall to it. + +$TC class add dev $DEVICE parent 1:7FFE classid 1:7FFF est 4sec 32sec cbq \ +rate 1Mbit $BANDWIDTH allot 1514b avpkt 1000 weight 10Kbit \ +prio 6 maxburst 10 split 1:7FFE defmap ffff + diff --git a/examples/dhcp-client-script b/examples/dhcp-client-script index e69de29b..7207b57d 100644 --- a/examples/dhcp-client-script +++ b/examples/dhcp-client-script @@ -0,0 +1,446 @@ +#!/bin/bash +# +# dhclient-script for Linux. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. +# +# Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> +# +# Probably, I did not understand, what this funny feature as "alias" +# means exactly. For now I suppose, that it is a static address, which +# we should install and preserve. +# + +exec >> /tmp/DHS.log 2>&1 + +echo dhc-script $* reason=$reason +set | grep "^\(old_\|new_\|check_\)" + +LOG () { + echo LOG $* ; +} + +# convert 8bit mask to length +# arg: $1 = mask +# +Mask8ToLen() { + local l=0; + + while [ $l -le 7 ]; do + if [ $[ ( 1 << $l ) + $1 ] -eq 256 ]; then + return $[ 8 - $l ] + fi + l=$[ $l + 1 ] + done + return 0; +} + +# convert inet dotted quad mask to length +# arg: $1 = dotquad mask +# +MaskToLen() { + local masklen=0 + local mask8=$1 + + case $1 in + 0.0.0.0) + return 0; + ;; + 255.*.0.0) + masklen=8 + mask8=${mask8#255.} + mask8=${mask8%.0.0} + ;; + 255.255.*.0) + masklen=16 + mask8=${mask8#255.255.} + mask8=${mask8%.0} + ;; + 255.255.255.*) + masklen=24 + mask8=${mask8#255.255.255.} + ;; + *) + return 255 + ;; + esac + Mask8ToLen $mask8 + return $[ $? + $masklen ] +} + +# calculate ABC "natural" mask +# arg: $1 = dotquad address +# +ABCMask () { + local class; + + class=${1%%.*} + + if [ "$1" = "255.255.255.255" ]; then + echo $1 + elif [ "$1" = "0.0.0.0" ]; then + echo $1 + elif [ $class -ge 224 ]; then + echo 240.0.0.0 + elif [ $class -ge 192 ]; then + echo 255.255.255.0 + elif [ $class -ge 128 ]; then + echo 255.255.0.0 + else + echo 255.0.0.0 + fi +} + +# calculate ABC "natural" mask length +# arg: $1 = dotquad address +# +ABCMaskLen () { + local class; + + class=${1%%.*} + + if [ "$1" = "255.255.255.255" ]; then + return 32 + elif [ "$1" = "0.0.0.0" ]; then + return 0 + elif [ $class -ge 224 ]; then + return 4; + elif [ $class -ge 192 ]; then + return 24; + elif [ $class -ge 128 ]; then + return 16; + else + return 8; + fi +} + +# Delete IP address +# args: $1 = interface +# $2 = address +# $3 = mask +# $4 = broadcast +# $5 = label +# +DelINETAddr () { + local masklen=32 + local addrid=$1 + + LOG DelINETAddr $* + + if [ "$5" ]; then + addrid=$addrid:$5 + fi + LOG ifconfig $addrid down + ifconfig $addrid down +} + +# Add IP address +# args: $1 = interface +# $2 = address +# $3 = mask +# $4 = broadcast +# $5 = label +# +AddINETAddr () { + local mask_arg + local brd_arg + local addrid=$1 + + LOG AddINETAddr $* + + if [ "$5" ]; then + addrid=$addrid:$5 + fi + if [ "$3" ]; then + mask_arg="netmask $3" + fi + if [ "$4" ]; then + brd_arg="broadcast $4" + fi + + LOG ifconfig $addrid $2 $mask_arg $brd_arg up + ifconfig $addrid $2 $mask_arg $brd_arg up +} + +# Add default routes +# args: $1 = routers list +# +AddDefaultRoutes() { + local router + + if [ "$1" ]; then + LOG AddDefaultRoutes $* + for router in $1; do + LOG route add default gw $router + route add default gw $router + done ; + fi +} + +# Delete default routes +# args: $1 = routers list +# +DelDefaultRoutes() { + local router + + if [ "$1" ]; then + LOG DelDefaultRoutes $* + + for router in $1; do + LOG route del default gw $router + route del default gw $router + done + fi +} + +# ping a host +# args: $1 = dotquad address of the host +# +PingNode() { + LOG PingNode $* + if ping -q -c 1 -w 2 $1 ; then + return 0; + fi + return 1; +} + +# Check (and add route, if alive) default routers +# args: $1 = routers list +# returns: 0 if at least one router is alive. +# +CheckRouterList() { + local router + local succeed=1 + + LOG CheckRouterList $* + + for router in $1; do + if PingNode $router ; then + succeed=0 + route add default gw $router + fi + done + return $succeed +} + +# Delete/create static routes. +# args: $1 = operation (del/add) +# $2 = routes list in format "dst1 nexthop1 dst2 ..." +# +# BEWARE: this feature of DHCP is obsolete, because does not +# support subnetting. +# +X-StaticRouteList() { + local op=$1 + local lst="$2" + local masklen + + LOG X-StaticRouteList $* + + if [ "$lst" ]; then + set $lst + while [ $# -gt 1 ]; do + route $op -net $1 netmask `ABCMask "$1"` gw $2 + shift; shift; + done + fi +} + +# Create static routes. +# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..." +# +AddStaticRouteList() { + LOG AddStaticRouteList $* + X-StaticRouteList add "$1" +} + +# Delete static routes. +# arg: $1 = routes list in format "dst1 nexthop1 dst2 ..." +# +DelStaticRouteList() { + LOG DelStaticRouteList $* + X-StaticRouteList del "$1" +} + +# Broadcast unsolicited ARP to update neighbours' caches. +# args: $1 = interface +# $2 = address +# +UnsolicitedARP() { + if [ -f /sbin/arping ]; then + /sbin/arping -A -c 1 -I "$1" "$2" & + (sleep 2 ; /sbin/arping -U -c 1 -I "$1" "$2" ) & + fi +} + +# Duplicate address detection. +# args: $1 = interface +# $2 = test address +# returns: 0, if DAD succeeded. +DAD() { + if [ -f /sbin/arping ]; then + /sbin/arping -c 2 -w 3 -D -I "$1" "$2" + return $? + fi + return 0 +} + + +# Setup resolver. +# args: NO +# domain and nameserver list are passed in global variables. +# +# NOTE: we try to be careful and not to break user supplied resolv.conf. +# The script mangles it, only if it has dhcp magic signature. +# +UpdateDNS() { + local nameserver + local idstring="#### Generated by DHCPCD" + + LOG UpdateDNS $* + + if [ "$new_domain_name" = "" -a "$new_domain_name_servers" = "" ]; then + return 0; + fi + + echo $idstring > /etc/resolv.conf.dhcp + if [ "$new_domain_name" ]; then + echo search $new_domain_name >> /etc/resolv.conf.dhcp + fi + echo options ndots:1 >> /etc/resolv.conf.dhcp + + if [ "$new_domain_name_servers" ]; then + for nameserver in $new_domain_name_servers; do + echo nameserver $nameserver >> /etc/resolv.conf.dhcp + done + else + echo nameserver 127.0.0.1 >> /etc/resolv.conf.dhcp + fi + + if [ -f /etc/resolv.conf ]; then + if [ "`head -1 /etc/resolv.conf`" != "$idstring" ]; then + return 0 + fi + if [ "$old_domain_name" = "$new_domain_name" -a + "$new_domain_name_servers" = "$old_domain_name_servers" ]; then + return 0 + fi + fi + mv /etc/resolv.conf.dhcp /etc/resolv.conf +} + +case $reason in +NBI) + exit 1 + ;; + +MEDIUM) + exit 0 + ;; + +PREINIT) + ifconfig $interface:dhcp down + ifconfig $interface:dhcp1 down + if [ -d /proc/sys/net/ipv4/conf/$interface ]; then + ifconfig $interface:dhcp 10.10.10.10 netmask 255.255.255.255 + ifconfig $interface:dhcp down + if [ -d /proc/sys/net/ipv4/conf/$interface ]; then + LOG The interface $interface already configured. + fi + fi + ifconfig $interface:dhcp up + exit 0 + ;; + +ARPSEND) + exit 0 + ;; + +ARPCHECK) + if DAD "$interface" "$check_ip_address" ; then + exit 0 + fi + exit 1 + ;; + +BOUND|RENEW|REBIND|REBOOT) + if [ "$old_ip_address" -a "$alias_ip_address" -a \ + "$alias_ip_address" != "$old_ip_address" ]; then + DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + if [ "$old_ip_address" -a "$old_ip_address" != "$new_ip_address" ]; then + DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp + DelDefaultRoutes "$old_routers" + DelStaticRouteList "$old_static_routes" + fi + if [ "$old_ip_address" = "" -o "$old_ip_address" != "$new_ip_address" -o \ + "$reason" = "BOUND" -o "$reason" = "REBOOT" ]; then + AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp + AddStaticRouteList "$new_static_routes" + AddDefaultRoutes "$new_routers" + UnsolicitedARP "$interface" "$new_ip_address" + fi + if [ "$new_ip_address" != "$alias_ip_address" -a "$alias_ip_address" ]; then + AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + UpdateDNS + exit 0 + ;; + +EXPIRE|FAIL) + if [ "$alias_ip_address" ]; then + DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + if [ "$old_ip_address" ]; then + DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp + DelDefaultRoutes "$old_routers" + DelStaticRouteList "$old_static_routes" + fi + if [ "$alias_ip_address" ]; then + AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + exit 0 + ;; + +TIMEOUT) + if [ "$alias_ip_address" ]; then + DelINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi +# Seems, <null address> means, that no more old leases found. +# Or does it mean bug in dhcpcd? 8) Fail for now. + if [ "$new_ip_address" = "<null address>" ]; then + if [ "$old_ip_address" ]; then + DelINETAddr "$interface" "$old_ip_address" "$old_subnet_mask" "$old_broadcast_address" dhcp + fi + if [ "$alias_ip_address" ]; then + AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + exit 1 + fi + if DAD "$interface" "$new_ip_address" ; then + AddINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp + UnsolicitedARP "$interface" "$new_ip_address" + if [ "$alias_ip_address" -a "$alias_ip_address" != "$new_ip_address" ]; then + AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + UnsolicitedARP "$interface" "$alias_ip_address" + fi + if CheckRouterList "$new_routers" ; then + AddStaticRouteList "$new_static_routes" + UpdateDNS + exit 0 + fi + fi + DelINETAddr "$interface" "$new_ip_address" "$new_subnet_mask" "$new_broadcast_address" dhcp + DelDefaultRoutes "$old_routers" + DelStaticRouteList "$old_static_routes" + if [ "$alias_ip_address" ]; then + AddINETAddr "$interface" "$alias_ip_address" "$alias_subnet_mask" "$alias_broadcast_address" dhcp1 + fi + exit 1 + ;; +esac + +exit 0 diff --git a/examples/diffserv/Edge1 b/examples/diffserv/Edge1 index e69de29b..4ddffdd1 100644 --- a/examples/diffserv/Edge1 +++ b/examples/diffserv/Edge1 @@ -0,0 +1,68 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities +# This script just tags on the ingress interfac using Ipchains +# the result is used for fast classification and re-marking +# on the egress interface +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +EGDEV="dev eth1" +# +# tag all incoming packets from host 10.2.0.24 to value 1 +# tag all incoming packets from host 10.2.0.3 to value 2 +# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3 +#These values are used in the egress +# +############################################################ +$IPCHAINS -A input -s 10.2.0.4/24 -m 3 +$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1 +$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2 + +######################## Egress side ######################## + + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 set_tc_index +# +# values of the DSCP to change depending on the class +# +#becomes EF +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0xb8 +#becomes AF11 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x28 +#becomes AF21 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x48 +# +# +# The class mapping +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent 1:0 + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 diff --git a/examples/diffserv/Edge2 b/examples/diffserv/Edge2 index e69de29b..2f78da24 100644 --- a/examples/diffserv/Edge2 +++ b/examples/diffserv/Edge2 @@ -0,0 +1,87 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities +# This script tags the fwmark on the ingress interface using IPchains +# the result is used first for policing on the Ingress interface then +# for fast classification and re-marking +# on the egress interface +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +EGDEV="dev eth1" +# +# tag all incoming packets from host 10.2.0.24 to value 1 +# tag all incoming packets from host 10.2.0.3 to value 2 +# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3 +#These values are used in the egress +############################################################ +$IPCHAINS -A input -s 10.2.0.0/24 -m 3 +$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1 +$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2 +############################################################ +# +# install the ingress qdisc on the ingress interface +############################################################ +$TC qdisc add dev $INDEV handle ffff: ingress +############################################################ + +# +# attach a fw classifier to the ingress which polices anything marked +# by ipchains to tag value 3 (The rest of the subnet packets -- not +# tag 1 or 2) to not go beyond 1.5Mbps +# Allow up to at least 60 packets to burst (assuming maximum packet +# size of # 1.5 KB) in the long run and upto about 6 packets in the +# shot run + +############################################################ +$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 3 fw \ +police rate 1500kbit burst 90k mtu 9k drop flowid :1 +############################################################ + +######################## Egress side ######################## + + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0xb8 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x28 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x48 +# +# +# The class mapping +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $DEV ingress diff --git a/examples/diffserv/Edge31-ca-u32 b/examples/diffserv/Edge31-ca-u32 index e69de29b..25e6c0b1 100644 --- a/examples/diffserv/Edge31-ca-u32 +++ b/examples/diffserv/Edge31-ca-u32 @@ -0,0 +1,170 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities using u32 classifier +# This script tags tcindex based on metering on the ingress +# interface the result is used for fast classification and re-marking +# on the egress interface +# This is an example of a color aware mode marker with PIR configured +# based on draft-wahjak-mcm-00.txt (section 3.1) +# +# The colors are defined using the Diffserv Fields +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/usr/src/iproute2-current +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +INDEV=eth0 +EGDEV="dev eth1" +CIR1=1500kbit +CIR2=1000kbit + +#The CBS is about 60 MTU sized packets +CBS1=90k +CBS2=90k + +############################################################ +# +# install the ingress qdisc on the ingress interface +$TC qdisc add dev $INDEV handle ffff: ingress +############################################################ +# +# Create u32 filters +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1: u32 \ +divisor 1 +############################################################ + +# The meters: Note that we have shared meters in this case as identified +# by the index parameter +meter1=" police index 1 rate $CIR1 burst $CBS1 " +meter2=" police index 2 rate $CIR2 burst $CBS1 " +meter3=" police index 3 rate $CIR2 burst $CBS2 " +meter4=" police index 4 rate $CIR1 burst $CBS2 " +meter5=" police index 5 rate $CIR1 burst $CBS2 " + +# All packets are marked with a tcindex value which is used on the egress +# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE + +# *********************** AF41 *************************** +#AF41 (DSCP 0x22) is passed on with a tcindex value 1 +#if it doesnt exceed its CIR/CBS +#policer 1 is used. +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \ +match ip tos 0x88 0xfc \ +$meter1 \ +continue flowid :1 +# +# if it exceeds the above but not the extra rate/burst below, it gets a +# tcindex value of 2 +# policer 2 is used +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \ +match ip tos 0x88 0xfc \ +$meter2 \ +continue flowid :2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 (policer 3) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \ +match ip tos 0x88 0xfc \ +$meter3 \ +drop flowid :3 +# + +# *********************** AF42 *************************** +#AF42 (DSCP 0x24) from is passed on with a tcindex value 2 +#if it doesnt exceed its CIR/CBS +#policer 2 is used. Note that this is shared with the AF41 +# +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \ +match ip tos 0x90 0xfc \ +$meter2 \ +continue flowid :2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 (policer 3) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \ +match ip tos 0x90 0xfc \ +$meter3 \ +drop flowid :3 +# +# *********************** AF43 *************************** +# +#AF43 (DSCP 0x26) from is passed on with a tcindex value 3 +#if it doesnt exceed its CIR/CBS +#policer 3 is used. Note that this is shared with the AF41 and AF42 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \ +match ip tos 0x98 0xfc \ +$meter3 \ +drop flowid :3 +# +# *********************** BE *************************** +# +# Anything else (not from the AF4*) gets discarded if it +# exceeds 1Mbps and by default goes to BE if it doesnt +# Note that the BE class is also used by the AF4* in the worst +# case +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \ +match ip src 0/0\ +$meter4 \ +drop flowid :4 + +######################## Egress side ######################## + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +#note that the ECN bits are masked out +# +#AF41 (0x88 is 0x22 shifted to the right by two bits) +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0x88 +#AF42 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x90 +#AF43 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x98 +#BE +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x0 +# +# +# The class mapping +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 1 tcindex classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 2 tcindex classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 3 tcindex classid 1:3 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 4 tcindex classid 1:4 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/diffserv/Edge31-cb-chains b/examples/diffserv/Edge31-cb-chains index e69de29b..d7faae98 100644 --- a/examples/diffserv/Edge31-cb-chains +++ b/examples/diffserv/Edge31-cb-chains @@ -0,0 +1,132 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities +# This script fwmark tags(IPchains) based on metering on the ingress +# interface the result is used for fast classification and re-marking +# on the egress interface +# This is an example of a color blind mode marker with no PIR configured +# based on draft-wahjak-mcm-00.txt (section 3.1) +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +EGDEV="dev eth1" +CIR1=1500kbit +CIR2=1000kbit + +#The CBS is about 60 MTU sized packets +CBS1=90k +CBS2=90k + +meter1="police rate $CIR1 burst $CBS1 " +meter2="police rate $CIR1 burst $CBS2 " +meter3="police rate $CIR2 burst $CBS1 " +meter4="police rate $CIR2 burst $CBS2 " +meter5="police rate $CIR2 burst $CBS2 " +# +# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1 +# tag all incoming packets from any other subnet to fw tag 2 +############################################################ +$IPCHAINS -A input -i $INDEV -s 0/0 -m 2 +$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1 +# +############################################################ +# install the ingress qdisc on the ingress interface +$TC qdisc add dev $INDEV handle ffff: ingress +# +############################################################ + +# All packets are marked with a tcindex value which is used on the egress +# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE +# +############################################################ +# +# anything with fw tag of 1 is passed on with a tcindex value 1 +#if it doesnt exceed its allocated rate (CIR/CBS) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \ +$meter1 \ +continue flowid 4:1 +# +# if it exceeds the above but not the extra rate/burst below, it gets a +#tcindex value of 2 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \ +$meter2 \ +continue flowid 4:2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \ +$meter3 \ +drop flowid 4:3 +# +# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it +# exceeds 1Mbps and by default goes to BE if it doesnt +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 2 fw \ +$meter5 \ +drop flowid 4:4 + + +######################## Egress side ######################## + + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +#note that the ECN bits are masked out +# +#AF41 (0x88 is 0x22 shifted to the right by two bits) +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0x88 +#AF42 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x90 +#AF43 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x98 +#BE +$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \ + value 0x0 +# +# +# The class mapping (using tcindex; could easily have +# replaced it with the fw classifier instead) +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 1 tcindex classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 2 tcindex classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 3 tcindex classid 1:3 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 4 tcindex classid 1:4 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/diffserv/Edge32-ca-u32 b/examples/diffserv/Edge32-ca-u32 index e69de29b..edf21e43 100644 --- a/examples/diffserv/Edge32-ca-u32 +++ b/examples/diffserv/Edge32-ca-u32 @@ -0,0 +1,198 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities using u32 classifier +# This script tags tcindex based on metering on the ingress +# interface the result is used for fast classification and re-marking +# on the egress interface +# This is an example of a color aware mode marker with PIR configured +# based on draft-wahjak-mcm-00.txt (section 3.2) +# +# The colors are defined using the Diffserv Fields +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +EGDEV="dev eth1" +CIR1=1000kbit +CIR2=500kbit +# the PIR is what is in excess of the CIR +PIR1=1000kbit +PIR2=500kbit + +#The CBS is about 60 MTU sized packets +CBS1=90k +CBS2=90k +#the EBS is about 20 max sized packets +EBS1=30k +EBS2=30k + +# The meters: Note that we have shared meters in this case as identified +# by the index parameter +meter1=" police index 1 rate $CIR1 burst $CBS1 " +meter1a=" police index 2 rate $PIR1 burst $EBS1 " +meter2=" police index 3 rate $CIR2 burst $CBS1 " +meter2a=" police index 4 rate $PIR2 burst $EBS1 " +meter3=" police index 5 rate $CIR2 burst $CBS2 " +meter3a=" police index 6 rate $PIR2 burst $EBS2 " +meter4=" police index 7 rate $CIR1 burst $CBS2 " + +############################################################ +# +# install the ingress qdisc on the ingress interface +$TC qdisc add dev $INDEV handle ffff: ingress +############################################################ +# +# All packets are marked with a tcindex value which is used on the egress +# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE +# +# *********************** AF41 *************************** +#AF41 (DSCP 0x22) from is passed on with a tcindex value 1 +#if it doesnt exceed its CIR/CBS + PIR/EBS +#policer 1 is used. +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \ +match ip tos 0x88 0xfc \ +$meter1 \ +continue flowid :1 +$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \ +match ip tos 0x88 0xfc \ +$meter1a \ +continue flowid :1 +# +# if it exceeds the above but not the extra rate/burst below, it gets a +# tcindex value of 2 +# policer 2 is used +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \ +match ip tos 0x88 0xfc \ +$meter2 \ +continue flowid :2 +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \ +match ip tos 0x88 0xfc \ +$meter2a \ +continue flowid :2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 (policer 3) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \ +match ip tos 0x88 0xfc \ +$meter3 \ +continue flowid :3 +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \ +match ip tos 0x88 0xfc \ +$meter3a \ +drop flowid :3 +# +# *********************** AF42 *************************** +#AF42 (DSCP 0x24) from is passed on with a tcindex value 2 +#if it doesnt exceed its CIR/CBS + PIR/EBS +#policer 2 is used. Note that this is shared with the AF41 +# +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 8 u32 \ +match ip tos 0x90 0xfc \ +$meter2 \ +continue flowid :2 +$TC filter add dev $INDEV parent ffff: protocol ip prio 9 u32 \ +match ip tos 0x90 0xfc \ +$meter2a \ +continue flowid :2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 (policer 3) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 10 u32 \ +match ip tos 0x90 0xfc \ +$meter3 \ +continue flowid :3 +$TC filter add dev $INDEV parent ffff: protocol ip prio 11 u32 \ +match ip tos 0x90 0xfc \ +$meter3a \ +drop flowid :3 + +# +# *********************** AF43 *************************** +# +#AF43 (DSCP 0x26) from is passed on with a tcindex value 3 +#if it doesnt exceed its CIR/CBS + PIR/EBS +#policer 3 is used. Note that this is shared with the AF41 and AF42 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 13 u32 \ +match ip tos 0x98 0xfc \ +$meter3 \ +continue flowid :3 +$TC filter add dev $INDEV parent ffff: protocol ip prio 14 u32 \ +match ip tos 0x98 0xfc \ +$meter3a \ +drop flowid :3 +# +## *********************** BE *************************** +## +## Anything else (not from the AF4*) gets discarded if it +## exceeds 1Mbps and by default goes to BE if it doesnt +## Note that the BE class is also used by the AF4* in the worst +## case +## +$TC filter add dev $INDEV parent ffff: protocol ip prio 16 u32 \ +match ip src 0/0\ +$meter4 \ +drop flowid :4 + +######################## Egress side ######################## + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +#note that the ECN bits are masked out +# +#AF41 (0x88 is 0x22 shifted to the right by two bits) +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0x88 +#AF42 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x90 +#AF43 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x98 +#BE +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x0 +# +# +# The class mapping +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 1 tcindex classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 2 tcindex classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 3 tcindex classid 1:3 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 4 tcindex classid 1:4 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/diffserv/Edge32-cb-chains b/examples/diffserv/Edge32-cb-chains index e69de29b..804fad19 100644 --- a/examples/diffserv/Edge32-cb-chains +++ b/examples/diffserv/Edge32-cb-chains @@ -0,0 +1,144 @@ +#! /bin/sh -x +# +# sample script on using the ingress capabilities +# This script fwmark tags(IPchains) based on metering on the ingress +# interface the result is used for fast classification and re-marking +# on the egress interface +# This is an example of a color blind mode marker with no PIR configured +# based on draft-wahjak-mcm-00.txt (section 3.1) +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains +INDEV=eth2 +EGDEV="dev eth1" +CIR1=1500kbit +CIR2=500kbit + +#The CBS is about 60 MTU sized packets +CBS1=90k +CBS2=90k + +meter1="police rate $CIR1 burst $CBS1 " +meter1a="police rate $CIR2 burst $CBS1 " +meter2="police rate $CIR1 burst $CBS2 " +meter2a="police rate $CIR2 burst $CBS2 " +meter3="police rate $CIR2 burst $CBS1 " +meter3a="police rate $CIR2 burst $CBS1 " +meter4="police rate $CIR2 burst $CBS2 " +meter5="police rate $CIR1 burst $CBS2 " +# +# tag the rest of incoming packets from subnet 10.2.0.0/24 to fw value 1 +# tag all incoming packets from any other subnet to fw tag 2 +############################################################ +$IPCHAINS -A input -i $INDEV -s 0/0 -m 2 +$IPCHAINS -A input -i $INDEV -s 10.2.0.0/24 -m 1 +# +############################################################ +# install the ingress qdisc on the ingress interface +$TC qdisc add dev $INDEV handle ffff: ingress +# +############################################################ + +# All packets are marked with a tcindex value which is used on the egress +# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE +# +############################################################ +# +# anything with fw tag of 1 is passed on with a tcindex value 1 +#if it doesnt exceed its allocated rate (CIR/CBS) +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 1 handle 1 fw \ +$meter1 \ +continue flowid 4:1 +$TC filter add dev $INDEV parent ffff: protocol ip prio 2 handle 1 fw \ +$meter1a \ +continue flowid 4:1 +# +# if it exceeds the above but not the extra rate/burst below, it gets a +#tcindex value of 2 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 3 handle 1 fw \ +$meter2 \ +continue flowid 4:2 +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1 fw \ +$meter2a \ +continue flowid 4:2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 handle 1 fw \ +$meter3 \ +continue flowid 4:3 +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 handle 1 fw \ +$meter3a \ +drop flowid 4:3 +# +# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it +# exceeds 1Mbps and by default goes to BE if it doesnt +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 7 handle 2 fw \ +$meter5 \ +drop flowid 4:4 + + +######################## Egress side ######################## + + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +#note that the ECN bits are masked out +# +#AF41 (0x88 is 0x22 shifted to the right by two bits) +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0x88 +#AF42 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x90 +#AF43 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x98 +#BE +$TC class change $EGDEV classid 1:4 dsmark mask 0x3 \ + value 0x0 +# +# +# The class mapping (using tcindex; could easily have +# replaced it with the fw classifier instead) +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 1 tcindex classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 2 tcindex classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 3 tcindex classid 1:3 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 4 tcindex classid 1:4 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/diffserv/Edge32-cb-u32 b/examples/diffserv/Edge32-cb-u32 index e69de29b..cc2ebb40 100644 --- a/examples/diffserv/Edge32-cb-u32 +++ b/examples/diffserv/Edge32-cb-u32 @@ -0,0 +1,145 @@ +#! /bin/sh +# +# sample script on using the ingress capabilities using u32 classifier +# This script tags tcindex based on metering on the ingress +# interface the result is used for fast classification and re-marking +# on the egress interface +# This is an example of a color blind mode marker with PIR configured +# based on draft-wahjak-mcm-00.txt (section 3.2) +# +#path to various utilities; +#change to reflect yours. +# +IPROUTE=/root/DS-6-beta/iproute2-990530-dsing +TC=$IPROUTE/tc/tc +IP=$IPROUTE/ip/ip +INDEV=eth2 +EGDEV="dev eth1" +CIR1=1000kbit +CIR2=1000kbit +# The PIR is the excess (in addition to the CIR i.e if always +# going to the PIR --> average rate is CIR+PIR) +PIR1=1000kbit +PIR2=500kbit + +#The CBS is about 60 MTU sized packets +CBS1=90k +CBS2=90k +#the EBS is about 10 max sized packets +EBS1=15k +EBS2=15k +# The meters +meter1=" police rate $CIR1 burst $CBS1 " +meter1a=" police rate $PIR1 burst $EBS1 " +meter2=" police rate $CIR2 burst $CBS1 " +meter2a="police rate $PIR2 burst $CBS1 " +meter3=" police rate $CIR2 burst $CBS2 " +meter3a=" police rate $PIR2 burst $EBS2 " +meter4=" police rate $CIR1 burst $CBS2 " +meter5=" police rate $CIR1 burst $CBS2 " + + +# install the ingress qdisc on the ingress interface +############################################################ +$TC qdisc add dev $INDEV handle ffff: ingress +############################################################ +# +############################################################ + +# All packets are marked with a tcindex value which is used on the egress +# NOTE: tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE +# +#anything from subnet 10.2.0.2/24 is passed on with a tcindex value 1 +#if it doesnt exceed its CIR/CBS + PIR/EBS +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 1 u32 \ +match ip src 10.2.0.0/24 $meter1 \ +continue flowid :1 +$TC filter add dev $INDEV parent ffff: protocol ip prio 2 u32 \ +match ip src 10.2.0.0/24 $meter1a \ +continue flowid :1 + +# +# if it exceeds the above but not the extra rate/burst below, it gets a +#tcindex value of 2 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 3 u32 \ +match ip src 10.2.0.0/24 $meter2 \ +continue flowid :2 +$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \ +match ip src 10.2.0.0/24 $meter2a \ +continue flowid :2 +# +# if it exceeds the above but not the rule below, it gets a tcindex value +# of 3 +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \ +match ip src 10.2.0.0/24 $meter3 \ +continue flowid :3 +$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \ +match ip src 10.2.0.0/24 $meter3a \ +drop flowid :3 +# +# +# Anything else (not from the subnet 10.2.0.24/24) gets discarded if it +# exceeds 1Mbps and by default goes to BE if it doesnt +# +$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \ +match ip src 0/0 $meter5 \ +drop flowid :4 + + +######################## Egress side ######################## + + +# attach a dsmarker +# +$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 +# +# values of the DSCP to change depending on the class +#note that the ECN bits are masked out +# +#AF41 (0x88 is 0x22 shifted to the right by two bits) +# +$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \ + value 0x88 +#AF42 +$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \ + value 0x90 +#AF43 +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x98 +#BE +$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \ + value 0x0 +# +# +# The class mapping +# +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 1 tcindex classid 1:1 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 2 tcindex classid 1:2 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 3 tcindex classid 1:3 +$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \ + handle 4 tcindex classid 1:4 +# + +# +echo "---- qdisc parameters Ingress ----------" +$TC qdisc ls dev $INDEV +echo "---- Class parameters Ingress ----------" +$TC class ls dev $INDEV +echo "---- filter parameters Ingress ----------" +$TC filter ls dev $INDEV parent ffff: + +echo "---- qdisc parameters Egress ----------" +$TC qdisc ls $EGDEV +echo "---- Class parameters Egress ----------" +$TC class ls $EGDEV +echo "---- filter parameters Egress ----------" +$TC filter ls $EGDEV parent 1:0 +# +#deleting the ingress qdisc +#$TC qdisc del $INDEV ingress diff --git a/examples/diffserv/README b/examples/diffserv/README index e69de29b..ec91d632 100644 --- a/examples/diffserv/README +++ b/examples/diffserv/README @@ -0,0 +1,98 @@ + +Note all these are mere examples which can be customized to your needs + +AFCBQ +----- +AF PHB built using CBQ, DSMARK,GRED (default in GRIO mode) ,RED for BE +and the tcindex classifier with some algorithmic mapping + +EFCBQ +----- +EF PHB built using CBQ (for rate control and prioritization), +DSMARK( to remark DSCPs), tcindex classifier and RED for the BE +traffic. + +EFPRIO +------ +EF PHB using the PRIO scheduler, Token Bucket to rate control EF, +tcindex classifier, DSMARK to remark, and RED for the BE traffic + +EDGE scripts +============== + +CB-3(1|2)-(u32/chains) +====================== + + +The major differences are that the classifier is u32 on -u32 extension +and IPchains on the chains extension. CB stands for color Blind +and 31 is for the mode where only a CIR and CBS are defined whereas +32 stands for a mode where a CIR/CBS + PIR/EBS are defined. + +Color Blind (CB) +==========-----= +We look at one special subnet that we are interested in for simplicty +reasons to demonstrate the capability. We send the packets from that +subnet to AF4*, BE or end up dropping depending on the metering results. + + +The algorithm overview is as follows: + +*classify: + +**case: subnet X +---------------- + if !exceed meter1 tag as AF41 + else + if !exceed meter2 tag as AF42 + else + if !exceed meter 3 tag as AF43 + else + drop + +default case: Any other subnet +------------------------------- + if !exceed meter 5 tag as AF43 + else + drop + + +One Egress side change the DSCPs of the packets to reflect AF4* and BE +based on the tags from the ingress. + +------------------------------------------------------------- + +Color Aware +=========== + +Define some meters with + policing and give them IDs eg + +meter1=police index 1 rate $CIR1 burst $CBS1 +meter2=police index 2 rate $CIR2 burst $CBS2 etc + +General overview: +classify based on the DSCPs and use the policer ids to decide tagging + + +*classify on ingress: + +switch (dscp) { + case AF41: /* tos&0xfc == 0x88 */ + if (!exceed meter1) break; + case AF42: /* tos&0xfc == 0x90 */ + if (!exceed meter2) { + tag as AF42; + break; + } + case AF43: /* tos&0xfc == 0x98 */ + if (!exceed meter3) { + tag as AF43; + break; + } else + drop; + default: + if (!exceed meter4) tag as BE; + else drop; +} + +On the Egress side mark the proper AF tags diff --git a/examples/diffserv/afcbq b/examples/diffserv/afcbq index e69de29b..10d6d934 100644 --- a/examples/diffserv/afcbq +++ b/examples/diffserv/afcbq @@ -0,0 +1,105 @@ +#!/usr/bin/perl +# +# +# AF using CBQ for a single interface eth0 +# 4 AF classes using GRED and one BE using RED +# Things you might want to change: +# - the device bandwidth (set at 10Mbits) +# - the bandwidth allocated for each AF class and the BE class +# - the drop probability associated with each AF virtual queue +# +# AF DSCP values used (based on AF draft 04) +# ----------------------------------------- +# AF DSCP values +# AF1 1. 0x0a 2. 0x0c 3. 0x0e +# AF2 1. 0x12 2. 0x14 3. 0x16 +# AF3 1. 0x1a 2. 0x1c 3. 0x1e +# AF4 1. 0x22 2. 0x24 3. 0x26 + +# +# +# A simple DSCP-class relationship formula used to generate +# values in the for loop of this script; $drop stands for the +# DP +# $dscp = ($class*8+$drop*2) +# +# if you use GRIO buffer sharing, then GRED priority is set as follows: +# $gprio=$drop+1; +# + +$TC = "/usr/src/iproute2-current/tc/tc"; +$DEV = "dev lo"; +$DEV = "dev eth1"; +$DEV = "dev eth0"; +# the BE-class number +$beclass = "5"; + +#GRIO buffer sharing on or off? +$GRIO = ""; +$GRIO = "grio"; +# The bandwidth of your device +$linerate="10Mbit"; +# The BE and AF rates +%rate_table=(); +$berate="1500Kbit"; +$rate_table{"AF1rate"}="1500Kbit"; +$rate_table{"AF2rate"}="1500Kbit"; +$rate_table{"AF3rate"}="1500Kbit"; +$rate_table{"AF4rate"}="1500Kbit"; +# +# +# +print "\n# --- General setup ---\n"; +print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n"; +print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex mask 0xfc " . + "shift 2 pass_on\n"; + #"shift 2\n"; +print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth $linerate ". + "cell 8 avpkt 1000 mpu 64\n"; +print "$TC filter add $DEV parent 2:0 protocol ip prio 1 tcindex ". + "mask 0xf0 shift 4 pass_on\n"; +for $class (1..4) { + print "\n# --- AF Class $class specific setup---\n"; + $AFrate=sprintf("AF%drate",$class); + print "$TC class add $DEV parent 2:0 classid 2:$class cbq ". + "bandwidth $linerate rate $rate_table{$AFrate} avpkt 1000 prio ". + (6-$class)." bounded allot 1514 weight 1 maxburst 21\n"; + print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle $class ". + "tcindex classid 2:$class\n"; + print "$TC qdisc add $DEV parent 2:$class gred setup DPs 3 default 2 ". + "$GRIO\n"; +# +# per DP setup +# + for $drop (1..3) { + print "\n# --- AF Class $class DP $drop---\n"; + $dscp = $class*8+$drop*2; + $tcindex = sprintf("1%x%x",$class,$drop); + print "$TC filter add $DEV parent 1:0 protocol ip prio 1 ". + "handle $dscp tcindex classid 1:$tcindex\n"; + $prob = $drop*0.02; + if ($GRIO) { + $gprio = $drop+1; + print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ". + "max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ". + "probability $prob ". + "prio $gprio\n"; + } else { + print "$TC qdisc change $DEV parent 2:$class gred limit 60KB min 15KB ". + "max 45KB burst 20 avpkt 1000 bandwidth $linerate DP $drop ". + "probability $prob \n"; + } + } +} +# +# +print "\n#------BE Queue setup------\n"; +print "$TC filter add $DEV parent 1:0 protocol ip prio 2 ". + "handle 0 tcindex mask 0 classid 1:1\n"; +print "$TC class add $DEV parent 2:0 classid 2:$beclass cbq ". + "bandwidth $linerate rate $berate avpkt 1000 prio 6 " . + "bounded allot 1514 weight 1 maxburst 21 \n"; +print "$TC filter add $DEV parent 2:0 protocol ip prio 1 handle 0 tcindex ". + "classid 2:5\n"; +print "$TC qdisc add $DEV parent 2:5 red limit 60KB min 15KB max 45KB ". + "burst 20 avpkt 1000 bandwidth $linerate probability 0.4\n"; diff --git a/examples/diffserv/ef-prio b/examples/diffserv/ef-prio index e69de29b..48611bdd 100644 --- a/examples/diffserv/ef-prio +++ b/examples/diffserv/ef-prio @@ -0,0 +1,25 @@ +#!/usr/bin/perl +$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc"; +$DEV = "dev eth1"; +$efrate="1.5Mbit"; +$MTU="1.5kB"; +print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n"; +print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ". + "mask 0xfc shift 2\n"; +print "$TC qdisc add $DEV parent 1:0 handle 2:0 prio\n"; +# +# EF class: Maximum about one MTU sized packet allowed on the queue +# +print "$TC qdisc add $DEV parent 2:1 tbf rate $efrate burst $MTU limit 1.6kB\n"; +print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ". + "handle 0x2e tcindex classid 2:1 pass_on\n"; +# +# BE class +# +print "#BE class(2:2) \n"; +print "$TC qdisc add $DEV parent 2:2 red limit 60KB ". + "min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ". + "probability 0.4\n"; +# +print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ". + "handle 0 tcindex mask 0 classid 2:2 pass_on\n"; diff --git a/examples/diffserv/efcbq b/examples/diffserv/efcbq index e69de29b..bcc437b3 100644 --- a/examples/diffserv/efcbq +++ b/examples/diffserv/efcbq @@ -0,0 +1,31 @@ +#!/usr/bin/perl +# +$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc"; +$DEV = "dev eth1"; +print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n"; +print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ". + "mask 0xfc shift 2\n"; +print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth ". + "10Mbit cell 8 avpkt 1000 mpu 64\n"; +# +# EF class +# +print "$TC class add $DEV parent 2:0 classid 2:1 cbq bandwidth ". + "10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated ". + "allot 1514 weight 1 maxburst 10 \n"; +# packet fifo for EF? +print "$TC qdisc add $DEV parent 2:1 pfifo limit 5\n"; +print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ". + "handle 0x2e tcindex classid 2:1 pass_on\n"; +# +# BE class +# +print "#BE class(2:2) \n"; +print "$TC class add $DEV parent 2:0 classid 2:2 cbq bandwidth ". + "10Mbit rate 5Mbit avpkt 1000 prio 7 allot 1514 weight 1 ". + "maxburst 21 borrow split 2:0 defmap 0xffff \n"; +print "$TC qdisc add $DEV parent 2:2 red limit 60KB ". + "min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ". + "probability 0.4\n"; +print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ". + "handle 0 tcindex mask 0 classid 2:2 pass_on\n"; diff --git a/examples/diffserv/regression-testing b/examples/diffserv/regression-testing index e69de29b..0ec705c0 100644 --- a/examples/diffserv/regression-testing +++ b/examples/diffserv/regression-testing @@ -0,0 +1,125 @@ + +These were the tests done to validate the Diffserv scripts. +This document will be updated continously. If you do more +thorough validation testing please post the details to the +diffserv mailing list. +Nevertheless, these tests should serve for basic validation. + +AFCBQ, EFCBQ, EFPRIO +---------------------- + +generate all possible DSCPs and observe that they +get sent to the proper classes. In the case of AF also +to the correct Virtual Queues. + +Edge1 +----- +generate TOS values 0x0,0x10,0xbb each with IP addresses +10.2.0.24 (mark 1), 10.2.0.3 (mark2) and 10.2.0.30 (mark 3) +and observe that they get marked as expected. + +Edge2 +----- + +-Repeat the tests in Edge1 +-ftp with data direction from 10.2.0.2 + *observe that the metering/policing works correctly (and the marking + as well). In this case the mark used will be 3 + +Edge31-cb-chains +---------------- + +-ftp with data direction from 10.2.0.2 + + *observe that the metering/policing works correctly (and the marking + as well). In this case the mark used will be 1. + + Metering: The data throughput should not exceed 2*CIR1 + 2*CIR2 + which is roughly: 5mbps + + Marking: the should be a variation of marked packets: + AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0) + +More tests required to see the interaction of several sources (other +than subnet 10.2.0.0/24). + +Edge31-ca-u32 +-------------- + +Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the +discard port of 10.1.0.2 (behind eth1) + +1) generate with src tos = 0x88 + Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2 + approximately 5mbps + Marking: Should vary between 0x88,0x90,0x98 and 0x0 + +2) generate with src tos = 0x90 + Metering: Allocated throughput should not exceed CIR1 + 2*CIR2 + approximately 3.5mbps + Marking: Should vary between 0x90,0x98 and 0x0 + +3) generate with src tos = 0x98 + Metering: Allocated throughput should not exceed CIR1 + CIR2 + approximately 2.5mbps + Marking: Should vary between 0x98 and 0x0 + +4) generate with src tos any other than the above + Metering: Allocated throughput should not exceed CIR1 + approximately 1.5mbps + Marking: Should be consistent at 0x0 + +TODO: Testing on how each color shares when all 4 types of packets +are going through the edge device + +Edge32-cb-u32, Edge32-cb-chains +------------------------------- + +-ftp with data direction from 10.2.0.2 + + *observe that the metering/policing works correctly (and the marking + as well). + + Metering: + The data throughput should not exceed 2*CIR1 + 2*CIR2 + + 2*PIR2 + PIR1 for u32 which is roughly: 6mbps + The data throughput should not exceed 2*CIR1 + 5*CIR2 + for chains which is roughly: 6mbps + + Marking: the should be a variation of marked packets: + AF41(TOS=0x88) AF42(0x90) AF43(0x98) and BE (0x0) + +TODO: +-More tests required to see the interaction of several sources (other +than subnet 10.2.0.0/24). +-More tests needed to capture stats on how many times the CIR was exceeded +but the data was not remarked etc. + +Edge32-ca-u32 +-------------- + +Generate data using modified tcpblast from 10.2.0.2 (behind eth2) to the +discard port of 10.1.0.2 (behind eth1) + +1) generate with src tos = 0x88 + Metering: Allocated throughput should not exceed 2*CIR1 + 2*CIR2 + +PIR1 -- approximately 4mbps + Marking: Should vary between 0x88,0x90,0x98 and 0x0 + +2) generate with src tos = 0x90 + Metering: Allocated throughput should not exceed CIR1 + 2*CIR2 + + 2* PIR2 approximately 3mbps + Marking: Should vary between 0x90,0x98 and 0x0 + +3) generate with src tos = 0x98 + Metering: Allocated throughput should not exceed PIR1+ CIR1 + CIR2 + approximately 2.5mbps + Marking: Should vary between 0x98 and 0x0 + +4) generate with src tos any other than the above + Metering: Allocated throughput should not exceed CIR1 + approximately 1mbps + Marking: Should be consistent at 0x0 + +TODO: Testing on how each color shares when all 4 types of packets +are going through the edge device diff --git a/include-glibc/bits/sockunion.h b/include-glibc/bits/sockunion.h index e69de29b..b83add82 100644 --- a/include-glibc/bits/sockunion.h +++ b/include-glibc/bits/sockunion.h @@ -0,0 +1,25 @@ + +/* I cannot describe, how I laughed, when saw, that now sys/socket.h + includes ALL OF networking include files. 8)8)8) + + Bravo! Aah, they forgot sockaddr_ll, sockaddr_pkt and sockaddr_nl... + Not a big problem, we only start the way to single UNIVERSAL include file: + + #include <GNU-Gnu_is_Not_Unix.h>. + + Jokes apart, it is full crap. Removed. + --ANK + + */ + +/* Union of all sockaddr types (required by IPv6 Basic API). This is + somewhat evil. */ +/* 8)8) Well, ipngwg really does strange things sometimes, but + not in such extent! It is removed long ago --ANK + */ + +union sockaddr_union + { + struct sockaddr sa; + char __maxsize[128]; + }; diff --git a/include-glibc/db.h b/include-glibc/db.h index e69de29b..296584c2 100644 --- a/include-glibc/db.h +++ b/include-glibc/db.h @@ -0,0 +1,10 @@ +/* Mess with various libdb in various glibcs is something... + * Crooked hands of hackers can result in amazing results making + * incompatibility at all the levels without any reasons. + * + * The simplest trick which I was able to invent is to write fake + * db.h including db_185.h and adding -I/usr/include/db3 to CFLAGS. + * Looks ugly but compiles everywhere. + */ + +#include <db_185.h> diff --git a/include-glibc/glibc-bugs.h b/include-glibc/glibc-bugs.h index e69de29b..65e3d8ad 100644 --- a/include-glibc/glibc-bugs.h +++ b/include-glibc/glibc-bugs.h @@ -0,0 +1,20 @@ +#ifndef __GLIBC_BUGS_H__ +#define __GLIBC_BUGS_H__ 1 + +#include <features.h> +#include <sys/types.h> + +#if defined(__GLIBC__) && __GLIBC__ >= 2 + +#ifndef __KERNEL_STRICT_NAMES +#define __KERNEL_STRICT_NAMES 1 +#endif + +#include <linux/types.h> + +typedef __u16 in_port_t; +typedef __u32 in_addr_t; + +#endif + +#endif diff --git a/include-glibc/netinet/in.h b/include-glibc/netinet/in.h index e69de29b..784a66ca 100644 --- a/include-glibc/netinet/in.h +++ b/include-glibc/netinet/in.h @@ -0,0 +1,11 @@ +#ifndef _NETINET_IN_H +#define _NETINET_IN_H 1 + +#include "glibc-bugs.h" +#include <sys/socket.h> +#include <sys/types.h> +#include <linux/in.h> + +#define SOL_IP 0 + +#endif /* netinet/in.h */ diff --git a/include-glibc/netinet/ip.h b/include-glibc/netinet/ip.h index e69de29b..8812e676 100644 --- a/include-glibc/netinet/ip.h +++ b/include-glibc/netinet/ip.h @@ -0,0 +1,9 @@ +#ifndef __NETINET_IP_H +#define __NETINET_IP_H 1 + +#include <glibc-bugs.h> +#include <netinet/in.h> + +#include <linux/ip.h> + +#endif /* netinet/ip.h */ diff --git a/include-glibc/socketbits.h b/include-glibc/socketbits.h index e69de29b..5421d6b8 100644 --- a/include-glibc/socketbits.h +++ b/include-glibc/socketbits.h @@ -0,0 +1,270 @@ +/* System-specific socket constants and types. Linux version. + Copyright (C) 1991, 92, 94, 95, 96, 97, 98 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifndef _SOCKETBITS_H + +#define _SOCKETBITS_H 1 +#include <features.h> + +#define __need_size_t +#define __need_NULL +#include <stddef.h> + + +__BEGIN_DECLS + +/* Type for length arguments in socket calls. */ +#ifndef __socklen_t_defined +typedef unsigned int socklen_t; +# define __socklen_t_defined +#endif + +/* Types of sockets. */ +enum __socket_type +{ + SOCK_STREAM = 1, /* Sequenced, reliable, connection-based + byte streams. */ +#define SOCK_STREAM SOCK_STREAM + SOCK_DGRAM = 2, /* Connectionless, unreliable datagrams + of fixed maximum length. */ +#define SOCK_DGRAM SOCK_DGRAM + SOCK_RAW = 3, /* Raw protocol interface. */ +#define SOCK_RAW SOCK_RAW + SOCK_RDM = 4, /* Reliably-delivered messages. */ +#define SOCK_RDM SOCK_RDM + SOCK_SEQPACKET = 5, /* Sequenced, reliable, connection-based, + datagrams of fixed maximum length. */ +#define SOCK_SEQPACKET SOCK_SEQPACKET + SOCK_PACKET = 10 /* Linux specific way of getting packets + at the dev level. For writing rarp and + other similar things on the user level. */ +#define SOCK_PACKET SOCK_PACKET +}; + +/* Protocol families. */ +#define PF_UNSPEC 0 /* Unspecified. */ +#define PF_LOCAL 1 /* Local to host (pipes and file-domain). */ +#define PF_UNIX PF_LOCAL /* Old BSD name for PF_LOCAL. */ +#define PF_FILE PF_LOCAL /* POSIX name for PF_LOCAL. */ +#define PF_INET 2 /* IP protocol family. */ +#define PF_AX25 3 /* Amateur Radio AX.25. */ +#define PF_IPX 4 /* Novell Internet Protocol. */ +#define PF_APPLETALK 5 /* Don't use this. */ +#define PF_NETROM 6 /* Amateur radio NetROM. */ +#define PF_BRIDGE 7 /* Multiprotocol bridge. */ +#define PF_AAL5 8 /* Reserved for Werner's ATM. */ +#define PF_X25 9 /* Reserved for X.25 project. */ +#define PF_INET6 10 /* IP version 6. */ +#define PF_ROSE 11 /* Amateur Radio X.25 PLP */ +#define PF_DECnet 12 /* Reserved for DECnet project */ +#define PF_NETBEUI 13 /* Reserved for 802.2LLC project*/ +#define PF_SECURITY 14 /* Security callback pseudo AF */ +#define PF_KEY 15 /* PF_KEY key management API */ +#define PF_NETLINK 16 +#define PF_ROUTE PF_NETLINK /* Alias to emulate 4.4BSD */ +#define PF_PACKET 17 /* Packet family */ +#define PF_MAX 32 /* For now.. */ + +/* Address families. */ +#define AF_UNSPEC PF_UNSPEC +#define AF_LOCAL PF_LOCAL +#define AF_UNIX PF_UNIX +#define AF_FILE PF_FILE +#define AF_INET PF_INET +#define AF_AX25 PF_AX25 +#define AF_IPX PF_IPX +#define AF_APPLETALK PF_APPLETALK +#define AF_NETROM PF_NETROM +#define AF_BRIDGE PF_BRIDGE +#define AF_AAL5 PF_AAL5 +#define AF_X25 PF_X25 +#define AF_INET6 PF_INET6 +#define AF_ROSE PF_ROSE +#define AF_DECnet PF_DECnet +#define AF_NETBEUI PF_NETBEUI +#define AF_SECURITY PF_SECURITY +#define pseudo_AF_KEY pseudo_PF_KEY +#define AF_NETLINK PF_NETLINK +#define AF_ROUTE PF_ROUTE +#define AF_PACKET PF_PACKET +#define AF_MAX PF_MAX + +/* Socket level values. Others are defined in the appropriate headers. + + XXX These definitions also should go into the appropriate headers as + far as they are available. */ +#define SOL_IPV6 41 +#define SOL_ICMPV6 58 +#define SOL_RAW 255 +#define SOL_AX25 257 +#define SOL_ATALK 258 +#define SOL_NETROM 259 +#define SOL_ROSE 260 +#define SOL_DECNET 261 +#define SOL_X25 262 + +/* Maximum queue length specifiable by listen. */ +#define SOMAXCONN 128 + +/* Get the definition of the macro to define the common sockaddr members. */ +#if __GLIBC_MINOR__ >= 1 +#include <bits/sockaddr.h> +#else +#include <sockaddrcom.h> +#endif + +/* Structure describing a generic socket address. */ +struct sockaddr + { + __SOCKADDR_COMMON (sa_); /* Common data: address family and length. */ + char sa_data[14]; /* Address data. */ + }; + + +/* Bits in the FLAGS argument to `send', `recv', et al. */ +enum + { + MSG_OOB = 0x01, /* Process out-of-band data. */ +#define MSG_OOB MSG_OOB + MSG_PEEK = 0x02, /* Peek at incoming messages. */ +#define MSG_PEEK MSG_PEEK + MSG_DONTROUTE = 0x04, /* Don't use local routing. */ +#define MSG_DONTROUTE MSG_DONTROUTE + MSG_CTRUNC = 0x08, /* Control data lost before delivery. */ +#define MSG_CTRUNC MSG_CTRUNC + MSG_PROXY = 0x10, /* Supply or ask second address. */ +#define MSG_PROXY MSG_PROXY + MSG_TRUNC = 0x20, +#define MSG_TRUNC MSG_TRUNC + MSG_DONTWAIT = 0x40, +#define MSG_DONTWAIT MSG_DONTWAIT + MSG_WAITALL = 0x100, +#define MSG_WAITALL MSG_WAITALL + MSG_ERRQUEUE = 0x2000, +#define MSG_ERRQUEUE MSG_ERRQUEUE + MSG_NOSIGNAL = 0x4000, +#define MSG_NOSIGNAL MSG_NOSIGNAL + }; + + +/* Structure describing messages sent by + `sendmsg' and received by `recvmsg'. */ +struct msghdr + { + __ptr_t msg_name; /* Address to send to/receive from. */ + socklen_t msg_namelen; /* Length of address data. */ + + struct iovec *msg_iov; /* Vector of data to send/receive into. */ + size_t msg_iovlen; /* Number of elements in the vector. */ + + __ptr_t msg_control; /* Ancillary data (eg BSD filedesc passing). */ + size_t msg_controllen; /* Ancillary data buffer length. */ + + int msg_flags; /* Flags on received message. */ + }; + +/* Structure used for storage of ancillary data object information. */ +struct cmsghdr + { + size_t cmsg_len; /* Length of data in cmsg_data plus length + of cmsghdr structure. */ + int cmsg_level; /* Originating protocol. */ + int cmsg_type; /* Protocol specific type. */ +#if !defined __STRICT_ANSI__ && defined __GNUC__ && __GNUC__ >= 2 + unsigned char __cmsg_data[0]; /* Ancillary data. */ +#endif + }; + +/* Ancillary data object manipulation macros. */ +#if !defined __STRICT_ANSI__ && defined __GNUC__ && __GNUC__ >= 2 +# define CMSG_DATA(cmsg) ((cmsg)->__cmsg_data) +#else +# define CMSG_DATA(cmsg) ((unsigned char *) ((struct cmsghdr *) (cmsg) + 1)) +#endif +#define CMSG_NXTHDR(mhdr, cmsg) __cmsg_nxthdr (mhdr, cmsg) +#define CMSG_FIRSTHDR(mhdr) \ + ((size_t) (mhdr)->msg_controllen >= sizeof (struct cmsghdr) \ + ? (struct cmsghdr *) (mhdr)->msg_control : (struct cmsghdr *) NULL) +#define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) +#define CMSG_SPACE(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + CMSG_ALIGN(len)) +#define CMSG_LEN(len) (CMSG_ALIGN(sizeof(struct cmsghdr)) + (len)) + + +#ifndef _EXTERN_INLINE +# define _EXTERN_INLINE extern __inline +#endif +extern struct cmsghdr *__cmsg_nxthdr __P ((struct msghdr *__mhdr, + struct cmsghdr *__cmsg)); +_EXTERN_INLINE struct cmsghdr * +__cmsg_nxthdr (struct msghdr *__mhdr, struct cmsghdr *__cmsg) +{ + if ((size_t) __cmsg->cmsg_len < sizeof (struct cmsghdr)) + /* The kernel header does this so there may be a reason. */ + return NULL; + + __cmsg = (struct cmsghdr *) + ((unsigned char *) __cmsg + CMSG_ALIGN(__cmsg->cmsg_len)); + + if ( (unsigned char *) (__cmsg + 1) >= + (unsigned char *) __mhdr->msg_control + __mhdr->msg_controllen) + /* No more entries. */ + return NULL; + return __cmsg; +} + +/* Socket level message types. This must match the definitions in + <linux/socket.h>. */ +enum + { + SCM_RIGHTS = 0x01, /* Data array contains access rights. */ +#define SCM_RIGHTS SCM_RIGHTS + SCM_CREDENTIALS = 0x02, /* Data array is `struct ucred'. */ +#define SCM_CREDENTIALS SCM_CREDENTIALS + }; + + + +/* Get socket manipulation related informations from kernel headers. */ +#ifdef THIS_IS_CRAP +#ifndef _LINUX_TYPES_H +# define _LINUX_TYPES_H +#endif +#endif + +#include <asm/socket.h> +#include <asm/types.h> + +struct ucred +{ + __u32 pid; + __u32 uid; + __u32 gid; +}; + + +/* Structure used to manipulate the SO_LINGER option. */ +struct linger + { + int l_onoff; /* Nonzero to linger on close. */ + int l_linger; /* Time to linger. */ + }; + +__END_DECLS + +#endif /* socketbits.h */ diff --git a/include/SNAPSHOT.h b/include/SNAPSHOT.h index e69de29b..e8107edf 100644 --- a/include/SNAPSHOT.h +++ b/include/SNAPSHOT.h @@ -0,0 +1 @@ +static char SNAPSHOT[] = "020116"; diff --git a/include/libnetlink.h b/include/libnetlink.h index e69de29b..45d3ad2b 100644 --- a/include/libnetlink.h +++ b/include/libnetlink.h @@ -0,0 +1,46 @@ +#ifndef __LIBNETLINK_H__ +#define __LIBNETLINK_H__ 1 + +#include <asm/types.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +struct rtnl_handle +{ + int fd; + struct sockaddr_nl local; + struct sockaddr_nl peer; + __u32 seq; + __u32 dump; +}; + +extern int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions); +extern void rtnl_close(struct rtnl_handle *rth); +extern int rtnl_wilddump_request(struct rtnl_handle *rth, int fam, int type); +extern int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len); +extern int rtnl_dump_filter(struct rtnl_handle *rth, + int (*filter)(struct sockaddr_nl *, struct nlmsghdr *n, void *), + void *arg1, + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *arg2); +extern int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer, + unsigned groups, struct nlmsghdr *answer, + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg); +extern int rtnl_send(struct rtnl_handle *rth, char *buf, int); + + +extern int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data); +extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, void *data, int alen); +extern int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data); +extern int rta_addattr_l(struct rtattr *rta, int maxlen, int type, void *data, int alen); + +extern int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len); + +extern int rtnl_listen(struct rtnl_handle *, int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg); +extern int rtnl_from_file(FILE *, int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg); + +#endif /* __LIBNETLINK_H__ */ + diff --git a/include/ll_map.h b/include/ll_map.h index e69de29b..739f157e 100644 --- a/include/ll_map.h +++ b/include/ll_map.h @@ -0,0 +1,12 @@ +#ifndef __LL_MAP_H__ +#define __LL_MAP_H__ 1 + +extern int ll_remember_index(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); +extern int ll_init_map(struct rtnl_handle *rth); +extern int ll_name_to_index(char *name); +extern const char *ll_index_to_name(int idx); +extern const char *ll_idx_n2a(int idx, char *buf); +extern int ll_index_to_type(int idx); +extern unsigned ll_index_to_flags(int idx); + +#endif /* __LL_MAP_H__ */ diff --git a/include/rt_names.h b/include/rt_names.h index e69de29b..6ac29204 100644 --- a/include/rt_names.h +++ b/include/rt_names.h @@ -0,0 +1,28 @@ +#ifndef RT_NAMES_H_ +#define RT_NAMES_H_ 1 + +const char* rtnl_rtprot_n2a(int id, char *buf, int len); +const char* rtnl_rtscope_n2a(int id, char *buf, int len); +const char* rtnl_rttable_n2a(int id, char *buf, int len); +const char* rtnl_rtrealm_n2a(int id, char *buf, int len); +const char* rtnl_dsfield_n2a(int id, char *buf, int len); +int rtnl_rtprot_a2n(int *id, char *arg); +int rtnl_rtscope_a2n(int *id, char *arg); +int rtnl_rttable_a2n(int *id, char *arg); +int rtnl_rtrealm_a2n(__u32 *id, char *arg); +int rtnl_dsfield_a2n(__u32 *id, char *arg); + +const char *inet_proto_n2a(int proto, char *buf, int len); +int inet_proto_a2n(char *buf); + + +const char * ll_type_n2a(int type, char *buf, int len); + +const char *ll_addr_n2a(unsigned char *addr, int alen, int type, char *buf, int blen); +int ll_addr_a2n(unsigned char *lladdr, int len, char *arg); + +const char * ll_proto_n2a(unsigned short id, char *buf, int len); +int ll_proto_a2n(unsigned short *id, char *buf); + + +#endif diff --git a/include/rtm_map.h b/include/rtm_map.h index e69de29b..70bda7d0 100644 --- a/include/rtm_map.h +++ b/include/rtm_map.h @@ -0,0 +1,10 @@ +#ifndef __RTM_MAP_H__ +#define __RTM_MAP_H__ 1 + +char *rtnl_rtntype_n2a(int id, char *buf, int len); +int rtnl_rtntype_a2n(int *id, char *arg); + +int get_rt_realms(__u32 *realms, char *arg); + + +#endif /* __RTM_MAP_H__ */ diff --git a/include/tcp_diag.h b/include/tcp_diag.h index e69de29b..23014df3 100644 --- a/include/tcp_diag.h +++ b/include/tcp_diag.h @@ -0,0 +1,119 @@ +#ifndef _TCP_DIAG_H_ +#define _TCP_DIAG_H_ 1 + +/* Replace with dymanically allocated value */ +#define NETLINK_TCPDIAG 4 + +/* Just some random number */ +#define TCPDIAG_GETSOCK 18 + +/* Socket identity */ +struct tcpdiag_sockid +{ + __u16 tcpdiag_sport; + __u16 tcpdiag_dport; + __u32 tcpdiag_src[4]; + __u32 tcpdiag_dst[4]; + __u32 tcpdiag_if; + __u32 tcpdiag_cookie[2]; +#define TCPDIAG_NOCOOKIE (~0U) +}; + +/* Request structure */ + +struct tcpdiagreq +{ + __u8 tcpdiag_family; /* Family of addresses. */ + __u8 tcpdiag_src_len; + __u8 tcpdiag_dst_len; + __u8 tcpdiag_ext; /* Query extended information */ + + struct tcpdiag_sockid id; + + __u32 tcpdiag_states; /* States to dump */ + __u32 tcpdiag_dbs; /* Tables to dump (NI) */ +}; + +enum +{ + TCPDIAG_REQ_NONE, + TCPDIAG_REQ_BYTECODE, +}; + +#define TCPDIAG_REQ_MAX TCPDIAG_REQ_BYTECODE + +/* Bytecode is sequence of 4 byte commands followed by variable arguments. + * All the commands identified by "code" are conditional jumps forward: + * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be + * length of the command and its arguments. + */ + +struct tcpdiag_bc_op +{ + unsigned char code; + unsigned char yes; + unsigned short no; +}; + +enum +{ + TCPDIAG_BC_NOP, + TCPDIAG_BC_JMP, + TCPDIAG_BC_S_GE, + TCPDIAG_BC_S_LE, + TCPDIAG_BC_D_GE, + TCPDIAG_BC_D_LE, + TCPDIAG_BC_AUTO, + TCPDIAG_BC_S_COND, + TCPDIAG_BC_D_COND, +}; + +struct tcpdiag_hostcond +{ + __u8 family; + __u8 prefix_len; + int port; + __u32 addr[0]; +}; + +/* Base info structure. It contains socket identity (addrs/ports/cookie) + * and, alas, the information shown by netstat. */ +struct tcpdiagmsg +{ + __u8 tcpdiag_family; + __u8 tcpdiag_state; + __u8 tcpdiag_timer; + __u8 tcpdiag_retrans; + + struct tcpdiag_sockid id; + + __u32 tcpdiag_expires; + __u32 tcpdiag_rqueue; + __u32 tcpdiag_wqueue; + __u32 tcpdiag_uid; + __u32 tcpdiag_inode; +}; + +/* Extensions */ + +enum +{ + TCPDIAG_NONE, + TCPDIAG_MEMINFO, + TCPDIAG_INFO, +}; + +#define TCPDIAG_MAX TCPDIAG_INFO + + +/* TCPDIAG_MEM */ + +struct tcpdiag_meminfo +{ + __u32 tcpdiag_rmem; + __u32 tcpdiag_wmem; + __u32 tcpdiag_fmem; + __u32 tcpdiag_tmem; +}; + +#endif /* _TCP_DIAG_H_ */ diff --git a/include/utils.h b/include/utils.h index e69de29b..e9ba5a38 100644 --- a/include/utils.h +++ b/include/utils.h @@ -0,0 +1,104 @@ +#ifndef __UTILS_H__ +#define __UTILS_H__ 1 + +#include <asm/types.h> +#include <resolv.h> + +#include "libnetlink.h" +#include "ll_map.h" +#include "rtm_map.h" + +extern int preferred_family; +extern int show_stats; +extern int show_details; +extern int show_raw; +extern int resolve_hosts; +extern int oneline; +extern char * _SL_; + +#ifndef IPPROTO_ESP +#define IPPROTO_ESP 50 +#endif +#ifndef IPPROTO_AH +#define IPPROTO_AH 51 +#endif + +#define SPRINT_BSIZE 64 +#define SPRINT_BUF(x) char x[SPRINT_BSIZE] + +extern void incomplete_command(void) __attribute__((noreturn)); + +#define NEXT_ARG() do { argv++; if (--argc <= 0) incomplete_command(); } while(0) + +typedef struct +{ + __u8 family; + __u8 bytelen; + __s16 bitlen; + __u32 data[4]; +} inet_prefix; + +#define DN_MAXADDL 20 +#ifndef AF_DECnet +#define AF_DECnet 12 +#endif + +struct dn_naddr +{ + unsigned short a_len; + unsigned char a_addr[DN_MAXADDL]; +}; + +#define IPX_NODE_LEN 6 + +struct ipx_addr { + u_int32_t ipx_net; + u_int8_t ipx_node[IPX_NODE_LEN]; +}; + +extern __u32 get_addr32(char *name); +extern int get_addr_1(inet_prefix *dst, char *arg, int family); +extern int get_prefix_1(inet_prefix *dst, char *arg, int family); +extern int get_addr(inet_prefix *dst, char *arg, int family); +extern int get_prefix(inet_prefix *dst, char *arg, int family); + +extern int get_integer(int *val, char *arg, int base); +extern int get_unsigned(unsigned *val, char *arg, int base); +#define get_byte get_u8 +#define get_ushort get_u16 +#define get_short get_s16 +extern int get_u32(__u32 *val, char *arg, int base); +extern int get_u16(__u16 *val, char *arg, int base); +extern int get_s16(__s16 *val, char *arg, int base); +extern int get_u8(__u8 *val, char *arg, int base); +extern int get_s8(__s8 *val, char *arg, int base); + +extern __u8* hexstring_n2a(const __u8 *str, int len, __u8 *buf, int blen); +extern __u8* hexstring_a2n(const __u8 *str, __u8 *buf, int blen); + +extern const char *format_host(int af, int len, void *addr, char *buf, int buflen); +extern const char *rt_addr_n2a(int af, int len, void *addr, char *buf, int buflen); + +void invarg(char *, char *) __attribute__((noreturn)); +void duparg(char *, char *) __attribute__((noreturn)); +void duparg2(char *, char *) __attribute__((noreturn)); +int matches(char *arg, char *pattern); +extern int inet_addr_match(inet_prefix *a, inet_prefix *b, int bits); + +const char *dnet_ntop(int af, const void *addr, char *str, size_t len); +int dnet_pton(int af, const char *src, void *addr); + +const char *ipx_ntop(int af, const void *addr, char *str, size_t len); +int ipx_pton(int af, const char *src, void *addr); + +extern int __iproute2_hz_internal; +extern int __get_hz(void); + +static __inline__ int get_hz(void) +{ + if (__iproute2_hz_internal == 0) + __iproute2_hz_internal = __get_hz(); + return __iproute2_hz_internal; +} + +#endif /* __UTILS_H__ */ diff --git a/ip/Makefile b/ip/Makefile index e69de29b..2aa00518 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -0,0 +1,22 @@ +IPOBJ=ip.o ipaddress.o iproute.o iprule.o \ + rtm_map.o iptunnel.o ipneigh.o iplink.o ipmaddr.o \ + ipmonitor.o ipmroute.o + +RTMONOBJ=rtmon.o + +ALLOBJ=$(IPOBJ) $(RTMONOBJ) +TARGETS=ip rtmon + +all: $(TARGETS) + +ip: $(IPOBJ) $(LIBNETLINK) $(LIBUTIL) + +rtmon: $(RTMONOBJ) $(LIBNETLINK) + +install: all + install -m 0755 -s $(TARGETS) $(DESTDIR)$(SBINDIR) + install -m 0755 routel routef $(DESTDIR)$(SBINDIR) + +clean: + rm -f $(ALLOBJ) $(TARGETS) + diff --git a/ip/ifcfg b/ip/ifcfg index e69de29b..ed6960f7 100644 --- a/ip/ifcfg +++ b/ip/ifcfg @@ -0,0 +1,145 @@ +#! /bin/bash + +CheckForwarding () { + local sbase fwd + sbase=/proc/sys/net/ipv4/conf + fwd=0 + if [ -d $sbase ]; then + for dir in $sbase/*/forwarding; do + fwd=$[$fwd + `cat $dir`] + done + else + fwd=2 + fi + return $fwd +} + +RestartRDISC () { + killall -HUP rdisc || rdisc -fs +} + +ABCMaskLen () { + local class; + + class=${1%%.*} + if [ "$1" = "" -o $class -eq 0 -o $class -ge 224 ]; then return 0 + elif [ $class -ge 224 ]; then return 0 + elif [ $class -ge 192 ]; then return 24 + elif [ $class -ge 128 ]; then return 16 + else return 8; fi +} + +label="label $1" +ldev="$1" +dev=${1%:*} +if [ "$dev" = "" -o "$1" = "help" ]; then + echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2 + echo " add - add new address" 1>&2 + echo " del - delete address" 1>&2 + echo " stop - completely disable IP" 1>&2 + exit 1 +fi +shift + +CheckForwarding +fwd=$? +if [ $fwd -ne 0 ]; then + echo "Forwarding is ON or its state is unknown ($fwd). OK, No RDISC." 1>&2 +fi + + +deleting=0 +case "$1" in +add) shift ;; +stop) + if [ "$ldev" != "$dev" ]; then + echo "Cannot stop alias $ldev" 1>&2 + exit 1; + fi + ip -4 addr flush dev $dev $label || exit 1 + if [ $fwd -eq 0 ]; then RestartRDISC; fi + exit 0 ;; +del*) + deleting=1; shift ;; +*) +esac + +ipaddr= +pfxlen= +if [ "$1" != "" ]; then + ipaddr=${1%/*} + if [ "$1" != "$ipaddr" ]; then + pfxlen=${1#*/} + fi + if [ "$ipaddr" = "" ]; then + echo "$1 is bad IP address." 1>&2 + exit 1 + fi +fi +shift + +peer=$1 +if [ "$peer" != "" ]; then + if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then + echo "Peer address with non-trivial netmask." 1>&2 + exit 1 + fi + pfx="$ipaddr peer $peer" +else + if [ "$pfxlen" = "" ]; then + ABCMaskLen $ipaddr + pfxlen=$? + fi + pfx="$ipaddr/$pfxlen" +fi + +if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then + label= +fi + +if [ $deleting -ne 0 ]; then + ip addr del $pfx dev $dev $label || exit 1 + if [ $fwd -eq 0 ]; then RestartRDISC; fi + exit 0 +fi + + +if ! ip link set up dev $dev ; then + echo "Error: cannot enable interface $dev." 1>&2 + exit 1 +fi +if [ "$ipaddr" = "" ]; then exit 0; fi + +if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then + echo "Error: some host already uses address $ipaddr on $dev." 1>&2 + exit 1 +fi + +if ! ip address add $pfx brd + dev $dev $label; then + echo "Error: failed to add $pfx on $dev." 1>&2 + exit 1 +fi + +arping -q -A -c 1 -I $dev $ipaddr +noarp=$? +( sleep 2 ; + arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null & + +ip route add unreachable 224.0.0.0/24 >& /dev/null +ip route add unreachable 255.255.255.255 >& /dev/null +if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then + ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null +fi + +if [ $fwd -eq 0 ]; then + if [ $noarp -eq 0 ]; then + ip ro append default dev $dev metric 30000 scope global + elif [ "$peer" != "" ]; then + if ping -q -c 2 -w 4 $peer ; then + ip ro append default via $peer dev $dev metric 30001 + fi + fi + RestartRDISC +fi + +exit 0 diff --git a/ip/ip.c b/ip/ip.c index e69de29b..fe379926 100644 --- a/ip/ip.c +++ b/ip/ip.c @@ -0,0 +1,167 @@ +/* + * ip.c "ip" utility frontend. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <string.h> + +#include "SNAPSHOT.h" +#include "utils.h" +#include "ip_common.h" + +int preferred_family = AF_UNSPEC; +int show_stats = 0; +int resolve_hosts = 0; +int oneline = 0; +char * _SL_ = NULL; + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, +"Usage: ip [ OPTIONS ] OBJECT { COMMAND | help }\n" +"where OBJECT := { link | addr | route | rule | neigh | tunnel |\n" +" maddr | mroute | monitor }\n" +" OPTIONS := { -V[ersion] | -s[tatistics] | -r[esolve] |\n" +" -f[amily] { inet | inet6 | ipx | dnet | link } | -o[neline] }\n"); + exit(-1); +} + +int main(int argc, char **argv) +{ + char *basename; + + basename = strrchr(argv[0], '/'); + if (basename == NULL) + basename = argv[0]; + else + basename++; + + while (argc > 1) { + char *opt = argv[1]; + if (strcmp(opt,"--") == 0) { + argc--; argv++; + break; + } + if (opt[0] != '-') + break; + if (opt[1] == '-') + opt++; + if (matches(opt, "-family") == 0) { + argc--; + argv++; + if (argc <= 1) + usage(); + if (strcmp(argv[1], "inet") == 0) + preferred_family = AF_INET; + else if (strcmp(argv[1], "inet6") == 0) + preferred_family = AF_INET6; + else if (strcmp(argv[1], "dnet") == 0) + preferred_family = AF_DECnet; + else if (strcmp(argv[1], "link") == 0) + preferred_family = AF_PACKET; + else if (strcmp(argv[1], "ipx") == 0) + preferred_family = AF_IPX; + else if (strcmp(argv[1], "help") == 0) + usage(); + else + invarg(argv[1], "invalid protocol family"); + } else if (strcmp(opt, "-4") == 0) { + preferred_family = AF_INET; + } else if (strcmp(opt, "-6") == 0) { + preferred_family = AF_INET6; + } else if (strcmp(opt, "-0") == 0) { + preferred_family = AF_PACKET; + } else if (strcmp(opt, "-I") == 0) { + preferred_family = AF_IPX; + } else if (strcmp(opt, "-D") == 0) { + preferred_family = AF_DECnet; + } else if (matches(opt, "-stats") == 0 || + matches(opt, "-statistics") == 0) { + ++show_stats; + } else if (matches(opt, "-resolve") == 0) { + ++resolve_hosts; + } else if (matches(opt, "-oneline") == 0) { + ++oneline; +#if 0 + } else if (matches(opt, "-numeric") == 0) { + rtnl_names_numeric++; +#endif + } else if (matches(opt, "-Version") == 0) { + printf("ip utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + } else if (matches(opt, "-help") == 0) { + usage(); + } else { + fprintf(stderr, "Option \"%s\" is unknown, try \"ip -help\".\n", opt); + exit(-1); + } + argc--; argv++; + } + + _SL_ = oneline ? "\\" : "\n" ; + + if (strcmp(basename, "ipaddr") == 0) + return do_ipaddr(argc-1, argv+1); + if (strcmp(basename, "ipmaddr") == 0) + return do_multiaddr(argc-1, argv+1); + if (strcmp(basename, "iproute") == 0) + return do_iproute(argc-1, argv+1); + if (strcmp(basename, "iprule") == 0) + return do_iprule(argc-1, argv+1); + if (strcmp(basename, "ipneigh") == 0) + return do_ipneigh(argc-1, argv+1); + if (strcmp(basename, "iplink") == 0) + return do_iplink(argc-1, argv+1); + if (strcmp(basename, "iptunnel") == 0) + return do_iptunnel(argc-1, argv+1); + if (strcmp(basename, "ipmonitor") == 0) + return do_ipmonitor(argc-1, argv+1); + + if (argc > 1) { + if (matches(argv[1], "address") == 0) + return do_ipaddr(argc-2, argv+2); + if (matches(argv[1], "maddress") == 0) + return do_multiaddr(argc-2, argv+2); + if (matches(argv[1], "route") == 0) + return do_iproute(argc-2, argv+2); + if (matches(argv[1], "rule") == 0) + return do_iprule(argc-2, argv+2); + if (matches(argv[1], "mroute") == 0) + return do_multiroute(argc-2, argv+2); + if (matches(argv[1], "neighbor") == 0 || + matches(argv[1], "neighbour") == 0) + return do_ipneigh(argc-2, argv+2); + if (matches(argv[1], "link") == 0) + return do_iplink(argc-2, argv+2); + if (matches(argv[1], "tunnel") == 0 || + strcmp(argv[1], "tunl") == 0) + return do_iptunnel(argc-2, argv+2); + if (matches(argv[1], "monitor") == 0) + return do_ipmonitor(argc-2, argv+2); + if (matches(argv[1], "help") == 0) + usage(); + fprintf(stderr, "Object \"%s\" is unknown, try \"ip help\".\n", argv[1]); + exit(-1); + } + usage(); +} diff --git a/ip/ip_common.h b/ip/ip_common.h index e69de29b..5ac43218 100644 --- a/ip/ip_common.h +++ b/ip/ip_common.h @@ -0,0 +1,20 @@ +extern int print_linkinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); +extern int print_addrinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); +extern int print_neigh(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); +extern int ipaddr_list(int argc, char **argv); +extern int ipaddr_list_link(int argc, char **argv); +extern int iproute_monitor(int argc, char **argv); +extern void iplink_usage(void) __attribute__((noreturn)); +extern void iproute_reset_filter(void); +extern void ipaddr_reset_filter(int); +extern void ipneigh_reset_filter(void); +extern int print_route(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg); +extern int do_ipaddr(int argc, char **argv); +extern int do_iproute(int argc, char **argv); +extern int do_iprule(int argc, char **argv); +extern int do_ipneigh(int argc, char **argv); +extern int do_iptunnel(int argc, char **argv); +extern int do_iplink(int argc, char **argv); +extern int do_ipmonitor(int argc, char **argv); +extern int do_multiaddr(int argc, char **argv); +extern int do_multiroute(int argc, char **argv); diff --git a/ip/ipaddress.c b/ip/ipaddress.c index e69de29b..0d00280c 100644 --- a/ip/ipaddress.c +++ b/ip/ipaddress.c @@ -0,0 +1,898 @@ +/* + * ipaddress.c "ip address". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * Laszlo Valko <valko@linux.karinthy.hu> 990223: address label must be zero terminated + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <fnmatch.h> + +#include "rt_names.h" +#include "utils.h" +#include "ll_map.h" +#include "ip_common.h" + +static struct +{ + int ifindex; + int family; + int oneline; + int showqueue; + inet_prefix pfx; + int scope, scopemask; + int flags, flagmask; + int up; + char *label; + int flushed; + char *flushb; + int flushp; + int flushe; + struct rtnl_handle *rth; +} filter; + +static int do_link; + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + if (do_link) { + iplink_usage(); + } + fprintf(stderr, "Usage: ip addr {add|del} IFADDR dev STRING\n"); + fprintf(stderr, " ip addr {show|flush} [ dev STRING ] [ scope SCOPE-ID ]\n"); + fprintf(stderr, " [ to PREFIX ] [ FLAG-LIST ] [ label PATTERN ]\n"); + fprintf(stderr, "IFADDR := PREFIX | ADDR peer PREFIX\n"); + fprintf(stderr, " [ broadcast ADDR ] [ anycast ADDR ]\n"); + fprintf(stderr, " [ label STRING ] [ scope SCOPE-ID ]\n"); + fprintf(stderr, "SCOPE-ID := [ host | link | global | NUMBER ]\n"); + fprintf(stderr, "FLAG-LIST := [ FLAG-LIST ] FLAG\n"); + fprintf(stderr, "FLAG := [ permanent | dynamic | secondary | primary |\n"); + fprintf(stderr, " tentative | deprecated ]\n"); + exit(-1); +} + +void print_link_flags(FILE *fp, unsigned flags, unsigned mdown) +{ + fprintf(fp, "<"); + flags &= ~IFF_RUNNING; +#define _PF(f) if (flags&IFF_##f) { \ + flags &= ~IFF_##f ; \ + fprintf(fp, #f "%s", flags ? "," : ""); } + _PF(LOOPBACK); + _PF(BROADCAST); + _PF(POINTOPOINT); + _PF(MULTICAST); + _PF(NOARP); + _PF(ALLMULTI); + _PF(PROMISC); + _PF(MASTER); + _PF(SLAVE); + _PF(DEBUG); + _PF(DYNAMIC); + _PF(AUTOMEDIA); + _PF(PORTSEL); + _PF(NOTRAILERS); + _PF(UP); +#undef _PF + if (flags) + fprintf(fp, "%x", flags); + if (mdown) + fprintf(fp, ",M-DOWN"); + fprintf(fp, "> "); +} + +void print_queuelen(char *name) +{ + struct ifreq ifr; + int s; + + s = socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) + return; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, name); + if (ioctl(s, SIOCGIFTXQLEN, &ifr) < 0) { + perror("SIOCGIFXQLEN"); + close(s); + return; + } + close(s); + + if (ifr.ifr_qlen) + printf("qlen %d", ifr.ifr_qlen); +} + +int print_linkinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct ifinfomsg *ifi = NLMSG_DATA(n); + struct rtattr * tb[IFLA_MAX+1]; + int len = n->nlmsg_len; + unsigned m_flag = 0; + + if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK) + return 0; + + len -= NLMSG_LENGTH(sizeof(*ifi)); + if (len < 0) + return -1; + + if (filter.ifindex && ifi->ifi_index != filter.ifindex) + return 0; + if (filter.up && !(ifi->ifi_flags&IFF_UP)) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len); + if (tb[IFLA_IFNAME] == NULL) { + fprintf(stderr, "BUG: nil ifname\n"); + return -1; + } + if (filter.label && + (!filter.family || filter.family == AF_PACKET) && + fnmatch(filter.label, RTA_DATA(tb[IFLA_IFNAME]), 0)) + return 0; + + if (n->nlmsg_type == RTM_DELLINK) + fprintf(fp, "Deleted "); + + fprintf(fp, "%d: %s", ifi->ifi_index, + tb[IFLA_IFNAME] ? (char*)RTA_DATA(tb[IFLA_IFNAME]) : "<nil>"); + + if (tb[IFLA_LINK]) { + SPRINT_BUF(b1); + int iflink = *(int*)RTA_DATA(tb[IFLA_LINK]); + if (iflink == 0) + fprintf(fp, "@NONE: "); + else { + fprintf(fp, "@%s: ", ll_idx_n2a(iflink, b1)); + m_flag = ll_index_to_flags(iflink); + m_flag = !(m_flag & IFF_UP); + } + } else { + fprintf(fp, ": "); + } + print_link_flags(fp, ifi->ifi_flags, m_flag); + + if (tb[IFLA_MTU]) + fprintf(fp, "mtu %u ", *(int*)RTA_DATA(tb[IFLA_MTU])); + if (tb[IFLA_QDISC]) + fprintf(fp, "qdisc %s ", (char*)RTA_DATA(tb[IFLA_QDISC])); +#ifdef IFLA_MASTER + if (tb[IFLA_MASTER]) { + SPRINT_BUF(b1); + fprintf(fp, "master %s ", ll_idx_n2a(*(int*)RTA_DATA(tb[IFLA_MASTER]), b1)); + } +#endif + if (filter.showqueue) + print_queuelen((char*)RTA_DATA(tb[IFLA_IFNAME])); + + if (!filter.family || filter.family == AF_PACKET) { + SPRINT_BUF(b1); + fprintf(fp, "%s", _SL_); + fprintf(fp, " link/%s ", ll_type_n2a(ifi->ifi_type, b1, sizeof(b1))); + + if (tb[IFLA_ADDRESS]) { + fprintf(fp, "%s", ll_addr_n2a(RTA_DATA(tb[IFLA_ADDRESS]), + RTA_PAYLOAD(tb[IFLA_ADDRESS]), + ifi->ifi_type, + b1, sizeof(b1))); + } + if (tb[IFLA_BROADCAST]) { + if (ifi->ifi_flags&IFF_POINTOPOINT) + fprintf(fp, " peer "); + else + fprintf(fp, " brd "); + fprintf(fp, "%s", ll_addr_n2a(RTA_DATA(tb[IFLA_BROADCAST]), + RTA_PAYLOAD(tb[IFLA_BROADCAST]), + ifi->ifi_type, + b1, sizeof(b1))); + } + } + if (do_link && tb[IFLA_STATS] && show_stats) { + struct net_device_stats slocal; + struct net_device_stats *s = RTA_DATA(tb[IFLA_STATS]); + if (((unsigned long)s) & (sizeof(unsigned long)-1)) { + memcpy(&slocal, s, sizeof(slocal)); + s = &slocal; + } + fprintf(fp, "%s", _SL_); + fprintf(fp, " RX: bytes packets errors dropped overrun mcast %s%s", + s->rx_compressed ? "compressed" : "", _SL_); + fprintf(fp, " %-10lu %-8lu %-7lu %-7lu %-7lu %-7lu", + s->rx_bytes, s->rx_packets, s->rx_errors, + s->rx_dropped, s->rx_over_errors, + s->multicast + ); + if (s->rx_compressed) + fprintf(fp, " %-7lu", s->rx_compressed); + if (show_stats > 1) { + fprintf(fp, "%s", _SL_); + fprintf(fp, " RX errors: length crc frame fifo missed%s", _SL_); + fprintf(fp, " %-7lu %-7lu %-7lu %-7lu %-7lu", + s->rx_length_errors, + s->rx_crc_errors, + s->rx_frame_errors, + s->rx_fifo_errors, + s->rx_missed_errors + ); + } + fprintf(fp, "%s", _SL_); + fprintf(fp, " TX: bytes packets errors dropped carrier collsns %s%s", + s->tx_compressed ? "compressed" : "", _SL_); + fprintf(fp, " %-10lu %-8lu %-7lu %-7lu %-7lu %-7lu", + s->tx_bytes, s->tx_packets, s->tx_errors, + s->tx_dropped, s->tx_carrier_errors, s->collisions); + if (s->tx_compressed) + fprintf(fp, " %-7lu", s->tx_compressed); + if (show_stats > 1) { + fprintf(fp, "%s", _SL_); + fprintf(fp, " TX errors: aborted fifo window heartbeat%s", _SL_); + fprintf(fp, " %-7lu %-7lu %-7lu %-7lu", + s->tx_aborted_errors, + s->tx_fifo_errors, + s->tx_window_errors, + s->tx_heartbeat_errors + ); + } + } + fprintf(fp, "\n"); + fflush(fp); + return 0; +} + +static int flush_update(void) +{ + if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) { + perror("Failed to send flush request\n"); + return -1; + } + filter.flushp = 0; + return 0; +} + +int print_addrinfo(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct ifaddrmsg *ifa = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * rta_tb[IFA_MAX+1]; + char abuf[256]; + SPRINT_BUF(b1); + + if (n->nlmsg_type != RTM_NEWADDR && n->nlmsg_type != RTM_DELADDR) + return 0; + len -= NLMSG_LENGTH(sizeof(*ifa)); + if (len < 0) { + fprintf(stderr, "BUG: wrong nlmsg len %d\n", len); + return -1; + } + + if (filter.flushb && n->nlmsg_type != RTM_NEWADDR) + return 0; + + memset(rta_tb, 0, sizeof(rta_tb)); + parse_rtattr(rta_tb, IFA_MAX, IFA_RTA(ifa), n->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa))); + + if (!rta_tb[IFA_LOCAL]) + rta_tb[IFA_LOCAL] = rta_tb[IFA_ADDRESS]; + if (!rta_tb[IFA_ADDRESS]) + rta_tb[IFA_ADDRESS] = rta_tb[IFA_LOCAL]; + + if (filter.ifindex && filter.ifindex != ifa->ifa_index) + return 0; + if ((filter.scope^ifa->ifa_scope)&filter.scopemask) + return 0; + if ((filter.flags^ifa->ifa_flags)&filter.flagmask) + return 0; + if (filter.label) { + SPRINT_BUF(b1); + const char *label; + if (rta_tb[IFA_LABEL]) + label = RTA_DATA(rta_tb[IFA_LABEL]); + else + label = ll_idx_n2a(ifa->ifa_index, b1); + if (fnmatch(filter.label, label, 0) != 0) + return 0; + } + if (filter.pfx.family) { + if (rta_tb[IFA_LOCAL]) { + inet_prefix dst; + memset(&dst, 0, sizeof(dst)); + dst.family = ifa->ifa_family; + memcpy(&dst.data, RTA_DATA(rta_tb[IFA_LOCAL]), RTA_PAYLOAD(rta_tb[IFA_LOCAL])); + if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen)) + return 0; + } + } + + if (filter.flushb) { + struct nlmsghdr *fn; + if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) { + if (flush_update()) + return -1; + } + fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp)); + memcpy(fn, n, n->nlmsg_len); + fn->nlmsg_type = RTM_DELADDR; + fn->nlmsg_flags = NLM_F_REQUEST; + fn->nlmsg_seq = ++filter.rth->seq; + filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb; + filter.flushed++; + if (show_stats < 2) + return 0; + } + + if (n->nlmsg_type == RTM_DELADDR) + fprintf(fp, "Deleted "); + + if (filter.oneline || filter.flushb) + fprintf(fp, "%u: %s", ifa->ifa_index, ll_index_to_name(ifa->ifa_index)); + if (ifa->ifa_family == AF_INET) + fprintf(fp, " inet "); + else if (ifa->ifa_family == AF_INET6) + fprintf(fp, " inet6 "); + else if (ifa->ifa_family == AF_DECnet) + fprintf(fp, " dnet "); + else if (ifa->ifa_family == AF_IPX) + fprintf(fp, " ipx "); + else + fprintf(fp, " family %d ", ifa->ifa_family); + + if (rta_tb[IFA_LOCAL]) { + fprintf(fp, "%s", rt_addr_n2a(ifa->ifa_family, + RTA_PAYLOAD(rta_tb[IFA_LOCAL]), + RTA_DATA(rta_tb[IFA_LOCAL]), + abuf, sizeof(abuf))); + + if (rta_tb[IFA_ADDRESS] == NULL || + memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]), RTA_DATA(rta_tb[IFA_LOCAL]), 4) == 0) { + fprintf(fp, "/%d ", ifa->ifa_prefixlen); + } else { + fprintf(fp, " peer %s/%d ", + rt_addr_n2a(ifa->ifa_family, + RTA_PAYLOAD(rta_tb[IFA_ADDRESS]), + RTA_DATA(rta_tb[IFA_ADDRESS]), + abuf, sizeof(abuf)), + ifa->ifa_prefixlen); + } + } + + if (rta_tb[IFA_BROADCAST]) { + fprintf(fp, "brd %s ", + rt_addr_n2a(ifa->ifa_family, + RTA_PAYLOAD(rta_tb[IFA_BROADCAST]), + RTA_DATA(rta_tb[IFA_BROADCAST]), + abuf, sizeof(abuf))); + } + if (rta_tb[IFA_ANYCAST]) { + fprintf(fp, "any %s ", + rt_addr_n2a(ifa->ifa_family, + RTA_PAYLOAD(rta_tb[IFA_ANYCAST]), + RTA_DATA(rta_tb[IFA_ANYCAST]), + abuf, sizeof(abuf))); + } + fprintf(fp, "scope %s ", rtnl_rtscope_n2a(ifa->ifa_scope, b1, sizeof(b1))); + if (ifa->ifa_flags&IFA_F_SECONDARY) { + ifa->ifa_flags &= ~IFA_F_SECONDARY; + fprintf(fp, "secondary "); + } + if (ifa->ifa_flags&IFA_F_TENTATIVE) { + ifa->ifa_flags &= ~IFA_F_TENTATIVE; + fprintf(fp, "tentative "); + } + if (ifa->ifa_flags&IFA_F_DEPRECATED) { + ifa->ifa_flags &= ~IFA_F_DEPRECATED; + fprintf(fp, "deprecated "); + } + if (!(ifa->ifa_flags&IFA_F_PERMANENT)) { + fprintf(fp, "dynamic "); + } else + ifa->ifa_flags &= ~IFA_F_PERMANENT; + if (ifa->ifa_flags) + fprintf(fp, "flags %02x ", ifa->ifa_flags); + if (rta_tb[IFA_LABEL]) + fprintf(fp, "%s", (char*)RTA_DATA(rta_tb[IFA_LABEL])); + if (rta_tb[IFA_CACHEINFO]) { + struct ifa_cacheinfo *ci = RTA_DATA(rta_tb[IFA_CACHEINFO]); + char buf[128]; + fprintf(fp, "%s", _SL_); + if (ci->ifa_valid == 0xFFFFFFFFU) + sprintf(buf, "valid_lft forever"); + else + sprintf(buf, "valid_lft %dsec", ci->ifa_valid); + if (ci->ifa_prefered == 0xFFFFFFFFU) + sprintf(buf+strlen(buf), " preferred_lft forever"); + else + sprintf(buf+strlen(buf), " preferred_lft %dsec", ci->ifa_prefered); + fprintf(fp, " %s", buf); + } + fprintf(fp, "\n"); + fflush(fp); + return 0; +} + + +struct nlmsg_list +{ + struct nlmsg_list *next; + struct nlmsghdr h; +}; + +int print_selected_addrinfo(int ifindex, struct nlmsg_list *ainfo, FILE *fp) +{ + for ( ;ainfo ; ainfo = ainfo->next) { + struct nlmsghdr *n = &ainfo->h; + struct ifaddrmsg *ifa = NLMSG_DATA(n); + + if (n->nlmsg_type != RTM_NEWADDR) + continue; + + if (n->nlmsg_len < NLMSG_LENGTH(sizeof(ifa))) + return -1; + + if (ifa->ifa_index != ifindex || + (filter.family && filter.family != ifa->ifa_family)) + continue; + + print_addrinfo(NULL, n, fp); + } + return 0; +} + + +int store_nlmsg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + struct nlmsg_list **linfo = (struct nlmsg_list**)arg; + struct nlmsg_list *h; + struct nlmsg_list **lp; + + h = malloc(n->nlmsg_len+sizeof(void*)); + if (h == NULL) + return -1; + + memcpy(&h->h, n, n->nlmsg_len); + h->next = NULL; + + for (lp = linfo; *lp; lp = &(*lp)->next) /* NOTHING */; + *lp = h; + + ll_remember_index(who, n, NULL); + return 0; +} + +int ipaddr_list_or_flush(int argc, char **argv, int flush) +{ + struct nlmsg_list *linfo = NULL; + struct nlmsg_list *ainfo = NULL; + struct nlmsg_list *l; + struct rtnl_handle rth; + char *filter_dev = NULL; + int no_link = 0; + + ipaddr_reset_filter(oneline); + filter.showqueue = 1; + + if (filter.family == AF_UNSPEC) + filter.family = preferred_family; + + if (flush) { + if (argc <= 0) { + fprintf(stderr, "Flush requires arguments.\n"); + return -1; + } + if (filter.family == AF_PACKET) { + fprintf(stderr, "Cannot flush link addresses.\n"); + return -1; + } + } + + while (argc > 0) { + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + get_prefix(&filter.pfx, *argv, filter.family); + if (filter.family == AF_UNSPEC) + filter.family = filter.pfx.family; + } else if (strcmp(*argv, "scope") == 0) { + int scope = 0; + NEXT_ARG(); + filter.scopemask = -1; + if (rtnl_rtscope_a2n(&scope, *argv)) { + if (strcmp(*argv, "all") != 0) + invarg("invalid \"scope\"\n", *argv); + scope = RT_SCOPE_NOWHERE; + filter.scopemask = 0; + } + filter.scope = scope; + } else if (strcmp(*argv, "up") == 0) { + filter.up = 1; + } else if (strcmp(*argv, "dynamic") == 0) { + filter.flags &= ~IFA_F_PERMANENT; + filter.flagmask |= IFA_F_PERMANENT; + } else if (strcmp(*argv, "permanent") == 0) { + filter.flags |= IFA_F_PERMANENT; + filter.flagmask |= IFA_F_PERMANENT; + } else if (strcmp(*argv, "secondary") == 0) { + filter.flags |= IFA_F_SECONDARY; + filter.flagmask |= IFA_F_SECONDARY; + } else if (strcmp(*argv, "primary") == 0) { + filter.flags &= ~IFA_F_SECONDARY; + filter.flagmask |= IFA_F_SECONDARY; + } else if (strcmp(*argv, "tentative") == 0) { + filter.flags |= IFA_F_TENTATIVE; + filter.flagmask |= IFA_F_TENTATIVE; + } else if (strcmp(*argv, "deprecated") == 0) { + filter.flags |= IFA_F_DEPRECATED; + filter.flagmask |= IFA_F_DEPRECATED; + } else if (strcmp(*argv, "label") == 0) { + NEXT_ARG(); + filter.label = *argv; + } else { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (filter_dev) + duparg2("dev", *argv); + filter_dev = *argv; + } + argv++; argc--; + } + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + if (rtnl_wilddump_request(&rth, preferred_family, RTM_GETLINK) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, store_nlmsg, &linfo, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + if (filter_dev) { + filter.ifindex = ll_name_to_index(filter_dev); + if (filter.ifindex <= 0) { + fprintf(stderr, "Device \"%s\" does not exist.\n", filter_dev); + return -1; + } + } + + if (flush) { + int round = 0; + char flushb[4096-512]; + + filter.flushb = flushb; + filter.flushp = 0; + filter.flushe = sizeof(flushb); + filter.rth = &rth; + + for (;;) { + if (rtnl_wilddump_request(&rth, filter.family, RTM_GETADDR) < 0) { + perror("Cannot send dump request"); + exit(1); + } + filter.flushed = 0; + if (rtnl_dump_filter(&rth, print_addrinfo, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Flush terminated\n"); + exit(1); + } + if (filter.flushed == 0) { + if (round == 0) { + fprintf(stderr, "Nothing to flush.\n"); + } else if (show_stats) + printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":""); + fflush(stdout); + return 0; + } + round++; + if (flush_update() < 0) + exit(1); + if (show_stats) { + printf("\n*** Round %d, deleting %d addresses ***\n", round, filter.flushed); + fflush(stdout); + } + } + } + + if (filter.family != AF_PACKET) { + if (rtnl_wilddump_request(&rth, filter.family, RTM_GETADDR) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, store_nlmsg, &ainfo, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + } + + + if (filter.family && filter.family != AF_PACKET) { + struct nlmsg_list **lp; + lp=&linfo; + + if (filter.oneline) + no_link = 1; + + while ((l=*lp)!=NULL) { + int ok = 0; + struct ifinfomsg *ifi = NLMSG_DATA(&l->h); + struct nlmsg_list *a; + + for (a=ainfo; a; a=a->next) { + struct nlmsghdr *n = &a->h; + struct ifaddrmsg *ifa = NLMSG_DATA(n); + + if (ifa->ifa_index != ifi->ifi_index || + (filter.family && filter.family != ifa->ifa_family)) + continue; + if ((filter.scope^ifa->ifa_scope)&filter.scopemask) + continue; + if ((filter.flags^ifa->ifa_flags)&filter.flagmask) + continue; + if (filter.pfx.family || filter.label) { + struct rtattr *tb[IFA_MAX+1]; + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, IFA_MAX, IFA_RTA(ifa), IFA_PAYLOAD(n)); + if (!tb[IFA_LOCAL]) + tb[IFA_LOCAL] = tb[IFA_ADDRESS]; + + if (filter.pfx.family && tb[IFA_LOCAL]) { + inet_prefix dst; + memset(&dst, 0, sizeof(dst)); + dst.family = ifa->ifa_family; + memcpy(&dst.data, RTA_DATA(tb[IFA_LOCAL]), RTA_PAYLOAD(tb[IFA_LOCAL])); + if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen)) + continue; + } + if (filter.label) { + SPRINT_BUF(b1); + const char *label; + if (tb[IFA_LABEL]) + label = RTA_DATA(tb[IFA_LABEL]); + else + label = ll_idx_n2a(ifa->ifa_index, b1); + if (fnmatch(filter.label, label, 0) != 0) + continue; + } + } + + ok = 1; + break; + } + if (!ok) + *lp = l->next; + else + lp = &l->next; + } + } + + for (l=linfo; l; l = l->next) { + if (no_link || print_linkinfo(NULL, &l->h, stdout) == 0) { + struct ifinfomsg *ifi = NLMSG_DATA(&l->h); + if (filter.family != AF_PACKET) + print_selected_addrinfo(ifi->ifi_index, ainfo, stdout); + } + fflush(stdout); + } + + exit(0); +} + +int ipaddr_list_link(int argc, char **argv) +{ + preferred_family = AF_PACKET; + do_link = 1; + return ipaddr_list_or_flush(argc, argv, 0); +} + +void ipaddr_reset_filter(int oneline) +{ + memset(&filter, 0, sizeof(filter)); + filter.oneline = oneline; +} + +int default_scope(inet_prefix *lcl) +{ + if (lcl->family == AF_INET) { + if (lcl->bytelen >= 1 && *(__u8*)&lcl->data == 127) + return RT_SCOPE_HOST; + } + return 0; +} + +int ipaddr_modify(int cmd, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct ifaddrmsg ifa; + char buf[256]; + } req; + char *d = NULL; + char *l = NULL; + inet_prefix lcl; + inet_prefix peer; + int local_len = 0; + int peer_len = 0; + int brd_len = 0; + int any_len = 0; + int scoped = 0; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = cmd; + req.ifa.ifa_family = preferred_family; + + while (argc > 0) { + if (strcmp(*argv, "peer") == 0 || + strcmp(*argv, "remote") == 0) { + NEXT_ARG(); + + if (peer_len) + duparg("peer", *argv); + get_prefix(&peer, *argv, req.ifa.ifa_family); + peer_len = peer.bytelen; + if (req.ifa.ifa_family == AF_UNSPEC) + req.ifa.ifa_family = peer.family; + addattr_l(&req.n, sizeof(req), IFA_ADDRESS, &peer.data, peer.bytelen); + req.ifa.ifa_prefixlen = peer.bitlen; + } else if (matches(*argv, "broadcast") == 0 || + strcmp(*argv, "brd") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (brd_len) + duparg("broadcast", *argv); + if (strcmp(*argv, "+") == 0) + brd_len = -1; + else if (strcmp(*argv, "-") == 0) + brd_len = -2; + else { + get_addr(&addr, *argv, req.ifa.ifa_family); + if (req.ifa.ifa_family == AF_UNSPEC) + req.ifa.ifa_family = addr.family; + addattr_l(&req.n, sizeof(req), IFA_BROADCAST, &addr.data, addr.bytelen); + brd_len = addr.bytelen; + } + } else if (strcmp(*argv, "anycast") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (any_len) + duparg("anycast", *argv); + get_addr(&addr, *argv, req.ifa.ifa_family); + if (req.ifa.ifa_family == AF_UNSPEC) + req.ifa.ifa_family = addr.family; + addattr_l(&req.n, sizeof(req), IFA_ANYCAST, &addr.data, addr.bytelen); + any_len = addr.bytelen; + } else if (strcmp(*argv, "scope") == 0) { + int scope = 0; + NEXT_ARG(); + if (rtnl_rtscope_a2n(&scope, *argv)) + invarg(*argv, "invalid scope value."); + req.ifa.ifa_scope = scope; + scoped = 1; + } else if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + d = *argv; + } else if (strcmp(*argv, "label") == 0) { + NEXT_ARG(); + l = *argv; + addattr_l(&req.n, sizeof(req), IFA_LABEL, l, strlen(l)+1); + } else { + if (strcmp(*argv, "local") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (local_len) + duparg2("local", *argv); + get_prefix(&lcl, *argv, req.ifa.ifa_family); + if (req.ifa.ifa_family == AF_UNSPEC) + req.ifa.ifa_family = lcl.family; + addattr_l(&req.n, sizeof(req), IFA_LOCAL, &lcl.data, lcl.bytelen); + local_len = lcl.bytelen; + } + argc--; argv++; + } + if (d == NULL) { + fprintf(stderr, "Not enough information: \"dev\" argument is required.\n"); + return -1; + } + if (l && matches(d, l) != 0) { + fprintf(stderr, "\"dev\" (%s) must match \"label\" (%s).\n", d, l); + exit(1); + } + + if (peer_len == 0 && local_len && cmd != RTM_DELADDR) { + peer = lcl; + addattr_l(&req.n, sizeof(req), IFA_ADDRESS, &lcl.data, lcl.bytelen); + } + if (req.ifa.ifa_prefixlen == 0) + req.ifa.ifa_prefixlen = lcl.bitlen; + + if (brd_len < 0 && cmd != RTM_DELADDR) { + inet_prefix brd; + int i; + if (req.ifa.ifa_family != AF_INET) { + fprintf(stderr, "Broadcast can be set only for IPv4 addresses\n"); + return -1; + } + brd = peer; + if (brd.bitlen <= 30) { + for (i=31; i>=brd.bitlen; i--) { + if (brd_len == -1) + brd.data[0] |= htonl(1<<(31-i)); + else + brd.data[0] &= ~htonl(1<<(31-i)); + } + addattr_l(&req.n, sizeof(req), IFA_BROADCAST, &brd.data, brd.bytelen); + brd_len = brd.bytelen; + } + } + if (!scoped && cmd != RTM_DELADDR) + req.ifa.ifa_scope = default_scope(&lcl); + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + ll_init_map(&rth); + + if ((req.ifa.ifa_index = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + return -1; + } + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + exit(0); +} + +int do_ipaddr(int argc, char **argv) +{ + if (argc < 1) + return ipaddr_list_or_flush(0, NULL, 0); + if (matches(*argv, "add") == 0) + return ipaddr_modify(RTM_NEWADDR, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return ipaddr_modify(RTM_DELADDR, argc-1, argv+1); + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return ipaddr_list_or_flush(argc-1, argv+1, 0); + if (matches(*argv, "flush") == 0) + return ipaddr_list_or_flush(argc-1, argv+1, 1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"ip address help\".\n", *argv); + exit(-1); +} + diff --git a/ip/iplink.c b/ip/iplink.c index e69de29b..1fc3dcfd 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -0,0 +1,397 @@ +/* + * iplink.c "ip link". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <errno.h> +#include <sys/socket.h> +#include <linux/if.h> +#include <linux/if_packet.h> +#include <linux/if_ether.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <sys/ioctl.h> +#include <linux/sockios.h> + +#include "rt_names.h" +#include "utils.h" +#include "ip_common.h" + + +static void usage(void) __attribute__((noreturn)); + +void iplink_usage(void) +{ + fprintf(stderr, "Usage: ip link set DEVICE { up | down | arp { on | off } |\n"); + fprintf(stderr, " dynamic { on | off } |\n"); + fprintf(stderr, " multicast { on | off } | txqueuelen PACKETS |\n"); + fprintf(stderr, " name NEWNAME |\n"); + fprintf(stderr, " address LLADDR | broadcast LLADDR |\n"); + fprintf(stderr, " mtu MTU }\n"); + fprintf(stderr, " ip link show [ DEVICE ]\n"); + exit(-1); +} + +static void usage(void) +{ + iplink_usage(); +} + +static int on_off(char *msg) +{ + fprintf(stderr, "Error: argument of \"%s\" must be \"on\" or \"off\"\n", msg); + return -1; +} + +static int get_ctl_fd(void) +{ + int s_errno; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, 0); + if (fd >= 0) + return fd; + s_errno = errno; + fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (fd >= 0) + return fd; + fd = socket(PF_INET6, SOCK_DGRAM, 0); + if (fd >= 0) + return fd; + errno = s_errno; + perror("Cannot create control socket"); + return -1; +} + +static int do_chflags(char *dev, __u32 flags, __u32 mask) +{ + struct ifreq ifr; + int fd; + int err; + + strcpy(ifr.ifr_name, dev); + fd = get_ctl_fd(); + if (fd < 0) + return -1; + err = ioctl(fd, SIOCGIFFLAGS, &ifr); + if (err) { + perror("SIOCGIFFLAGS"); + close(fd); + return -1; + } + if ((ifr.ifr_flags^flags)&mask) { + ifr.ifr_flags &= ~mask; + ifr.ifr_flags |= mask&flags; + err = ioctl(fd, SIOCSIFFLAGS, &ifr); + if (err) + perror("SIOCSIFFLAGS"); + } + close(fd); + return err; +} + +static int do_changename(char *dev, char *newdev) +{ + struct ifreq ifr; + int fd; + int err; + + strcpy(ifr.ifr_name, dev); + strcpy(ifr.ifr_newname, newdev); + fd = get_ctl_fd(); + if (fd < 0) + return -1; + err = ioctl(fd, SIOCSIFNAME, &ifr); + if (err) { + perror("SIOCSIFNAME"); + close(fd); + return -1; + } + close(fd); + return err; +} + +static int set_qlen(char *dev, int qlen) +{ + struct ifreq ifr; + int s; + + s = get_ctl_fd(); + if (s < 0) + return -1; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + ifr.ifr_qlen = qlen; + if (ioctl(s, SIOCSIFTXQLEN, &ifr) < 0) { + perror("SIOCSIFXQLEN"); + close(s); + return -1; + } + close(s); + + return 0; +} + +static int set_mtu(char *dev, int mtu) +{ + struct ifreq ifr; + int s; + + s = get_ctl_fd(); + if (s < 0) + return -1; + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + ifr.ifr_mtu = mtu; + if (ioctl(s, SIOCSIFMTU, &ifr) < 0) { + perror("SIOCSIFMTU"); + close(s); + return -1; + } + close(s); + + return 0; +} + +static int get_address(char *dev, int *htype) +{ + struct ifreq ifr; + struct sockaddr_ll me; + int alen; + int s; + + s = socket(PF_PACKET, SOCK_DGRAM, 0); + if (s < 0) { + perror("socket(PF_PACKET)"); + return -1; + } + + memset(&ifr, 0, sizeof(ifr)); + strcpy(ifr.ifr_name, dev); + if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) { + perror("SIOCGIFINDEX"); + close(s); + return -1; + } + + memset(&me, 0, sizeof(me)); + me.sll_family = AF_PACKET; + me.sll_ifindex = ifr.ifr_ifindex; + me.sll_protocol = htons(ETH_P_LOOP); + if (bind(s, (struct sockaddr*)&me, sizeof(me)) == -1) { + perror("bind"); + close(s); + return -1; + } + + alen = sizeof(me); + if (getsockname(s, (struct sockaddr*)&me, &alen) == -1) { + perror("getsockname"); + close(s); + return -1; + } + close(s); + *htype = me.sll_hatype; + return me.sll_halen; +} + +static int parse_address(char *dev, int hatype, int halen, char *lla, struct ifreq *ifr) +{ + int alen; + + memset(ifr, 0, sizeof(*ifr)); + strcpy(ifr->ifr_name, dev); + ifr->ifr_hwaddr.sa_family = hatype; + alen = ll_addr_a2n(ifr->ifr_hwaddr.sa_data, 14, lla); + if (alen < 0) + return -1; + if (alen != halen) { + fprintf(stderr, "Wrong address (%s) length: expected %d bytes\n", lla, halen); + return -1; + } + return 0; +} + +static int set_address(struct ifreq *ifr, int brd) +{ + int s; + + s = get_ctl_fd(); + if (s < 0) + return -1; + if (ioctl(s, brd?SIOCSIFHWBROADCAST:SIOCSIFHWADDR, ifr) < 0) { + perror(brd?"SIOCSIFHWBROADCAST":"SIOCSIFHWADDR"); + close(s); + return -1; + } + close(s); + return 0; +} + + +static int do_set(int argc, char **argv) +{ + char *dev = NULL; + __u32 mask = 0; + __u32 flags = 0; + int qlen = -1; + int mtu = -1; + char *newaddr = NULL; + char *newbrd = NULL; + struct ifreq ifr0, ifr1; + char *newname = NULL; + int htype, halen; + + while (argc > 0) { + if (strcmp(*argv, "up") == 0) { + mask |= IFF_UP; + flags |= IFF_UP; + } else if (strcmp(*argv, "down") == 0) { + mask |= IFF_UP; + flags &= ~IFF_UP; + } else if (strcmp(*argv, "name") == 0) { + NEXT_ARG(); + newname = *argv; + } else if (matches(*argv, "address") == 0) { + NEXT_ARG(); + newaddr = *argv; + } else if (matches(*argv, "broadcast") == 0 || + strcmp(*argv, "brd") == 0) { + NEXT_ARG(); + newbrd = *argv; + } else if (matches(*argv, "txqueuelen") == 0 || + strcmp(*argv, "qlen") == 0 || + matches(*argv, "txqlen") == 0) { + NEXT_ARG(); + if (qlen != -1) + duparg("txqueuelen", *argv); + if (get_integer(&qlen, *argv, 0)) + invarg("Invalid \"txqueuelen\" value\n", *argv); + } else if (strcmp(*argv, "mtu") == 0) { + NEXT_ARG(); + if (mtu != -1) + duparg("mtu", *argv); + if (get_integer(&mtu, *argv, 0)) + invarg("Invalid \"mtu\" value\n", *argv); + } else if (strcmp(*argv, "multicast") == 0) { + NEXT_ARG(); + mask |= IFF_MULTICAST; + if (strcmp(*argv, "on") == 0) { + flags |= IFF_MULTICAST; + } else if (strcmp(*argv, "off") == 0) { + flags &= ~IFF_MULTICAST; + } else + return on_off("multicast"); + } else if (strcmp(*argv, "arp") == 0) { + NEXT_ARG(); + mask |= IFF_NOARP; + if (strcmp(*argv, "on") == 0) { + flags &= ~IFF_NOARP; + } else if (strcmp(*argv, "off") == 0) { + flags |= IFF_NOARP; + } else + return on_off("noarp"); +#ifdef IFF_DYNAMIC + } else if (matches(*argv, "dynamic") == 0) { + NEXT_ARG(); + mask |= IFF_DYNAMIC; + if (strcmp(*argv, "on") == 0) { + flags |= IFF_DYNAMIC; + } else if (strcmp(*argv, "off") == 0) { + flags &= ~IFF_DYNAMIC; + } else + return on_off("dynamic"); +#endif + } else { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (dev) + duparg2("dev", *argv); + dev = *argv; + } + argc--; argv++; + } + + if (!dev) { + fprintf(stderr, "Not enough of information: \"dev\" argument is required.\n"); + exit(-1); + } + + if (newaddr || newbrd) { + halen = get_address(dev, &htype); + if (halen < 0) + return -1; + if (newaddr) { + if (parse_address(dev, htype, halen, newaddr, &ifr0) < 0) + return -1; + } + if (newbrd) { + if (parse_address(dev, htype, halen, newbrd, &ifr1) < 0) + return -1; + } + } + + if (newname && strcmp(dev, newname)) { + if (do_changename(dev, newname) < 0) + return -1; + dev = newname; + } + if (qlen != -1) { + if (set_qlen(dev, qlen) < 0) + return -1; + } + if (mtu != -1) { + if (set_mtu(dev, mtu) < 0) + return -1; + } + if (newaddr || newbrd) { + if (newbrd) { + if (set_address(&ifr1, 1) < 0) + return -1; + } + if (newaddr) { + if (set_address(&ifr0, 0) < 0) + return -1; + } + } + if (mask) + return do_chflags(dev, flags, mask); + return 0; +} + +int do_iplink(int argc, char **argv) +{ + if (argc > 0) { + if (matches(*argv, "set") == 0) + return do_set(argc-1, argv+1); + if (matches(*argv, "show") == 0 || + matches(*argv, "lst") == 0 || + matches(*argv, "list") == 0) + return ipaddr_list_link(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + } else + return ipaddr_list_link(0, NULL); + + fprintf(stderr, "Command \"%s\" is unknown, try \"ip link help\".\n", *argv); + exit(-1); +} diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index e69de29b..b2c4adc0 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -0,0 +1,342 @@ +/* + * ipmaddr.c "ip maddress". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "rt_names.h" +#include "utils.h" + +static struct { + char *dev; + int family; +} filter; + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip maddr [ add | del ] MULTIADDR dev STRING\n"); + fprintf(stderr, " ip maddr show [ dev STRING ]\n"); + exit(-1); +} + +static int parse_hex(char *str, unsigned char *addr) +{ + int len=0; + + while (*str) { + int tmp; + if (str[1] == 0) + return -1; + if (sscanf(str, "%02x", &tmp) != 1) + return -1; + addr[len] = tmp; + len++; + str += 2; + } + return len; +} + +struct ma_info +{ + struct ma_info *next; + int index; + int users; + char *features; + char name[IFNAMSIZ]; + inet_prefix addr; +}; + +void maddr_ins(struct ma_info **lst, struct ma_info *m) +{ + struct ma_info *mp; + + for (; (mp=*lst) != NULL; lst = &mp->next) { + if (mp->index > m->index) + break; + } + m->next = *lst; + *lst = m; +} + +void read_dev_mcast(struct ma_info **result_p) +{ + char buf[256]; + FILE *fp = fopen("/proc/net/dev_mcast", "r"); + + if (!fp) + return; + + while (fgets(buf, sizeof(buf), fp)) { + char hexa[256]; + struct ma_info m; + int len; + int st; + + memset(&m, 0, sizeof(m)); + sscanf(buf, "%d%s%d%d%s", &m.index, m.name, &m.users, &st, + hexa); + if (filter.dev && strcmp(filter.dev, m.name)) + continue; + + m.addr.family = AF_PACKET; + + len = parse_hex(hexa, (unsigned char*)&m.addr.data); + if (len >= 0) { + struct ma_info *ma = malloc(sizeof(m)); + + memcpy(ma, &m, sizeof(m)); + ma->addr.bytelen = len; + ma->addr.bitlen = len<<3; + if (st) + ma->features = "static"; + maddr_ins(result_p, ma); + } + } + fclose(fp); +} + +void read_igmp(struct ma_info **result_p) +{ + struct ma_info m; + char buf[256]; + FILE *fp = fopen("/proc/net/igmp", "r"); + + if (!fp) + return; + memset(&m, 0, sizeof(m)); + fgets(buf, sizeof(buf), fp); + + m.addr.family = AF_INET; + m.addr.bitlen = 32; + m.addr.bytelen = 4; + + while (fgets(buf, sizeof(buf), fp)) { + struct ma_info *ma = malloc(sizeof(m)); + + if (buf[0] != '\t') { + sscanf(buf, "%d%s", &m.index, m.name); + continue; + } + + if (filter.dev && strcmp(filter.dev, m.name)) + continue; + + sscanf(buf, "%08x%d", (__u32*)&m.addr.data, &m.users); + + ma = malloc(sizeof(m)); + memcpy(ma, &m, sizeof(m)); + maddr_ins(result_p, ma); + } + fclose(fp); +} + + +void read_igmp6(struct ma_info **result_p) +{ + char buf[256]; + FILE *fp = fopen("/proc/net/igmp6", "r"); + + if (!fp) + return; + + while (fgets(buf, sizeof(buf), fp)) { + char hexa[256]; + struct ma_info m; + int len; + + memset(&m, 0, sizeof(m)); + sscanf(buf, "%d%s%s%d", &m.index, m.name, hexa, &m.users); + + if (filter.dev && strcmp(filter.dev, m.name)) + continue; + + m.addr.family = AF_INET6; + + len = parse_hex(hexa, (unsigned char*)&m.addr.data); + if (len >= 0) { + struct ma_info *ma = malloc(sizeof(m)); + + memcpy(ma, &m, sizeof(m)); + + ma->addr.bytelen = len; + ma->addr.bitlen = len<<3; + maddr_ins(result_p, ma); + } + } + fclose(fp); +} + +static void print_maddr(FILE *fp, struct ma_info *list) +{ + fprintf(fp, "\t"); + + if (list->addr.family == AF_PACKET) { + SPRINT_BUF(b1); + fprintf(fp, "link %s", ll_addr_n2a((unsigned char*)list->addr.data, + list->addr.bytelen, 0, + b1, sizeof(b1))); + } else { + char abuf[256]; + switch(list->addr.family) { + case AF_INET: + fprintf(fp, "inet "); + break; + case AF_INET6: + fprintf(fp, "inet6 "); + break; + default: + fprintf(fp, "family %d ", list->addr.family); + break; + } + fprintf(fp, "%s", + format_host(list->addr.family, + -1, + list->addr.data, + abuf, sizeof(abuf))); + } + if (list->users != 1) + fprintf(fp, " users %d", list->users); + if (list->features) + fprintf(fp, " %s", list->features); + fprintf(fp, "\n"); +} + +static void print_mlist(FILE *fp, struct ma_info *list) +{ + int cur_index = 0; + + for (; list; list = list->next) { + if (oneline) { + cur_index = list->index; + fprintf(fp, "%d:\t%s%s", cur_index, list->name, _SL_); + } else if (cur_index != list->index) { + cur_index = list->index; + fprintf(fp, "%d:\t%s\n", cur_index, list->name); + } + print_maddr(fp, list); + } +} + +static int multiaddr_list(int argc, char **argv) +{ + struct ma_info *list = NULL; + + if (!filter.family) + filter.family = preferred_family; + + while (argc > 0) { + if (1) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (filter.dev) + duparg2("dev", *argv); + filter.dev = *argv; + } + argv++; argc--; + } + + if (!filter.family || filter.family == AF_PACKET) + read_dev_mcast(&list); + if (!filter.family || filter.family == AF_INET) + read_igmp(&list); + if (!filter.family || filter.family == AF_INET6) + read_igmp6(&list); + print_mlist(stdout, list); + return 0; +} + +int multiaddr_modify(int cmd, int argc, char **argv) +{ + struct ifreq ifr; + int fd; + + memset(&ifr, 0, sizeof(ifr)); + + if (cmd == RTM_NEWADDR) + cmd = SIOCADDMULTI; + else + cmd = SIOCDELMULTI; + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (ifr.ifr_name[0]) + duparg("dev", *argv); + strncpy(ifr.ifr_name, *argv, IFNAMSIZ); + } else { + if (matches(*argv, "address") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (ifr.ifr_hwaddr.sa_data[0]) + duparg("address", *argv); + if (ll_addr_a2n(ifr.ifr_hwaddr.sa_data, 14, *argv) < 0) { + fprintf(stderr, "Error: \"%s\" is not a legal ll address.\n", *argv); + exit(1); + } + } + argc--; argv++; + } + if (ifr.ifr_name[0] == 0) { + fprintf(stderr, "Not enough information: \"dev\" is required.\n"); + exit(-1); + } + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + perror("Cannot create socket"); + exit(1); + } + if (ioctl(fd, cmd, (char*)&ifr) != 0) { + perror("ioctl"); + exit(1); + } + close(fd); + + exit(0); +} + + +int do_multiaddr(int argc, char **argv) +{ + if (argc < 1) + return multiaddr_list(0, NULL); + if (matches(*argv, "add") == 0) + return multiaddr_modify(RTM_NEWADDR, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return multiaddr_modify(RTM_DELADDR, argc-1, argv+1); + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return multiaddr_list(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"ip maddr help\".\n", *argv); + exit(-1); +} diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c index e69de29b..9ed6bbaf 100644 --- a/ip/ipmonitor.c +++ b/ip/ipmonitor.c @@ -0,0 +1,152 @@ +/* + * ipmonitor.c "ip monitor". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <time.h> + +#include "utils.h" +#include "ip_common.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip monitor [ all | LISTofOBJECTS ]\n"); + exit(-1); +} + + +int accept_msg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + + if (n->nlmsg_type == RTM_NEWROUTE || n->nlmsg_type == RTM_DELROUTE) { + print_route(who, n, arg); + return 0; + } + if (n->nlmsg_type == RTM_NEWLINK || n->nlmsg_type == RTM_DELLINK) { + ll_remember_index(who, n, NULL); + print_linkinfo(who, n, arg); + return 0; + } + if (n->nlmsg_type == RTM_NEWADDR || n->nlmsg_type == RTM_DELADDR) { + print_addrinfo(who, n, arg); + return 0; + } + if (n->nlmsg_type == RTM_NEWNEIGH || n->nlmsg_type == RTM_DELNEIGH) { + print_neigh(who, n, arg); + return 0; + } + if (n->nlmsg_type == 15) { + char *tstr; + time_t secs = ((__u32*)NLMSG_DATA(n))[0]; + long usecs = ((__u32*)NLMSG_DATA(n))[1]; + tstr = asctime(localtime(&secs)); + tstr[strlen(tstr)-1] = 0; + fprintf(fp, "Timestamp: %s %lu us\n", tstr, usecs); + return 0; + } + if (n->nlmsg_type == RTM_NEWQDISC || + n->nlmsg_type == RTM_DELQDISC || + n->nlmsg_type == RTM_NEWTCLASS || + n->nlmsg_type == RTM_DELTCLASS || + n->nlmsg_type == RTM_NEWTFILTER || + n->nlmsg_type == RTM_DELTFILTER) + return 0; + if (n->nlmsg_type != NLMSG_ERROR && n->nlmsg_type != NLMSG_NOOP && + n->nlmsg_type != NLMSG_DONE) { + fprintf(fp, "Unknown message: %08x %08x %08x\n", + n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags); + } + return 0; +} + +int do_ipmonitor(int argc, char **argv) +{ + struct rtnl_handle rth; + char *file = NULL; + unsigned groups = ~RTMGRP_TC; + int llink=0; + int laddr=0; + int lroute=0; + + ipaddr_reset_filter(1); + iproute_reset_filter(); + ipneigh_reset_filter(); + + while (argc > 0) { + if (matches(*argv, "file") == 0) { + NEXT_ARG(); + file = *argv; + } else if (matches(*argv, "link") == 0) { + llink=1; + groups = 0; + } else if (matches(*argv, "address") == 0) { + laddr=1; + groups = 0; + } else if (matches(*argv, "route") == 0) { + lroute=1; + groups = 0; + } else if (strcmp(*argv, "all") == 0) { + groups = ~RTMGRP_TC; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + fprintf(stderr, "Argument \"%s\" is unknown, try \"ip monitor help\".\n", *argv); + exit(-1); + } + argc--; argv++; + } + + if (llink) + groups |= RTMGRP_LINK; + if (laddr) { + if (!preferred_family || preferred_family == AF_INET) + groups |= RTMGRP_IPV4_IFADDR; + if (!preferred_family || preferred_family == AF_INET6) + groups |= RTMGRP_IPV6_IFADDR; + } + if (lroute) { + if (!preferred_family || preferred_family == AF_INET) + groups |= RTMGRP_IPV4_ROUTE; + if (!preferred_family || preferred_family == AF_INET6) + groups |= RTMGRP_IPV6_ROUTE; + } + + if (file) { + FILE *fp; + fp = fopen(file, "r"); + if (fp == NULL) { + perror("Cannot fopen"); + exit(-1); + } + return rtnl_from_file(fp, accept_msg, (void*)stdout); + } + + if (rtnl_open(&rth, groups) < 0) + exit(1); + + ll_init_map(&rth); + + if (rtnl_listen(&rth, accept_msg, (void*)stdout) < 0) + exit(2); + + exit(0); +} diff --git a/ip/ipmroute.c b/ip/ipmroute.c index e69de29b..01e876bc 100644 --- a/ip/ipmroute.c +++ b/ip/ipmroute.c @@ -0,0 +1,204 @@ +/* + * ipmroute.c "ip mroute". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" + +char filter_dev[16]; +int filter_family; + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip mroute show [ PREFIX ] [ from PREFIX ] [ iif DEVICE ]\n"); +#if 0 + fprintf(stderr, "Usage: ip mroute [ add | del ] DESTINATION from SOURCE [ iif DEVICE ] [ oif DEVICE ]\n"); +#endif + exit(-1); +} + +char *viftable[32]; + +struct rtfilter +{ + inet_prefix mdst; + inet_prefix msrc; +} filter; + +void read_viftable(void) +{ + char buf[256]; + FILE *fp = fopen("/proc/net/ip_mr_vif", "r"); + + if (!fp) + return; + + fgets(buf, sizeof(buf), fp); + + while (fgets(buf, sizeof(buf), fp)) { + int vifi; + char dev[256]; + + if (sscanf(buf, "%d%s", &vifi, dev) < 2) + continue; + + if (vifi<0 || vifi>31) + continue; + + viftable[vifi] = strdup(dev); + } + fclose(fp); +} + +void read_mroute_list(FILE *ofp) +{ + char buf[256]; + FILE *fp = fopen("/proc/net/ip_mr_cache", "r"); + + if (!fp) + return; + + fgets(buf, sizeof(buf), fp); + + while (fgets(buf, sizeof(buf), fp)) { + inet_prefix maddr, msrc; + unsigned pkts, b, w; + int vifi; + char oiflist[256]; + char sbuf[256]; + char mbuf[256]; + char obuf[256]; + + oiflist[0] = 0; + if (sscanf(buf, "%x%x%d%u%u%u%s", maddr.data, msrc.data, &vifi, + &pkts, &b, &w, oiflist) < 6) + continue; + + if (vifi!=-1 && (vifi < 0 || vifi>31)) + continue; + + if (filter_dev[0] && (vifi<0 || strcmp(filter_dev, viftable[vifi]))) + continue; + if (filter.mdst.family && inet_addr_match(&maddr, &filter.mdst, filter.mdst.bitlen)) + continue; + if (filter.msrc.family && inet_addr_match(&msrc, &filter.msrc, filter.msrc.bitlen)) + continue; + + snprintf(obuf, sizeof(obuf), "(%s, %s)", + format_host(AF_INET, 4, &msrc.data[0], sbuf, sizeof(sbuf)), + format_host(AF_INET, 4, &maddr.data[0], mbuf, sizeof(mbuf))); + + fprintf(ofp, "%-32s Iif: ", obuf); + + if (vifi == -1) + fprintf(ofp, "unresolved "); + else + fprintf(ofp, "%-10s ", viftable[vifi]); + + if (oiflist[0]) { + char *next = NULL; + char *p = oiflist; + int ovifi, ottl; + + fprintf(ofp, "Oifs: "); + + while (p) { + next = strchr(p, ' '); + if (next) { + *next = 0; + next++; + } + if (sscanf(p, "%d:%d", &ovifi, &ottl)<2) { + p = next; + continue; + } + p = next; + + fprintf(ofp, "%s", viftable[ovifi]); + if (ottl>1) + fprintf(ofp, "(ttl %d) ", ovifi); + else + fprintf(ofp, " "); + } + } + + if (show_stats && b) { + fprintf(ofp, "%s %u packets, %u bytes", _SL_, pkts, b); + if (w) + fprintf(ofp, ", %u arrived on wrong iif.", w); + } + fprintf(ofp, "\n"); + } + fclose(fp); +} + + +static int mroute_list(int argc, char **argv) +{ + while (argc > 0) { + if (strcmp(*argv, "iif") == 0) { + NEXT_ARG(); + strncpy(filter_dev, *argv, sizeof(filter_dev)-1); + } else if (matches(*argv, "from") == 0) { + NEXT_ARG(); + get_prefix(&filter.msrc, *argv, AF_INET); + } else { + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + get_prefix(&filter.mdst, *argv, AF_INET); + } + argv++; argc--; + } + + read_viftable(); + read_mroute_list(stdout); + return 0; +} + +int do_multiroute(int argc, char **argv) +{ + if (argc < 1) + return mroute_list(0, NULL); +#if 0 + if (matches(*argv, "add") == 0) + return mroute_modify(RTM_NEWADDR, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return mroute_modify(RTM_DELADDR, argc-1, argv+1); + if (matches(*argv, "get") == 0) + return mroute_get(argc-1, argv+1); +#endif + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return mroute_list(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"ip mroute help\".\n", *argv); + exit(-1); +} diff --git a/ip/ipneigh.c b/ip/ipneigh.c index e69de29b..f8c27900 100644 --- a/ip/ipneigh.c +++ b/ip/ipneigh.c @@ -0,0 +1,484 @@ +/* + * ipneigh.c "ip neigh". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <string.h> +#include <sys/time.h> +#include <net/if.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> + +#include "rt_names.h" +#include "utils.h" +#include "ip_common.h" + +#define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) + +static struct +{ + int family; + int index; + int state; + int unused_only; + inet_prefix pfx; + int flushed; + char *flushb; + int flushp; + int flushe; + struct rtnl_handle *rth; +} filter; + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip neigh { add | del | change | replace } { ADDR [ lladdr LLADDR ]\n" + " [ nud { permanent | noarp | stale | reachable } ]\n" + " | proxy ADDR } [ dev DEV ]\n"); + fprintf(stderr, " ip neigh {show|flush} [ to PREFIX ] [ dev DEV ] [ nud STATE ]\n"); + exit(-1); +} + +int nud_state_a2n(unsigned *state, char *arg) +{ + if (matches(arg, "permanent") == 0) + *state = NUD_PERMANENT; + else if (matches(arg, "reachable") == 0) + *state = NUD_REACHABLE; + else if (strcmp(arg, "noarp") == 0) + *state = NUD_NOARP; + else if (strcmp(arg, "none") == 0) + *state = NUD_NONE; + else if (strcmp(arg, "stale") == 0) + *state = NUD_STALE; + else if (strcmp(arg, "incomplete") == 0) + *state = NUD_INCOMPLETE; + else if (strcmp(arg, "delay") == 0) + *state = NUD_DELAY; + else if (strcmp(arg, "probe") == 0) + *state = NUD_PROBE; + else if (matches(arg, "failed") == 0) + *state = NUD_FAILED; + else { + if (get_unsigned(state, arg, 0)) + return -1; + if (*state>=0x100 || (*state&((*state)-1))) + return -1; + } + return 0; +} + +char * nud_state_n2a(__u8 state, char *buf, int len) +{ + switch (state) { + case NUD_NONE: + return "none"; + case NUD_INCOMPLETE: + return "incomplete"; + case NUD_REACHABLE: + return "reachable"; + case NUD_STALE: + return "stale"; + case NUD_DELAY: + return "delay"; + case NUD_PROBE: + return "probe"; + case NUD_FAILED: + return "failed"; + case NUD_NOARP: + return "noarp"; + case NUD_PERMANENT: + return "permanent"; + default: + snprintf(buf, len, "%x", state); + return buf; + } +} + +static int flush_update(void) +{ + if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) { + perror("Failed to send flush request\n"); + return -1; + } + filter.flushp = 0; + return 0; +} + + +static int ipneigh_modify(int cmd, int flags, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct ndmsg ndm; + char buf[256]; + } req; + char *d = NULL; + int dst_ok = 0; + int lladdr_ok = 0; + char * lla = NULL; + inet_prefix dst; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST|flags; + req.n.nlmsg_type = cmd; + req.ndm.ndm_family = preferred_family; + req.ndm.ndm_state = NUD_PERMANENT; + + while (argc > 0) { + if (matches(*argv, "lladdr") == 0) { + NEXT_ARG(); + if (lladdr_ok) + duparg("lladdr", *argv); + lla = *argv; + lladdr_ok = 1; + } else if (strcmp(*argv, "nud") == 0) { + unsigned state; + NEXT_ARG(); + if (nud_state_a2n(&state, *argv)) + invarg("nud state is bad", *argv); + req.ndm.ndm_state = state; + } else if (matches(*argv, "proxy") == 0) { + NEXT_ARG(); + if (matches(*argv, "help") == 0) + usage(); + if (dst_ok) + duparg("address", *argv); + get_addr(&dst, *argv, preferred_family); + dst_ok = 1; + req.ndm.ndm_flags |= NTF_PROXY; + } else if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + d = *argv; + } else { + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) { + NEXT_ARG(); + } + if (dst_ok) + duparg2("to", *argv); + get_addr(&dst, *argv, preferred_family); + dst_ok = 1; + } + argc--; argv++; + } + if (d == NULL || !dst_ok || dst.family == AF_UNSPEC) { + fprintf(stderr, "Device and destination are required arguments.\n"); + exit(-1); + } + req.ndm.ndm_family = dst.family; + addattr_l(&req.n, sizeof(req), NDA_DST, &dst.data, dst.bytelen); + + if (lla && strcmp(lla, "null")) { + __u8 llabuf[16]; + int l; + + l = ll_addr_a2n(llabuf, sizeof(llabuf), lla); + addattr_l(&req.n, sizeof(req), NDA_LLADDR, llabuf, l); + } + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + ll_init_map(&rth); + + if ((req.ndm.ndm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + return -1; + } + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + exit(0); +} + + +int print_neigh(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct ndmsg *r = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[NDA_MAX+1]; + char abuf[256]; + + if (n->nlmsg_type != RTM_NEWNEIGH && n->nlmsg_type != RTM_DELNEIGH) { + fprintf(stderr, "Not RTM_NEWNEIGH: %08x %08x %08x\n", + n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags); + + return 0; + } + len -= NLMSG_LENGTH(sizeof(*r)); + if (len < 0) { + fprintf(stderr, "BUG: wrong nlmsg len %d\n", len); + return -1; + } + + if (filter.flushb && n->nlmsg_type != RTM_NEWNEIGH) + return 0; + + if (filter.family && filter.family != r->ndm_family) + return 0; + if (filter.index && filter.index != r->ndm_ifindex) + return 0; + if (!(filter.state&r->ndm_state) && + (r->ndm_state || !(filter.state&0x100)) && + (r->ndm_family != AF_DECnet)) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, NDA_MAX, NDA_RTA(r), n->nlmsg_len - NLMSG_LENGTH(sizeof(*r))); + + if (tb[NDA_DST]) { + if (filter.pfx.family) { + inet_prefix dst; + memset(&dst, 0, sizeof(dst)); + dst.family = r->ndm_family; + memcpy(&dst.data, RTA_DATA(tb[NDA_DST]), RTA_PAYLOAD(tb[NDA_DST])); + if (inet_addr_match(&dst, &filter.pfx, filter.pfx.bitlen)) + return 0; + } + } + if (filter.unused_only && tb[NDA_CACHEINFO]) { + struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]); + if (ci->ndm_refcnt) + return 0; + } + + if (filter.flushb) { + struct nlmsghdr *fn; + if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) { + if (flush_update()) + return -1; + } + fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp)); + memcpy(fn, n, n->nlmsg_len); + fn->nlmsg_type = RTM_DELNEIGH; + fn->nlmsg_flags = NLM_F_REQUEST; + fn->nlmsg_seq = ++filter.rth->seq; + filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb; + filter.flushed++; + if (show_stats < 2) + return 0; + } + + if (tb[NDA_DST]) { + fprintf(fp, "%s ", + format_host(r->ndm_family, + RTA_PAYLOAD(tb[NDA_DST]), + RTA_DATA(tb[NDA_DST]), + abuf, sizeof(abuf))); + } + if (!filter.index && r->ndm_ifindex) + fprintf(fp, "dev %s ", ll_index_to_name(r->ndm_ifindex)); + if (tb[NDA_LLADDR]) { + SPRINT_BUF(b1); + fprintf(fp, "lladdr %s", ll_addr_n2a(RTA_DATA(tb[NDA_LLADDR]), + RTA_PAYLOAD(tb[NDA_LLADDR]), + ll_index_to_type(r->ndm_ifindex), + b1, sizeof(b1))); + } + if (r->ndm_flags & NTF_ROUTER) { + fprintf(fp, " router"); + } + if (tb[NDA_CACHEINFO] && show_stats) { + static int hz; + struct nda_cacheinfo *ci = RTA_DATA(tb[NDA_CACHEINFO]); + if (!hz) + hz = get_hz(); + if (ci->ndm_refcnt) + printf(" ref %d", ci->ndm_refcnt); + fprintf(fp, " used %d/%d/%d", ci->ndm_used/hz, + ci->ndm_confirmed/hz, ci->ndm_updated/hz); + } + + if (r->ndm_state) { + SPRINT_BUF(b1); + fprintf(fp, " nud %s", nud_state_n2a(r->ndm_state, b1, sizeof(b1))); + } + fprintf(fp, "\n"); + + fflush(fp); + return 0; +} + +void ipneigh_reset_filter() +{ + memset(&filter, 0, sizeof(filter)); + filter.state = ~0; +} + +int do_show_or_flush(int argc, char **argv, int flush) +{ + char *filter_dev = NULL; + struct rtnl_handle rth; + int state_given = 0; + + ipneigh_reset_filter(); + + if (!filter.family) + filter.family = preferred_family; + + if (flush) { + if (argc <= 0) { + fprintf(stderr, "Flush requires arguments.\n"); + return -1; + } + filter.state = ~(NUD_PERMANENT|NUD_NOARP); + } else + filter.state = 0xFF & ~NUD_NOARP; + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (filter_dev) + duparg("dev", *argv); + filter_dev = *argv; + } else if (strcmp(*argv, "unused") == 0) { + filter.unused_only = 1; + } else if (strcmp(*argv, "nud") == 0) { + unsigned state; + NEXT_ARG(); + if (!state_given) { + state_given = 1; + filter.state = 0; + } + if (nud_state_a2n(&state, *argv)) { + if (strcmp(*argv, "all") != 0) + invarg("nud state is bad", *argv); + state = ~0; + if (flush) + state &= ~NUD_NOARP; + } + if (state == 0) + state = 0x100; + filter.state |= state; + } else { + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + get_prefix(&filter.pfx, *argv, filter.family); + if (filter.family == AF_UNSPEC) + filter.family = filter.pfx.family; + } + argc--; argv++; + } + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + ll_init_map(&rth); + + if (filter_dev) { + if ((filter.index = ll_name_to_index(filter_dev)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", filter_dev); + return -1; + } + } + + if (flush) { + int round = 0; + char flushb[4096-512]; + + filter.flushb = flushb; + filter.flushp = 0; + filter.flushe = sizeof(flushb); + filter.rth = &rth; + filter.state &= ~NUD_FAILED; + + for (;;) { + if (rtnl_wilddump_request(&rth, filter.family, RTM_GETNEIGH) < 0) { + perror("Cannot send dump request"); + exit(1); + } + filter.flushed = 0; + if (rtnl_dump_filter(&rth, print_neigh, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Flush terminated\n"); + exit(1); + } + if (filter.flushed == 0) { + if (round == 0) { + fprintf(stderr, "Nothing to flush.\n"); + } else if (show_stats) + printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":""); + fflush(stdout); + return 0; + } + round++; + if (flush_update() < 0) + exit(1); + if (show_stats) { + printf("\n*** Round %d, deleting %d entries ***\n", round, filter.flushed); + fflush(stdout); + } + } + } + + if (rtnl_wilddump_request(&rth, filter.family, RTM_GETNEIGH) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, print_neigh, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + return 0; +} + +int do_ipneigh(int argc, char **argv) +{ + if (argc > 0) { + if (matches(*argv, "add") == 0) + return ipneigh_modify(RTM_NEWNEIGH, NLM_F_CREATE|NLM_F_EXCL, argc-1, argv+1); + if (matches(*argv, "change") == 0 || + strcmp(*argv, "chg") == 0) + return ipneigh_modify(RTM_NEWNEIGH, NLM_F_REPLACE, argc-1, argv+1); + if (matches(*argv, "replace") == 0) + return ipneigh_modify(RTM_NEWNEIGH, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return ipneigh_modify(RTM_DELNEIGH, 0, argc-1, argv+1); + if (matches(*argv, "get") == 0) { + fprintf(stderr, "Sorry, \"neigh get\" is not implemented :-(\n"); + return -1; + } + if (matches(*argv, "show") == 0 || + matches(*argv, "lst") == 0 || + matches(*argv, "list") == 0) + return do_show_or_flush(argc-1, argv+1, 0); + if (matches(*argv, "flush") == 0) + return do_show_or_flush(argc-1, argv+1, 1); + if (matches(*argv, "help") == 0) + usage(); + } else + return do_show_or_flush(0, NULL, 0); + + fprintf(stderr, "Command \"%s\" is unknown, try \"ip neigh help\".\n", *argv); + exit(-1); +} diff --git a/ip/iproute.c b/ip/iproute.c index e69de29b..404f8e0e 100644 --- a/ip/iproute.c +++ b/ip/iproute.c @@ -0,0 +1,1410 @@ +/* + * iproute.c "ip route". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + * Kunihiro Ishiguro <kunihiro@zebra.org> 001102: rtnh_ifindex was not initialized + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <string.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <arpa/inet.h> +#include <linux/in_route.h> + +#include "rt_names.h" +#include "utils.h" +#include "ip_common.h" + +#ifndef RTAX_RTTVAR +#define RTAX_RTTVAR RTAX_HOPS +#endif + + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip route { list | flush } SELECTOR\n"); + fprintf(stderr, " ip route get ADDRESS [ from ADDRESS iif STRING ]\n"); + fprintf(stderr, " [ oif STRING ] [ tos TOS ]\n"); + fprintf(stderr, " ip route { add | del | change | append | replace | monitor } ROUTE\n"); + fprintf(stderr, "SELECTOR := [ root PREFIX ] [ match PREFIX ] [ exact PREFIX ]\n"); + fprintf(stderr, " [ table TABLE_ID ] [ proto RTPROTO ]\n"); + fprintf(stderr, " [ type TYPE ] [ scope SCOPE ]\n"); + fprintf(stderr, "ROUTE := NODE_SPEC [ INFO_SPEC ]\n"); + fprintf(stderr, "NODE_SPEC := [ TYPE ] PREFIX [ tos TOS ]\n"); + fprintf(stderr, " [ table TABLE_ID ] [ proto RTPROTO ]\n"); + fprintf(stderr, " [ scope SCOPE ] [ metric METRIC ]\n"); + fprintf(stderr, "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n"); + fprintf(stderr, "NH := [ via ADDRESS ] [ dev STRING ] [ weight NUMBER ] NHFLAGS\n"); + fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ]\n"); + fprintf(stderr, " [ rtt NUMBER ] [ rttvar NUMBER ]\n"); + fprintf(stderr, " [ window NUMBER] [ cwnd NUMBER ] [ ssthresh REALM ]\n"); + fprintf(stderr, " [ realms REALM ]\n"); + fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n"); + fprintf(stderr, " unreachable | prohibit | blackhole | nat ]\n"); + fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n"); + fprintf(stderr, "SCOPE := [ host | link | global | NUMBER ]\n"); + fprintf(stderr, "FLAGS := [ equalize ]\n"); + fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n"); + fprintf(stderr, "RTPROTO := [ kernel | boot | static | NUMBER ]\n"); + exit(-1); +} + + +static struct +{ + int tb; + int flushed; + char *flushb; + int flushp; + int flushe; + struct rtnl_handle *rth; + int protocol, protocolmask; + int scope, scopemask; + int type, typemask; + int tos, tosmask; + int iif, iifmask; + int oif, oifmask; + int realm, realmmask; + inet_prefix rprefsrc; + inet_prefix rvia; + inet_prefix rdst; + inet_prefix mdst; + inet_prefix rsrc; + inet_prefix msrc; +} filter; + +static int flush_update(void) +{ + if (rtnl_send(filter.rth, filter.flushb, filter.flushp) < 0) { + perror("Failed to send flush request\n"); + return -1; + } + filter.flushp = 0; + return 0; +} + +int print_route(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct rtmsg *r = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[RTA_MAX+1]; + char abuf[256]; + inet_prefix dst; + inet_prefix src; + inet_prefix prefsrc; + inet_prefix via; + int host_len = -1; + SPRINT_BUF(b1); + + + if (n->nlmsg_type != RTM_NEWROUTE && n->nlmsg_type != RTM_DELROUTE) { + fprintf(stderr, "Not a route: %08x %08x %08x\n", + n->nlmsg_len, n->nlmsg_type, n->nlmsg_flags); + return 0; + } + if (filter.flushb && n->nlmsg_type != RTM_NEWROUTE) + return 0; + len -= NLMSG_LENGTH(sizeof(*r)); + if (len < 0) { + fprintf(stderr, "BUG: wrong nlmsg len %d\n", len); + return -1; + } + + if (r->rtm_family == AF_INET6) + host_len = 128; + else if (r->rtm_family == AF_INET) + host_len = 32; + else if (r->rtm_family == AF_DECnet) + host_len = 16; + else if (r->rtm_family == AF_IPX) + host_len = 80; + + if (r->rtm_family == AF_INET6) { + if (filter.tb) { + if (filter.tb < 0) { + if (!(r->rtm_flags&RTM_F_CLONED)) + return 0; + } else { + if (r->rtm_flags&RTM_F_CLONED) + return 0; + if (filter.tb == RT_TABLE_LOCAL) { + if (r->rtm_type != RTN_LOCAL) + return 0; + } else if (filter.tb == RT_TABLE_MAIN) { + if (r->rtm_type == RTN_LOCAL) + return 0; + } else { + return 0; + } + } + } + } else { + if (filter.tb > 0 && filter.tb != r->rtm_table) + return 0; + } + if ((filter.protocol^r->rtm_protocol)&filter.protocolmask) + return 0; + if ((filter.scope^r->rtm_scope)&filter.scopemask) + return 0; + if ((filter.type^r->rtm_type)&filter.typemask) + return 0; + if ((filter.tos^r->rtm_tos)&filter.tosmask) + return 0; + if (filter.rdst.family && + (r->rtm_family != filter.rdst.family || filter.rdst.bitlen > r->rtm_dst_len)) + return 0; + if (filter.mdst.family && + (r->rtm_family != filter.mdst.family || + (filter.mdst.bitlen >= 0 && filter.mdst.bitlen < r->rtm_dst_len))) + return 0; + if (filter.rsrc.family && + (r->rtm_family != filter.rsrc.family || filter.rsrc.bitlen > r->rtm_src_len)) + return 0; + if (filter.msrc.family && + (r->rtm_family != filter.msrc.family || + (filter.msrc.bitlen >= 0 && filter.msrc.bitlen < r->rtm_src_len))) + return 0; + if (filter.rvia.family && r->rtm_family != filter.rvia.family) + return 0; + if (filter.rprefsrc.family && r->rtm_family != filter.rprefsrc.family) + return 0; + + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len); + + memset(&dst, 0, sizeof(dst)); + dst.family = r->rtm_family; + if (tb[RTA_DST]) + memcpy(&dst.data, RTA_DATA(tb[RTA_DST]), (r->rtm_dst_len+7)/8); + if (filter.rsrc.family || filter.msrc.family) { + memset(&src, 0, sizeof(src)); + src.family = r->rtm_family; + if (tb[RTA_SRC]) + memcpy(&src.data, RTA_DATA(tb[RTA_SRC]), (r->rtm_src_len+7)/8); + } + if (filter.rvia.bitlen>0) { + memset(&via, 0, sizeof(via)); + via.family = r->rtm_family; + if (tb[RTA_GATEWAY]) + memcpy(&via.data, RTA_DATA(tb[RTA_GATEWAY]), host_len); + } + if (filter.rprefsrc.bitlen>0) { + memset(&prefsrc, 0, sizeof(prefsrc)); + prefsrc.family = r->rtm_family; + if (tb[RTA_PREFSRC]) + memcpy(&prefsrc.data, RTA_DATA(tb[RTA_PREFSRC]), host_len); + } + + if (filter.rdst.family && inet_addr_match(&dst, &filter.rdst, filter.rdst.bitlen)) + return 0; + if (filter.mdst.family && filter.mdst.bitlen >= 0 && + inet_addr_match(&dst, &filter.mdst, r->rtm_dst_len)) + return 0; + + if (filter.rsrc.family && inet_addr_match(&src, &filter.rsrc, filter.rsrc.bitlen)) + return 0; + if (filter.msrc.family && filter.msrc.bitlen >= 0 && + inet_addr_match(&src, &filter.msrc, r->rtm_src_len)) + return 0; + + if (filter.rvia.family && inet_addr_match(&via, &filter.rvia, filter.rvia.bitlen)) + return 0; + if (filter.rprefsrc.family && inet_addr_match(&prefsrc, &filter.rprefsrc, filter.rprefsrc.bitlen)) + return 0; + if (filter.realmmask) { + __u32 realms = 0; + if (tb[RTA_FLOW]) + realms = *(__u32*)RTA_DATA(tb[RTA_FLOW]); + if ((realms^filter.realm)&filter.realmmask) + return 0; + } + if (filter.iifmask) { + int iif = 0; + if (tb[RTA_IIF]) + iif = *(int*)RTA_DATA(tb[RTA_IIF]); + if ((iif^filter.iif)&filter.iifmask) + return 0; + } + if (filter.oifmask) { + int oif = 0; + if (tb[RTA_OIF]) + oif = *(int*)RTA_DATA(tb[RTA_OIF]); + if ((oif^filter.oif)&filter.oifmask) + return 0; + } + if (filter.flushb && + r->rtm_family == AF_INET6 && + r->rtm_dst_len == 0 && + r->rtm_type == RTN_UNREACHABLE && + tb[RTA_PRIORITY] && + *(int*)RTA_DATA(tb[RTA_PRIORITY]) == -1) + return 0; + + if (filter.flushb) { + struct nlmsghdr *fn; + if (NLMSG_ALIGN(filter.flushp) + n->nlmsg_len > filter.flushe) { + if (flush_update()) + return -1; + } + fn = (struct nlmsghdr*)(filter.flushb + NLMSG_ALIGN(filter.flushp)); + memcpy(fn, n, n->nlmsg_len); + fn->nlmsg_type = RTM_DELROUTE; + fn->nlmsg_flags = NLM_F_REQUEST; + fn->nlmsg_seq = ++filter.rth->seq; + filter.flushp = (((char*)fn) + n->nlmsg_len) - filter.flushb; + filter.flushed++; + if (show_stats < 2) + return 0; + } + + if (n->nlmsg_type == RTM_DELROUTE) + fprintf(fp, "Deleted "); + if (r->rtm_type != RTN_UNICAST && !filter.type) + fprintf(fp, "%s ", rtnl_rtntype_n2a(r->rtm_type, b1, sizeof(b1))); + + if (tb[RTA_DST]) { + if (r->rtm_dst_len != host_len) { + fprintf(fp, "%s/%u ", rt_addr_n2a(r->rtm_family, + RTA_PAYLOAD(tb[RTA_DST]), + RTA_DATA(tb[RTA_DST]), + abuf, sizeof(abuf)), + r->rtm_dst_len + ); + } else { + fprintf(fp, "%s ", format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_DST]), + RTA_DATA(tb[RTA_DST]), + abuf, sizeof(abuf)) + ); + } + } else if (r->rtm_dst_len) { + fprintf(fp, "0/%d ", r->rtm_dst_len); + } else { + fprintf(fp, "default "); + } + if (tb[RTA_SRC]) { + if (r->rtm_src_len != host_len) { + fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family, + RTA_PAYLOAD(tb[RTA_SRC]), + RTA_DATA(tb[RTA_SRC]), + abuf, sizeof(abuf)), + r->rtm_src_len + ); + } else { + fprintf(fp, "from %s ", format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_SRC]), + RTA_DATA(tb[RTA_SRC]), + abuf, sizeof(abuf)) + ); + } + } else if (r->rtm_src_len) { + fprintf(fp, "from 0/%u ", r->rtm_src_len); + } + if (r->rtm_tos && filter.tosmask != -1) { + SPRINT_BUF(b1); + fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1))); + } + if (tb[RTA_GATEWAY] && filter.rvia.bitlen != host_len) { + fprintf(fp, "via %s ", + format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_GATEWAY]), + RTA_DATA(tb[RTA_GATEWAY]), + abuf, sizeof(abuf))); + } + if (tb[RTA_OIF] && filter.oifmask != -1) + fprintf(fp, "dev %s ", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_OIF]))); + + if (!(r->rtm_flags&RTM_F_CLONED)) { + if (r->rtm_table != RT_TABLE_MAIN && !filter.tb) + fprintf(fp, " table %s ", rtnl_rttable_n2a(r->rtm_table, b1, sizeof(b1))); + if (r->rtm_protocol != RTPROT_BOOT && filter.protocolmask != -1) + fprintf(fp, " proto %s ", rtnl_rtprot_n2a(r->rtm_protocol, b1, sizeof(b1))); + if (r->rtm_scope != RT_SCOPE_UNIVERSE && filter.scopemask != -1) + fprintf(fp, " scope %s ", rtnl_rtscope_n2a(r->rtm_scope, b1, sizeof(b1))); + } + if (tb[RTA_PREFSRC] && filter.rprefsrc.bitlen != host_len) { + /* Do not use format_host(). It is our local addr + and symbolic name will not be useful. + */ + fprintf(fp, " src %s ", + rt_addr_n2a(r->rtm_family, + RTA_PAYLOAD(tb[RTA_PREFSRC]), + RTA_DATA(tb[RTA_PREFSRC]), + abuf, sizeof(abuf))); + } + if (tb[RTA_PRIORITY]) + fprintf(fp, " metric %d ", *(__u32*)RTA_DATA(tb[RTA_PRIORITY])); + if (r->rtm_flags & RTNH_F_DEAD) + fprintf(fp, "dead "); + if (r->rtm_flags & RTNH_F_ONLINK) + fprintf(fp, "onlink "); + if (r->rtm_flags & RTNH_F_PERVASIVE) + fprintf(fp, "pervasive "); + if (r->rtm_flags & RTM_F_EQUALIZE) + fprintf(fp, "equalize "); + if (r->rtm_flags & RTM_F_NOTIFY) + fprintf(fp, "notify "); + + if (tb[RTA_FLOW] && filter.realmmask != ~0U) { + __u32 to = *(__u32*)RTA_DATA(tb[RTA_FLOW]); + __u32 from = to>>16; + to &= 0xFFFF; + fprintf(fp, "realm%s ", from ? "s" : ""); + if (from) { + fprintf(fp, "%s/", + rtnl_rtrealm_n2a(from, b1, sizeof(b1))); + } + fprintf(fp, "%s ", + rtnl_rtrealm_n2a(to, b1, sizeof(b1))); + } + if ((r->rtm_flags&RTM_F_CLONED) && r->rtm_family == AF_INET) { + __u32 flags = r->rtm_flags&~0xFFFF; + int first = 1; + + fprintf(fp, "%s cache ", _SL_); + +#define PRTFL(fl,flname) if (flags&RTCF_##fl) { \ + flags &= ~RTCF_##fl; \ + fprintf(fp, "%s" flname "%s", first ? "<" : "", flags ? "," : "> "); \ + first = 0; } + PRTFL(LOCAL, "local"); + PRTFL(REJECT, "reject"); + PRTFL(MULTICAST, "mc"); + PRTFL(BROADCAST, "brd"); + PRTFL(DNAT, "dst-nat"); + PRTFL(SNAT, "src-nat"); + PRTFL(MASQ, "masq"); + PRTFL(DIRECTDST, "dst-direct"); + PRTFL(DIRECTSRC, "src-direct"); + PRTFL(REDIRECTED, "redirected"); + PRTFL(DOREDIRECT, "redirect"); + PRTFL(FAST, "fastroute"); + PRTFL(NOTIFY, "notify"); + PRTFL(TPROXY, "proxy"); +#ifdef RTCF_EQUALIZE + PRTFL(EQUALIZE, "equalize"); +#endif + if (flags) + fprintf(fp, "%s%x> ", first ? "<" : "", flags); + if (tb[RTA_CACHEINFO]) { + struct rta_cacheinfo *ci = RTA_DATA(tb[RTA_CACHEINFO]); + static int hz; + if (!hz) + hz = get_hz(); + if (ci->rta_expires != 0) + fprintf(fp, " expires %dsec", ci->rta_expires/hz); + if (ci->rta_error != 0) + fprintf(fp, " error %d", ci->rta_error); + if (show_stats) { + if (ci->rta_clntref) + fprintf(fp, " users %d", ci->rta_clntref); + if (ci->rta_used != 0) + fprintf(fp, " used %d", ci->rta_used); + if (ci->rta_lastuse != 0) + fprintf(fp, " age %dsec", ci->rta_lastuse/hz); + } +#ifdef RTNETLINK_HAVE_PEERINFO + if (ci->rta_id) + fprintf(fp, " ipid 0x%04x", ci->rta_id); + if (ci->rta_ts || ci->rta_tsage) + fprintf(fp, " ts 0x%x tsage %dsec", ci->rta_ts, ci->rta_tsage); +#endif + } + } else if (r->rtm_family == AF_INET6) { + struct rta_cacheinfo *ci = NULL; + if (tb[RTA_CACHEINFO]) + ci = RTA_DATA(tb[RTA_CACHEINFO]); + if ((r->rtm_flags & RTM_F_CLONED) || (ci && ci->rta_expires)) { + static int hz; + if (!hz) + hz = get_hz(); + if (r->rtm_flags & RTM_F_CLONED) + fprintf(fp, "%s cache ", _SL_); + if (ci->rta_expires) + fprintf(fp, " expires %dsec", ci->rta_expires/hz); + if (ci->rta_error != 0) + fprintf(fp, " error %d", ci->rta_error); + if (show_stats) { + if (ci->rta_clntref) + fprintf(fp, " users %d", ci->rta_clntref); + if (ci->rta_used != 0) + fprintf(fp, " used %d", ci->rta_used); + if (ci->rta_lastuse != 0) + fprintf(fp, " age %dsec", ci->rta_lastuse/hz); + } + } else if (ci) { + if (ci->rta_error != 0) + fprintf(fp, " error %d", ci->rta_error); + } + } + if (tb[RTA_METRICS]) { + int i; + unsigned mxlock = 0; + struct rtattr *mxrta[RTAX_MAX+1]; + + memset(mxrta, 0, sizeof(mxrta)); + + parse_rtattr(mxrta, RTAX_MAX, RTA_DATA(tb[RTA_METRICS]), + RTA_PAYLOAD(tb[RTA_METRICS])); + if (mxrta[RTAX_LOCK]) + mxlock = *(unsigned*)RTA_DATA(mxrta[RTAX_LOCK]); + + for (i=2; i<=RTAX_MAX; i++) { + static char *mx_names[] = + { + "mtu", + "window", + "rtt", + "rttvar", + "ssthresh", + "cwnd", + "advmss", + "reordering", + }; + static int hz; + if (mxrta[i] == NULL) + continue; + if (!hz) + hz = get_hz(); + if (i-2 < sizeof(mx_names)/sizeof(char*)) + fprintf(fp, " %s", mx_names[i-2]); + else + fprintf(fp, " metric%d", i); + if (mxlock & (1<<i)) + fprintf(fp, " lock"); + + if (i != RTAX_RTT && i != RTAX_RTTVAR) + fprintf(fp, " %u", *(unsigned*)RTA_DATA(mxrta[i])); + else { + unsigned val = *(unsigned*)RTA_DATA(mxrta[i]); + + val *= 1000; + if (i == RTAX_RTT) + val /= 8; + else + val /= 4; + if (val >= hz) + fprintf(fp, " %ums", val/hz); + else + fprintf(fp, " %.2fms", (float)val/hz); + } + } + } + if (tb[RTA_IIF] && filter.iifmask != -1) { + fprintf(fp, " iif %s", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_IIF]))); + } + if (tb[RTA_MULTIPATH]) { + struct rtnexthop *nh = RTA_DATA(tb[RTA_MULTIPATH]); + int first = 0; + + len = RTA_PAYLOAD(tb[RTA_MULTIPATH]); + + for (;;) { + if (len < sizeof(*nh)) + break; + if (nh->rtnh_len > len) + break; + if (r->rtm_flags&RTM_F_CLONED && r->rtm_type == RTN_MULTICAST) { + if (first) + fprintf(fp, " Oifs:"); + else + fprintf(fp, " "); + } else + fprintf(fp, "%s\tnexthop", _SL_); + if (nh->rtnh_len > sizeof(*nh)) { + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, RTA_MAX, RTNH_DATA(nh), nh->rtnh_len - sizeof(*nh)); + if (tb[RTA_GATEWAY]) { + fprintf(fp, " via %s ", + format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_GATEWAY]), + RTA_DATA(tb[RTA_GATEWAY]), + abuf, sizeof(abuf))); + } + } + if (r->rtm_flags&RTM_F_CLONED && r->rtm_type == RTN_MULTICAST) { + fprintf(fp, " %s", ll_index_to_name(nh->rtnh_ifindex)); + if (nh->rtnh_hops != 1) + fprintf(fp, "(ttl>%d)", nh->rtnh_hops); + } else { + fprintf(fp, " dev %s", ll_index_to_name(nh->rtnh_ifindex)); + fprintf(fp, " weight %d", nh->rtnh_hops+1); + } + if (nh->rtnh_flags & RTNH_F_DEAD) + fprintf(fp, " dead"); + if (nh->rtnh_flags & RTNH_F_ONLINK) + fprintf(fp, " onlink"); + if (nh->rtnh_flags & RTNH_F_PERVASIVE) + fprintf(fp, " pervasive"); + len -= NLMSG_ALIGN(nh->rtnh_len); + nh = RTNH_NEXT(nh); + } + } + fprintf(fp, "\n"); + fflush(fp); + return 0; +} + + +int parse_one_nh(struct rtattr *rta, struct rtnexthop *rtnh, int *argcp, char ***argvp) +{ + int argc = *argcp; + char **argv = *argvp; + + while (++argv, --argc > 0) { + if (strcmp(*argv, "via") == 0) { + NEXT_ARG(); + rta_addattr32(rta, 4096, RTA_GATEWAY, get_addr32(*argv)); + rtnh->rtnh_len += sizeof(struct rtattr) + 4; + } else if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if ((rtnh->rtnh_ifindex = ll_name_to_index(*argv)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", *argv); + exit(1); + } + } else if (strcmp(*argv, "weight") == 0) { + unsigned w; + NEXT_ARG(); + if (get_unsigned(&w, *argv, 0) || w == 0 || w > 256) + invarg("\"weight\" is invalid\n", *argv); + rtnh->rtnh_hops = w - 1; + } else if (strcmp(*argv, "onlink") == 0) { + rtnh->rtnh_flags |= RTNH_F_ONLINK; + } else + break; + } + *argcp = argc; + *argvp = argv; + return 0; +} + +int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r, int argc, char **argv) +{ + char buf[1024]; + struct rtattr *rta = (void*)buf; + struct rtnexthop *rtnh; + + rta->rta_type = RTA_MULTIPATH; + rta->rta_len = RTA_LENGTH(0); + rtnh = RTA_DATA(rta); + + while (argc > 0) { + if (strcmp(*argv, "nexthop") != 0) { + fprintf(stderr, "Error: \"nexthop\" or end of line is expected instead of \"%s\"\n", *argv); + exit(-1); + } + if (argc <= 1) { + fprintf(stderr, "Error: unexpected end of line after \"nexthop\"\n"); + exit(-1); + } + memset(rtnh, 0, sizeof(*rtnh)); + rtnh->rtnh_len = sizeof(*rtnh); + rta->rta_len += rtnh->rtnh_len; + parse_one_nh(rta, rtnh, &argc, &argv); + rtnh = RTNH_NEXT(rtnh); + } + + if (rta->rta_len > RTA_LENGTH(0)) + addattr_l(n, 1024, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta)); + return 0; +} + + +int iproute_modify(int cmd, unsigned flags, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct rtmsg r; + char buf[1024]; + } req; + char mxbuf[256]; + struct rtattr * mxrta = (void*)mxbuf; + unsigned mxlock = 0; + char *d = NULL; + int gw_ok = 0; + int dst_ok = 0; + int nhs_ok = 0; + int scope_ok = 0; + int table_ok = 0; + int proto_ok = 0; + int type_ok = 0; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST|flags; + req.n.nlmsg_type = cmd; + req.r.rtm_family = preferred_family; + req.r.rtm_table = RT_TABLE_MAIN; + req.r.rtm_scope = RT_SCOPE_NOWHERE; + + if (cmd != RTM_DELROUTE) { + req.r.rtm_protocol = RTPROT_BOOT; + req.r.rtm_scope = RT_SCOPE_UNIVERSE; + req.r.rtm_type = RTN_UNICAST; + } + + mxrta->rta_type = RTA_METRICS; + mxrta->rta_len = RTA_LENGTH(0); + + while (argc > 0) { + if (strcmp(*argv, "src") == 0) { + inet_prefix addr; + NEXT_ARG(); + get_addr(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + addattr_l(&req.n, sizeof(req), RTA_PREFSRC, &addr.data, addr.bytelen); + } else if (strcmp(*argv, "via") == 0) { + inet_prefix addr; + gw_ok = 1; + NEXT_ARG(); + get_addr(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + addattr_l(&req.n, sizeof(req), RTA_GATEWAY, &addr.data, addr.bytelen); + } else if (strcmp(*argv, "from") == 0) { + inet_prefix addr; + NEXT_ARG(); + get_prefix(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + if (addr.bytelen) + addattr_l(&req.n, sizeof(req), RTA_SRC, &addr.data, addr.bytelen); + req.r.rtm_src_len = addr.bitlen; + } else if (strcmp(*argv, "tos") == 0 || + matches(*argv, "dsfield") == 0) { + __u32 tos; + NEXT_ARG(); + if (rtnl_dsfield_a2n(&tos, *argv)) + invarg("\"tos\" value is invalid\n", *argv); + req.r.rtm_tos = tos; + } else if (matches(*argv, "metric") == 0 || + matches(*argv, "priority") == 0 || + matches(*argv, "preference") == 0) { + __u32 metric; + NEXT_ARG(); + if (get_u32(&metric, *argv, 0)) + invarg("\"metric\" value is invalid\n", *argv); + addattr32(&req.n, sizeof(req), RTA_PRIORITY, metric); + } else if (strcmp(*argv, "scope") == 0) { + int scope = 0; + NEXT_ARG(); + if (rtnl_rtscope_a2n(&scope, *argv)) + invarg("invalid \"scope\" value\n", *argv); + req.r.rtm_scope = scope; + scope_ok = 1; + } else if (strcmp(*argv, "mtu") == 0) { + unsigned mtu; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_MTU); + NEXT_ARG(); + } + if (get_unsigned(&mtu, *argv, 0)) + invarg("\"mtu\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_MTU, mtu); +#ifdef RTAX_ADVMSS + } else if (strcmp(*argv, "advmss") == 0) { + unsigned mss; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_ADVMSS); + NEXT_ARG(); + } + if (get_unsigned(&mss, *argv, 0)) + invarg("\"mss\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_ADVMSS, mss); +#endif +#ifdef RTAX_REORDERING + } else if (matches(*argv, "reordering") == 0) { + unsigned reord; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_REORDERING); + NEXT_ARG(); + } + if (get_unsigned(&reord, *argv, 0)) + invarg("\"reordering\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_REORDERING, reord); +#endif + } else if (strcmp(*argv, "rtt") == 0) { + unsigned rtt; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_RTT); + NEXT_ARG(); + } + if (get_unsigned(&rtt, *argv, 0)) + invarg("\"rtt\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_RTT, rtt); + } else if (matches(*argv, "window") == 0) { + unsigned win; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_WINDOW); + NEXT_ARG(); + } + if (get_unsigned(&win, *argv, 0)) + invarg("\"window\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_WINDOW, win); + } else if (matches(*argv, "cwnd") == 0) { + unsigned win; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_CWND); + NEXT_ARG(); + } + if (get_unsigned(&win, *argv, 0)) + invarg("\"cwnd\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_CWND, win); + } else if (matches(*argv, "rttvar") == 0) { + unsigned win; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_RTTVAR); + NEXT_ARG(); + } + if (get_unsigned(&win, *argv, 0)) + invarg("\"rttvar\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_RTTVAR, win); + } else if (matches(*argv, "ssthresh") == 0) { + unsigned win; + NEXT_ARG(); + if (strcmp(*argv, "lock") == 0) { + mxlock |= (1<<RTAX_SSTHRESH); + NEXT_ARG(); + } + if (get_unsigned(&win, *argv, 0)) + invarg("\"ssthresh\" value is invalid\n", *argv); + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_SSTHRESH, win); + } else if (matches(*argv, "realms") == 0) { + __u32 realm; + NEXT_ARG(); + if (get_rt_realms(&realm, *argv)) + invarg("\"realm\" value is invalid\n", *argv); + addattr32(&req.n, sizeof(req), RTA_FLOW, realm); + } else if (strcmp(*argv, "onlink") == 0) { + req.r.rtm_flags |= RTNH_F_ONLINK; + } else if (matches(*argv, "equalize") == 0 || + strcmp(*argv, "eql") == 0) { + req.r.rtm_flags |= RTM_F_EQUALIZE; + } else if (strcmp(*argv, "nexthop") == 0) { + nhs_ok = 1; + break; + } else if (matches(*argv, "protocol") == 0) { + int prot; + NEXT_ARG(); + if (rtnl_rtprot_a2n(&prot, *argv)) + invarg("\"protocol\" value is invalid\n", *argv); + req.r.rtm_protocol = prot; + proto_ok =1; + } else if (matches(*argv, "table") == 0) { + int tid; + NEXT_ARG(); + if (rtnl_rttable_a2n(&tid, *argv)) + invarg("\"table\" value is invalid\n", *argv); + req.r.rtm_table = tid; + table_ok = 1; + } else if (strcmp(*argv, "dev") == 0 || + strcmp(*argv, "oif") == 0) { + NEXT_ARG(); + d = *argv; + } else { + int type; + inet_prefix dst; + + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + } + if ((**argv < '0' || **argv > '9') && + rtnl_rtntype_a2n(&type, *argv) == 0) { + NEXT_ARG(); + req.r.rtm_type = type; + type_ok = 1; + } + + if (matches(*argv, "help") == 0) + usage(); + if (dst_ok) + duparg2("to", *argv); + get_prefix(&dst, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = dst.family; + req.r.rtm_dst_len = dst.bitlen; + dst_ok = 1; + if (dst.bytelen) + addattr_l(&req.n, sizeof(req), RTA_DST, &dst.data, dst.bytelen); + } + argc--; argv++; + } + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + if (d || nhs_ok) { + int idx; + + ll_init_map(&rth); + + if (d) { + if ((idx = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + return -1; + } + addattr32(&req.n, sizeof(req), RTA_OIF, idx); + } + } + + if (mxrta->rta_len > RTA_LENGTH(0)) { + if (mxlock) + rta_addattr32(mxrta, sizeof(mxbuf), RTAX_LOCK, mxlock); + addattr_l(&req.n, sizeof(req), RTA_METRICS, RTA_DATA(mxrta), RTA_PAYLOAD(mxrta)); + } + + if (nhs_ok) + parse_nexthops(&req.n, &req.r, argc, argv); + + if (!table_ok) { + if (req.r.rtm_type == RTN_LOCAL || + req.r.rtm_type == RTN_BROADCAST || + req.r.rtm_type == RTN_NAT || + req.r.rtm_type == RTN_ANYCAST) + req.r.rtm_table = RT_TABLE_LOCAL; + } + if (!scope_ok) { + if (req.r.rtm_type == RTN_LOCAL || + req.r.rtm_type == RTN_NAT) + req.r.rtm_scope = RT_SCOPE_HOST; + else if (req.r.rtm_type == RTN_BROADCAST || + req.r.rtm_type == RTN_MULTICAST || + req.r.rtm_type == RTN_ANYCAST) + req.r.rtm_scope = RT_SCOPE_LINK; + else if (req.r.rtm_type == RTN_UNICAST || + req.r.rtm_type == RTN_UNSPEC) { + if (cmd == RTM_DELROUTE) + req.r.rtm_scope = RT_SCOPE_NOWHERE; + else if (!gw_ok && !nhs_ok) + req.r.rtm_scope = RT_SCOPE_LINK; + } + } + + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = AF_INET; + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + return 0; +} + +static int rtnl_rtcache_request(struct rtnl_handle *rth, int family) +{ + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct sockaddr_nl nladdr; + + memset(&nladdr, 0, sizeof(nladdr)); + memset(&req, 0, sizeof(req)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = RTM_GETROUTE; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = rth->dump = ++rth->seq; + req.rtm.rtm_family = family; + req.rtm.rtm_flags |= RTM_F_CLONED; + + return sendto(rth->fd, (void*)&req, sizeof(req), 0, (struct sockaddr*)&nladdr, sizeof(nladdr)); +} + +static int iproute_flush_cache(void) +{ +#define ROUTE_FLUSH_PATH "/proc/sys/net/ipv4/route/flush" + + int len; + int flush_fd = open (ROUTE_FLUSH_PATH, O_WRONLY); + char *buffer = "-1"; + + if (flush_fd < 0) { + fprintf (stderr, "Cannot open \"%s\"\n", ROUTE_FLUSH_PATH); + return -1; + } + + len = strlen (buffer); + + if ((write (flush_fd, (void *)buffer, len)) < len) { + fprintf (stderr, "Cannot flush routing cache\n"); + return -1; + } + close(flush_fd); + return 0; +} + + +static int iproute_list_or_flush(int argc, char **argv, int flush) +{ + int do_ipv6 = preferred_family; + struct rtnl_handle rth; + char *id = NULL; + char *od = NULL; + + iproute_reset_filter(); + filter.tb = RT_TABLE_MAIN; + + if (flush && argc <= 0) { + fprintf(stderr, "\"ip route flush\" requires arguments.\n"); + return -1; + } + + while (argc > 0) { + if (matches(*argv, "table") == 0) { + int tid; + NEXT_ARG(); + if (rtnl_rttable_a2n(&tid, *argv)) { + if (strcmp(*argv, "all") == 0) { + tid = 0; + } else if (strcmp(*argv, "cache") == 0) { + tid = -1; + } else if (strcmp(*argv, "help") == 0) { + usage(); + } else { + invarg("table id value is invalid\n", *argv); + } + } + filter.tb = tid; + } else if (matches(*argv, "cached") == 0 || + matches(*argv, "cloned") == 0) { + filter.tb = -1; + } else if (strcmp(*argv, "tos") == 0 || + matches(*argv, "dsfield") == 0) { + __u32 tos; + NEXT_ARG(); + if (rtnl_dsfield_a2n(&tos, *argv)) + invarg("TOS value is invalid\n", *argv); + filter.tos = tos; + filter.tosmask = -1; + } else if (matches(*argv, "protocol") == 0) { + int prot = 0; + NEXT_ARG(); + filter.protocolmask = -1; + if (rtnl_rtprot_a2n(&prot, *argv)) { + if (strcmp(*argv, "all") != 0) + invarg("invalid \"protocol\"\n", *argv); + prot = 0; + filter.protocolmask = 0; + } + filter.protocol = prot; + } else if (matches(*argv, "scope") == 0) { + int scope = 0; + NEXT_ARG(); + filter.scopemask = -1; + if (rtnl_rtscope_a2n(&scope, *argv)) { + if (strcmp(*argv, "all") != 0) + invarg("invalid \"scope\"\n", *argv); + scope = RT_SCOPE_NOWHERE; + filter.scopemask = 0; + } + filter.scope = scope; + } else if (matches(*argv, "type") == 0) { + int type; + NEXT_ARG(); + filter.typemask = -1; + if (rtnl_rtntype_a2n(&type, *argv)) + invarg("node type value is invalid\n", *argv); + filter.type = type; + } else if (strcmp(*argv, "dev") == 0 || + strcmp(*argv, "oif") == 0) { + NEXT_ARG(); + od = *argv; + } else if (strcmp(*argv, "iif") == 0) { + NEXT_ARG(); + id = *argv; + } else if (strcmp(*argv, "via") == 0) { + NEXT_ARG(); + get_prefix(&filter.rvia, *argv, do_ipv6); + } else if (strcmp(*argv, "src") == 0) { + NEXT_ARG(); + get_prefix(&filter.rprefsrc, *argv, do_ipv6); + } else if (matches(*argv, "realms") == 0) { + __u32 realm; + NEXT_ARG(); + if (get_rt_realms(&realm, *argv)) + invarg("invalid realms\n", *argv); + filter.realm = realm; + filter.realmmask = ~0U; + if ((filter.realm&0xFFFF) == 0 && + (*argv)[strlen(*argv) - 1] == '/') + filter.realmmask &= ~0xFFFF; + if ((filter.realm&0xFFFF0000U) == 0 && + (strchr(*argv, '/') == NULL || + (*argv)[0] == '/')) + filter.realmmask &= ~0xFFFF0000U; + } else if (matches(*argv, "from") == 0) { + NEXT_ARG(); + if (matches(*argv, "root") == 0) { + NEXT_ARG(); + get_prefix(&filter.rsrc, *argv, do_ipv6); + } else if (matches(*argv, "match") == 0) { + NEXT_ARG(); + get_prefix(&filter.msrc, *argv, do_ipv6); + } else { + if (matches(*argv, "exact") == 0) { + NEXT_ARG(); + } + get_prefix(&filter.msrc, *argv, do_ipv6); + filter.rsrc = filter.msrc; + } + } else { + if (matches(*argv, "to") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "root") == 0) { + NEXT_ARG(); + get_prefix(&filter.rdst, *argv, do_ipv6); + } else if (matches(*argv, "match") == 0) { + NEXT_ARG(); + get_prefix(&filter.mdst, *argv, do_ipv6); + } else { + if (matches(*argv, "exact") == 0) { + NEXT_ARG(); + } + get_prefix(&filter.mdst, *argv, do_ipv6); + filter.rdst = filter.mdst; + } + } + argc--; argv++; + } + + if (do_ipv6 == AF_UNSPEC && filter.tb) + do_ipv6 = AF_INET; + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + ll_init_map(&rth); + + if (id || od) { + int idx; + + if (id) { + if ((idx = ll_name_to_index(id)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", id); + return -1; + } + filter.iif = idx; + filter.iifmask = -1; + } + if (od) { + if ((idx = ll_name_to_index(od)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", od); + return -1; + } + filter.oif = idx; + filter.oifmask = -1; + } + } + + if (flush) { + int round = 0; + char flushb[4096-512]; + + if (filter.tb == -1) { + if (do_ipv6 != AF_INET6) { + iproute_flush_cache(); + if (show_stats) + printf("*** IPv4 routing cache is flushed.\n"); + } + if (do_ipv6 == AF_INET) + return 0; + } + + filter.flushb = flushb; + filter.flushp = 0; + filter.flushe = sizeof(flushb); + filter.rth = &rth; + + for (;;) { + if (rtnl_wilddump_request(&rth, do_ipv6, RTM_GETROUTE) < 0) { + perror("Cannot send dump request"); + exit(1); + } + filter.flushed = 0; + if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Flush terminated\n"); + exit(1); + } + if (filter.flushed == 0) { + if (round == 0) { + if (filter.tb != -1 || do_ipv6 == AF_INET6) + fprintf(stderr, "Nothing to flush.\n"); + } else if (show_stats) + printf("*** Flush is complete after %d round%s ***\n", round, round>1?"s":""); + fflush(stdout); + return 0; + } + round++; + if (flush_update() < 0) + exit(1); + if (show_stats) { + printf("\n*** Round %d, deleting %d entries ***\n", round, filter.flushed); + fflush(stdout); + } + } + } + + if (filter.tb != -1) { + if (rtnl_wilddump_request(&rth, do_ipv6, RTM_GETROUTE) < 0) { + perror("Cannot send dump request"); + exit(1); + } + } else { + if (rtnl_rtcache_request(&rth, do_ipv6) < 0) { + perror("Cannot send dump request"); + exit(1); + } + } + + if (rtnl_dump_filter(&rth, print_route, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + exit(0); +} + + +int iproute_get(int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct rtmsg r; + char buf[1024]; + } req; + char *idev = NULL; + char *odev = NULL; + int connected = 0; + int from_ok = 0; + + memset(&req, 0, sizeof(req)); + + iproute_reset_filter(); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_GETROUTE; + req.r.rtm_family = preferred_family; + req.r.rtm_table = 0; + req.r.rtm_protocol = 0; + req.r.rtm_scope = 0; + req.r.rtm_type = 0; + req.r.rtm_src_len = 0; + req.r.rtm_dst_len = 0; + req.r.rtm_tos = 0; + + while (argc > 0) { + if (strcmp(*argv, "tos") == 0 || + matches(*argv, "dsfield") == 0) { + __u32 tos; + NEXT_ARG(); + if (rtnl_dsfield_a2n(&tos, *argv)) + invarg("TOS value is invalid\n", *argv); + req.r.rtm_tos = tos; + } else if (matches(*argv, "from") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (matches(*argv, "help") == 0) + usage(); + from_ok = 1; + get_prefix(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + if (addr.bytelen) + addattr_l(&req.n, sizeof(req), RTA_SRC, &addr.data, addr.bytelen); + req.r.rtm_src_len = addr.bitlen; + } else if (matches(*argv, "iif") == 0) { + NEXT_ARG(); + idev = *argv; + } else if (matches(*argv, "oif") == 0 || + strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + odev = *argv; + } else if (matches(*argv, "notify") == 0) { + req.r.rtm_flags |= RTM_F_NOTIFY; + } else if (matches(*argv, "connected") == 0) { + connected = 1; + } else { + inet_prefix addr; + if (strcmp(*argv, "to") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + get_prefix(&addr, *argv, req.r.rtm_family); + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = addr.family; + if (addr.bytelen) + addattr_l(&req.n, sizeof(req), RTA_DST, &addr.data, addr.bytelen); + req.r.rtm_dst_len = addr.bitlen; + } + argc--; argv++; + } + + if (req.r.rtm_dst_len == 0) { + fprintf(stderr, "need at least destination address\n"); + exit(1); + } + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + ll_init_map(&rth); + + if (idev || odev) { + int idx; + + if (idev) { + if ((idx = ll_name_to_index(idev)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", idev); + return -1; + } + addattr32(&req.n, sizeof(req), RTA_IIF, idx); + } + if (odev) { + if ((idx = ll_name_to_index(odev)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", odev); + return -1; + } + addattr32(&req.n, sizeof(req), RTA_OIF, idx); + } + } + + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = AF_INET; + + if (rtnl_talk(&rth, &req.n, 0, 0, &req.n, NULL, NULL) < 0) + exit(2); + + if (connected && !from_ok) { + struct rtmsg *r = NLMSG_DATA(&req.n); + int len = req.n.nlmsg_len; + struct rtattr * tb[RTA_MAX+1]; + + if (print_route(NULL, &req.n, (void*)stdout) < 0) { + fprintf(stderr, "An error :-)\n"); + exit(1); + } + + if (req.n.nlmsg_type != RTM_NEWROUTE) { + fprintf(stderr, "Not a route?\n"); + return -1; + } + len -= NLMSG_LENGTH(sizeof(*r)); + if (len < 0) { + fprintf(stderr, "Wrong len %d\n", len); + return -1; + } + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len); + + if (tb[RTA_PREFSRC]) { + tb[RTA_PREFSRC]->rta_type = RTA_SRC; + r->rtm_src_len = 8*RTA_PAYLOAD(tb[RTA_PREFSRC]); + } else if (!tb[RTA_SRC]) { + fprintf(stderr, "Failed to connect the route\n"); + return -1; + } + if (!odev && tb[RTA_OIF]) + tb[RTA_OIF]->rta_type = 0; + if (tb[RTA_GATEWAY]) + tb[RTA_GATEWAY]->rta_type = 0; + if (!idev && tb[RTA_IIF]) + tb[RTA_IIF]->rta_type = 0; + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_GETROUTE; + + if (rtnl_talk(&rth, &req.n, 0, 0, &req.n, NULL, NULL) < 0) + exit(2); + } + + if (print_route(NULL, &req.n, (void*)stdout) < 0) { + fprintf(stderr, "An error :-)\n"); + exit(1); + } + + exit(0); +} + +void iproute_reset_filter() +{ + memset(&filter, 0, sizeof(filter)); + filter.mdst.bitlen = -1; + filter.msrc.bitlen = -1; +} + +int do_iproute(int argc, char **argv) +{ + if (argc < 1) + return iproute_list_or_flush(0, NULL, 0); + + if (matches(*argv, "add") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_EXCL, + argc-1, argv+1); + if (matches(*argv, "change") == 0 || strcmp(*argv, "chg") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_REPLACE, + argc-1, argv+1); + if (matches(*argv, "replace") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_REPLACE, + argc-1, argv+1); + if (matches(*argv, "prepend") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE, + argc-1, argv+1); + if (matches(*argv, "append") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_CREATE|NLM_F_APPEND, + argc-1, argv+1); + if (matches(*argv, "test") == 0) + return iproute_modify(RTM_NEWROUTE, NLM_F_EXCL, + argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return iproute_modify(RTM_DELROUTE, 0, + argc-1, argv+1); + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return iproute_list_or_flush(argc-1, argv+1, 0); + if (matches(*argv, "get") == 0) + return iproute_get(argc-1, argv+1); + if (matches(*argv, "flush") == 0) + return iproute_list_or_flush(argc-1, argv+1, 1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"ip route help\".\n", *argv); + exit(-1); +} + diff --git a/ip/iprule.c b/ip/iprule.c index e69de29b..457864f8 100644 --- a/ip/iprule.c +++ b/ip/iprule.c @@ -0,0 +1,323 @@ +/* + * iprule.c "ip rule". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <arpa/inet.h> +#include <string.h> + +#include "rt_names.h" +#include "utils.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip rule [ list | add | del ] SELECTOR ACTION\n"); + fprintf(stderr, "SELECTOR := [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK ]\n"); + fprintf(stderr, " [ dev STRING ] [ pref NUMBER ]\n"); + fprintf(stderr, "ACTION := [ table TABLE_ID ] [ nat ADDRESS ]\n"); + fprintf(stderr, " [ prohibit | reject | unreachable ]\n"); + fprintf(stderr, " [ realms [SRCREALM/]DSTREALM ]\n"); + fprintf(stderr, "TABLE_ID := [ local | main | default | NUMBER ]\n"); + exit(-1); +} + +int print_rule(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct rtmsg *r = NLMSG_DATA(n); + int len = n->nlmsg_len; + int host_len = -1; + struct rtattr * tb[RTA_MAX+1]; + char abuf[256]; + SPRINT_BUF(b1); + + if (n->nlmsg_type != RTM_NEWRULE) + return 0; + + len -= NLMSG_LENGTH(sizeof(*r)); + if (len < 0) + return -1; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, RTA_MAX, RTM_RTA(r), len); + + if (r->rtm_family == AF_INET) + host_len = 32; + else if (r->rtm_family == AF_INET6) + host_len = 128; + else if (r->rtm_family == AF_DECnet) + host_len = 16; + else if (r->rtm_family == AF_IPX) + host_len = 80; + + if (tb[RTA_PRIORITY]) + fprintf(fp, "%u:\t", *(unsigned*)RTA_DATA(tb[RTA_PRIORITY])); + else + fprintf(fp, "0:\t"); + + if (tb[RTA_SRC]) { + if (r->rtm_src_len != host_len) { + fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family, + RTA_PAYLOAD(tb[RTA_SRC]), + RTA_DATA(tb[RTA_SRC]), + abuf, sizeof(abuf)), + r->rtm_src_len + ); + } else { + fprintf(fp, "from %s ", format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_SRC]), + RTA_DATA(tb[RTA_SRC]), + abuf, sizeof(abuf)) + ); + } + } else if (r->rtm_src_len) { + fprintf(fp, "from 0/%d ", r->rtm_src_len); + } else { + fprintf(fp, "from all "); + } + + if (tb[RTA_DST]) { + if (r->rtm_dst_len != host_len) { + fprintf(fp, "to %s/%u ", rt_addr_n2a(r->rtm_family, + RTA_PAYLOAD(tb[RTA_DST]), + RTA_DATA(tb[RTA_DST]), + abuf, sizeof(abuf)), + r->rtm_dst_len + ); + } else { + fprintf(fp, "to %s ", format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_DST]), + RTA_DATA(tb[RTA_DST]), + abuf, sizeof(abuf))); + } + } else if (r->rtm_dst_len) { + fprintf(fp, "to 0/%d ", r->rtm_dst_len); + } + + if (r->rtm_tos) { + SPRINT_BUF(b1); + fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1))); + } + if (tb[RTA_PROTOINFO]) { + fprintf(fp, "fwmark %8x ", *(__u32*)RTA_DATA(tb[RTA_PROTOINFO])); + } + + if (tb[RTA_IIF]) { + fprintf(fp, "iif %s ", (char*)RTA_DATA(tb[RTA_IIF])); + } + + if (r->rtm_table) + fprintf(fp, "lookup %s ", rtnl_rttable_n2a(r->rtm_table, b1, sizeof(b1))); + + if (tb[RTA_FLOW]) { + __u32 to = *(__u32*)RTA_DATA(tb[RTA_FLOW]); + __u32 from = to>>16; + to &= 0xFFFF; + if (from) { + fprintf(fp, "realms %s/", + rtnl_rtrealm_n2a(from, b1, sizeof(b1))); + } + fprintf(fp, "%s ", + rtnl_rtrealm_n2a(to, b1, sizeof(b1))); + } + + if (r->rtm_type == RTN_NAT) { + if (tb[RTA_GATEWAY]) { + fprintf(fp, "map-to %s ", + format_host(r->rtm_family, + RTA_PAYLOAD(tb[RTA_GATEWAY]), + RTA_DATA(tb[RTA_GATEWAY]), + abuf, sizeof(abuf))); + } else + fprintf(fp, "masquerade"); + } else if (r->rtm_type != RTN_UNICAST) + fprintf(fp, "%s", rtnl_rtntype_n2a(r->rtm_type, b1, sizeof(b1))); + + fprintf(fp, "\n"); + fflush(fp); + return 0; +} + +int iprule_list(int argc, char **argv) +{ + struct rtnl_handle rth; + int af = preferred_family; + + if (af == AF_UNSPEC) + af = AF_INET; + + if (argc > 0) { + fprintf(stderr, "\"ip rule show\" need not eny arguments.\n"); + return -1; + } + + if (rtnl_open(&rth, 0) < 0) + return 1; + + if (rtnl_wilddump_request(&rth, af, RTM_GETRULE) < 0) { + perror("Cannot send dump request"); + return 1; + } + + if (rtnl_dump_filter(&rth, print_rule, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + return 1; + } + + return 0; +} + + +int iprule_modify(int cmd, int argc, char **argv) +{ + int table_ok = 0; + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct rtmsg r; + char buf[1024]; + } req; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_type = cmd; + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.r.rtm_family = preferred_family; + req.r.rtm_protocol = RTPROT_BOOT; + req.r.rtm_scope = RT_SCOPE_UNIVERSE; + req.r.rtm_table = 0; + req.r.rtm_type = RTN_UNSPEC; + + if (cmd == RTM_NEWRULE) { + req.n.nlmsg_flags |= NLM_F_CREATE|NLM_F_EXCL; + req.r.rtm_type = RTN_UNICAST; + } + + while (argc > 0) { + if (strcmp(*argv, "from") == 0) { + inet_prefix dst; + NEXT_ARG(); + get_prefix(&dst, *argv, req.r.rtm_family); + req.r.rtm_src_len = dst.bitlen; + addattr_l(&req.n, sizeof(req), RTA_SRC, &dst.data, dst.bytelen); + } else if (strcmp(*argv, "to") == 0) { + inet_prefix dst; + NEXT_ARG(); + get_prefix(&dst, *argv, req.r.rtm_family); + req.r.rtm_dst_len = dst.bitlen; + addattr_l(&req.n, sizeof(req), RTA_DST, &dst.data, dst.bytelen); + } else if (matches(*argv, "preference") == 0 || + matches(*argv, "order") == 0 || + matches(*argv, "priority") == 0) { + __u32 pref; + NEXT_ARG(); + if (get_u32(&pref, *argv, 0)) + invarg("preference value is invalid\n", *argv); + addattr32(&req.n, sizeof(req), RTA_PRIORITY, pref); + } else if (strcmp(*argv, "tos") == 0) { + __u32 tos; + NEXT_ARG(); + if (rtnl_dsfield_a2n(&tos, *argv)) + invarg("TOS value is invalid\n", *argv); + req.r.rtm_tos = tos; + } else if (strcmp(*argv, "fwmark") == 0) { + __u32 fwmark; + NEXT_ARG(); + if (get_u32(&fwmark, *argv, 16)) + invarg("fwmark value is invalid\n", *argv); + addattr32(&req.n, sizeof(req), RTA_PROTOINFO, fwmark); + } else if (matches(*argv, "realms") == 0) { + __u32 realm; + NEXT_ARG(); + if (get_rt_realms(&realm, *argv)) + invarg("invalid realms\n", *argv); + addattr32(&req.n, sizeof(req), RTA_FLOW, realm); + } else if (matches(*argv, "table") == 0 || + strcmp(*argv, "lookup") == 0) { + int tid; + NEXT_ARG(); + if (rtnl_rttable_a2n(&tid, *argv)) + invarg("invalid table ID\n", *argv); + req.r.rtm_table = tid; + table_ok = 1; + } else if (strcmp(*argv, "dev") == 0 || + strcmp(*argv, "iif") == 0) { + NEXT_ARG(); + addattr_l(&req.n, sizeof(req), RTA_IIF, *argv, strlen(*argv)+1); + } else if (strcmp(*argv, "nat") == 0 || + matches(*argv, "map-to") == 0) { + NEXT_ARG(); + addattr32(&req.n, sizeof(req), RTA_GATEWAY, get_addr32(*argv)); + req.r.rtm_type = RTN_NAT; + } else { + int type; + + if (strcmp(*argv, "type") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (rtnl_rtntype_a2n(&type, *argv)) + invarg("Failed to parse rule type", *argv); + req.r.rtm_type = type; + } + argc--; + argv++; + } + + if (req.r.rtm_family == AF_UNSPEC) + req.r.rtm_family = AF_INET; + + if (!table_ok && cmd == RTM_NEWRULE) + req.r.rtm_table = RT_TABLE_MAIN; + + if (rtnl_open(&rth, 0) < 0) + return 1; + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + return 2; + + return 0; +} + +int do_iprule(int argc, char **argv) +{ + if (argc < 1) { + return iprule_list(0, NULL); + } else if (matches(argv[0], "list") == 0 || + matches(argv[0], "lst") == 0 || + matches(argv[0], "show") == 0) { + return iprule_list(argc-1, argv+1); + } else if (matches(argv[0], "add") == 0) { + return iprule_modify(RTM_NEWRULE, argc-1, argv+1); + } else if (matches(argv[0], "delete") == 0) { + return iprule_modify(RTM_DELRULE, argc-1, argv+1); + } else if (matches(argv[0], "help") == 0) + usage(); + + fprintf(stderr, "Command \"%s\" is unknown, try \"ip rule help\".\n", *argv); + exit(-1); +} + diff --git a/ip/iptunnel.c b/ip/iptunnel.c index e69de29b..41c262b5 100644 --- a/ip/iptunnel.c +++ b/ip/iptunnel.c @@ -0,0 +1,581 @@ +/* + * iptunnel.c "ip tunnel" + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + * Rani Assaf <rani@magic.metawire.com> 980930: do not allow key for ipip/sit + * Phil Karn <karn@ka9q.ampr.org> 990408: "pmtudisc" flag + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <linux/if.h> +#include <linux/if_arp.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <arpa/inet.h> +#include <linux/if_tunnel.h> + +#include "rt_names.h" +#include "utils.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: ip tunnel { add | change | del | show } [ NAME ]\n"); + fprintf(stderr, " [ mode { ipip | gre | sit } ] [ remote ADDR ] [ local ADDR ]\n"); + fprintf(stderr, " [ [i|o]seq ] [ [i|o]key KEY ] [ [i|o]csum ]\n"); + fprintf(stderr, " [ ttl TTL ] [ tos TOS ] [ [no]pmtudisc ] [ dev PHYS_DEV ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Where: NAME := STRING\n"); + fprintf(stderr, " ADDR := { IP_ADDRESS | any }\n"); + fprintf(stderr, " TOS := { NUMBER | inherit }\n"); + fprintf(stderr, " TTL := { 1..255 | inherit }\n"); + fprintf(stderr, " KEY := { DOTTED_QUAD | NUMBER }\n"); + exit(-1); +} + +static int do_ioctl_get_ifindex(char *dev) +{ + struct ifreq ifr; + int fd; + int err; + + strcpy(ifr.ifr_name, dev); + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, SIOCGIFINDEX, &ifr); + if (err) { + perror("ioctl"); + return 0; + } + close(fd); + return ifr.ifr_ifindex; +} + +static int do_ioctl_get_iftype(char *dev) +{ + struct ifreq ifr; + int fd; + int err; + + strcpy(ifr.ifr_name, dev); + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, SIOCGIFHWADDR, &ifr); + if (err) { + perror("ioctl"); + return -1; + } + close(fd); + return ifr.ifr_addr.sa_family; +} + + +static char * do_ioctl_get_ifname(int idx) +{ + static struct ifreq ifr; + int fd; + int err; + + ifr.ifr_ifindex = idx; + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, SIOCGIFNAME, &ifr); + if (err) { + perror("ioctl"); + return NULL; + } + close(fd); + return ifr.ifr_name; +} + + + +static int do_get_ioctl(char *basedev, struct ip_tunnel_parm *p) +{ + struct ifreq ifr; + int fd; + int err; + + strcpy(ifr.ifr_name, basedev); + ifr.ifr_ifru.ifru_data = (void*)p; + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, SIOCGETTUNNEL, &ifr); + if (err) + perror("ioctl"); + close(fd); + return err; +} + +static int do_add_ioctl(int cmd, char *basedev, struct ip_tunnel_parm *p) +{ + struct ifreq ifr; + int fd; + int err; + + if (cmd == SIOCCHGTUNNEL && p->name[0]) + strcpy(ifr.ifr_name, p->name); + else + strcpy(ifr.ifr_name, basedev); + ifr.ifr_ifru.ifru_data = (void*)p; + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, cmd, &ifr); + if (err) + perror("ioctl"); + close(fd); + return err; +} + +static int do_del_ioctl(char *basedev, struct ip_tunnel_parm *p) +{ + struct ifreq ifr; + int fd; + int err; + + if (p->name[0]) + strcpy(ifr.ifr_name, p->name); + else + strcpy(ifr.ifr_name, basedev); + ifr.ifr_ifru.ifru_data = (void*)p; + fd = socket(AF_INET, SOCK_DGRAM, 0); + err = ioctl(fd, SIOCDELTUNNEL, &ifr); + if (err) + perror("ioctl"); + close(fd); + return err; +} + +static int parse_args(int argc, char **argv, int cmd, struct ip_tunnel_parm *p) +{ + int count = 0; + char medium[IFNAMSIZ]; + + memset(p, 0, sizeof(*p)); + memset(&medium, 0, sizeof(medium)); + + p->iph.version = 4; + p->iph.ihl = 5; +#ifndef IP_DF +#define IP_DF 0x4000 /* Flag: "Don't Fragment" */ +#endif + p->iph.frag_off = htons(IP_DF); + + while (argc > 0) { + if (strcmp(*argv, "mode") == 0) { + NEXT_ARG(); + if (strcmp(*argv, "ipip") == 0 || + strcmp(*argv, "ip/ip") == 0) { + if (p->iph.protocol && p->iph.protocol != IPPROTO_IPIP) { + fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); + exit(-1); + } + p->iph.protocol = IPPROTO_IPIP; + } else if (strcmp(*argv, "gre") == 0 || + strcmp(*argv, "gre/ip") == 0) { + if (p->iph.protocol && p->iph.protocol != IPPROTO_GRE) { + fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); + exit(-1); + } + p->iph.protocol = IPPROTO_GRE; + } else if (strcmp(*argv, "sit") == 0 || + strcmp(*argv, "ipv6/ip") == 0) { + if (p->iph.protocol && p->iph.protocol != IPPROTO_IPV6) { + fprintf(stderr,"You managed to ask for more than one tunnel mode.\n"); + exit(-1); + } + p->iph.protocol = IPPROTO_IPV6; + } else { + fprintf(stderr,"Cannot guess tunnel mode.\n"); + exit(-1); + } + } else if (strcmp(*argv, "key") == 0) { + unsigned uval; + NEXT_ARG(); + p->i_flags |= GRE_KEY; + p->o_flags |= GRE_KEY; + if (strchr(*argv, '.')) + p->i_key = p->o_key = get_addr32(*argv); + else { + if (get_unsigned(&uval, *argv, 0)<0) { + fprintf(stderr, "invalid value of \"key\"\n"); + exit(-1); + } + p->i_key = p->o_key = htonl(uval); + } + } else if (strcmp(*argv, "ikey") == 0) { + unsigned uval; + NEXT_ARG(); + p->i_flags |= GRE_KEY; + if (strchr(*argv, '.')) + p->o_key = get_addr32(*argv); + else { + if (get_unsigned(&uval, *argv, 0)<0) { + fprintf(stderr, "invalid value of \"ikey\"\n"); + exit(-1); + } + p->i_key = htonl(uval); + } + } else if (strcmp(*argv, "okey") == 0) { + unsigned uval; + NEXT_ARG(); + p->o_flags |= GRE_KEY; + if (strchr(*argv, '.')) + p->o_key = get_addr32(*argv); + else { + if (get_unsigned(&uval, *argv, 0)<0) { + fprintf(stderr, "invalid value of \"okey\"\n"); + exit(-1); + } + p->o_key = htonl(uval); + } + } else if (strcmp(*argv, "seq") == 0) { + p->i_flags |= GRE_SEQ; + p->o_flags |= GRE_SEQ; + } else if (strcmp(*argv, "iseq") == 0) { + p->i_flags |= GRE_SEQ; + } else if (strcmp(*argv, "oseq") == 0) { + p->o_flags |= GRE_SEQ; + } else if (strcmp(*argv, "csum") == 0) { + p->i_flags |= GRE_CSUM; + p->o_flags |= GRE_CSUM; + } else if (strcmp(*argv, "icsum") == 0) { + p->i_flags |= GRE_CSUM; + } else if (strcmp(*argv, "ocsum") == 0) { + p->o_flags |= GRE_CSUM; + } else if (strcmp(*argv, "nopmtudisc") == 0) { + p->iph.frag_off = 0; + } else if (strcmp(*argv, "pmtudisc") == 0) { + p->iph.frag_off = htons(IP_DF); + } else if (strcmp(*argv, "remote") == 0) { + NEXT_ARG(); + if (strcmp(*argv, "any")) + p->iph.daddr = get_addr32(*argv); + } else if (strcmp(*argv, "local") == 0) { + NEXT_ARG(); + if (strcmp(*argv, "any")) + p->iph.saddr = get_addr32(*argv); + } else if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + strncpy(medium, *argv, IFNAMSIZ-1); + } else if (strcmp(*argv, "ttl") == 0) { + unsigned uval; + NEXT_ARG(); + if (strcmp(*argv, "inherit") != 0) { + if (get_unsigned(&uval, *argv, 0)) + invarg("invalid TTL\n", *argv); + if (uval > 255) + invarg("TTL must be <=255\n", *argv); + p->iph.ttl = uval; + } + } else if (strcmp(*argv, "tos") == 0 || + matches(*argv, "dsfield") == 0) { + __u32 uval; + NEXT_ARG(); + if (strcmp(*argv, "inherit") != 0) { + if (rtnl_dsfield_a2n(&uval, *argv)) + invarg("bad TOS value", *argv); + p->iph.tos = uval; + } else + p->iph.tos = 1; + } else { + if (strcmp(*argv, "name") == 0) { + NEXT_ARG(); + } + if (matches(*argv, "help") == 0) + usage(); + if (p->name[0]) + duparg2("name", *argv); + strncpy(p->name, *argv, IFNAMSIZ); + if (cmd == SIOCCHGTUNNEL && count == 0) { + struct ip_tunnel_parm old_p; + memset(&old_p, 0, sizeof(old_p)); + if (do_get_ioctl(*argv, &old_p)) + return -1; + *p = old_p; + } + } + count++; + argc--; argv++; + } + + + if (p->iph.protocol == 0) { + if (memcmp(p->name, "gre", 3) == 0) + p->iph.protocol = IPPROTO_GRE; + else if (memcmp(p->name, "ipip", 4) == 0) + p->iph.protocol = IPPROTO_IPIP; + else if (memcmp(p->name, "sit", 3) == 0) + p->iph.protocol = IPPROTO_IPV6; + } + + if (p->iph.protocol == IPPROTO_IPIP || p->iph.protocol == IPPROTO_IPV6) { + if ((p->i_flags & GRE_KEY) || (p->o_flags & GRE_KEY)) { + fprintf(stderr, "Keys are not allowed with ipip and sit.\n"); + return -1; + } + } + + if (medium[0]) { + p->link = do_ioctl_get_ifindex(medium); + if (p->link == 0) + return -1; + } + + if (p->i_key == 0 && IN_MULTICAST(ntohl(p->iph.daddr))) { + p->i_key = p->iph.daddr; + p->i_flags |= GRE_KEY; + } + if (p->o_key == 0 && IN_MULTICAST(ntohl(p->iph.daddr))) { + p->o_key = p->iph.daddr; + p->o_flags |= GRE_KEY; + } + if (IN_MULTICAST(ntohl(p->iph.daddr)) && !p->iph.saddr) { + fprintf(stderr, "Broadcast tunnel requires a source address.\n"); + return -1; + } + return 0; +} + + +static int do_add(int cmd, int argc, char **argv) +{ + struct ip_tunnel_parm p; + + if (parse_args(argc, argv, cmd, &p) < 0) + return -1; + + if (p.iph.ttl && p.iph.frag_off == 0) { + fprintf(stderr, "ttl != 0 and noptmudisc are incompatible\n"); + return -1; + } + + switch (p.iph.protocol) { + case IPPROTO_IPIP: + return do_add_ioctl(cmd, "tunl0", &p); + case IPPROTO_GRE: + return do_add_ioctl(cmd, "gre0", &p); + case IPPROTO_IPV6: + return do_add_ioctl(cmd, "sit0", &p); + default: + fprintf(stderr, "cannot determine tunnel mode (ipip, gre or sit)\n"); + return -1; + } + return -1; +} + +int do_del(int argc, char **argv) +{ + struct ip_tunnel_parm p; + + if (parse_args(argc, argv, SIOCDELTUNNEL, &p) < 0) + return -1; + + switch (p.iph.protocol) { + case IPPROTO_IPIP: + return do_del_ioctl("tunl0", &p); + case IPPROTO_GRE: + return do_del_ioctl("gre0", &p); + case IPPROTO_IPV6: + return do_del_ioctl("sit0", &p); + default: + return do_del_ioctl(p.name, &p); + } + return -1; +} + +void print_tunnel(struct ip_tunnel_parm *p) +{ + char s1[1024]; + char s2[1024]; + char s3[64]; + char s4[64]; + + inet_ntop(AF_INET, &p->i_key, s3, sizeof(s3)); + inet_ntop(AF_INET, &p->o_key, s4, sizeof(s4)); + + /* Do not use format_host() for local addr, + * symbolic name will not be useful. + */ + printf("%s: %s/ip remote %s local %s ", + p->name, + p->iph.protocol == IPPROTO_IPIP ? "ip" : + (p->iph.protocol == IPPROTO_GRE ? "gre" : + (p->iph.protocol == IPPROTO_IPV6 ? "ipv6" : "unknown")), + p->iph.daddr ? format_host(AF_INET, 4, &p->iph.daddr, s1, sizeof(s1)) : "any", + p->iph.saddr ? rt_addr_n2a(AF_INET, 4, &p->iph.saddr, s2, sizeof(s2)) : "any"); + + if (p->link) { + char *n = do_ioctl_get_ifname(p->link); + if (n) + printf(" dev %s ", n); + } + + if (p->iph.ttl) + printf(" ttl %d ", p->iph.ttl); + else + printf(" ttl inherit "); + + if (p->iph.tos) { + SPRINT_BUF(b1); + printf(" tos"); + if (p->iph.tos&1) + printf(" inherit"); + if (p->iph.tos&~1) + printf("%c%s ", p->iph.tos&1 ? '/' : ' ', + rtnl_dsfield_n2a(p->iph.tos&~1, b1, sizeof(b1))); + } + + if (!(p->iph.frag_off&htons(IP_DF))) + printf(" nopmtudisc"); + + if ((p->i_flags&GRE_KEY) && (p->o_flags&GRE_KEY) && p->o_key == p->i_key) + printf(" key %s", s3); + else if ((p->i_flags|p->o_flags)&GRE_KEY) { + if (p->i_flags&GRE_KEY) + printf(" ikey %s ", s3); + if (p->o_flags&GRE_KEY) + printf(" okey %s ", s4); + } + + if (p->i_flags&GRE_SEQ) + printf("%s Drop packets out of sequence.\n", _SL_); + if (p->i_flags&GRE_CSUM) + printf("%s Checksum in received packet is required.", _SL_); + if (p->o_flags&GRE_SEQ) + printf("%s Sequence packets on output.", _SL_); + if (p->o_flags&GRE_CSUM) + printf("%s Checksum output packets.", _SL_); +} + +static int do_tunnels_list(struct ip_tunnel_parm *p) +{ + char name[IFNAMSIZ]; + unsigned long rx_bytes, rx_packets, rx_errs, rx_drops, + rx_fifo, rx_frame, + tx_bytes, tx_packets, tx_errs, tx_drops, + tx_fifo, tx_colls, tx_carrier, rx_multi; + int type; + struct ip_tunnel_parm p1; + + char buf[512]; + FILE *fp = fopen("/proc/net/dev", "r"); + if (fp == NULL) { + perror("fopen"); + return -1; + } + + fgets(buf, sizeof(buf), fp); + fgets(buf, sizeof(buf), fp); + + while (fgets(buf, sizeof(buf), fp) != NULL) { + char *ptr; + buf[sizeof(buf) - 1] = 0; + if ((ptr = strchr(buf, ':')) == NULL || + (*ptr++ = 0, sscanf(buf, "%s", name) != 1)) { + fprintf(stderr, "Wrong format of /proc/net/dev. Sorry.\n"); + return -1; + } + if (sscanf(ptr, "%ld%ld%ld%ld%ld%ld%ld%*d%ld%ld%ld%ld%ld%ld%ld", + &rx_bytes, &rx_packets, &rx_errs, &rx_drops, + &rx_fifo, &rx_frame, &rx_multi, + &tx_bytes, &tx_packets, &tx_errs, &tx_drops, + &tx_fifo, &tx_colls, &tx_carrier) != 14) + continue; + if (p->name[0] && strcmp(p->name, name)) + continue; + type = do_ioctl_get_iftype(name); + if (type == -1) { + fprintf(stderr, "Failed to get type of [%s]\n", name); + continue; + } + if (type != ARPHRD_TUNNEL && type != ARPHRD_IPGRE && type != ARPHRD_SIT) + continue; + memset(&p1, 0, sizeof(p1)); + if (do_get_ioctl(name, &p1)) + continue; + if ((p->link && p1.link != p->link) || + (p->name[0] && strcmp(p1.name, p->name)) || + (p->iph.daddr && p1.iph.daddr != p->iph.daddr) || + (p->iph.saddr && p1.iph.saddr != p->iph.saddr) || + (p->i_key && p1.i_key != p->i_key)) + continue; + print_tunnel(&p1); + if (show_stats) { + printf("%s", _SL_); + printf("RX: Packets Bytes Errors CsumErrs OutOfSeq Mcasts%s", _SL_); + printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-8ld%s", + rx_packets, rx_bytes, rx_errs, rx_frame, rx_fifo, rx_multi, _SL_); + printf("TX: Packets Bytes Errors DeadLoop NoRoute NoBufs%s", _SL_); + printf(" %-10ld %-12ld %-6ld %-8ld %-8ld %-6ld", + tx_packets, tx_bytes, tx_errs, tx_colls, tx_carrier, tx_drops); + } + printf("\n"); + } + return 0; +} + +static int do_show(int argc, char **argv) +{ + int err; + struct ip_tunnel_parm p; + + if (parse_args(argc, argv, SIOCGETTUNNEL, &p) < 0) + return -1; + + switch (p.iph.protocol) { + case IPPROTO_IPIP: + err = do_get_ioctl(p.name[0] ? p.name : "tunl0", &p); + break; + case IPPROTO_GRE: + err = do_get_ioctl(p.name[0] ? p.name : "gre0", &p); + break; + case IPPROTO_IPV6: + err = do_get_ioctl(p.name[0] ? p.name : "sit0", &p); + break; + default: + do_tunnels_list(&p); + return 0; + } + if (err) + return -1; + + print_tunnel(&p); + printf("\n"); + return 0; +} + +int do_iptunnel(int argc, char **argv) +{ + if (argc > 0) { + if (matches(*argv, "add") == 0) + return do_add(SIOCADDTUNNEL, argc-1, argv+1); + if (matches(*argv, "change") == 0) + return do_add(SIOCCHGTUNNEL, argc-1, argv+1); + if (matches(*argv, "del") == 0) + return do_del(argc-1, argv+1); + if (matches(*argv, "show") == 0 || + matches(*argv, "lst") == 0 || + matches(*argv, "list") == 0) + return do_show(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + } else + return do_show(0, NULL); + + fprintf(stderr, "Command \"%s\" is unknown, try \"ip tunnel help\".\n", *argv); + exit(-1); +} diff --git a/ip/routef b/ip/routef index e69de29b..db43b5df 100644 --- a/ip/routef +++ b/ip/routef @@ -0,0 +1,3 @@ +#! /bin/sh + +exec ip -4 ro flush scope global type unicast diff --git a/ip/routel b/ip/routel index e69de29b..8d1d352a 100644 --- a/ip/routel +++ b/ip/routel @@ -0,0 +1,60 @@ +#!/bin/sh +#$Id$ + +# +# Script created by: Stephen R. van den Berg <srb@cuci.nl>, 1999/04/18 +# Donated to the public domain. +# +# This script transforms the output of "ip" into more readable text. +# "ip" is the Linux-advanced-routing configuration tool part of the +# iproute package. +# + +test "X-h" = "X$1" && echo "Usage: $0 [tablenr [raw ip args...]]" && exit 64 + +test -z "$*" && set 0 + +ip route list table "$@" | + while read network rest + do set xx $rest + shift + proto="" + via="" + dev="" + scope="" + src="" + table="" + case $network in + broadcast|local|unreachable) via=$network + network=$1 + shift + ;; + esac + while test $# != 0 + do + key=$1 + val=$2 + eval "$key=$val" + shift 2 + done + echo "$network $via $src $proto $scope $dev $table" + done | awk -F ' ' ' +BEGIN { + format="%15s%-3s %15s %15s %8s %8s%7s %s\n"; + printf(format,"target","","gateway","source","proto","scope","dev","tbl"); + } + { network=$1; + mask=""; + if(match(network,"/")) + { mask=" "substr(network,RSTART+1); + network=substr(network,0,RSTART); + } + via=$2; + src=$3; + proto=$4; + scope=$5; + dev=$6; + table=$7; + printf(format,network,mask,via,src,proto,scope,dev,table); + } +' diff --git a/ip/rtm_map.c b/ip/rtm_map.c index e69de29b..21e818b4 100644 --- a/ip/rtm_map.c +++ b/ip/rtm_map.c @@ -0,0 +1,116 @@ +/* + * rtm_map.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <string.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#include "rt_names.h" +#include "utils.h" + +char *rtnl_rtntype_n2a(int id, char *buf, int len) +{ + switch (id) { + case RTN_UNSPEC: + return "none"; + case RTN_UNICAST: + return "unicast"; + case RTN_LOCAL: + return "local"; + case RTN_BROADCAST: + return "broadcast"; + case RTN_ANYCAST: + return "anycast"; + case RTN_MULTICAST: + return "multicast"; + case RTN_BLACKHOLE: + return "blackhole"; + case RTN_UNREACHABLE: + return "unreachable"; + case RTN_PROHIBIT: + return "prohibit"; + case RTN_THROW: + return "throw"; + case RTN_NAT: + return "nat"; + case RTN_XRESOLVE: + return "xresolve"; + default: + snprintf(buf, len, "%d", id); + return buf; + } +} + + +int rtnl_rtntype_a2n(int *id, char *arg) +{ + char *end; + unsigned long res; + + if (strcmp(arg, "local") == 0) + res = RTN_LOCAL; + else if (strcmp(arg, "nat") == 0) + res = RTN_NAT; + else if (matches(arg, "broadcast") == 0 || + strcmp(arg, "brd") == 0) + res = RTN_BROADCAST; + else if (matches(arg, "anycast") == 0) + res = RTN_ANYCAST; + else if (matches(arg, "multicast") == 0) + res = RTN_MULTICAST; + else if (matches(arg, "prohibit") == 0) + res = RTN_PROHIBIT; + else if (matches(arg, "unreachable") == 0) + res = RTN_UNREACHABLE; + else if (matches(arg, "blackhole") == 0) + res = RTN_BLACKHOLE; + else if (matches(arg, "xresolve") == 0) + res = RTN_XRESOLVE; + else if (matches(arg, "unicast") == 0) + res = RTN_UNICAST; + else if (strcmp(arg, "throw") == 0) + res = RTN_THROW; + else { + res = strtoul(arg, &end, 0); + if (!end || end == arg || *end || res > 255) + return -1; + } + *id = res; + return 0; +} + +int get_rt_realms(__u32 *realms, char *arg) +{ + __u32 realm = 0; + char *p = strchr(arg, '/'); + + *realms = 0; + if (p) { + *p = 0; + if (rtnl_rtrealm_a2n(realms, arg)) { + *p = '/'; + return -1; + } + *realms <<= 16; + *p = '/'; + arg = p+1; + } + if (*arg && rtnl_rtrealm_a2n(&realm, arg)) + return -1; + *realms |= realm; + return 0; +} diff --git a/ip/rtmon.c b/ip/rtmon.c index e69de29b..d01bc635 100644 --- a/ip/rtmon.c +++ b/ip/rtmon.c @@ -0,0 +1,177 @@ +/* + * rtmon.c RTnetlink listener. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <net/if.h> +#include <netinet/in.h> +#include <string.h> + +#include "SNAPSHOT.h" + +#include "utils.h" +#include "libnetlink.h" + +int resolve_hosts = 0; +static int init_phase = 1; + +static void write_stamp(FILE *fp) +{ + char buf[128]; + struct nlmsghdr *n1 = (void*)buf; + struct timeval tv; + + n1->nlmsg_type = 15; + n1->nlmsg_flags = 0; + n1->nlmsg_seq = 0; + n1->nlmsg_pid = 0; + n1->nlmsg_len = NLMSG_LENGTH(4*2); + gettimeofday(&tv, NULL); + ((__u32*)NLMSG_DATA(n1))[0] = tv.tv_sec; + ((__u32*)NLMSG_DATA(n1))[1] = tv.tv_usec; + fwrite((void*)n1, 1, NLMSG_ALIGN(n1->nlmsg_len), fp); +} + +static int dump_msg(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + if (!init_phase) + write_stamp(fp); + fwrite((void*)n, 1, NLMSG_ALIGN(n->nlmsg_len), fp); + fflush(fp); + return 0; +} + +void usage(void) +{ + fprintf(stderr, "Usage: rtmon file FILE [ all | LISTofOBJECTS]\n"); + fprintf(stderr, "LISTofOBJECTS := [ link ] [ address ] [ route ]\n"); + exit(-1); +} + +int +main(int argc, char **argv) +{ + FILE *fp; + struct rtnl_handle rth; + int family = AF_UNSPEC; + unsigned groups = ~0U; + int llink = 0; + int laddr = 0; + int lroute = 0; + char *file = NULL; + + while (argc > 1) { + if (matches(argv[1], "-family") == 0) { + argc--; + argv++; + if (argc <= 1) + usage(); + if (strcmp(argv[1], "inet") == 0) + family = AF_INET; + else if (strcmp(argv[1], "inet6") == 0) + family = AF_INET6; + else if (strcmp(argv[1], "link") == 0) + family = AF_INET6; + else if (strcmp(argv[1], "help") == 0) + usage(); + else { + fprintf(stderr, "Protocol ID \"%s\" is unknown, try \"rtmon help\".\n", argv[1]); + exit(-1); + } + } else if (strcmp(argv[1], "-4") == 0) { + family = AF_INET; + } else if (strcmp(argv[1], "-6") == 0) { + family = AF_INET6; + } else if (strcmp(argv[1], "-0") == 0) { + family = AF_PACKET; + } else if (matches(argv[1], "-Version") == 0) { + printf("rtmon utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + } else if (matches(argv[1], "file") == 0) { + argc--; + argv++; + if (argc <= 1) + usage(); + file = argv[1]; + } else if (matches(argv[1], "link") == 0) { + llink=1; + groups = 0; + } else if (matches(argv[1], "address") == 0) { + laddr=1; + groups = 0; + } else if (matches(argv[1], "route") == 0) { + lroute=1; + groups = 0; + } else if (strcmp(argv[1], "all") == 0) { + groups = ~0U; + } else if (matches(argv[1], "help") == 0) { + usage(); + } else { + fprintf(stderr, "Argument \"%s\" is unknown, try \"rtmon help\".\n", argv[1]); + exit(-1); + } + argc--; argv++; + } + + if (file == NULL) { + fprintf(stderr, "Not enough information: argument \"file\" is required\n"); + exit(-1); + } + if (llink) + groups |= RTMGRP_LINK; + if (laddr) { + if (!family || family == AF_INET) + groups |= RTMGRP_IPV4_IFADDR; + if (!family || family == AF_INET6) + groups |= RTMGRP_IPV6_IFADDR; + } + if (lroute) { + if (!family || family == AF_INET) + groups |= RTMGRP_IPV4_ROUTE; + if (!family || family == AF_INET6) + groups |= RTMGRP_IPV6_ROUTE; + } + + fp = fopen(file, "w"); + if (fp == NULL) { + perror("Cannot fopen"); + exit(-1); + } + + if (rtnl_open(&rth, groups) < 0) + exit(1); + + if (rtnl_wilddump_request(&rth, AF_UNSPEC, RTM_GETLINK) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + write_stamp(fp); + + if (rtnl_dump_filter(&rth, dump_msg, fp, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + return 1; + } + + init_phase = 0; + + if (rtnl_listen(&rth, dump_msg, (void*)fp) < 0) + exit(2); + + exit(0); +} diff --git a/ip/rtpr b/ip/rtpr index e69de29b..c3629fd6 100644 --- a/ip/rtpr +++ b/ip/rtpr @@ -0,0 +1,4 @@ +#! /bin/bash + +exec tr "[\\\\]" "[ +]" diff --git a/lib/Makefile b/lib/Makefile index e69de29b..bc270bff 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -0,0 +1,18 @@ + +UTILOBJ=utils.o rt_names.o ll_types.o ll_proto.o ll_addr.o inet_proto.o + +NLOBJ=ll_map.o libnetlink.o + +all: libnetlink.a libutil.a + +libnetlink.a: $(NLOBJ) + $(AR) rcs $@ $(NLOBJ) + +libutil.a: $(UTILOBJ) $(ADDLIB) + $(AR) rcs $@ $(UTILOBJ) $(ADDLIB) + +install: + +clean: + rm -f $(NLOBJ) $(UTILOBJ) $(ADDLIB) libnetlink.a libutil.a + diff --git a/lib/dnet_ntop.c b/lib/dnet_ntop.c index e69de29b..9500df86 100644 --- a/lib/dnet_ntop.c +++ b/lib/dnet_ntop.c @@ -0,0 +1,98 @@ +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> + +#include "utils.h" + +static __inline__ u_int16_t dn_ntohs(u_int16_t addr) +{ + union { + u_int8_t byte[2]; + u_int16_t word; + } u; + + u.word = addr; + return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); +} + +static __inline__ int do_digit(char *str, u_int16_t *addr, u_int16_t scale, size_t *pos, size_t len, int *started) +{ + u_int16_t tmp = *addr / scale; + + if (*pos == len) + return 1; + + if (((tmp) > 0) || *started || (scale == 1)) { + *str = tmp + '0'; + *started = 1; + (*pos)++; + *addr -= (tmp * scale); + } + + return 0; +} + + +static const char *dnet_ntop1(const struct dn_naddr *dna, char *str, size_t len) +{ + u_int16_t addr = dn_ntohs(*(u_int16_t *)dna->a_addr); + u_int16_t area = addr >> 10; + size_t pos = 0; + int started = 0; + + if (dna->a_len != 2) + return NULL; + + addr &= 0x03ff; + + if (len == 0) + return str; + + if (do_digit(str + pos, &area, 10, &pos, len, &started)) + return str; + + if (do_digit(str + pos, &area, 1, &pos, len, &started)) + return str; + + if (pos == len) + return str; + + *(str + pos) = '.'; + pos++; + started = 0; + + if (do_digit(str + pos, &addr, 1000, &pos, len, &started)) + return str; + + if (do_digit(str + pos, &addr, 100, &pos, len, &started)) + return str; + + if (do_digit(str + pos, &addr, 10, &pos, len, &started)) + return str; + + if (do_digit(str + pos, &addr, 1, &pos, len, &started)) + return str; + + if (pos == len) + return str; + + *(str + pos) = 0; + + return str; +} + + +const char *dnet_ntop(int af, const void *addr, char *str, size_t len) +{ + switch(af) { + case AF_DECnet: + errno = 0; + return dnet_ntop1((struct dn_naddr *)addr, str, len); + default: + errno = EAFNOSUPPORT; + } + + return NULL; +} + + diff --git a/lib/dnet_pton.c b/lib/dnet_pton.c index e69de29b..bd7727ae 100644 --- a/lib/dnet_pton.c +++ b/lib/dnet_pton.c @@ -0,0 +1,71 @@ +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> + +#include "utils.h" + +static __inline__ u_int16_t dn_htons(u_int16_t addr) +{ + union { + u_int8_t byte[2]; + u_int16_t word; + } u; + + u.word = addr; + return ((u_int16_t)u.byte[0]) | (((u_int16_t)u.byte[1]) << 8); +} + + +static int dnet_num(const char *src, u_int16_t * dst) +{ + int rv = 0; + int tmp; + *dst = 0; + + while ((tmp = *src++) != 0) { + tmp -= '0'; + if ((tmp < 0) || (tmp > 9)) + return rv; + + rv++; + (*dst) *= 10; + (*dst) += tmp; + } + + return rv; +} + +static int dnet_pton1(const char *src, struct dn_naddr *dna) +{ + u_int16_t area = 0; + u_int16_t node = 0; + int pos; + + pos = dnet_num(src, &area); + if ((pos == 0) || (area > 63) || (*(src + pos) != '.')) + return 0; + pos = dnet_num(src + pos + 1, &node); + if ((pos == 0) || (node > 1023)) + return 0; + dna->a_len = 2; + *(u_int16_t *)dna->a_addr = dn_htons((area << 10) | node); + + return 1; +} + +int dnet_pton(int af, const char *src, void *addr) +{ + int err; + + switch (af) { + case AF_DECnet: + errno = 0; + err = dnet_pton1(src, (struct dn_naddr *)addr); + break; + default: + errno = EAFNOSUPPORT; + err = -1; + } + + return err; +} diff --git a/lib/inet_ntop.c b/lib/inet_ntop.c index e69de29b..a3722d67 100644 --- a/lib/inet_ntop.c +++ b/lib/inet_ntop.c @@ -0,0 +1,199 @@ +/* Copyright (c) 1996 by Internet Software Consortium. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS + * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE + * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL + * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR + * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char rcsid[] = "$Id: inet_ntop.c,v 1.4 1996/09/27 03:24:13 drepper Exp $"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <arpa/nameser.h> +#include <string.h> +#include <errno.h> +#include <stdio.h> + +#include <linux/in6.h> +#ifndef IN6ADDRSZ +#define IN6ADDRSZ sizeof(struct in6_addr) +#endif + +#ifdef SPRINTF_CHAR +# define SPRINTF(x) strlen(sprintf/**/x) +#else +# define SPRINTF(x) ((size_t)sprintf x) +#endif + +/* + * WARNING: Don't even consider trying to compile this on a system where + * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX. + */ + +static const char *inet_ntop4 __P((const u_char *src, char *dst, size_t size)); +static const char *inet_ntop6 __P((const u_char *src, char *dst, size_t size)); + +/* char * + * inet_ntop(af, src, dst, size) + * convert a network format address to presentation format. + * return: + * pointer to presentation format address (`dst'), or NULL (see errno). + * author: + * Paul Vixie, 1996. + */ +const char * +inet_ntop(af, src, dst, size) + int af; + const void *src; + char *dst; + size_t size; +{ + switch (af) { + case AF_INET: + return (inet_ntop4(src, dst, size)); + case AF_INET6: + return (inet_ntop6(src, dst, size)); + default: + errno = (EAFNOSUPPORT); + return (NULL); + } + /* NOTREACHED */ +} + +/* const char * + * inet_ntop4(src, dst, size) + * format an IPv4 address, more or less like inet_ntoa() + * return: + * `dst' (as a const) + * notes: + * (1) uses no statics + * (2) takes a u_char* not an in_addr as input + * author: + * Paul Vixie, 1996. + */ +static const char * +inet_ntop4(src, dst, size) + const u_char *src; + char *dst; + size_t size; +{ + static const char fmt[] = "%u.%u.%u.%u"; + char tmp[sizeof "255.255.255.255"]; + + if (SPRINTF((tmp, fmt, src[0], src[1], src[2], src[3])) > size) { + errno = (ENOSPC); + return (NULL); + } + strcpy(dst, tmp); + return (dst); +} + +/* const char * + * inet_ntop6(src, dst, size) + * convert IPv6 binary address into presentation (printable) format + * author: + * Paul Vixie, 1996. + */ +static const char * +inet_ntop6(src, dst, size) + const u_char *src; + char *dst; + size_t size; +{ + /* + * Note that int32_t and int16_t need only be "at least" large enough + * to contain a value of the specified size. On some systems, like + * Crays, there is no such thing as an integer variable with 16 bits. + * Keep this in mind if you think this function should have been coded + * to use pointer overlays. All the world's not a VAX. + */ + char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp; + struct { int base, len; } best, cur; + u_int words[sizeof(struct in6_addr) / INT16SZ]; + int i; + + /* + * Preprocess: + * Copy the input (bytewise) array into a wordwise array. + * Find the longest run of 0x00's in src[] for :: shorthanding. + */ + memset(words, '\0', sizeof words); + for (i = 0; i < IN6ADDRSZ; i++) + words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3)); + best.base = -1; + cur.base = -1; + for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) { + if (words[i] == 0) { + if (cur.base == -1) + cur.base = i, cur.len = 1; + else + cur.len++; + } else { + if (cur.base != -1) { + if (best.base == -1 || cur.len > best.len) + best = cur; + cur.base = -1; + } + } + } + if (cur.base != -1) { + if (best.base == -1 || cur.len > best.len) + best = cur; + } + if (best.base != -1 && best.len < 2) + best.base = -1; + + /* + * Format the result. + */ + tp = tmp; + for (i = 0; i < (IN6ADDRSZ / INT16SZ); i++) { + /* Are we inside the best run of 0x00's? */ + if (best.base != -1 && i >= best.base && + i < (best.base + best.len)) { + if (i == best.base) + *tp++ = ':'; + continue; + } + /* Are we following an initial run of 0x00s or any real hex? */ + if (i != 0) + *tp++ = ':'; + /* Is this address an encapsulated IPv4? */ + if (i == 6 && best.base == 0 && + (best.len == 6 || (best.len == 5 && words[5] == 0xffff))) { + if (!inet_ntop4(src+12, tp, sizeof tmp - (tp - tmp))) + return (NULL); + tp += strlen(tp); + break; + } + tp += SPRINTF((tp, "%x", words[i])); + } + /* Was it a trailing run of 0x00's? */ + if (best.base != -1 && (best.base + best.len) == (IN6ADDRSZ / INT16SZ)) + *tp++ = ':'; + *tp++ = '\0'; + + /* + * Check for overflow, copy, and we're done. + */ + if ((size_t)(tp - tmp) > size) { + errno = (ENOSPC); + return (NULL); + } + strcpy(dst, tmp); + return (dst); +} diff --git a/lib/inet_proto.c b/lib/inet_proto.c index e69de29b..a55e0e7b 100644 --- a/lib/inet_proto.c +++ b/lib/inet_proto.c @@ -0,0 +1,70 @@ +/* + * inet_proto.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <string.h> + +#include "utils.h" + +char *inet_proto_n2a(int proto, char *buf, int len) +{ + static char ncache[16]; + static int icache = -1; + struct protoent *pe; + + if (proto == icache) + return ncache; + + pe = getprotobynumber(proto); + if (pe) { + icache = proto; + strncpy(ncache, pe->p_name, 16); + strncpy(buf, pe->p_name, len); + return buf; + } + snprintf(buf, len, "ipproto-%d", proto); + return buf; +} + +int inet_proto_a2n(char *buf) +{ + static char ncache[16]; + static int icache = -1; + struct protoent *pe; + + if (icache>=0 && strcmp(ncache, buf) == 0) + return icache; + + if (buf[0] >= '0' && buf[0] <= '9') { + __u8 ret; + if (get_u8(&ret, buf, 10)) + return -1; + return ret; + } + + pe = getprotobyname(buf); + if (pe) { + icache = pe->p_proto; + strncpy(ncache, pe->p_name, 16); + return pe->p_proto; + } + return -1; +} + + diff --git a/lib/inet_pton.c b/lib/inet_pton.c index e69de29b..99508344 100644 --- a/lib/inet_pton.c +++ b/lib/inet_pton.c @@ -0,0 +1,217 @@ +/* Copyright (c) 1996 by Internet Software Consortium. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS + * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE + * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL + * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR + * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#if defined(LIBC_SCCS) && !defined(lint) +static char rcsid[] = "$Id: inet_pton.c,v 1.5 1996/09/27 03:24:16 drepper Exp $"; +#endif /* LIBC_SCCS and not lint */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <arpa/nameser.h> +#include <string.h> +#include <errno.h> + +#include <linux/in6.h> +#define IN6ADDRSZ sizeof(struct in6_addr) + +/* + * WARNING: Don't even consider trying to compile this on a system where + * sizeof(int) < 4. sizeof(int) > 4 is fine; all the world's not a VAX. + */ + +static int inet_pton4 __P((const char *src, u_char *dst)); +static int inet_pton6 __P((const char *src, u_char *dst)); + +/* int + * inet_pton(af, src, dst) + * convert from presentation format (which usually means ASCII printable) + * to network format (which is usually some kind of binary format). + * return: + * 1 if the address was valid for the specified address family + * 0 if the address wasn't valid (`dst' is untouched in this case) + * -1 if some other error occurred (`dst' is untouched in this case, too) + * author: + * Paul Vixie, 1996. + */ +int +inet_pton(af, src, dst) + int af; + const char *src; + void *dst; +{ + switch (af) { + case AF_INET: + return (inet_pton4(src, dst)); + case AF_INET6: + return (inet_pton6(src, dst)); + default: + errno = EAFNOSUPPORT; + return (-1); + } + /* NOTREACHED */ +} + +/* int + * inet_pton4(src, dst) + * like inet_aton() but without all the hexadecimal and shorthand. + * return: + * 1 if `src' is a valid dotted quad, else 0. + * notice: + * does not touch `dst' unless it's returning 1. + * author: + * Paul Vixie, 1996. + */ +static int +inet_pton4(src, dst) + const char *src; + u_char *dst; +{ + static const char digits[] = "0123456789"; + int saw_digit, octets, ch; + u_char tmp[INADDRSZ], *tp; + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + while ((ch = *src++) != '\0') { + const char *pch; + + if ((pch = strchr(digits, ch)) != NULL) { + u_int new = *tp * 10 + (pch - digits); + + if (new > 255) + return (0); + *tp = new; + if (! saw_digit) { + if (++octets > 4) + return (0); + saw_digit = 1; + } + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return (0); + *++tp = 0; + saw_digit = 0; + } else + return (0); + } + if (octets < 4) + return (0); + + memcpy(dst, tmp, INADDRSZ); + return (1); +} + +/* int + * inet_pton6(src, dst) + * convert presentation level address to network order binary form. + * return: + * 1 if `src' is a valid [RFC1884 2.2] address, else 0. + * notice: + * (1) does not touch `dst' unless it's returning 1. + * (2) :: in a full address is silently ignored. + * credit: + * inspired by Mark Andrews. + * author: + * Paul Vixie, 1996. + */ +static int +inet_pton6(src, dst) + const char *src; + u_char *dst; +{ + static const char xdigits_l[] = "0123456789abcdef", + xdigits_u[] = "0123456789ABCDEF"; + u_char tmp[IN6ADDRSZ], *tp, *endp, *colonp; + const char *xdigits, *curtok; + int ch, saw_xdigit; + u_int val; + + memset((tp = tmp), '\0', IN6ADDRSZ); + endp = tp + IN6ADDRSZ; + colonp = NULL; + /* Leading :: requires some special handling. */ + if (*src == ':') + if (*++src != ':') + return (0); + curtok = src; + saw_xdigit = 0; + val = 0; + while ((ch = *src++) != '\0') { + const char *pch; + + if ((pch = strchr((xdigits = xdigits_l), ch)) == NULL) + pch = strchr((xdigits = xdigits_u), ch); + if (pch != NULL) { + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return (0); + saw_xdigit = 1; + continue; + } + if (ch == ':') { + curtok = src; + if (!saw_xdigit) { + if (colonp) + return (0); + colonp = tp; + continue; + } + if (tp + INT16SZ > endp) + return (0); + *tp++ = (u_char) (val >> 8) & 0xff; + *tp++ = (u_char) val & 0xff; + saw_xdigit = 0; + val = 0; + continue; + } + if (ch == '.' && ((tp + INADDRSZ) <= endp) && + inet_pton4(curtok, tp) > 0) { + tp += INADDRSZ; + saw_xdigit = 0; + break; /* '\0' was seen by inet_pton4(). */ + } + return (0); + } + if (saw_xdigit) { + if (tp + INT16SZ > endp) + return (0); + *tp++ = (u_char) (val >> 8) & 0xff; + *tp++ = (u_char) val & 0xff; + } + if (colonp != NULL) { + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + int i; + + for (i = 1; i <= n; i++) { + endp[- i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp) + return (0); + memcpy(dst, tmp, IN6ADDRSZ); + return (1); +} diff --git a/lib/ipx_ntop.c b/lib/ipx_ntop.c index e69de29b..b2d67902 100644 --- a/lib/ipx_ntop.c +++ b/lib/ipx_ntop.c @@ -0,0 +1,71 @@ +#include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> + +#include "utils.h" + +static __inline__ int do_digit(char *str, u_int32_t addr, u_int32_t scale, size_t *pos, size_t len) +{ + u_int32_t tmp = addr >> (scale * 4); + + if (*pos == len) + return 1; + + tmp &= 0x0f; + if (tmp > 9) + *str = tmp + 'A' - 10; + else + *str = tmp + '0'; + (*pos)++; + + return 0; +} + +static const char *ipx_ntop1(const struct ipx_addr *addr, char *str, size_t len) +{ + int i; + size_t pos = 0; + + if (len == 0) + return str; + + for(i = 7; i >= 0; i--) + if (do_digit(str + pos, ntohl(addr->ipx_net), i, &pos, len)) + return str; + + if (pos == len) + return str; + + *(str + pos) = '.'; + pos++; + + for(i = 0; i < 6; i++) { + if (do_digit(str + pos, addr->ipx_node[i], 1, &pos, len)) + return str; + if (do_digit(str + pos, addr->ipx_node[i], 0, &pos, len)) + return str; + } + + if (pos == len) + return str; + + *(str + pos) = 0; + + return str; +} + + +const char *ipx_ntop(int af, const void *addr, char *str, size_t len) +{ + switch(af) { + case AF_IPX: + errno = 0; + return ipx_ntop1((struct ipx_addr *)addr, str, len); + default: + errno = EAFNOSUPPORT; + } + + return NULL; +} + + diff --git a/lib/ipx_pton.c b/lib/ipx_pton.c index e69de29b..1a52b7f1 100644 --- a/lib/ipx_pton.c +++ b/lib/ipx_pton.c @@ -0,0 +1,107 @@ +#include <errno.h> +#include <string.h> +#include <sys/types.h> +#include <netinet/in.h> + +#include "utils.h" + +static u_int32_t hexget(char c) +{ + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + if (c >= '0' && c <= '9') + return c - '0'; + + return 0xf0; +} + +static int ipx_getnet(u_int32_t *net, const char *str) +{ + int i; + u_int32_t tmp; + + for(i = 0; *str && (i < 8); i++) { + + if ((tmp = hexget(*str)) & 0xf0) { + if (*str == '.') + return 0; + else + return -1; + } + + str++; + (*net) <<= 4; + (*net) |= tmp; + } + + if (*str == 0) + return 0; + + return -1; +} + +static int ipx_getnode(u_int8_t *node, const char *str) +{ + int i; + u_int32_t tmp; + + for(i = 0; i < 6; i++) { + if ((tmp = hexget(*str++)) & 0xf0) + return -1; + node[i] = (u_int8_t)tmp; + node[i] <<= 4; + if ((tmp = hexget(*str++)) & 0xf0) + return -1; + node[i] |= (u_int8_t)tmp; + if (*str == ':') + str++; + } + + return 0; +} + +static int ipx_pton1(const char *src, struct ipx_addr *addr) +{ + char *sep = (char *)src; + int no_node = 0; + + memset(addr, 0, sizeof(struct ipx_addr)); + + while(*sep && (*sep != '.')) + sep++; + + if (*sep != '.') + no_node = 1; + + if (ipx_getnet(&addr->ipx_net, src)) + return 0; + + addr->ipx_net = htonl(addr->ipx_net); + + if (no_node) + return 1; + + if (ipx_getnode(addr->ipx_node, sep + 1)) + return 0; + + return 1; +} + +int ipx_pton(int af, const char *src, void *addr) +{ + int err; + + switch (af) { + case AF_IPX: + errno = 0; + err = ipx_pton1(src, (struct ipx_addr *)addr); + break; + default: + errno = EAFNOSUPPORT; + err = -1; + } + + return err; +} diff --git a/lib/libnetlink.c b/lib/libnetlink.c index e69de29b..a1f39d40 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -0,0 +1,521 @@ +/* + * libnetlink.c RTnetlink service routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <net/if_arp.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/uio.h> + +#include "libnetlink.h" + +void rtnl_close(struct rtnl_handle *rth) +{ + close(rth->fd); +} + +int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions) +{ + int addr_len; + + memset(rth, 0, sizeof(rth)); + + rth->fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (rth->fd < 0) { + perror("Cannot open netlink socket"); + return -1; + } + + memset(&rth->local, 0, sizeof(rth->local)); + rth->local.nl_family = AF_NETLINK; + rth->local.nl_groups = subscriptions; + + if (bind(rth->fd, (struct sockaddr*)&rth->local, sizeof(rth->local)) < 0) { + perror("Cannot bind netlink socket"); + return -1; + } + addr_len = sizeof(rth->local); + if (getsockname(rth->fd, (struct sockaddr*)&rth->local, &addr_len) < 0) { + perror("Cannot getsockname"); + return -1; + } + if (addr_len != sizeof(rth->local)) { + fprintf(stderr, "Wrong address length %d\n", addr_len); + return -1; + } + if (rth->local.nl_family != AF_NETLINK) { + fprintf(stderr, "Wrong address family %d\n", rth->local.nl_family); + return -1; + } + rth->seq = time(NULL); + return 0; +} + +int rtnl_wilddump_request(struct rtnl_handle *rth, int family, int type) +{ + struct { + struct nlmsghdr nlh; + struct rtgenmsg g; + } req; + struct sockaddr_nl nladdr; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = type; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = rth->dump = ++rth->seq; + req.g.rtgen_family = family; + + return sendto(rth->fd, (void*)&req, sizeof(req), 0, (struct sockaddr*)&nladdr, sizeof(nladdr)); +} + +int rtnl_send(struct rtnl_handle *rth, char *buf, int len) +{ + struct sockaddr_nl nladdr; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + return sendto(rth->fd, buf, len, 0, (struct sockaddr*)&nladdr, sizeof(nladdr)); +} + +int rtnl_dump_request(struct rtnl_handle *rth, int type, void *req, int len) +{ + struct nlmsghdr nlh; + struct sockaddr_nl nladdr; + struct iovec iov[2] = { { &nlh, sizeof(nlh) }, { req, len } }; + struct msghdr msg = { + (void*)&nladdr, sizeof(nladdr), + iov, 2, + NULL, 0, + 0 + }; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + nlh.nlmsg_len = NLMSG_LENGTH(len); + nlh.nlmsg_type = type; + nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + nlh.nlmsg_pid = 0; + nlh.nlmsg_seq = rth->dump = ++rth->seq; + + return sendmsg(rth->fd, &msg, 0); +} + +int rtnl_dump_filter(struct rtnl_handle *rth, + int (*filter)(struct sockaddr_nl *, struct nlmsghdr *n, void *), + void *arg1, + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *arg2) +{ + char buf[8192]; + struct sockaddr_nl nladdr; + struct iovec iov = { buf, sizeof(buf) }; + + while (1) { + int status; + struct nlmsghdr *h; + + struct msghdr msg = { + (void*)&nladdr, sizeof(nladdr), + &iov, 1, + NULL, 0, + 0 + }; + + status = recvmsg(rth->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + if (msg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, "sender address length == %d\n", msg.msg_namelen); + exit(1); + } + + h = (struct nlmsghdr*)buf; + while (NLMSG_OK(h, status)) { + int err; + + if (h->nlmsg_pid != rth->local.nl_pid || + h->nlmsg_seq != rth->dump) { + if (junk) { + err = junk(&nladdr, h, arg2); + if (err < 0) + return err; + } + goto skip_it; + } + + if (h->nlmsg_type == NLMSG_DONE) + return 0; + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + perror("RTNETLINK answers"); + } + return -1; + } + err = filter(&nladdr, h, arg1); + if (err < 0) + return err; + +skip_it: + h = NLMSG_NEXT(h, status); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer, + unsigned groups, struct nlmsghdr *answer, + int (*junk)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg) +{ + int status; + unsigned seq; + struct nlmsghdr *h; + struct sockaddr_nl nladdr; + struct iovec iov = { (void*)n, n->nlmsg_len }; + char buf[8192]; + struct msghdr msg = { + (void*)&nladdr, sizeof(nladdr), + &iov, 1, + NULL, 0, + 0 + }; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = peer; + nladdr.nl_groups = groups; + + n->nlmsg_seq = seq = ++rtnl->seq; + if (answer == NULL) + n->nlmsg_flags |= NLM_F_ACK; + + status = sendmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + perror("Cannot talk to rtnetlink"); + return -1; + } + + iov.iov_base = buf; + + while (1) { + iov.iov_len = sizeof(buf); + status = recvmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + if (msg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, "sender address length == %d\n", msg.msg_namelen); + exit(1); + } + for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) { + int err; + int len = h->nlmsg_len; + int l = len - sizeof(*h); + + if (l<0 || len>status) { + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Truncated message\n"); + return -1; + } + fprintf(stderr, "!!!malformed message: len=%d\n", len); + exit(1); + } + + if (h->nlmsg_pid != rtnl->local.nl_pid || + h->nlmsg_seq != seq) { + if (junk) { + err = junk(&nladdr, h, jarg); + if (err < 0) + return err; + } + continue; + } + + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (l < sizeof(struct nlmsgerr)) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + if (errno == 0) { + if (answer) + memcpy(answer, h, h->nlmsg_len); + return 0; + } + perror("RTNETLINK answers"); + } + return -1; + } + if (answer) { + memcpy(answer, h, h->nlmsg_len); + return 0; + } + + fprintf(stderr, "Unexpected reply!!!\n"); + + status -= NLMSG_ALIGN(len); + h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len)); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_listen(struct rtnl_handle *rtnl, + int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg) +{ + int status; + struct nlmsghdr *h; + struct sockaddr_nl nladdr; + struct iovec iov; + char buf[8192]; + struct msghdr msg = { + (void*)&nladdr, sizeof(nladdr), + &iov, 1, + NULL, 0, + 0 + }; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = 0; + nladdr.nl_groups = 0; + + + iov.iov_base = buf; + + while (1) { + iov.iov_len = sizeof(buf); + status = recvmsg(rtnl->fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return -1; + } + if (msg.msg_namelen != sizeof(nladdr)) { + fprintf(stderr, "Sender address length == %d\n", msg.msg_namelen); + exit(1); + } + for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) { + int err; + int len = h->nlmsg_len; + int l = len - sizeof(*h); + + if (l<0 || len>status) { + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Truncated message\n"); + return -1; + } + fprintf(stderr, "!!!malformed message: len=%d\n", len); + exit(1); + } + + err = handler(&nladdr, h, jarg); + if (err < 0) + return err; + + status -= NLMSG_ALIGN(len); + h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len)); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } +} + +int rtnl_from_file(FILE *rtnl, + int (*handler)(struct sockaddr_nl *,struct nlmsghdr *n, void *), + void *jarg) +{ + int status; + struct sockaddr_nl nladdr; + char buf[8192]; + struct nlmsghdr *h = (void*)buf; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = 0; + nladdr.nl_groups = 0; + + while (1) { + int err, len, type; + int l; + + status = fread(&buf, 1, sizeof(*h), rtnl); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("rtnl_from_file: fread"); + return -1; + } + if (status == 0) + return 0; + + len = h->nlmsg_len; + type= h->nlmsg_type; + l = len - sizeof(*h); + + if (l<0 || len>sizeof(buf)) { + fprintf(stderr, "!!!malformed message: len=%d @%lu\n", + len, ftell(rtnl)); + return -1; + } + + status = fread(NLMSG_DATA(h), 1, NLMSG_ALIGN(l), rtnl); + + if (status < 0) { + perror("rtnl_from_file: fread"); + return -1; + } + if (status < l) { + fprintf(stderr, "rtnl-from_file: truncated message\n"); + return -1; + } + + err = handler(&nladdr, h, jarg); + if (err < 0) + return err; + } +} + +int addattr32(struct nlmsghdr *n, int maxlen, int type, __u32 data) +{ + int len = RTA_LENGTH(4); + struct rtattr *rta; + if (NLMSG_ALIGN(n->nlmsg_len) + len > maxlen) + return -1; + rta = (struct rtattr*)(((char*)n) + NLMSG_ALIGN(n->nlmsg_len)); + rta->rta_type = type; + rta->rta_len = len; + memcpy(RTA_DATA(rta), &data, 4); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + len; + return 0; +} + +int addattr_l(struct nlmsghdr *n, int maxlen, int type, void *data, int alen) +{ + int len = RTA_LENGTH(alen); + struct rtattr *rta; + + if (NLMSG_ALIGN(n->nlmsg_len) + len > maxlen) + return -1; + rta = (struct rtattr*)(((char*)n) + NLMSG_ALIGN(n->nlmsg_len)); + rta->rta_type = type; + rta->rta_len = len; + memcpy(RTA_DATA(rta), data, alen); + n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + len; + return 0; +} + +int rta_addattr32(struct rtattr *rta, int maxlen, int type, __u32 data) +{ + int len = RTA_LENGTH(4); + struct rtattr *subrta; + + if (RTA_ALIGN(rta->rta_len) + len > maxlen) + return -1; + subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len)); + subrta->rta_type = type; + subrta->rta_len = len; + memcpy(RTA_DATA(subrta), &data, 4); + rta->rta_len = NLMSG_ALIGN(rta->rta_len) + len; + return 0; +} + +int rta_addattr_l(struct rtattr *rta, int maxlen, int type, void *data, int alen) +{ + struct rtattr *subrta; + int len = RTA_LENGTH(alen); + + if (RTA_ALIGN(rta->rta_len) + len > maxlen) + return -1; + subrta = (struct rtattr*)(((char*)rta) + RTA_ALIGN(rta->rta_len)); + subrta->rta_type = type; + subrta->rta_len = len; + memcpy(RTA_DATA(subrta), data, alen); + rta->rta_len = NLMSG_ALIGN(rta->rta_len) + len; + return 0; +} + + +int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len) +{ + while (RTA_OK(rta, len)) { + if (rta->rta_type <= max) + tb[rta->rta_type] = rta; + rta = RTA_NEXT(rta,len); + } + if (len) + fprintf(stderr, "!!!Deficit %d, rta_len=%d\n", len, rta->rta_len); + return 0; +} diff --git a/lib/ll_addr.c b/lib/ll_addr.c index e69de29b..082cb3c4 100644 --- a/lib/ll_addr.c +++ b/lib/ll_addr.c @@ -0,0 +1,91 @@ +/* + * ll_addr.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" + + +const char *ll_addr_n2a(unsigned char *addr, int alen, int type, char *buf, int blen) +{ + int i; + int l; + + if (alen == 4 && + (type == ARPHRD_TUNNEL || type == ARPHRD_SIT || type == ARPHRD_IPGRE)) { + return inet_ntop(AF_INET, addr, buf, blen); + } + l = 0; + for (i=0; i<alen; i++) { + if (i==0) { + snprintf(buf+l, blen, "%02x", addr[i]); + blen -= 2; + l += 2; + } else { + snprintf(buf+l, blen, ":%02x", addr[i]); + blen -= 3; + l += 3; + } + } + return buf; +} + +int ll_addr_a2n(unsigned char *lladdr, int len, char *arg) +{ + if (strchr(arg, '.')) { + inet_prefix pfx; + if (get_addr_1(&pfx, arg, AF_INET)) { + fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg); + return -1; + } + if (len < 4) + return -1; + memcpy(lladdr, pfx.data, 4); + return 4; + } else { + int i; + + for (i=0; i<len; i++) { + int temp; + char *cp = strchr(arg, ':'); + if (cp) { + *cp = 0; + cp++; + } + if (sscanf(arg, "%x", &temp) != 1) { + fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg); + return -1; + } + if (temp < 0 || temp > 255) { + fprintf(stderr, "\"%s\" is invalid lladdr.\n", arg); + return -1; + } + lladdr[i] = temp; + if (!cp) + break; + arg = cp; + } + return i+1; + } +} diff --git a/lib/ll_map.c b/lib/ll_map.c index e69de29b..e5a95e6a 100644 --- a/lib/ll_map.c +++ b/lib/ll_map.c @@ -0,0 +1,169 @@ +/* + * ll_map.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <string.h> + +#include "libnetlink.h" +#include "ll_map.h" + +struct idxmap +{ + struct idxmap * next; + int index; + int type; + int alen; + unsigned flags; + unsigned char addr[8]; + char name[16]; +}; + +static struct idxmap *idxmap[16]; + +int ll_remember_index(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + int h; + struct ifinfomsg *ifi = NLMSG_DATA(n); + struct idxmap *im, **imp; + struct rtattr *tb[IFLA_MAX+1]; + + if (n->nlmsg_type != RTM_NEWLINK) + return 0; + + if (n->nlmsg_len < NLMSG_LENGTH(sizeof(ifi))) + return -1; + + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), IFLA_PAYLOAD(n)); + if (tb[IFLA_IFNAME] == NULL) + return 0; + + h = ifi->ifi_index&0xF; + + for (imp=&idxmap[h]; (im=*imp)!=NULL; imp = &im->next) + if (im->index == ifi->ifi_index) + break; + + if (im == NULL) { + im = malloc(sizeof(*im)); + if (im == NULL) + return 0; + im->next = *imp; + im->index = ifi->ifi_index; + *imp = im; + } + + im->type = ifi->ifi_type; + im->flags = ifi->ifi_flags; + if (tb[IFLA_ADDRESS]) { + int alen; + im->alen = alen = RTA_PAYLOAD(tb[IFLA_ADDRESS]); + if (alen > sizeof(im->addr)) + alen = sizeof(im->addr); + memcpy(im->addr, RTA_DATA(tb[IFLA_ADDRESS]), alen); + } else { + im->alen = 0; + memset(im->addr, 0, sizeof(im->addr)); + } + strcpy(im->name, RTA_DATA(tb[IFLA_IFNAME])); + return 0; +} + +const char *ll_idx_n2a(int idx, char *buf) +{ + struct idxmap *im; + + if (idx == 0) + return "*"; + for (im = idxmap[idx&0xF]; im; im = im->next) + if (im->index == idx) + return im->name; + snprintf(buf, 16, "if%d", idx); + return buf; +} + + +const char *ll_index_to_name(int idx) +{ + static char nbuf[16]; + + return ll_idx_n2a(idx, nbuf); +} + +int ll_index_to_type(int idx) +{ + struct idxmap *im; + + if (idx == 0) + return -1; + for (im = idxmap[idx&0xF]; im; im = im->next) + if (im->index == idx) + return im->type; + return -1; +} + +unsigned ll_index_to_flags(int idx) +{ + struct idxmap *im; + + if (idx == 0) + return 0; + + for (im = idxmap[idx&0xF]; im; im = im->next) + if (im->index == idx) + return im->flags; + return 0; +} + +int ll_name_to_index(char *name) +{ + static char ncache[16]; + static int icache; + struct idxmap *im; + int i; + + if (name == NULL) + return 0; + if (icache && strcmp(name, ncache) == 0) + return icache; + for (i=0; i<16; i++) { + for (im = idxmap[i]; im; im = im->next) { + if (strcmp(im->name, name) == 0) { + icache = im->index; + strcpy(ncache, name); + return im->index; + } + } + } + return 0; +} + +int ll_init_map(struct rtnl_handle *rth) +{ + if (rtnl_wilddump_request(rth, AF_UNSPEC, RTM_GETLINK) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(rth, ll_remember_index, &idxmap, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + return 0; +} diff --git a/lib/ll_proto.c b/lib/ll_proto.c index e69de29b..71f149dc 100644 --- a/lib/ll_proto.c +++ b/lib/ll_proto.c @@ -0,0 +1,127 @@ +/* + * ll_proto.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" + + +#define __PF(f,n) { ETH_P_##f, #n }, +static struct { + int id; + char *name; +} llproto_names[] = { +__PF(LOOP,loop) +__PF(PUP,pup) +#ifdef ETH_P_PUPAT +__PF(PUPAT,pupat) +#endif +__PF(IP,ip) +__PF(X25,x25) +__PF(ARP,arp) +__PF(BPQ,bpq) +#ifdef ETH_P_IEEEPUP +__PF(IEEEPUP,ieeepup) +#endif +#ifdef ETH_P_IEEEPUPAT +__PF(IEEEPUPAT,ieeepupat) +#endif +__PF(DEC,dec) +__PF(DNA_DL,dna_dl) +__PF(DNA_RC,dna_rc) +__PF(DNA_RT,dna_rt) +__PF(LAT,lat) +__PF(DIAG,diag) +__PF(CUST,cust) +__PF(SCA,sca) +__PF(RARP,rarp) +__PF(ATALK,atalk) +__PF(AARP,aarp) +__PF(IPX,ipx) +__PF(IPV6,ipv6) +#ifdef ETH_P_PPP_DISC +__PF(PPP_DISC,ppp_disc) +#endif +#ifdef ETH_P_PPP_SES +__PF(PPP_SES,ppp_ses) +#endif +#ifdef ETH_P_ATMMPOA +__PF(ATMMPOA,atmmpoa) +#endif +#ifdef ETH_P_ATMFATE +__PF(ATMFATE,atmfate) +#endif + +__PF(802_3,802_3) +__PF(AX25,ax25) +__PF(ALL,all) +__PF(802_2,802_2) +__PF(SNAP,snap) +__PF(DDCMP,ddcmp) +__PF(WAN_PPP,wan_ppp) +__PF(PPP_MP,ppp_mp) +__PF(LOCALTALK,localtalk) +__PF(PPPTALK,ppptalk) +__PF(TR_802_2,tr_802_2) +__PF(MOBITEX,mobitex) +__PF(CONTROL,control) +__PF(IRDA,irda) +#ifdef ETH_P_ECONET +__PF(ECONET,econet) +#endif + +{ 0x8100, "802.1Q" }, +{ ETH_P_IP, "ipv4" }, +}; +#undef __PF + + +char * ll_proto_n2a(unsigned short id, char *buf, int len) +{ + int i; + + id = ntohs(id); + + for (i=0; i<sizeof(llproto_names)/sizeof(llproto_names[0]); i++) { + if (llproto_names[i].id == id) + return llproto_names[i].name; + } + snprintf(buf, len, "[%d]", id); + return buf; +} + +int ll_proto_a2n(unsigned short *id, char *buf) +{ + int i; + for (i=0; i<sizeof(llproto_names)/sizeof(llproto_names[0]); i++) { + if (strcasecmp(llproto_names[i].name, buf) == 0) { + *id = htons(llproto_names[i].id); + return 0; + } + } + if (get_u16(id, buf, 0)) + return -1; + *id = htons(*id); + return 0; +} diff --git a/lib/ll_types.c b/lib/ll_types.c index e69de29b..165ecfa6 100644 --- a/lib/ll_types.c +++ b/lib/ll_types.c @@ -0,0 +1,128 @@ +/* + * ll_types.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/sockios.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +char * ll_type_n2a(int type, char *buf, int len) +{ +#define __PF(f,n) { ARPHRD_##f, #n }, +static struct { + int type; + char *name; +} arphrd_names[] = { +{ 0, "generic" }, +__PF(ETHER,ether) +__PF(EETHER,eether) +__PF(AX25,ax25) +__PF(PRONET,pronet) +__PF(CHAOS,chaos) +#ifdef ARPHRD_IEEE802_TR +__PF(IEEE802,ieee802) +#else +__PF(IEEE802,tr) +#endif +__PF(ARCNET,arcnet) +__PF(APPLETLK,atalk) +__PF(DLCI,dlci) +#ifdef ARPHRD_ATM +__PF(ATM,atm) +#endif +__PF(METRICOM,metricom) +#ifdef ARPHRD_IEEE1394 +__PF(IEEE1394,ieee1394) +#endif + +__PF(SLIP,slip) +__PF(CSLIP,cslip) +__PF(SLIP6,slip6) +__PF(CSLIP6,cslip6) +__PF(RSRVD,rsrvd) +__PF(ADAPT,adapt) +__PF(ROSE,rose) +__PF(X25,x25) +#ifdef ARPHRD_HWX25 +__PF(HWX25,hwx25) +#endif +__PF(PPP,ppp) +__PF(HDLC,hdlc) +__PF(LAPB,lapb) +#ifdef ARPHRD_DDCMP +__PF(DDCMP,ddcmp) +#endif +#ifdef ARPHRD_RAWHDLC +__PF(RAWHDLC,rawhdlc) +#endif + +__PF(TUNNEL,ipip) +__PF(TUNNEL6,tunnel6) +__PF(FRAD,frad) +__PF(SKIP,skip) +__PF(LOOPBACK,loopback) +__PF(LOCALTLK,ltalk) +__PF(FDDI,fddi) +__PF(BIF,bif) +__PF(SIT,sit) +__PF(IPDDP,ip/ddp) +__PF(IPGRE,gre) +__PF(PIMREG,pimreg) +__PF(HIPPI,hippi) +__PF(ASH,ash) +__PF(ECONET,econet) +__PF(IRDA,irda) +__PF(FCPP,fcpp) +__PF(FCAL,fcal) +__PF(FCPL,fcpl) +__PF(FCFABRIC,fcfb0) +__PF(FCFABRIC+1,fcfb1) +__PF(FCFABRIC+2,fcfb2) +__PF(FCFABRIC+3,fcfb3) +__PF(FCFABRIC+4,fcfb4) +__PF(FCFABRIC+5,fcfb5) +__PF(FCFABRIC+6,fcfb6) +__PF(FCFABRIC+7,fcfb7) +__PF(FCFABRIC+8,fcfb8) +__PF(FCFABRIC+9,fcfb9) +__PF(FCFABRIC+10,fcfb10) +__PF(FCFABRIC+11,fcfb11) +__PF(FCFABRIC+12,fcfb12) +#ifdef ARPHRD_IEEE802_TR +__PF(IEEE802_TR,tr) +#endif +#ifdef ARPHRD_IEEE80211 +__PF(IEEE80211,ieee802.11) +#endif +#ifdef ARPHRD_VOID +__PF(VOID,void) +#endif +}; +#undef __PF + + int i; + for (i=0; i<sizeof(arphrd_names)/sizeof(arphrd_names[0]); i++) { + if (arphrd_names[i].type == type) + return arphrd_names[i].name; + } + snprintf(buf, len, "[%d]", type); + return buf; +} diff --git a/lib/rt_names.c b/lib/rt_names.c index e69de29b..429f73e9 100644 --- a/lib/rt_names.c +++ b/lib/rt_names.c @@ -0,0 +1,388 @@ +/* + * rt_names.c rtnetlink names DB. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <string.h> +#include <sys/time.h> + +static void rtnl_tab_initialize(char *file, char **tab, int size) +{ + char buf[512]; + FILE *fp; + + fp = fopen(file, "r"); + if (!fp) + return; + while (fgets(buf, sizeof(buf), fp)) { + char *p = buf; + int id; + char namebuf[512]; + + while (*p == ' ' || *p == '\t') + p++; + if (*p == '#' || *p == '\n' || *p == 0) + continue; + if (sscanf(p, "0x%x %s\n", &id, namebuf) != 2 && + sscanf(p, "0x%x %s #", &id, namebuf) != 2 && + sscanf(p, "%d %s\n", &id, namebuf) != 2 && + sscanf(p, "%d %s #", &id, namebuf) != 2) { + fprintf(stderr, "Database %s is corrupted at %s\n", + file, p); + return; + } + + if (id<0 || id>size) + continue; + + tab[id] = strdup(namebuf); + } + fclose(fp); +} + + +static char * rtnl_rtprot_tab[256] = { + "none", + "redirect", + "kernel", + "boot", + "static", + NULL, + NULL, + NULL, + "gated", + "ra", + "mrt", + "zebra", + "bird", +}; + + + +static int rtnl_rtprot_init; + +static void rtnl_rtprot_initialize(void) +{ + rtnl_rtprot_init = 1; + rtnl_tab_initialize("/etc/iproute2/rt_protos", + rtnl_rtprot_tab, 256); +} + +char * rtnl_rtprot_n2a(int id, char *buf, int len) +{ + if (id<0 || id>=256) { + snprintf(buf, len, "%d", id); + return buf; + } + if (!rtnl_rtprot_tab[id]) { + if (!rtnl_rtprot_init) + rtnl_rtprot_initialize(); + } + if (rtnl_rtprot_tab[id]) + return rtnl_rtprot_tab[id]; + snprintf(buf, len, "%d", id); + return buf; +} + +int rtnl_rtprot_a2n(__u32 *id, char *arg) +{ + static char *cache = NULL; + static unsigned long res; + char *end; + int i; + + if (cache && strcmp(cache, arg) == 0) { + *id = res; + return 0; + } + + if (!rtnl_rtprot_init) + rtnl_rtprot_initialize(); + + for (i=0; i<256; i++) { + if (rtnl_rtprot_tab[i] && + strcmp(rtnl_rtprot_tab[i], arg) == 0) { + cache = rtnl_rtprot_tab[i]; + res = i; + *id = res; + return 0; + } + } + + res = strtoul(arg, &end, 0); + if (!end || end == arg || *end || res > 255) + return -1; + *id = res; + return 0; +} + + + +static char * rtnl_rtscope_tab[256] = { + "global", +}; + +static int rtnl_rtscope_init; + +static void rtnl_rtscope_initialize(void) +{ + rtnl_rtscope_init = 1; + rtnl_rtscope_tab[255] = "nowhere"; + rtnl_rtscope_tab[254] = "host"; + rtnl_rtscope_tab[253] = "link"; + rtnl_rtscope_tab[200] = "site"; + rtnl_tab_initialize("/etc/iproute2/rt_scopes", + rtnl_rtscope_tab, 256); +} + +char * rtnl_rtscope_n2a(int id, char *buf, int len) +{ + if (id<0 || id>=256) { + snprintf(buf, len, "%d", id); + return buf; + } + if (!rtnl_rtscope_tab[id]) { + if (!rtnl_rtscope_init) + rtnl_rtscope_initialize(); + } + if (rtnl_rtscope_tab[id]) + return rtnl_rtscope_tab[id]; + snprintf(buf, len, "%d", id); + return buf; +} + +int rtnl_rtscope_a2n(__u32 *id, char *arg) +{ + static char *cache = NULL; + static unsigned long res; + char *end; + int i; + + if (cache && strcmp(cache, arg) == 0) { + *id = res; + return 0; + } + + if (!rtnl_rtscope_init) + rtnl_rtscope_initialize(); + + for (i=0; i<256; i++) { + if (rtnl_rtscope_tab[i] && + strcmp(rtnl_rtscope_tab[i], arg) == 0) { + cache = rtnl_rtscope_tab[i]; + res = i; + *id = res; + return 0; + } + } + + res = strtoul(arg, &end, 0); + if (!end || end == arg || *end || res > 255) + return -1; + *id = res; + return 0; +} + + + +static char * rtnl_rtrealm_tab[256] = { + "unknown", +}; + +static int rtnl_rtrealm_init; + +static void rtnl_rtrealm_initialize(void) +{ + rtnl_rtrealm_init = 1; + rtnl_tab_initialize("/etc/iproute2/rt_realms", + rtnl_rtrealm_tab, 256); +} + +char * rtnl_rtrealm_n2a(int id, char *buf, int len) +{ + if (id<0 || id>=256) { + snprintf(buf, len, "%d", id); + return buf; + } + if (!rtnl_rtrealm_tab[id]) { + if (!rtnl_rtrealm_init) + rtnl_rtrealm_initialize(); + } + if (rtnl_rtrealm_tab[id]) + return rtnl_rtrealm_tab[id]; + snprintf(buf, len, "%d", id); + return buf; +} + + +int rtnl_rtrealm_a2n(__u32 *id, char *arg) +{ + static char *cache = NULL; + static unsigned long res; + char *end; + int i; + + if (cache && strcmp(cache, arg) == 0) { + *id = res; + return 0; + } + + if (!rtnl_rtrealm_init) + rtnl_rtrealm_initialize(); + + for (i=0; i<256; i++) { + if (rtnl_rtrealm_tab[i] && + strcmp(rtnl_rtrealm_tab[i], arg) == 0) { + cache = rtnl_rtrealm_tab[i]; + res = i; + *id = res; + return 0; + } + } + + res = strtoul(arg, &end, 0); + if (!end || end == arg || *end || res > 255) + return -1; + *id = res; + return 0; +} + + + +static char * rtnl_rttable_tab[256] = { + "unspec", +}; + +static int rtnl_rttable_init; + +static void rtnl_rttable_initialize(void) +{ + rtnl_rttable_init = 1; + rtnl_rttable_tab[255] = "local"; + rtnl_rttable_tab[254] = "main"; + rtnl_tab_initialize("/etc/iproute2/rt_tables", + rtnl_rttable_tab, 256); +} + +char * rtnl_rttable_n2a(int id, char *buf, int len) +{ + if (id<0 || id>=256) { + snprintf(buf, len, "%d", id); + return buf; + } + if (!rtnl_rttable_tab[id]) { + if (!rtnl_rttable_init) + rtnl_rttable_initialize(); + } + if (rtnl_rttable_tab[id]) + return rtnl_rttable_tab[id]; + snprintf(buf, len, "%d", id); + return buf; +} + +int rtnl_rttable_a2n(__u32 *id, char *arg) +{ + static char *cache = NULL; + static unsigned long res; + char *end; + int i; + + if (cache && strcmp(cache, arg) == 0) { + *id = res; + return 0; + } + + if (!rtnl_rttable_init) + rtnl_rttable_initialize(); + + for (i=0; i<256; i++) { + if (rtnl_rttable_tab[i] && + strcmp(rtnl_rttable_tab[i], arg) == 0) { + cache = rtnl_rttable_tab[i]; + res = i; + *id = res; + return 0; + } + } + + i = strtoul(arg, &end, 0); + if (!end || end == arg || *end || i > 255) + return -1; + *id = i; + return 0; +} + + +static char * rtnl_rtdsfield_tab[256] = { + "0", +}; + +static int rtnl_rtdsfield_init; + +static void rtnl_rtdsfield_initialize(void) +{ + rtnl_rtdsfield_init = 1; + rtnl_tab_initialize("/etc/iproute2/rt_dsfield", + rtnl_rtdsfield_tab, 256); +} + +char * rtnl_dsfield_n2a(int id, char *buf, int len) +{ + if (id<0 || id>=256) { + snprintf(buf, len, "%d", id); + return buf; + } + if (!rtnl_rtdsfield_tab[id]) { + if (!rtnl_rtdsfield_init) + rtnl_rtdsfield_initialize(); + } + if (rtnl_rtdsfield_tab[id]) + return rtnl_rtdsfield_tab[id]; + snprintf(buf, len, "0x%02x", id); + return buf; +} + + +int rtnl_dsfield_a2n(__u32 *id, char *arg) +{ + static char *cache = NULL; + static unsigned long res; + char *end; + int i; + + if (cache && strcmp(cache, arg) == 0) { + *id = res; + return 0; + } + + if (!rtnl_rtdsfield_init) + rtnl_rtdsfield_initialize(); + + for (i=0; i<256; i++) { + if (rtnl_rtdsfield_tab[i] && + strcmp(rtnl_rtdsfield_tab[i], arg) == 0) { + cache = rtnl_rtdsfield_tab[i]; + res = i; + *id = res; + return 0; + } + } + + res = strtoul(arg, &end, 16); + if (!end || end == arg || *end || res > 255) + return -1; + *id = res; + return 0; +} + diff --git a/lib/utils.c b/lib/utils.c index e69de29b..6763be2a 100644 --- a/lib/utils.c +++ b/lib/utils.c @@ -0,0 +1,528 @@ +/* + * utils.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * + * Changes: + * + * Rani Assaf <rani@magic.metawire.com> 980929: resolve addresses + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <string.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <resolv.h> +#include <linux/pkt_sched.h> + +#include "utils.h" + +int get_integer(int *val, char *arg, int base) +{ + long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtol(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > INT_MAX || res < INT_MIN) + return -1; + *val = res; + return 0; +} + +int get_unsigned(unsigned *val, char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > UINT_MAX) + return -1; + *val = res; + return 0; +} + +int get_u32(__u32 *val, char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > 0xFFFFFFFFUL) + return -1; + *val = res; + return 0; +} + +int get_u16(__u16 *val, char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > 0xFFFF) + return -1; + *val = res; + return 0; +} + +int get_u8(__u8 *val, char *arg, int base) +{ + unsigned long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtoul(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > 0xFF) + return -1; + *val = res; + return 0; +} + +int get_s16(__s16 *val, char *arg, int base) +{ + long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtol(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > 0x7FFF || res < -0x8000) + return -1; + *val = res; + return 0; +} + +int get_s8(__s8 *val, char *arg, int base) +{ + long res; + char *ptr; + + if (!arg || !*arg) + return -1; + res = strtol(arg, &ptr, base); + if (!ptr || ptr == arg || *ptr || res > 0x7F || res < -0x80) + return -1; + *val = res; + return 0; +} + +int get_addr_1(inet_prefix *addr, char *name, int family) +{ + char *cp; + unsigned char *ap = (unsigned char*)addr->data; + int i; + + memset(addr, 0, sizeof(*addr)); + + if (strcmp(name, "default") == 0 || + strcmp(name, "all") == 0 || + strcmp(name, "any") == 0) { + if (family == AF_DECnet) + return -1; + addr->family = family; + addr->bytelen = (family == AF_INET6 ? 16 : 4); + addr->bitlen = -1; + return 0; + } + + if (strchr(name, ':')) { + addr->family = AF_INET6; + if (family != AF_UNSPEC && family != AF_INET6) + return -1; + if (inet_pton(AF_INET6, name, addr->data) <= 0) + return -1; + addr->bytelen = 16; + addr->bitlen = -1; + return 0; + } + + if (family == AF_DECnet) { + struct dn_naddr dna; + addr->family = AF_DECnet; + if (dnet_pton(AF_DECnet, name, &dna) <= 0) + return -1; + memcpy(addr->data, dna.a_addr, 2); + addr->bytelen = 2; + addr->bitlen = -1; + return 0; + } + + addr->family = AF_INET; + if (family != AF_UNSPEC && family != AF_INET) + return -1; + addr->bytelen = 4; + addr->bitlen = -1; + for (cp=name, i=0; *cp; cp++) { + if (*cp <= '9' && *cp >= '0') { + ap[i] = 10*ap[i] + (*cp-'0'); + continue; + } + if (*cp == '.' && ++i <= 3) + continue; + return -1; + } + return 0; +} + +int get_prefix_1(inet_prefix *dst, char *arg, int family) +{ + int err; + unsigned plen; + char *slash; + + memset(dst, 0, sizeof(*dst)); + + if (strcmp(arg, "default") == 0 || + strcmp(arg, "any") == 0 || + strcmp(arg, "all") == 0) { + if (family == AF_DECnet) + return -1; + dst->family = family; + dst->bytelen = 0; + dst->bitlen = 0; + return 0; + } + + slash = strchr(arg, '/'); + if (slash) + *slash = 0; + err = get_addr_1(dst, arg, family); + if (err == 0) { + switch(dst->family) { + case AF_INET6: + dst->bitlen = 128; + break; + case AF_DECnet: + dst->bitlen = 16; + break; + default: + case AF_INET: + dst->bitlen = 32; + } + if (slash) { + if (get_integer(&plen, slash+1, 0) || plen > dst->bitlen) { + err = -1; + goto done; + } + dst->bitlen = plen; + } + } +done: + if (slash) + *slash = '/'; + return err; +} + +int get_addr(inet_prefix *dst, char *arg, int family) +{ + if (family == AF_PACKET) { + fprintf(stderr, "Error: \"%s\" may be inet address, but it is not allowed in this context.\n", arg); + exit(1); + } + if (get_addr_1(dst, arg, family)) { + fprintf(stderr, "Error: an inet address is expected rather than \"%s\".\n", arg); + exit(1); + } + return 0; +} + +int get_prefix(inet_prefix *dst, char *arg, int family) +{ + if (family == AF_PACKET) { + fprintf(stderr, "Error: \"%s\" may be inet prefix, but it is not allowed in this context.\n", arg); + exit(1); + } + if (get_prefix_1(dst, arg, family)) { + fprintf(stderr, "Error: an inet prefix is expected rather than \"%s\".\n", arg); + exit(1); + } + return 0; +} + +__u32 get_addr32(char *name) +{ + inet_prefix addr; + if (get_addr_1(&addr, name, AF_INET)) { + fprintf(stderr, "Error: an IP address is expected rather than \"%s\"\n", name); + exit(1); + } + return addr.data[0]; +} + +void incomplete_command() +{ + fprintf(stderr, "Command line is not complete. Try option \"help\"\n"); + exit(-1); +} + +void invarg(char *msg, char *arg) +{ + fprintf(stderr, "Error: argument \"%s\" is wrong: %s\n", arg, msg); + exit(-1); +} + +void duparg(char *key, char *arg) +{ + fprintf(stderr, "Error: duplicate \"%s\": \"%s\" is the second value.\n", key, arg); + exit(-1); +} + +void duparg2(char *key, char *arg) +{ + fprintf(stderr, "Error: either \"%s\" is duplicate, or \"%s\" is a garbage.\n", key, arg); + exit(-1); +} + +int matches(char *cmd, char *pattern) +{ + int len = strlen(cmd); + if (len > strlen(pattern)) + return -1; + return memcmp(pattern, cmd, len); +} + +int inet_addr_match(inet_prefix *a, inet_prefix *b, int bits) +{ + __u32 *a1 = a->data; + __u32 *a2 = b->data; + int words = bits >> 0x05; + + bits &= 0x1f; + + if (words) + if (memcmp(a1, a2, words << 2)) + return -1; + + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (0x20 - bits)); + + if ((w1 ^ w2) & mask) + return 1; + } + + return 0; +} + +int __iproute2_hz_internal; + +int __get_hz(void) +{ + char name[1024]; + int hz = 0; + FILE *fp; + + if (getenv("HZ")) + return atoi(getenv("HZ")) ? : HZ; + + if (getenv("PROC_NET_PSCHED")) { + snprintf(name, sizeof(name)-1, "%s", getenv("PROC_NET_PSCHED")); + } else if (getenv("PROC_ROOT")) { + snprintf(name, sizeof(name)-1, "%s/net/psched", getenv("PROC_ROOT")); + } else { + strcpy(name, "/proc/net/psched"); + } + fp = fopen(name, "r"); + + if (fp) { + unsigned nom, denom; + if (fscanf(fp, "%*08x%*08x%08x%08x", &nom, &denom) == 2) + if (nom == 1000000) + hz = denom; + fclose(fp); + } + if (hz) + return hz; + return HZ; +} + +const char *rt_addr_n2a(int af, int len, void *addr, char *buf, int buflen) +{ + switch (af) { + case AF_INET: + case AF_INET6: + return inet_ntop(af, addr, buf, buflen); + case AF_IPX: + return ipx_ntop(af, addr, buf, buflen); + case AF_DECnet: + { + struct dn_naddr dna = { 2, { 0, 0, }}; + memcpy(dna.a_addr, addr, 2); + return dnet_ntop(af, &dna, buf, buflen); + } + default: + return "???"; + } +} + +#ifdef RESOLVE_HOSTNAMES +struct namerec +{ + struct namerec *next; + inet_prefix addr; + char *name; +}; + +static struct namerec *nht[256]; + +char *resolve_address(char *addr, int len, int af) +{ + struct namerec *n; + struct hostent *h_ent; + unsigned hash; + static int notfirst; + + + if (af == AF_INET6 && ((__u32*)addr)[0] == 0 && + ((__u32*)addr)[1] == 0 && ((__u32*)addr)[2] == htonl(0xffff)) { + af = AF_INET; + addr += 12; + len = 4; + } + + hash = addr[len-1] ^ addr[len-2] ^ addr[len-3] ^ addr[len-4]; + + for (n = nht[hash]; n; n = n->next) { + if (n->addr.family == af && + n->addr.bytelen == len && + memcmp(n->addr.data, addr, len) == 0) + return n->name; + } + if ((n = malloc(sizeof(*n))) == NULL) + return NULL; + n->addr.family = af; + n->addr.bytelen = len; + n->name = NULL; + memcpy(n->addr.data, addr, len); + n->next = nht[hash]; + nht[hash] = n; + if (++notfirst == 1) + sethostent(1); + fflush(stdout); + + if ((h_ent = gethostbyaddr(addr, len, af)) != NULL) + n->name = strdup(h_ent->h_name); + + /* Even if we fail, "negative" entry is remembered. */ + return n->name; +} +#endif + + +const char *format_host(int af, int len, void *addr, char *buf, int buflen) +{ +#ifdef RESOLVE_HOSTNAMES + if (resolve_hosts) { + char *n; + if (len <= 0) { + switch (af) { + case AF_INET: + len = 4; + break; + case AF_INET6: + len = 16; + break; + case AF_IPX: + len = 10; + break; +#ifdef AF_DECnet + /* I see no reasons why gethostbyname + may not work for DECnet */ + case AF_DECnet: + len = 2; + break; +#endif + default: ; + } + } + if (len > 0 && + (n = resolve_address(addr, len, af)) != NULL) + return n; + } +#endif + return rt_addr_n2a(af, len, addr, buf, buflen); +} + + +__u8* hexstring_n2a(const __u8 *str, int len, __u8 *buf, int blen) +{ + __u8 *ptr = buf; + int i; + + for (i=0; i<len; i++) { + if (blen < 3) + break; + sprintf(ptr, "%02x", str[i]); + ptr += 2; + blen -= 2; + if (i != len-1 && blen > 1) { + *ptr++ = ':'; + blen--; + } + } + return buf; +} + +__u8* hexstring_a2n(const __u8 *str, __u8 *buf, int blen) +{ + int cnt = 0; + + for (;;) { + unsigned acc; + char ch; + + acc = 0; + + while ((ch = *str) != ':' && ch != 0) { + if (ch >= '0' && ch <= '9') + ch -= '0'; + else if (ch >= 'a' && ch <= 'f') + ch -= 'a'-10; + else if (ch >= 'A' && ch <= 'F') + ch -= 'A'-10; + else + return NULL; + acc = (acc<<4) + ch; + str++; + } + + if (acc > 255) + return NULL; + if (cnt < blen) { + buf[cnt] = acc; + cnt++; + } + if (ch == 0) + break; + ++str; + } + if (cnt < blen) + memset(buf+cnt, 0, blen-cnt); + return buf; +} diff --git a/misc/Makefile b/misc/Makefile index e69de29b..685b0044 100644 --- a/misc/Makefile +++ b/misc/Makefile @@ -0,0 +1,37 @@ +SSOBJ=ss.o ssfilter.o +NSTATOBJ=nstat.o +IFSTATOBJ=ifstat.o +RTACCTOBJ=rtacct.o +ARPDOBJ=arpd.o +RTSTATOBJ=rtstat.o + +ALLOBJ=$(SSOBJ) $(NSTATOBJ) $(IFSTATOBJ) $(RTACCTOBJ) $(ARPDOBJ) $(RTSTATOBJ) +TARGETS=ss nstat ifstat rtacct arpd rtstat + +all: $(TARGETS) + +ss: $(SSOBJ) $(LIBUTIL) + +nstat: $(NSTATOBJ) + $(CC) $(CFLAGS) $(LDFLAGS) -o nstat $(NSTATOBJ) -lm + +ifstat: $(IFSTATOBJ) + $(CC) $(CFLAGS) $(LDFLAGS) -o ifstat $(IFSTATOBJ) $(LIBNETLINK) -lm + +rtacct: $(RTACCTOBJ) + $(CC) $(CFLAGS) $(LDFLAGS) -o rtacct $(RTACCTOBJ) $(LIBNETLINK) -lm + +arpd: $(ARPDOBJ) + $(CC) $(CFLAGS) $(LDFLAGS) -o arpd $(ARPDOBJ) $(LIBNETLINK) -ldb + +rtstat: $(RTSTATOBJ) + $(CC) $(CFLAGS) $(LDFLAGS) -o rtstat $(RTSTATOBJ) + +ssfilter.c: ssfilter.y + bison ssfilter.y -o ssfilter.c + +install: all + install -m 0755 -s $(TARGETS) $(DESTDIR)$(SBINDIR) + +clean: + rm -f $(ALLOBJ) $(TARGETS) ssfilter.c diff --git a/misc/arpd.c b/misc/arpd.c index e69de29b..4590dafc 100644 --- a/misc/arpd.c +++ b/misc/arpd.c @@ -0,0 +1,846 @@ +/* + * arpd.c ARP helper daemon. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <syslog.h> +#include <malloc.h> +#include <string.h> +#include <unistd.h> +#include <stdlib.h> +#include <netdb.h> +#include <db.h> +#include <sys/ioctl.h> +#include <sys/poll.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/uio.h> +#include <sys/socket.h> +#include <sys/time.h> +#include <time.h> +#include <signal.h> +#include <linux/if.h> +#include <linux/if_arp.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <linux/if_packet.h> +#include <linux/filter.h> + +#include "libnetlink.h" +#include "utils.h" + +int resolve_hosts; + +DB *dbase; +char *dbname = "/var/lib/arpd/arpd.db"; + +int ifnum; +int *ifvec; +char **ifnames; + +struct dbkey +{ + __u32 iface; + __u32 addr; +}; + +#define IS_NEG(x) (((__u8*)(x))[0] == 0xFF) +#define NEG_TIME(x) (((x)[2]<<24)|((x)[3]<<16)|((x)[4]<<8)|(x)[5]) +#define NEG_AGE(x) ((__u32)time(NULL) - NEG_TIME((__u8*)x)) +#define NEG_VALID(x) (NEG_AGE(x) < negative_timeout) +#define NEG_CNT(x) (((__u8*)(x))[1]) + +struct rtnl_handle rth; + +struct pollfd pset[2]; +int udp_sock = -1; + +volatile int do_exit; +volatile int do_sync; +volatile int do_stats; + +struct { + unsigned long arp_new; + unsigned long arp_change; + + unsigned long app_recv; + unsigned long app_success; + unsigned long app_bad; + unsigned long app_neg; + unsigned long app_suppressed; + + unsigned long kern_neg; + unsigned long kern_new; + unsigned long kern_change; + + unsigned long probes_sent; + unsigned long probes_suppressed; +} stats; + +int active_probing; +int negative_timeout = 60; +int no_kernel_broadcasts; +int broadcast_rate = 1000; +int broadcast_burst = 3000; + +void usage(void) +{ + fprintf(stderr, +"Usage: arpd [ -lk ] [ -a N ] [ -b dbase ] [ -f file ] [ interfaces ]\n"); + exit(1); +} + +int handle_if(int ifindex) +{ + int i; + + if (ifnum == 0) + return 1; + + for (i=0; i<ifnum; i++) + if (ifvec[i] == ifindex) + return 1; + return 0; +} + +int sysctl_adjusted; + +void do_sysctl_adjustments(void) +{ + int i; + + if (!ifnum) + return; + + for (i=0; i<ifnum; i++) { + char buf[128]; + FILE *fp; + + if (active_probing) { + sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/mcast_solicit", ifnames[i]); + if ((fp = fopen(buf, "w")) != NULL) { + if (no_kernel_broadcasts) + strcpy(buf, "0\n"); + else + sprintf(buf, "%d\n", active_probing>=2 ? 1 : 3-active_probing); + fputs(buf, fp); + fclose(fp); + } + } + + sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/app_solicit", ifnames[i]); + if ((fp = fopen(buf, "w")) != NULL) { + sprintf(buf, "%d\n", active_probing<=1 ? 1 : active_probing); + fputs(buf, fp); + fclose(fp); + } + } + sysctl_adjusted = 1; +} + +void undo_sysctl_adjustments(void) +{ + int i; + + if (!sysctl_adjusted) + return; + + for (i=0; i<ifnum; i++) { + char buf[128]; + FILE *fp; + + if (active_probing) { + sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/mcast_solicit", ifnames[i]); + if ((fp = fopen(buf, "w")) != NULL) { + strcpy(buf, "3\n"); + fputs(buf, fp); + fclose(fp); + } + } + sprintf(buf, "/proc/sys/net/ipv4/neigh/%s/app_solicit", ifnames[i]); + if ((fp = fopen(buf, "w")) != NULL) { + strcpy(buf, "0\n"); + fputs(buf, fp); + fclose(fp); + } + } + sysctl_adjusted = 0; +} + + +int send_probe(int ifindex, __u32 addr) +{ + struct ifreq ifr; + struct sockaddr_in dst; + int len; + unsigned char buf[256]; + struct arphdr *ah = (struct arphdr*)buf; + unsigned char *p = (unsigned char *)(ah+1); + struct sockaddr_ll sll; + + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_ifindex = ifindex; + if (ioctl(udp_sock, SIOCGIFNAME, &ifr)) + return -1; + if (ioctl(udp_sock, SIOCGIFHWADDR, &ifr)) + return -1; + if (ifr.ifr_hwaddr.sa_family != ARPHRD_ETHER) + return -1; + if (setsockopt(udp_sock, SOL_SOCKET, SO_BINDTODEVICE, ifr.ifr_name, strlen(ifr.ifr_name)+1) < 0) + return -1; + + dst.sin_family = AF_INET; + dst.sin_port = htons(1025); + dst.sin_addr.s_addr = addr; + if (connect(udp_sock, (struct sockaddr*)&dst, sizeof(dst)) < 0) + return -1; + len = sizeof(dst); + if (getsockname(udp_sock, (struct sockaddr*)&dst, &len) < 0) + return -1; + + ah->ar_hrd = htons(ifr.ifr_hwaddr.sa_family); + ah->ar_pro = htons(ETH_P_IP); + ah->ar_hln = 6; + ah->ar_pln = 4; + ah->ar_op = htons(ARPOP_REQUEST); + + memcpy(p, ifr.ifr_hwaddr.sa_data, ah->ar_hln); + p += ah->ar_hln; + + memcpy(p, &dst.sin_addr, 4); + p+=4; + + sll.sll_family = AF_PACKET; + memset(sll.sll_addr, 0xFF, sizeof(sll.sll_addr)); + sll.sll_ifindex = ifindex; + sll.sll_protocol = htons(ETH_P_ARP); + memcpy(p, &sll.sll_addr, ah->ar_hln); + p+=ah->ar_hln; + + memcpy(p, &addr, 4); + p+=4; + + len = sendto(pset[0].fd, buf, p-buf, 0, (struct sockaddr*)&sll, sizeof(sll)); + if (len < 0) + return -1; + stats.probes_sent++; + return 0; +} + +/* Be very tough on sending probes: 1 per second with burst of 3. */ + +int queue_active_probe(int ifindex, __u32 addr) +{ + static struct timeval prev; + static int buckets; + struct timeval now; + + gettimeofday(&now, NULL); + if (prev.tv_sec) { + int diff = (now.tv_sec-prev.tv_sec)*1000+(now.tv_usec-prev.tv_usec)/1000; + buckets += diff; + } else { + buckets = broadcast_burst; + } + if (buckets > broadcast_burst) + buckets = broadcast_burst; + if (buckets >= broadcast_rate && !send_probe(ifindex, addr)) { + buckets -= broadcast_rate; + prev = now; + return 0; + } + stats.probes_suppressed++; + return -1; +} + +int respond_to_kernel(int ifindex, __u32 addr, char *lla, int llalen) +{ + struct { + struct nlmsghdr n; + struct ndmsg ndm; + char buf[256]; + } req; + + memset(&req.n, 0, sizeof(req.n)); + memset(&req.ndm, 0, sizeof(req.ndm)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ndmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST; + req.n.nlmsg_type = RTM_NEWNEIGH; + req.ndm.ndm_family = AF_INET; + req.ndm.ndm_state = NUD_STALE; + req.ndm.ndm_ifindex = ifindex; + req.ndm.ndm_type = RTN_UNICAST; + + addattr_l(&req.n, sizeof(req), NDA_DST, &addr, 4); + addattr_l(&req.n, sizeof(req), NDA_LLADDR, lla, llalen); + return rtnl_send(&rth, (char*)&req, req.n.nlmsg_len) <= 0; +} + +void prepare_neg_entry(__u8 *ndata, __u32 stamp) +{ + ndata[0] = 0xFF; + ndata[1] = 0; + ndata[2] = stamp>>24; + ndata[3] = stamp>>16; + ndata[4] = stamp>>8; + ndata[5] = stamp; +} + + +int do_one_request(struct nlmsghdr *n) +{ + struct ndmsg *ndm = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[NDA_MAX+1]; + struct dbkey key; + DBT dbkey, dbdat; + int do_acct = 0; + + if (n->nlmsg_type == NLMSG_DONE) { + dbase->sync(dbase, 0); + + /* Now we have at least mirror of kernel db, so that + * may start real resolution. + */ + do_sysctl_adjustments(); + return 0; + } + + if (n->nlmsg_type != RTM_GETNEIGH && n->nlmsg_type != RTM_NEWNEIGH) + return 0; + + len -= NLMSG_LENGTH(sizeof(*ndm)); + if (len < 0) + return -1; + + if (ndm->ndm_family != AF_INET || + (ifnum && !handle_if(ndm->ndm_ifindex)) || + ndm->ndm_flags || + ndm->ndm_type != RTN_UNICAST || + !(ndm->ndm_state&~NUD_NOARP)) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, NDA_MAX, NDA_RTA(ndm), len); + + if (!tb[NDA_DST]) + return 0; + + key.iface = ndm->ndm_ifindex; + memcpy(&key.addr, RTA_DATA(tb[NDA_DST]), 4); + dbkey.data = &key; + dbkey.size = sizeof(key); + + if (dbase->get(dbase, &dbkey, &dbdat, 0) != 0) { + dbdat.data = 0; + dbdat.size = 0; + } + + if (n->nlmsg_type == RTM_GETNEIGH) { + if (!(n->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (!(ndm->ndm_state&(NUD_PROBE|NUD_INCOMPLETE))) { + stats.app_bad++; + return 0; + } + + if (ndm->ndm_state&NUD_PROBE) { + /* If we get this, kernel still has some valid + * address, but unicast probing failed and host + * is either dead or changed its mac address. + * Kernel is going to initiate broadcast resolution. + * OK, we invalidate our information as well. + */ + if (dbdat.data && !IS_NEG(dbdat.data)) + stats.app_neg++; + + dbase->del(dbase, &dbkey, 0); + } else { + /* If we get this kernel does not have any information. + * If we have something tell this to kernel. */ + stats.app_recv++; + if (dbdat.data && !IS_NEG(dbdat.data)) { + stats.app_success++; + respond_to_kernel(key.iface, key.addr, dbdat.data, dbdat.size); + return 0; + } + + /* Sheeit! We have nothing to tell. */ + /* If we have recent negative entry, be silent. */ + if (dbdat.data && NEG_VALID(dbdat.data)) { + if (NEG_CNT(dbdat.data) >= active_probing) { + stats.app_suppressed++; + return 0; + } + do_acct = 1; + } + } + + if (active_probing && + queue_active_probe(ndm->ndm_ifindex, key.addr) == 0 && + do_acct) { + NEG_CNT(dbdat.data)++; + dbase->put(dbase, &dbkey, &dbdat, 0); + } + } else if (n->nlmsg_type == RTM_NEWNEIGH) { + if (n->nlmsg_flags&NLM_F_REQUEST) + return 0; + + if (ndm->ndm_state&NUD_FAILED) { + /* Kernel was not able to resolve. Host is dead. + * Create negative entry if it is not present + * or renew it if it is too old. */ + if (!dbdat.data || + !IS_NEG(dbdat.data) || + !NEG_VALID(dbdat.data)) { + __u8 ndata[6]; + stats.kern_neg++; + prepare_neg_entry(ndata, time(NULL)); + dbdat.data = ndata; + dbdat.size = sizeof(ndata); + dbase->put(dbase, &dbkey, &dbdat, 0); + } + } else if (tb[NDA_LLADDR]) { + if (dbdat.data && !IS_NEG(dbdat.data)) { + if (memcmp(RTA_DATA(tb[NDA_LLADDR]), dbdat.data, dbdat.size) == 0) + return 0; + stats.kern_change++; + } else { + stats.kern_new++; + } + dbdat.data = RTA_DATA(tb[NDA_LLADDR]); + dbdat.size = RTA_PAYLOAD(tb[NDA_LLADDR]); + dbase->put(dbase, &dbkey, &dbdat, 0); + } + } + return 0; +} + +void load_initial_table(void) +{ + rtnl_wilddump_request(&rth, AF_INET, RTM_GETNEIGH); +} + +void get_kern_msg(void) +{ + int status; + struct nlmsghdr *h; + struct sockaddr_nl nladdr; + struct iovec iov; + char buf[8192]; + struct msghdr msg = { + (void*)&nladdr, sizeof(nladdr), + &iov, 1, + NULL, 0, + 0 + }; + + memset(&nladdr, 0, sizeof(nladdr)); + + iov.iov_base = buf; + iov.iov_len = sizeof(buf); + + status = recvmsg(rth.fd, &msg, MSG_DONTWAIT); + + if (status <= 0) + return; + + if (msg.msg_namelen != sizeof(nladdr)) + return; + + if (nladdr.nl_pid) + return; + + for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) { + int len = h->nlmsg_len; + int l = len - sizeof(*h); + + if (l < 0 || len > status) + return; + + if (do_one_request(h) < 0) + return; + + status -= NLMSG_ALIGN(len); + h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len)); + } +} + +/* Receive gratuitous ARP messages and store them, that's all. */ +void get_arp_pkt(void) +{ + unsigned char buf[1024]; + struct sockaddr_ll sll; + int sll_len = sizeof(sll); + struct arphdr *a = (struct arphdr*)buf; + struct dbkey key; + DBT dbkey, dbdat; + int n; + + n = recvfrom(pset[0].fd, buf, sizeof(buf), MSG_DONTWAIT, (struct sockaddr*)&sll, &sll_len); + if (n < 0) { + if (errno != EINTR && errno != EAGAIN) + syslog(LOG_ERR, "recvfrom: %m"); + return; + } + + if (ifnum && !handle_if(sll.sll_ifindex)) + return; + + /* Sanity checks */ + + if (n < sizeof(*a) || + (a->ar_op != htons(ARPOP_REQUEST) && + a->ar_op != htons(ARPOP_REPLY)) || + a->ar_pln != 4 || + a->ar_pro != htons(ETH_P_IP) || + a->ar_hln != sll.sll_halen || + sizeof(*a) + 2*4 + 2*a->ar_hln > n) + return; + + key.iface = sll.sll_ifindex; + memcpy(&key.addr, (char*)(a+1) + a->ar_hln, 4); + + /* DAD message, ignore. */ + if (key.addr == 0) + return; + + dbkey.data = &key; + dbkey.size = sizeof(key); + + if (dbase->get(dbase, &dbkey, &dbdat, 0) == 0 && !IS_NEG(dbdat.data)) { + if (memcmp(dbdat.data, a+1, dbdat.size) == 0) + return; + stats.arp_change++; + } else { + stats.arp_new++; + } + + dbdat.data = a+1; + dbdat.size = a->ar_hln; + dbase->put(dbase, &dbkey, &dbdat, 0); +} + +void catch_signal(int sig, void (*handler)(int)) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = handler; +#ifdef SA_INTERRUPT + sa.sa_flags = SA_INTERRUPT; +#endif + sigaction(sig, &sa, NULL); +} + +#include <setjmp.h> +sigjmp_buf env; +volatile int in_poll; + +void sig_exit(int signo) +{ + do_exit = 1; + if (in_poll) + siglongjmp(env, 1); +} + +void sig_sync(int signo) +{ + do_sync = 1; + if (in_poll) + siglongjmp(env, 1); +} + +void sig_stats(int signo) +{ + do_sync = 1; + do_stats = 1; + if (in_poll) + siglongjmp(env, 1); +} + +void send_stats(void) +{ + syslog(LOG_INFO, "arp_rcv: n%lu c%lu app_rcv: tot %lu hits %lu bad %lu neg %lu sup %lu", + stats.arp_new, stats.arp_change, + + stats.app_recv, stats.app_success, + stats.app_bad, stats.app_neg, stats.app_suppressed + ); + syslog(LOG_INFO, "kern: n%lu c%lu neg %lu arp_send: %lu rlim %lu", + stats.kern_new, stats.kern_change, stats.kern_neg, + + stats.probes_sent, stats.probes_suppressed + ); + do_stats = 0; +} + + +int main(int argc, char **argv) +{ + int opt; + int do_list = 0; + char *do_load = NULL; + + while ((opt = getopt(argc, argv, "h?b:lf:a:n:kR:B:")) != EOF) { + switch (opt) { + case 'b': + dbname = optarg; + break; + case 'f': + if (do_load) { + fprintf(stderr, "Duplicate option -f\n"); + usage(); + } + do_load = optarg; + break; + case 'l': + do_list = 1; + break; + case 'a': + active_probing = atoi(optarg); + break; + case 'n': + negative_timeout = atoi(optarg); + break; + case 'k': + no_kernel_broadcasts = 1; + break; + case 'R': + if ((broadcast_rate = atoi(optarg)) <= 0 || + (broadcast_rate = 1000/broadcast_rate) <= 0) { + fprintf(stderr, "Invalid ARP rate\n"); + exit(-1); + } + break; + case 'B': + if ((broadcast_burst = atoi(optarg)) <= 0 || + (broadcast_burst = 1000*broadcast_burst) <= 0) { + fprintf(stderr, "Invalid ARP burst\n"); + exit(-1); + } + break; + case 'h': + case '?': + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (argc > 0) { + ifnum = argc; + ifnames = argv; + ifvec = malloc(argc*sizeof(int)); + if (!ifvec) { + perror("malloc"); + exit(-1); + } + } + + if ((udp_sock = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + perror("socket"); + exit(-1); + } + + if (ifnum) { + int i; + struct ifreq ifr; + memset(&ifr, 0, sizeof(ifr)); + for (i=0; i<ifnum; i++) { + strncpy(ifr.ifr_name, ifnames[i], IFNAMSIZ); + if (ioctl(udp_sock, SIOCGIFINDEX, &ifr)) { + perror("ioctl(SIOCGIFINDEX)"); + exit(-1);; + } + ifvec[i] = ifr.ifr_ifindex; + } + } + + dbase = dbopen(dbname, O_CREAT|O_RDWR, 0644, DB_HASH, NULL); + if (dbase == NULL) { + perror("db_open"); + exit(-1); + } + + if (do_load) { + char buf[128]; + FILE *fp; + struct dbkey k; + DBT dbkey, dbdat; + + dbkey.data = &k; + dbkey.size = sizeof(k); + + if (strcmp(do_load, "-") == 0 || strcmp(do_load, "--") == 0) { + fp = stdin; + } else if ((fp = fopen(do_load, "r")) == NULL) { + perror("fopen"); + goto do_abort; + } + + buf[sizeof(buf)-1] = 0; + while (fgets(buf, sizeof(buf)-1, fp)) { + __u8 b1[6]; + char ipbuf[128]; + char macbuf[128]; + + if (buf[0] == '#') + continue; + + if (sscanf(buf, "%u%s%s", &k.iface, ipbuf, macbuf) != 3) { + fprintf(stderr, "Wrong format of input file \"%s\"\n", do_load); + goto do_abort; + } + if (strncmp(macbuf, "FAILED:", 7) == 0) + continue; + if (!inet_aton(ipbuf, (struct in_addr*)&k.addr)) { + fprintf(stderr, "Invalid IP address: \"%s\"\n", ipbuf); + goto do_abort; + } + dbdat.data = hexstring_a2n(macbuf, b1, 6); + if (dbdat.data == NULL) + goto do_abort; + dbdat.size = 6; + + if (dbase->put(dbase, &dbkey, &dbdat, 0)) { + perror("hash->put"); + goto do_abort; + } + } + dbase->sync(dbase, 0); + if (fp != stdin) + fclose(fp); + } + + if (do_list) { + DBT dbkey, dbdat; + printf("%-8s %-15s %s\n", "#Ifindex", "IP", "MAC"); + while (dbase->seq(dbase, &dbkey, &dbdat, R_NEXT) == 0) { + struct dbkey *key = dbkey.data; + if (handle_if(key->iface)) { + if (!IS_NEG(dbdat.data)) { + __u8 b1[18]; + printf("%-8d %-15s %s\n", + key->iface, + inet_ntoa(*(struct in_addr*)&key->addr), + hexstring_n2a(dbdat.data, 6, b1, 18)); + } else { + printf("%-8d %-15s FAILED: %dsec ago\n", + key->iface, + inet_ntoa(*(struct in_addr*)&key->addr), + NEG_AGE(dbdat.data)); + } + } + } + } + + if (do_load || do_list) + goto out; + + pset[0].fd = socket(PF_PACKET, SOCK_DGRAM, 0); + if (pset[0].fd < 0) { + perror("socket"); + exit(-1); + } + + if (1) { + struct sockaddr_ll sll; + memset(&sll, 0, sizeof(sll)); + sll.sll_family = AF_PACKET; + sll.sll_protocol = htons(ETH_P_ARP); + sll.sll_ifindex = (ifnum == 1 ? ifvec[0] : 0); + if (bind(pset[0].fd, (struct sockaddr*)&sll, sizeof(sll)) < 0) { + perror("bind"); + goto do_abort; + } + } + + if (rtnl_open(&rth, RTMGRP_NEIGH) < 0) { + perror("rtnl_open"); + goto do_abort; + } + pset[1].fd = rth.fd; + + load_initial_table(); + + if (1) { + int fd; + pid_t pid = fork(); + + if (pid > 0) + _exit(0); + if (pid < 0) { + perror("arpd: fork"); + goto do_abort; + } + + chdir("/"); + fd = open("/dev/null", O_RDWR); + if (fd >= 0) { + dup2(fd, 0); + dup2(fd, 1); + dup2(fd, 2); + if (fd > 2) + close(fd); + } + setsid(); + } + + openlog("arpd", LOG_PID | LOG_CONS, LOG_DAEMON); + catch_signal(SIGINT, sig_exit); + catch_signal(SIGTERM, sig_exit); + catch_signal(SIGHUP, sig_sync); + catch_signal(SIGUSR1, sig_stats); + +#define EVENTS (POLLIN|POLLPRI|POLLERR|POLLHUP) + pset[0].events = EVENTS; + pset[0].revents = 0; + pset[1].events = EVENTS; + pset[1].revents = 0; + + sigsetjmp(env, 1); + + for (;;) { + in_poll = 1; + + if (do_exit) + break; + if (do_sync) { + in_poll = 0; + dbase->sync(dbase, 0); + do_sync = 0; + in_poll = 1; + } + if (do_stats) + send_stats(); + if (poll(pset, 2, 30000) > 0) { + in_poll = 0; + if (pset[0].revents&EVENTS) + get_arp_pkt(); + if (pset[1].revents&EVENTS) + get_kern_msg(); + } else { + do_sync = 1; + } + } + + undo_sysctl_adjustments(); +out: + dbase->close(dbase); + exit(0); + +do_abort: + dbase->close(dbase); + exit(-1); +} diff --git a/misc/ifstat.c b/misc/ifstat.c index e69de29b..67489b9a 100644 --- a/misc/ifstat.c +++ b/misc/ifstat.c @@ -0,0 +1,729 @@ +/* + * ifstat.c handy utility to read net interface statistics + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <fnmatch.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/poll.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <signal.h> +#include <math.h> + +#include <libnetlink.h> +#include <linux/netdevice.h> + +#include <SNAPSHOT.h> + +int dump_zeros = 0; +int reset_history = 0; +int ignore_history = 0; +int no_output = 0; +int no_update = 0; +int scan_interval = 0; +int time_constant = 0; +int show_errors = 0; +double W; +char **patterns; +int npatterns; + +char info_source[128]; +int source_mismatch; + +#define MAXS (sizeof(struct net_device_stats)/sizeof(unsigned long)) + +struct ifstat_ent +{ + struct ifstat_ent *next; + char *name; + int ifindex; + unsigned long long val[MAXS]; + double rate[MAXS]; + unsigned long ival[MAXS]; +}; + +struct ifstat_ent *kern_db; +struct ifstat_ent *hist_db; + +int match(char *id) +{ + int i; + + if (npatterns == 0) + return 1; + + for (i=0; i<npatterns; i++) { + if (!fnmatch(patterns[i], id, 0)) + return 1; + } + return 0; +} + +int get_nlmsg(struct sockaddr_nl *who, struct nlmsghdr *m, void *arg) +{ + struct ifinfomsg *ifi = NLMSG_DATA(m); + struct rtattr * tb[IFLA_MAX+1]; + int len = m->nlmsg_len; + struct ifstat_ent *n; + int i; + + if (m->nlmsg_type != RTM_NEWLINK) + return 0; + + len -= NLMSG_LENGTH(sizeof(*ifi)); + if (len < 0) + return -1; + + if (!(ifi->ifi_flags&IFF_UP)) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len); + if (tb[IFLA_IFNAME] == NULL || tb[IFLA_STATS] == NULL) + return 0; + + n = malloc(sizeof(*n)); + if (!n) + abort(); + n->ifindex = ifi->ifi_index; + n->name = strdup(RTA_DATA(tb[IFLA_IFNAME])); + memcpy(&n->ival, RTA_DATA(tb[IFLA_STATS]), sizeof(n->ival)); + memset(&n->rate, 0, sizeof(n->rate)); + for (i=0; i<MAXS; i++) + n->val[i] = n->ival[i]; + n->next = kern_db; + kern_db = n; + return 0; +} + +void load_info(void) +{ + struct ifstat_ent *db, *n; + struct rtnl_handle rth; + + if (rtnl_open(&rth, 0) < 0) + exit(1); + + if (rtnl_wilddump_request(&rth, AF_INET, RTM_GETLINK) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, get_nlmsg, NULL, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + rtnl_close(&rth); + + db = kern_db; + kern_db = NULL; + + while (db) { + n = db; + db = db->next; + n->next = kern_db; + kern_db = n; + } +} + +void load_raw_table(FILE *fp) +{ + char buf[4096]; + struct ifstat_ent *db = NULL; + struct ifstat_ent *n; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + char *p; + char *next; + int i; + + if (buf[0] == '#') { + buf[strlen(buf)-1] = 0; + if (info_source[0] && strcmp(info_source, buf+1)) + source_mismatch = 1; + strncpy(info_source, buf+1, sizeof(info_source)-1); + continue; + } + if ((n = malloc(sizeof(*n))) == NULL) + abort(); + + if (!(p = strchr(buf, ' '))) + abort(); + *p++ = 0; + + if (sscanf(buf, "%d", &n->ifindex) != 1) + abort(); + if (!(next = strchr(p, ' '))) + abort(); + *next++ = 0; + + n->name = strdup(p); + p = next; + + for (i=0; i<MAXS; i++) { + unsigned rate; + if (!(next = strchr(p, ' '))) + abort(); + *next++ = 0; + if (sscanf(p, "%llu", n->val+i) != 1) + abort(); + n->ival[i] = (unsigned long)n->val[i]; + p = next; + if (!(next = strchr(p, ' '))) + abort(); + *next++ = 0; + if (sscanf(p, "%u", &rate) != 1) + abort(); + n->rate[i] = rate; + p = next; + } + n->next = db; + db = n; + } + + while (db) { + n = db; + db = db->next; + n->next = kern_db; + kern_db = n; + } +} + +void dump_raw_db(FILE *fp, int to_hist) +{ + struct ifstat_ent *n, *h; + h = hist_db; + fprintf(fp, "#%s\n", info_source); + + for (n=kern_db; n; n=n->next) { + int i; + unsigned long long *vals = n->val; + double *rates = n->rate; + if (!match(n->name)) { + struct ifstat_ent *h1; + if (!to_hist) + continue; + for (h1 = h; h1; h1 = h1->next) { + if (h1->ifindex == n->ifindex) { + vals = h1->val; + rates = h1->rate; + h = h1->next; + break; + } + } + } + fprintf(fp, "%d %s ", n->ifindex, n->name); + for (i=0; i<MAXS; i++) + fprintf(fp, "%llu %u ", vals[i], (unsigned)rates[i]); + fprintf(fp, "\n"); + } +} + + +void format_rate(FILE *fp, unsigned long long *vals, double *rates, int i) +{ + char temp[64]; + if (vals[i] > 1024*1024*1024) + fprintf(fp, "%7lluM ", vals[i]/(1024*1024)); + else if (vals[i] > 1024*1024) + fprintf(fp, "%7lluK ", vals[i]/1024); + else + fprintf(fp, "%8llu ", vals[i]); + + if (rates[i] > 1024*1024) { + sprintf(temp, "%uM", (unsigned)(rates[i]/(1024*1024))); + fprintf(fp, "%-6s ", temp); + } else if (rates[i] > 1024) { + sprintf(temp, "%uK", (unsigned)(rates[i]/1024)); + fprintf(fp, "%-6s ", temp); + } else + fprintf(fp, "%-6u ", (unsigned)rates[i]); +} + +void format_pair(FILE *fp, unsigned long long *vals, int i, int k) +{ + char temp[64]; + if (vals[i] > 1024*1024*1024) + fprintf(fp, "%7lluM ", vals[i]/(1024*1024)); + else if (vals[i] > 1024*1024) + fprintf(fp, "%7lluK ", vals[i]/1024); + else + fprintf(fp, "%8llu ", vals[i]); + + if (vals[k] > 1024*1024*1024) { + sprintf(temp, "%uM", (unsigned)(vals[k]/(1024*1024))); + fprintf(fp, "%-6s ", temp); + } else if (vals[k] > 1024*1024) { + sprintf(temp, "%uK", (unsigned)(vals[k]/1024)); + fprintf(fp, "%-6s ", temp); + } else + fprintf(fp, "%-6u ", (unsigned)vals[k]); +} + +void print_head(FILE *fp) +{ + fprintf(fp, "#%s\n", info_source); + fprintf(fp, "%-15s ", "Interface"); + + fprintf(fp, "%8s/%-6s ", "RX Pkts", "Rate"); + fprintf(fp, "%8s/%-6s ", "TX Pkts", "Rate"); + fprintf(fp, "%8s/%-6s ", "RX Data", "Rate"); + fprintf(fp, "%8s/%-6s\n","TX Data", "Rate"); + + if (!show_errors) { + fprintf(fp, "%-15s ", ""); + fprintf(fp, "%8s/%-6s ", "RX Errs", "Drop"); + fprintf(fp, "%8s/%-6s ", "TX Errs", "Drop"); + fprintf(fp, "%8s/%-6s ", "RX Over", "Rate"); + fprintf(fp, "%8s/%-6s\n","TX Coll", "Rate"); + } else { + fprintf(fp, "%-15s ", ""); + fprintf(fp, "%8s/%-6s ", "RX Errs", "Rate"); + fprintf(fp, "%8s/%-6s ", "RX Drop", "Rate"); + fprintf(fp, "%8s/%-6s ", "RX Over", "Rate"); + fprintf(fp, "%8s/%-6s\n","RX Leng", "Rate"); + + fprintf(fp, "%-15s ", ""); + fprintf(fp, "%8s/%-6s ", "RX Crc", "Rate"); + fprintf(fp, "%8s/%-6s ", "RX Frm", "Rate"); + fprintf(fp, "%8s/%-6s ", "RX Fifo", "Rate"); + fprintf(fp, "%8s/%-6s\n","RX Miss", "Rate"); + + fprintf(fp, "%-15s ", ""); + fprintf(fp, "%8s/%-6s ", "TX Errs", "Rate"); + fprintf(fp, "%8s/%-6s ", "TX Drop", "Rate"); + fprintf(fp, "%8s/%-6s ", "TX Coll", "Rate"); + fprintf(fp, "%8s/%-6s\n","TX Carr", "Rate"); + + fprintf(fp, "%-15s ", ""); + fprintf(fp, "%8s/%-6s ", "TX Abrt", "Rate"); + fprintf(fp, "%8s/%-6s ", "TX Fifo", "Rate"); + fprintf(fp, "%8s/%-6s ", "TX Hear", "Rate"); + fprintf(fp, "%8s/%-6s\n","TX Wind", "Rate"); + } +} + +void print_one_if(FILE *fp, struct ifstat_ent *n, unsigned long long *vals) +{ + int i; + fprintf(fp, "%-15s ", n->name); + for (i=0; i<4; i++) + format_rate(fp, vals, n->rate, i); + fprintf(fp, "\n"); + + if (!show_errors) { + fprintf(fp, "%-15s ", ""); + format_pair(fp, vals, 4, 6); + format_pair(fp, vals, 5, 7); + format_rate(fp, vals, n->rate, 11); + format_rate(fp, vals, n->rate, 9); + fprintf(fp, "\n"); + } else { + fprintf(fp, "%-15s ", ""); + format_rate(fp, vals, n->rate, 4); + format_rate(fp, vals, n->rate, 6); + format_rate(fp, vals, n->rate, 11); + format_rate(fp, vals, n->rate, 10); + fprintf(fp, "\n"); + + fprintf(fp, "%-15s ", ""); + format_rate(fp, vals, n->rate, 12); + format_rate(fp, vals, n->rate, 13); + format_rate(fp, vals, n->rate, 14); + format_rate(fp, vals, n->rate, 15); + fprintf(fp, "\n"); + + fprintf(fp, "%-15s ", ""); + format_rate(fp, vals, n->rate, 5); + format_rate(fp, vals, n->rate, 7); + format_rate(fp, vals, n->rate, 9); + format_rate(fp, vals, n->rate, 17); + fprintf(fp, "\n"); + + fprintf(fp, "%-15s ", ""); + format_rate(fp, vals, n->rate, 16); + format_rate(fp, vals, n->rate, 18); + format_rate(fp, vals, n->rate, 19); + format_rate(fp, vals, n->rate, 20); + fprintf(fp, "\n"); + } +} + + +void dump_kern_db(FILE *fp) +{ + struct ifstat_ent *n, *h; + h = hist_db; + + print_head(fp); + + for (n=kern_db; n; n=n->next) { + if (!match(n->name)) + continue; + print_one_if(fp, n, n->val); + } +} + + +void dump_incr_db(FILE *fp) +{ + struct ifstat_ent *n, *h; + h = hist_db; + + print_head(fp); + + for (n=kern_db; n; n=n->next) { + int i; + unsigned long long vals[MAXS]; + struct ifstat_ent *h1; + + memcpy(vals, n->val, sizeof(vals)); + + for (h1 = h; h1; h1 = h1->next) { + if (h1->ifindex == n->ifindex) { + for (i = 0; i < MAXS; i++) + vals[i] -= h1->val[i]; + h = h1->next; + break; + } + } + if (!match(n->name)) + continue; + print_one_if(fp, n, vals); + } +} + + +static int children; + +void sigchild(int signo) +{ +} + +void update_db(int interval) +{ + struct ifstat_ent *n, *h; + + n = kern_db; + kern_db = NULL; + + load_info(); + + h = kern_db; + kern_db = n; + + for (n = kern_db; n; n = n->next) { + struct ifstat_ent *h1; + for (h1 = h; h1; h1 = h1->next) { + if (h1->ifindex == n->ifindex) { + int i; + for (i = 0; i < MAXS; i++) { + if ((long)(h1->ival[i] - n->ival[i]) < 0) { + memset(n->ival, 0, sizeof(n->ival)); + break; + } + } + for (i = 0; i < MAXS; i++) { + double sample; + unsigned long incr = h1->ival[i] - n->ival[i]; + n->val[i] += incr; + n->ival[i] = h1->ival[i]; + sample = (double)(incr*1000)/interval; + if (interval >= scan_interval) { + n->rate[i] += W*(sample-n->rate[i]); + } else if (interval >= 1000) { + if (interval >= time_constant) { + n->rate[i] = sample; + } else { + double w = W*(double)interval/scan_interval; + n->rate[i] += w*(sample-n->rate[i]); + } + } + } + + while (h != h1) { + struct ifstat_ent *tmp = h; + h = h->next; + free(tmp->name); + free(tmp); + }; + h = h1->next; + free(h1->name); + free(h1); + break; + } + } + } +} + +#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000) + + +void server_loop(int fd) +{ + struct timeval snaptime; + struct pollfd p; + p.fd = fd; + p.events = p.revents = POLLIN; + + sprintf(info_source, "%d.%lu sampling_interval=%d time_const=%d", + getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000); + + load_info(); + + for (;;) { + int status; + int tdiff; + struct timeval now; + gettimeofday(&now, NULL); + tdiff = T_DIFF(now, snaptime); + if (tdiff >= scan_interval) { + update_db(tdiff); + snaptime = now; + tdiff = 0; + } + if (poll(&p, 1, tdiff + scan_interval) > 0 + && (p.revents&POLLIN)) { + int clnt = accept(fd, NULL, NULL); + if (clnt >= 0) { + pid_t pid; + if (children >= 5) { + close(clnt); + } else if ((pid = fork()) != 0) { + if (pid>0) + children++; + close(clnt); + } else { + FILE *fp = fdopen(clnt, "w"); + if (fp) { + if (tdiff > 0) + update_db(tdiff); + dump_raw_db(fp, 0); + } + exit(0); + } + } + } + while (children && waitpid(-1, &status, WNOHANG) > 0) + children--; + } +} + +int verify_forging(int fd) +{ + struct ucred cred; + int olen = sizeof(cred); + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) || + olen < sizeof(cred)) + return -1; + if (cred.uid == getuid() || cred.uid == 0) + return 0; + return -1; +} + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, +"Usage: ifstat [ -h?vVzrnasd:t: ] [ PATTERN [ PATTERN ] ]\n" + ); + exit(-1); +} + + +int main(int argc, char *argv[]) +{ + char hist_name[128]; + struct sockaddr_un sun; + FILE *hist_fp = NULL; + int ch; + int fd; + + while ((ch = getopt(argc, argv, "h?vVzrnasd:t:e")) != EOF) { + switch(ch) { + case 'z': + dump_zeros = 1; + break; + case 'r': + reset_history = 1; + break; + case 'a': + ignore_history = 1; + break; + case 's': + no_update = 1; + break; + case 'n': + no_output = 1; + break; + case 'e': + show_errors = 1; + break; + case 'd': + scan_interval = 1000*atoi(optarg); + break; + case 't': + if (sscanf(optarg, "%d", &time_constant) != 1 || + time_constant <= 0) { + fprintf(stderr, "ifstat: invalid time constant divisor\n"); + exit(-1); + } + break; + case 'v': + case 'V': + printf("ifstat utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + case 'h': + case '?': + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + sun.sun_family = AF_UNIX; + sun.sun_path[0] = 0; + sprintf(sun.sun_path+1, "ifstat%d", getuid()); + + if (scan_interval > 0) { + if (time_constant == 0) + time_constant = 60; + time_constant *= 1000; + W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant); + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("ifstat: socket"); + exit(-1); + } + if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) { + perror("ifstat: bind"); + exit(-1); + } + if (listen(fd, 5) < 0) { + perror("ifstat: listen"); + exit(-1); + } + if (fork()) + exit(0); + chdir("/"); + close(0); close(1); close(2); setsid(); + signal(SIGPIPE, SIG_IGN); + signal(SIGCHLD, sigchild); + server_loop(fd); + exit(0); + } + + patterns = argv; + npatterns = argc; + + if (getenv("IFSTAT_HISTORY")) + snprintf(hist_name, sizeof(hist_name), getenv("IFSTAT_HISTORY")); + else + sprintf(hist_name, "/tmp/.ifstat.u%d", getuid()); + + if (reset_history) + unlink(hist_name); + + if (!ignore_history || !no_update) { + struct stat stb; + + fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600); + if (fd < 0) { + perror("ifstat: open history file"); + exit(-1); + } + if ((hist_fp = fdopen(fd, "r+")) == NULL) { + perror("ifstat: fdopen history file"); + exit(-1); + } + if (flock(fileno(hist_fp), LOCK_EX)) { + perror("ifstat: flock history file"); + exit(-1); + } + if (fstat(fileno(hist_fp), &stb) != 0) { + perror("ifstat: fstat history file"); + exit(-1); + } + if (stb.st_nlink != 1 || stb.st_uid != getuid()) { + fprintf(stderr, "ifstat: something is so wrong with history file, that I prefer not to proceed.\n"); + exit(-1); + } + if (!ignore_history) { + FILE *tfp; + long uptime; + if ((tfp = fopen("/proc/uptime", "r")) != NULL) { + if (fscanf(tfp, "%ld", &uptime) != 1) + uptime = -1; + fclose(tfp); + } + if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) { + fprintf(stderr, "ifstat: history is aged out, resetting\n"); + ftruncate(fileno(hist_fp), 0); + } + } + + load_raw_table(hist_fp); + + hist_db = kern_db; + kern_db = NULL; + } + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 && + (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0 + || (strcpy(sun.sun_path+1, "ifstat0"), + connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0)) + && verify_forging(fd) == 0) { + FILE *sfp = fdopen(fd, "r"); + load_raw_table(sfp); + if (hist_db && source_mismatch) { + fprintf(stderr, "ifstat: history is stale, ignoring it.\n"); + hist_db = NULL; + } + fclose(sfp); + } else { + if (fd >= 0) + close(fd); + if (hist_db && info_source[0] && strcmp(info_source, "kernel")) { + fprintf(stderr, "ifstat: history is stale, ignoring it.\n"); + hist_db = NULL; + info_source[0] = 0; + } + load_info(); + if (info_source[0] == 0) + strcpy(info_source, "kernel"); + } + + if (!no_output) { + if (ignore_history || hist_db == NULL) + dump_kern_db(stdout); + else + dump_incr_db(stdout); + } + if (!no_update) { + ftruncate(fileno(hist_fp), 0); + rewind(hist_fp); + dump_raw_db(hist_fp, 1); + fflush(hist_fp); + } + exit(0); +} diff --git a/misc/netbug b/misc/netbug index e69de29b..6d13c8ee 100644 --- a/misc/netbug +++ b/misc/netbug @@ -0,0 +1,53 @@ +#! /bin/bash + +echo -n "Send network configuration summary to [ENTER means kuznet@ms2.inr.ac.ru] " +IFS="" read mail || exit 1 +[ -z "$mail" ] && mail=kuznet@ms2.inr.ac.ru + + +netbug="" +while [ "$netbug" = "" ]; do + netbug=`echo netbug.$$.$RANDOM` + if [ -e /tmp/$netbug ]; then + netbug="" + fi +done + +tmppath=/tmp/$netbug + +trap "rm -rf $tmppath $tmppath.tar.gz" 0 SIGINT + +mkdir $tmppath +mkdir $tmppath/net + +cat /proc/slabinfo > $tmppath/slabinfo +cat /proc/net/netstat > $tmppath/net/netstat +cat /proc/net/unix > $tmppath/net/unix +cat /proc/net/packet > $tmppath/net/packet +cat /proc/net/netlink > $tmppath/net/netlink +cat /proc/net/psched > $tmppath/net/psched +cat /proc/net/softnet_stat > $tmppath/net/softnet_stat +cat /proc/net/sockstat > $tmppath/net/sockstat +cat /proc/net/tcp > $tmppath/net/tcp +cat /proc/net/udp > $tmppath/net/udp +cat /proc/net/raw > $tmppath/net/raw +cat /proc/net/snmp > $tmppath/net/snmp + +ss -aioem -D $tmppath/tcpdiag + +if [ -e /proc/net/tcp6 ]; then + cat /proc/net/sockstat6 > $tmppath/net/sockstat6 + cat /proc/net/tcp6 > $tmppath/net/tcp6 + cat /proc/net/udp6 > $tmppath/net/udp6 + cat /proc/net/raw6 > $tmppath/net/raw6 + cat /proc/net/snmp6 > $tmppath/net/snmp6 +fi + +cd /tmp +tar c $netbug | gzip -9c > $netbug.tar.gz + +uuencode $netbug.tar.gz $netbug.tar.gz | mail -s $netbug "$mail" + +echo "Sending to <$mail>; subject is $netbug" + +exit 0 diff --git a/misc/nstat.c b/misc/nstat.c index e69de29b..9580ccf3 100644 --- a/misc/nstat.c +++ b/misc/nstat.c @@ -0,0 +1,614 @@ +/* + * nstat.c handy utility to read counters /proc/net/netstat and snmp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <fnmatch.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/poll.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <signal.h> +#include <math.h> + +#include <SNAPSHOT.h> + +int dump_zeros = 0; +int reset_history = 0; +int ignore_history = 0; +int no_output = 0; +int no_update = 0; +int scan_interval = 0; +int time_constant = 0; +double W; +char **patterns; +int npatterns; + +char info_source[128]; +int source_mismatch; + +int generic_proc_open(char *env, char *name) +{ + char store[128]; + char *p = getenv(env); + if (!p) { + p = getenv("PROC_ROOT") ? : "/proc"; + snprintf(store, sizeof(store)-1, "%s/%s", p, name); + p = store; + } + return open(store, O_RDONLY); +} + +int net_netstat_open(void) +{ + return generic_proc_open("PROC_NET_NETSTAT", "net/netstat"); +} + +int net_snmp_open(void) +{ + return generic_proc_open("PROC_NET_SNMP", "net/snmp"); +} + +int net_snmp6_open(void) +{ + return generic_proc_open("PROC_NET_SNMP6", "net/snmp6"); +} + +struct nstat_ent +{ + struct nstat_ent *next; + char *id; + unsigned long long val; + unsigned long ival; + double rate; +}; + +struct nstat_ent *kern_db; +struct nstat_ent *hist_db; + +char *useless_numbers[] = { +"IpForwarding", "IpDefaultTTL", +"TcpRtoAlgorithm", "TcpRtoMin", "TcpRtoMax", +"TcpMaxConn", "TcpCurrEstab" +}; + +int useless_number(char *id) +{ + int i; + for (i=0; i<sizeof(useless_numbers)/sizeof(*useless_numbers); i++) + if (strcmp(id, useless_numbers[i]) == 0) + return 1; + return 0; +} + +int match(char *id) +{ + int i; + + if (npatterns == 0) + return 1; + + for (i=0; i<npatterns; i++) { + if (!fnmatch(patterns[i], id, 0)) + return 1; + } + return 0; +} + +void load_good_table(FILE *fp) +{ + char buf[4096]; + struct nstat_ent *db = NULL; + struct nstat_ent *n; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + int nr; + unsigned long long val; + double rate; + char idbuf[256]; + if (buf[0] == '#') { + buf[strlen(buf)-1] = 0; + if (info_source[0] && strcmp(info_source, buf+1)) + source_mismatch = 1; + strncpy(info_source, buf+1, sizeof(info_source)-1); + continue; + } + nr = sscanf(buf, "%s%llu%lg", idbuf, &val, &rate); + if (nr < 2) + abort(); + if (nr < 3) + rate = 0; + if (useless_number(idbuf)) + continue; + if ((n = malloc(sizeof(*n))) == NULL) + abort(); + n->id = strdup(idbuf); + n->ival = (unsigned long)val; + n->val = val; + n->rate = rate; + n->next = db; + db = n; + } + + while (db) { + n = db; + db = db->next; + n->next = kern_db; + kern_db = n; + } +} + + +void load_ugly_table(FILE *fp) +{ + char buf[4096]; + struct nstat_ent *db = NULL; + struct nstat_ent *n; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + char idbuf[256]; + int off; + char *p; + + p = strchr(buf, ':'); + if (!p) + abort(); + *p = 0; + strcpy(idbuf, buf); + off = strlen(idbuf); + p += 2; + + while (*p) { + char *next; + if ((next = strchr(p, ' ')) != NULL) + *next++ = 0; + else if ((next = strchr(p, '\n')) != NULL) + *next++ = 0; + strcpy(idbuf+off, p); + n = malloc(sizeof(*n)); + if (!n) + abort(); + n->id = strdup(idbuf); + n->rate = 0; + n->next = db; + db = n; + p = next; + } + n = db; + if (fgets(buf, sizeof(buf), fp) == NULL) + abort(); + do { + p = strrchr(buf, ' '); + if (!p) + abort(); + *p = 0; + if (sscanf(p+1, "%lu", &n->ival) != 1) + abort(); + n->val = n->ival; + /* Trick to skip "dummy" trailing ICMP MIB in 2.4 */ + if (strcmp(idbuf, "IcmpOutAddrMaskReps") == 0) + idbuf[5] = 0; + else + n = n->next; + } while (p > buf + off + 2); + } + + while (db) { + n = db; + db = db->next; + if (useless_number(n->id)) { + free(n->id); + free(n); + } else { + n->next = kern_db; + kern_db = n; + } + } +} + +void load_snmp(void) +{ + FILE *fp = fdopen(net_snmp_open(), "r"); + if (fp) { + load_ugly_table(fp); + fclose(fp); + } +} + +void load_snmp6(void) +{ + FILE *fp = fdopen(net_snmp6_open(), "r"); + if (fp) { + load_good_table(fp); + fclose(fp); + } +} + +void load_netstat(void) +{ + FILE *fp = fdopen(net_netstat_open(), "r"); + if (fp) { + load_ugly_table(fp); + fclose(fp); + } +} + +void dump_kern_db(FILE *fp, int to_hist) +{ + struct nstat_ent *n, *h; + h = hist_db; + fprintf(fp, "#%s\n", info_source); + for (n=kern_db; n; n=n->next) { + unsigned long long val = n->val; + if (!dump_zeros && !val && !n->rate) + continue; + if (!match(n->id)) { + struct nstat_ent *h1; + if (!to_hist) + continue; + for (h1 = h; h1; h1 = h1->next) { + if (strcmp(h1->id, n->id) == 0) { + val = h1->val; + h = h1->next; + break; + } + } + } + fprintf(fp, "%-32s%-16llu%6.1f\n", n->id, val, n->rate); + } +} + +void dump_incr_db(FILE *fp) +{ + struct nstat_ent *n, *h; + h = hist_db; + fprintf(fp, "#%s\n", info_source); + for (n=kern_db; n; n=n->next) { + int ovfl = 0; + unsigned long long val = n->val; + struct nstat_ent *h1; + for (h1 = h; h1; h1 = h1->next) { + if (strcmp(h1->id, n->id) == 0) { + if (val < h1->val) { + ovfl = 1; + val = h1->val; + } + val -= h1->val; + h = h1->next; + break; + } + } + if (!dump_zeros && !val && !n->rate) + continue; + if (!match(n->id)) + continue; + fprintf(fp, "%-32s%-16llu%6.1f%s\n", n->id, val, + n->rate, ovfl?" (overflow)":""); + } +} + +static int children; + +void sigchild(int signo) +{ +} + +void update_db(int interval) +{ + struct nstat_ent *n, *h; + + n = kern_db; + kern_db = NULL; + + load_netstat(); + load_snmp6(); + load_snmp(); + + h = kern_db; + kern_db = n; + + for (n = kern_db; n; n = n->next) { + struct nstat_ent *h1; + for (h1 = h; h1; h1 = h1->next) { + if (strcmp(h1->id, n->id) == 0) { + double sample; + unsigned long incr = h1->ival - n->ival; + n->val += incr; + n->ival = h1->ival; + sample = (double)(incr*1000)/interval; + if (interval >= scan_interval) { + n->rate += W*(sample-n->rate); + } else if (interval >= 1000) { + if (interval >= time_constant) { + n->rate = sample; + } else { + double w = W*(double)interval/scan_interval; + n->rate += w*(sample-n->rate); + } + } + + while (h != h1) { + struct nstat_ent *tmp = h; + h = h->next; + free(tmp->id); + free(tmp); + }; + h = h1->next; + free(h1->id); + free(h1); + break; + } + } + } +} + +#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000) + + +void server_loop(int fd) +{ + struct timeval snaptime; + struct pollfd p; + p.fd = fd; + p.events = p.revents = POLLIN; + + sprintf(info_source, "%d.%lu sampling_interval=%d time_const=%d", + getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000); + + load_netstat(); + load_snmp6(); + load_snmp(); + + for (;;) { + int status; + int tdiff; + struct timeval now; + gettimeofday(&now, NULL); + tdiff = T_DIFF(now, snaptime); + if (tdiff >= scan_interval) { + update_db(tdiff); + snaptime = now; + tdiff = 0; + } + if (poll(&p, 1, tdiff + scan_interval) > 0 + && (p.revents&POLLIN)) { + int clnt = accept(fd, NULL, NULL); + if (clnt >= 0) { + pid_t pid; + if (children >= 5) { + close(clnt); + } else if ((pid = fork()) != 0) { + if (pid>0) + children++; + close(clnt); + } else { + FILE *fp = fdopen(clnt, "w"); + if (fp) { + if (tdiff > 0) + update_db(tdiff); + dump_kern_db(fp, 0); + } + exit(0); + } + } + } + while (children && waitpid(-1, &status, WNOHANG) > 0) + children--; + } +} + +int verify_forging(int fd) +{ + struct ucred cred; + int olen = sizeof(cred); + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) || + olen < sizeof(cred)) + return -1; + if (cred.uid == getuid() || cred.uid == 0) + return 0; + return -1; +} + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, +"Usage: nstat [ -h?vVzrnasd:t: ] [ PATTERN [ PATTERN ] ]\n" + ); + exit(-1); +} + + +int main(int argc, char *argv[]) +{ + char hist_name[128]; + struct sockaddr_un sun; + FILE *hist_fp = NULL; + int ch; + int fd; + + while ((ch = getopt(argc, argv, "h?vVzrnasd:t:")) != EOF) { + switch(ch) { + case 'z': + dump_zeros = 1; + break; + case 'r': + reset_history = 1; + break; + case 'a': + ignore_history = 1; + break; + case 's': + no_update = 1; + break; + case 'n': + no_output = 1; + break; + case 'd': + scan_interval = 1000*atoi(optarg); + break; + case 't': + if (sscanf(optarg, "%d", &time_constant) != 1 || + time_constant <= 0) { + fprintf(stderr, "nstat: invalid time constant divisor\n"); + exit(-1); + } + break; + case 'v': + case 'V': + printf("nstat utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + case 'h': + case '?': + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + sun.sun_family = AF_UNIX; + sun.sun_path[0] = 0; + sprintf(sun.sun_path+1, "nstat%d", getuid()); + + if (scan_interval > 0) { + if (time_constant == 0) + time_constant = 60; + time_constant *= 1000; + W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant); + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("nstat: socket"); + exit(-1); + } + if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) { + perror("nstat: bind"); + exit(-1); + } + if (listen(fd, 5) < 0) { + perror("nstat: listen"); + exit(-1); + } + if (fork()) + exit(0); + chdir("/"); + close(0); close(1); close(2); setsid(); + signal(SIGPIPE, SIG_IGN); + signal(SIGCHLD, sigchild); + server_loop(fd); + exit(0); + } + + patterns = argv; + npatterns = argc; + + if (getenv("NSTAT_HISTORY")) + snprintf(hist_name, sizeof(hist_name), getenv("NSTAT_HISTORY")); + else + sprintf(hist_name, "/tmp/.nstat.u%d", getuid()); + + if (reset_history) + unlink(hist_name); + + if (!ignore_history || !no_update) { + struct stat stb; + + fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600); + if (fd < 0) { + perror("nstat: open history file"); + exit(-1); + } + if ((hist_fp = fdopen(fd, "r+")) == NULL) { + perror("nstat: fdopen history file"); + exit(-1); + } + if (flock(fileno(hist_fp), LOCK_EX)) { + perror("nstat: flock history file"); + exit(-1); + } + if (fstat(fileno(hist_fp), &stb) != 0) { + perror("nstat: fstat history file"); + exit(-1); + } + if (stb.st_nlink != 1 || stb.st_uid != getuid()) { + fprintf(stderr, "nstat: something is so wrong with history file, that I prefer not to proceed.\n"); + exit(-1); + } + if (!ignore_history) { + FILE *tfp; + long uptime; + if ((tfp = fopen("/proc/uptime", "r")) != NULL) { + if (fscanf(tfp, "%ld", &uptime) != 1) + uptime = -1; + fclose(tfp); + } + if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) { + fprintf(stderr, "nstat: history is aged out, resetting\n"); + ftruncate(fileno(hist_fp), 0); + } + } + + load_good_table(hist_fp); + + hist_db = kern_db; + kern_db = NULL; + } + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 && + (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0 + || (strcpy(sun.sun_path+1, "nstat0"), + connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0)) + && verify_forging(fd) == 0) { + FILE *sfp = fdopen(fd, "r"); + load_good_table(sfp); + if (hist_db && source_mismatch) { + fprintf(stderr, "nstat: history is stale, ignoring it.\n"); + hist_db = NULL; + } + fclose(sfp); + } else { + if (fd >= 0) + close(fd); + if (hist_db && info_source[0] && strcmp(info_source, "kernel")) { + fprintf(stderr, "nstat: history is stale, ignoring it.\n"); + hist_db = NULL; + info_source[0] = 0; + } + load_netstat(); + load_snmp6(); + load_snmp(); + if (info_source[0] == 0) + strcpy(info_source, "kernel"); + } + + if (!no_output) { + if (ignore_history || hist_db == NULL) + dump_kern_db(stdout, 0); + else + dump_incr_db(stdout); + } + if (!no_update) { + ftruncate(fileno(hist_fp), 0); + rewind(hist_fp); + dump_kern_db(hist_fp, 1); + fflush(hist_fp); + } + exit(0); +} diff --git a/misc/rtacct.c b/misc/rtacct.c index e69de29b..5c6748b9 100644 --- a/misc/rtacct.c +++ b/misc/rtacct.c @@ -0,0 +1,625 @@ +/* + * rtacct.c Applet to display contents of /proc/net/rt_acct. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <errno.h> +#include <time.h> +#include <sys/time.h> +#include <fnmatch.h> +#include <sys/file.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/poll.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <signal.h> +#include <math.h> + +#include "rt_names.h" + +#include <SNAPSHOT.h> + +int reset_history = 0; +int ignore_history = 0; +int no_output = 0; +int no_update = 0; +int scan_interval = 0; +int time_constant = 0; +int dump_zeros = 0; +unsigned long magic_number = 0; +double W; + +int generic_proc_open(char *env, char *name) +{ + char store[1024]; + char *p = getenv(env); + if (!p) { + p = getenv("PROC_ROOT") ? : "/proc"; + snprintf(store, sizeof(store)-1, "%s/%s", p, name); + p = store; + } + return open(store, O_RDONLY); +} + +int net_rtacct_open(void) +{ + return generic_proc_open("PROC_NET_RTACCT", "net/rt_acct"); +} + +__u32 rmap[256/4]; + +struct rtacct_data +{ + __u32 ival[256*4]; + + unsigned long long val[256*4]; + double rate[256*4]; + __u8 signature[128]; +}; + +struct rtacct_data kern_db_static; + +struct rtacct_data *kern_db = &kern_db_static; +struct rtacct_data *hist_db; + +void nread(int fd, char *buf, int tot) +{ + int count = 0; + + while (count < tot) { + int n = read(fd, buf+count, tot-count); + if (n < 0) { + if (errno == EINTR) + continue; + exit(-1); + } + if (n == 0) + exit(-1); + count += n; + } +} + + +__u32 *read_kern_table(__u32 *tbl) +{ + static __u32 *tbl_ptr; + int fd; + + if (magic_number) { + if (tbl_ptr != NULL) + return tbl_ptr; + + fd = open("/dev/mem", O_RDONLY); + if (fd < 0) { + perror("magic open"); + exit(-1); + } + tbl_ptr = mmap(NULL, 4096, + PROT_READ, + MAP_SHARED, + fd, magic_number); + if ((unsigned long)tbl_ptr == ~0UL) { + perror("magic mmap"); + exit(-1); + } + close(fd); + return tbl_ptr; + } + + fd = net_rtacct_open(); + if (fd >= 0) { + nread(fd, (char*)tbl, 256*16); + close(fd); + } else { + memset(tbl, 0, 256*16); + } + return tbl; +} + +void format_rate(FILE *fp, double rate) +{ + char temp[64]; + + if (rate > 1024*1024) { + sprintf(temp, "%uM", (unsigned)rint(rate/(1024*1024))); + fprintf(fp, " %-10s", temp); + } else if (rate > 1024) { + sprintf(temp, "%uK", (unsigned)rint(rate/1024)); + fprintf(fp, " %-10s", temp); + } else + fprintf(fp, " %-10u", (unsigned)rate); +} + +void format_count(FILE *fp, unsigned long long val) +{ + if (val > 1024*1024*1024) + fprintf(fp, " %10lluM", val/(1024*1024)); + else if (val > 1024*1024) + fprintf(fp, " %10lluK", val/1024); + else + fprintf(fp, " %10llu", val); +} + +void dump_abs_db(FILE *fp) +{ + int realm; + char b1[16]; + + if (!no_output) { + fprintf(fp, "#%s\n", kern_db->signature); + fprintf(fp, +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"\n" + , "Realm", "BytesTo", "PktsTo", "BytesFrom", "PktsFrom"); + fprintf(fp, +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"\n" + , "", "BPSTo", "PPSTo", "BPSFrom", "PPSFrom"); + + } + + for (realm=0; realm<256; realm++) { + int i; + unsigned long long *val; + double *rate; + + if (!(rmap[realm>>5] & (1<<(realm&0x1f)))) + continue; + + val = &kern_db->val[realm*4]; + rate = &kern_db->rate[realm*4]; + + if (!dump_zeros && + !val[0] && !rate[0] && + !val[1] && !rate[1] && + !val[2] && !rate[2] && + !val[3] && !rate[3]) + continue; + + if (hist_db) { + memcpy(&hist_db->val[realm*4], val, sizeof(*val)*4); + } + + if (no_output) + continue; + + fprintf(fp, "%-10s", rtnl_rtrealm_n2a(realm, b1, sizeof(b1))); + for (i = 0; i < 4; i++) + format_count(fp, val[i]); + fprintf(fp, "\n%-10s", ""); + for (i = 0; i < 4; i++) + format_rate(fp, rate[i]); + fprintf(fp, "\n"); + } +} + + +void dump_incr_db(FILE *fp) +{ + int k, realm; + char b1[16]; + + if (!no_output) { + fprintf(fp, "#%s\n", kern_db->signature); + fprintf(fp, +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"\n" + , "Realm", "BytesTo", "PktsTo", "BytesFrom", "PktsFrom"); + fprintf(fp, +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"%-10s " +"\n" + , "", "BPSTo", "PPSTo", "BPSFrom", "PPSFrom"); + } + + for (realm=0; realm<256; realm++) { + int ovfl = 0; + int i; + unsigned long long *val; + double *rate; + unsigned long long rval[4]; + + if (!(rmap[realm>>5] & (1<<(realm&0x1f)))) + continue; + + val = &kern_db->val[realm*4]; + rate = &kern_db->rate[realm*4]; + + for (k=0; k<4; k++) { + rval[k] = val[k]; + if (rval[k] < hist_db->val[realm*4+k]) + ovfl = 1; + else + rval[k] -= hist_db->val[realm*4+k]; + } + if (ovfl) { + for (k=0; k<4; k++) + rval[k] = val[k]; + } + if (hist_db) { + memcpy(&hist_db->val[realm*4], val, sizeof(*val)*4); + } + + if (no_output) + continue; + + if (!dump_zeros && + !rval[0] && !rate[0] && + !rval[1] && !rate[1] && + !rval[2] && !rate[2] && + !rval[3] && !rate[3]) + continue; + + + fprintf(fp, "%-10s", rtnl_rtrealm_n2a(realm, b1, sizeof(b1))); + for (i = 0; i < 4; i++) + format_count(fp, rval[i]); + fprintf(fp, "\n%-10s", ""); + for (i = 0; i < 4; i++) + format_rate(fp, rate[i]); + fprintf(fp, "\n"); + } +} + + +static int children; + +void sigchild(int signo) +{ +} + +/* Server side only: read kernel data, update tables, calculate rates. */ + +void update_db(int interval) +{ + int i; + __u32 *ival; + __u32 _ival[256*4]; + + ival = read_kern_table(_ival); + + for (i=0; i<256*4; i++) { + double sample; + __u32 incr = ival[i] - kern_db->ival[i]; + + if (ival[i] == 0 && incr == 0 && + kern_db->val[i] == 0 && kern_db->rate[i] == 0) + continue; + + kern_db->val[i] += incr; + kern_db->ival[i] = ival[i]; + sample = (double)(incr*1000)/interval; + if (interval >= scan_interval) { + kern_db->rate[i] += W*(sample-kern_db->rate[i]); + } else if (interval >= 1000) { + if (interval >= time_constant) { + kern_db->rate[i] = sample; + } else { + double w = W*(double)interval/scan_interval; + kern_db->rate[i] += w*(sample-kern_db->rate[i]); + } + } + } +} + +void send_db(int fd) +{ + int tot = 0; + + while (tot < sizeof(*kern_db)) { + int n = write(fd, ((char*)kern_db) + tot, sizeof(*kern_db)-tot); + if (n < 0) { + if (errno == EINTR) + continue; + return; + } + tot += n; + } +} + + + +#define T_DIFF(a,b) (((a).tv_sec-(b).tv_sec)*1000 + ((a).tv_usec-(b).tv_usec)/1000) + + +void pad_kern_table(struct rtacct_data *dat, __u32 *ival) +{ + int i; + memset(dat->rate, 0, sizeof(dat->rate)); + if (dat->ival != ival) + memcpy(dat->ival, ival, sizeof(dat->ival)); + for (i=0; i<256*4; i++) + dat->val[i] = ival[i]; +} + +void server_loop(int fd) +{ + struct timeval snaptime; + struct pollfd p; + p.fd = fd; + p.events = p.revents = POLLIN; + + sprintf(kern_db->signature, "%d.%lu sampling_interval=%d time_const=%d", + getpid(), (unsigned long)random(), scan_interval/1000, time_constant/1000); + + pad_kern_table(kern_db, read_kern_table(kern_db->ival)); + + for (;;) { + int status; + int tdiff; + struct timeval now; + gettimeofday(&now, NULL); + tdiff = T_DIFF(now, snaptime); + if (tdiff >= scan_interval) { + update_db(tdiff); + snaptime = now; + tdiff = 0; + } + if (poll(&p, 1, tdiff + scan_interval) > 0 + && (p.revents&POLLIN)) { + int clnt = accept(fd, NULL, NULL); + if (clnt >= 0) { + pid_t pid; + if (children >= 5) { + close(clnt); + } else if ((pid = fork()) != 0) { + if (pid>0) + children++; + close(clnt); + } else { + if (tdiff > 0) + update_db(tdiff); + send_db(clnt); + exit(0); + } + } + } + while (children && waitpid(-1, &status, WNOHANG) > 0) + children--; + } +} + +int verify_forging(int fd) +{ + struct ucred cred; + int olen = sizeof(cred); + if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, (void*)&cred, &olen) || + olen < sizeof(cred)) + return -1; + if (cred.uid == getuid() || cred.uid == 0) + return 0; + return -1; +} + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, +"Usage: rtacct [ -h?vVzrnasd:t: ] [ ListOfRealms ]\n" + ); + exit(-1); +} + +int main(int argc, char *argv[]) +{ + char hist_name[128]; + struct sockaddr_un sun; + int ch; + int fd; + + while ((ch = getopt(argc, argv, "h?vVzrM:nasd:t:")) != EOF) { + switch(ch) { + case 'z': + dump_zeros = 1; + break; + case 'r': + reset_history = 1; + break; + case 'a': + ignore_history = 1; + break; + case 's': + no_update = 1; + break; + case 'n': + no_output = 1; + break; + case 'd': + scan_interval = 1000*atoi(optarg); + break; + case 't': + if (sscanf(optarg, "%d", &time_constant) != 1 || + time_constant <= 0) { + fprintf(stderr, "rtacct: invalid time constant divisor\n"); + exit(-1); + } + break; + case 'v': + case 'V': + printf("rtacct utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + case 'M': + /* Some secret undocumented option, nobody + * is expected to ask about its sense. See? + */ + sscanf(optarg, "%lx", &magic_number); + break; + case 'h': + case '?': + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + if (argc) { + while (argc > 0) { + __u32 realm; + if (rtnl_rtrealm_a2n(&realm, argv[0])) { + fprintf(stderr, "Warning: realm \"%s\" does not exist.\n", argv[0]); + exit(-1); + } + rmap[realm>>5] |= (1<<(realm&0x1f)); + argc--; argv++; + } + } else { + memset(rmap, ~0, sizeof(rmap)); + /* Always suppress zeros. */ + dump_zeros = 0; + } + + sun.sun_family = AF_UNIX; + sun.sun_path[0] = 0; + sprintf(sun.sun_path+1, "rtacct%d", getuid()); + + if (scan_interval > 0) { + if (time_constant == 0) + time_constant = 60; + time_constant *= 1000; + W = 1 - 1/exp(log(10)*(double)scan_interval/time_constant); + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + perror("rtacct: socket"); + exit(-1); + } + if (bind(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) < 0) { + perror("rtacct: bind"); + exit(-1); + } + if (listen(fd, 5) < 0) { + perror("rtacct: listen"); + exit(-1); + } + if (fork()) + exit(0); + chdir("/"); + close(0); close(1); close(2); setsid(); + signal(SIGPIPE, SIG_IGN); + signal(SIGCHLD, sigchild); + server_loop(fd); + exit(0); + } + + if (getenv("RTACCT_HISTORY")) + snprintf(hist_name, sizeof(hist_name), getenv("RTACCT_HISTORY")); + else + sprintf(hist_name, "/tmp/.rtacct.u%d", getuid()); + + if (reset_history) + unlink(hist_name); + + if (!ignore_history || !no_update) { + struct stat stb; + + fd = open(hist_name, O_RDWR|O_CREAT|O_NOFOLLOW, 0600); + if (fd < 0) { + perror("rtacct: open history file"); + exit(-1); + } + if (flock(fd, LOCK_EX)) { + perror("rtacct: flock history file"); + exit(-1); + } + if (fstat(fd, &stb) != 0) { + perror("rtacct: fstat history file"); + exit(-1); + } + if (stb.st_nlink != 1 || stb.st_uid != getuid()) { + fprintf(stderr, "rtacct: something is so wrong with history file, that I prefer not to proceed.\n"); + exit(-1); + } + if (stb.st_size != sizeof(*hist_db)) + write(fd, kern_db, sizeof(*hist_db)); + + hist_db = mmap(NULL, sizeof(*hist_db), + PROT_READ|PROT_WRITE, + no_update ? MAP_PRIVATE : MAP_SHARED, + fd, 0); + + if ((unsigned long)hist_db == ~0UL) { + perror("mmap"); + exit(-1); + } + + if (!ignore_history) { + FILE *tfp; + long uptime; + if ((tfp = fopen("/proc/uptime", "r")) != NULL) { + if (fscanf(tfp, "%ld", &uptime) != 1) + uptime = -1; + fclose(tfp); + } + + if (uptime >= 0 && time(NULL) >= stb.st_mtime+uptime) { + fprintf(stderr, "rtacct: history is aged out, resetting\n"); + memset(hist_db, 0, sizeof(*hist_db)); + } + } + + close(fd); + } + + if ((fd = socket(AF_UNIX, SOCK_STREAM, 0)) >= 0 && + (connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0 + || (strcpy(sun.sun_path+1, "rtacct0"), + connect(fd, (struct sockaddr*)&sun, 2+1+strlen(sun.sun_path+1)) == 0)) + && verify_forging(fd) == 0) { + nread(fd, (char*)kern_db, sizeof(*kern_db)); + if (hist_db && hist_db->signature[0] && + strcmp(kern_db->signature, hist_db->signature)) { + fprintf(stderr, "rtacct: history is stale, ignoring it.\n"); + hist_db = NULL; + } + close(fd); + } else { + if (fd >= 0) + close(fd); + + if (hist_db && hist_db->signature[0] && + strcmp(hist_db->signature, "kernel")) { + fprintf(stderr, "rtacct: history is stale, ignoring it.\n"); + hist_db = NULL; + } + + pad_kern_table(kern_db, read_kern_table(kern_db->ival)); + strcpy(kern_db->signature, "kernel"); + } + + if (ignore_history || hist_db == NULL) + dump_abs_db(stdout); + else + dump_incr_db(stdout); + + exit(0); +} diff --git a/misc/rtstat.c b/misc/rtstat.c index e69de29b..feed6cf2 100644 --- a/misc/rtstat.c +++ b/misc/rtstat.c @@ -0,0 +1,172 @@ +/* rtstat.c: A program for route cache monitoring + * + * Copyright 2001 by Robert Olsson <robert.olsson@its.uu.se> + * Uppsala University, Sweden + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Additional credits: + * Martin Josefsson <gandalf@wlug.westbo.se> 010828 bug fix + * + */ + +#include <stdio.h> +#include <unistd.h> +#include <getopt.h> + +#define VERSION "0.33 010829" + +extern char *optarg; +extern int optind, opterr, optopt; + +FILE *fp; +unsigned rt_size, in_hit[2], in_slow_tot[2], in_slow_mc[2], + in_no_rt[2], in_brd[2], in_martian_dst[2], in_martian_src[2], + out_hit[2], out_slow_tot[2], out_slow_mc[2]; + + +/* Read (and summarize for SMP) the different stats vars. */ + +void scan_line(int i) +{ + unsigned temp[10]; + + in_hit[i] = 0; + in_slow_tot[i] = 0; + in_slow_mc[i] = 0; + in_no_rt[i] = 0; + in_brd[i] = 0; + in_martian_dst[i] = 0; + in_martian_src[i] = 0; + out_hit[i] = 0; + out_slow_tot[i] = 0; + out_slow_mc[i] = 0; + + while(!feof(fp)) { + fscanf(fp, "%x %x %x %x %x %x %x %x %x %x %x\n", + &rt_size, + &temp[0], /* in_hit */ + &temp[1], /* in_slow_tot */ + &temp[2], /* in_slow_mc */ + &temp[3], /* in_no_rt */ + &temp[4], /* in_brd */ + &temp[5], /* in_martian_dst */ + &temp[6], /* in_martian_src */ + &temp[7], /* out_hit */ + &temp[8], /* out_slow_tot */ + &temp[9] /* out_slow_mc */ + ); + + in_hit[i] += temp[0]; + in_slow_tot[i] += temp[1]; + in_slow_mc[i] += temp[2]; + in_no_rt[i] += temp[3]; + in_brd[i] += temp[4]; + in_martian_dst[i] += temp[5]; + in_martian_src[i] += temp[6]; + out_hit[i] += temp[7]; + out_slow_tot[i] += temp[8]; + out_slow_mc[i] += temp[9]; + } + return; +} + +void print_hdr_line(void) +{ + printf(" size IN: hit tot mc no_rt bcast madst masrc OUT: hit tot mc\n"); +} + +int usage(int exit_code) +{ + fprintf(stderr, "rtstat Version %s\n", VERSION); + fprintf(stderr, " -help\n"); + fprintf(stderr, " -i interval\n"); + fprintf(stderr, " -s subject [0-2]\n"); + fprintf(stderr, "\n"); + print_hdr_line(); + fprintf(stderr, "\n"); + fprintf(stderr, "size == route cache size\n"); + fprintf(stderr, "hit == IN: total number of cache hits per sec\n"); + fprintf(stderr, "tot == IN: total number of cache misses per sec\n"); + fprintf(stderr, "mc == IN: mulicast cache misses per sec\n"); + fprintf(stderr, "no_rt == IN: route table misses per sec\n"); + fprintf(stderr, "bcast == IN: broadcast cache misses per sec\n"); + fprintf(stderr, "madst == IN: dst martians per sec\n"); + fprintf(stderr, "masrc == IN: src martians per sec\n"); + + fprintf(stderr, "hit == OUT: total number of cache hits per sec\n"); + fprintf(stderr, "tot == OUT: total number of cache misses per sec\n"); + fprintf(stderr, "mc == OUT: mulicast cache misses per sec\n"); + + exit(exit_code); +} + +int main(int argc, char **argv) +{ + int c, i=1, interval=2, hdr=2; + + while ((c=getopt(argc, argv,"h?s:i:")) != EOF) + switch (c) + { + + case '?': + case 'h': usage(0); + + case 'i': sscanf(optarg, "%u", &interval); + break; + + case 's': sscanf(optarg, "%u", &hdr); + break; + + default: usage(1); + } + + if(interval < 1 ) interval=1; + + if ((fp = fopen("/proc/net/rt_cache_stat", "r"))); + else + { + perror("fopen"); + exit(-1); + } + + if(hdr > 0) print_hdr_line(); + + for(;1;i++) { + + if(hdr > 1 && (! (i % 20))) print_hdr_line(); + + scan_line(0); + sleep(interval); + rewind(fp); + scan_line(1); + rewind(fp); + + printf("%5u %9u %7u %5u %5u %5u %5u %5u %9u %7u %6u\n", + rt_size, + (in_hit[1] - in_hit[0])/interval, + (in_slow_tot[1] - in_slow_tot[0])/interval, + (in_slow_mc[1] - in_slow_mc[0])/interval, + (in_no_rt[1] - in_no_rt[0])/interval, + (in_brd[1] - in_brd[0])/interval, + (in_martian_dst[1] - in_martian_dst[0])/interval, + (in_martian_src[1] - in_martian_src[0])/interval, + + (out_hit[1] - out_hit[0])/interval, + (out_slow_tot[1] - out_slow_tot[0])/interval, + (out_slow_mc[1] - out_slow_mc[0])/interval + ); + } + return 1; +} + +/* + * Compile: + gcc -g -O2 -Wall -o rtstat rtstat.c +*/ + + + diff --git a/misc/ss.c b/misc/ss.c index e69de29b..3918bdef 100644 --- a/misc/ss.c +++ b/misc/ss.c @@ -0,0 +1,2672 @@ +/* + * ss.c "sockstat", socket statistics + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/uio.h> +#include <netinet/in.h> +#include <string.h> +#include <errno.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <resolv.h> +#include <dirent.h> +#include <fnmatch.h> + +#include "utils.h" +#include "rt_names.h" +#include "ll_map.h" +#include "libnetlink.h" +#include "tcp_diag.h" +#include "SNAPSHOT.h" + +#include <linux/tcp.h> + +int resolve_hosts = 0; +int resolve_services = 1; +int preferred_family = AF_UNSPEC; +int show_options = 0; +int show_details = 0; +int show_users = 0; +int show_mem = 0; +int show_tcpinfo = 0; + +int netid_width; +int state_width; +int addrp_width; +int addr_width; +int serv_width; +int screen_width; + +static const char *TCP_PROTO = "tcp"; +static const char *UDP_PROTO = "udp"; +static const char *RAW_PROTO = "raw"; +static const char *dg_proto = NULL; + +enum +{ + TCP_DB, + UDP_DB, + RAW_DB, + UNIX_DG_DB, + UNIX_ST_DB, + PACKET_DG_DB, + PACKET_R_DB, + NETLINK_DB, + MAX_DB +}; + +#define PACKET_DBM ((1<<PACKET_DG_DB)|(1<<PACKET_R_DB)) +#define UNIX_DBM ((1<<UNIX_DG_DB)|(1<<UNIX_ST_DB)) +#define ALL_DB ((1<<MAX_DB)-1) + +enum { + SS_UNKNOWN, + SS_ESTABLISHED, + SS_SYN_SENT, + SS_SYN_RECV, + SS_FIN_WAIT1, + SS_FIN_WAIT2, + SS_TIME_WAIT, + SS_CLOSE, + SS_CLOSE_WAIT, + SS_LAST_ACK, + SS_LISTEN, + SS_CLOSING, + SS_MAX +}; + +#define SS_ALL ((1<<SS_MAX)-1) + +#include "ssfilter.h" + +struct filter +{ + int dbs; + int states; + int families; + struct ssfilter *f; +}; + +struct filter default_filter = { + dbs: (1<<TCP_DB), + states: SS_ALL & ~((1<<SS_LISTEN)|(1<<SS_CLOSE)|(1<<SS_TIME_WAIT)|(1<<SS_SYN_RECV)), + families: (1<<AF_INET)|(1<<AF_INET6), +}; + +struct filter current_filter; + +int generic_proc_open(char *env, char *name) +{ + char store[128]; + char *p = getenv(env); + if (!p) { + p = getenv("PROC_ROOT") ? : "/proc"; + snprintf(store, sizeof(store)-1, "%s/%s", p, name); + p = store; + } + return open(store, O_RDONLY); +} + +int net_tcp_open(void) +{ + return generic_proc_open("PROC_NET_TCP", "net/tcp"); +} + +int net_tcp6_open(void) +{ + return generic_proc_open("PROC_NET_TCP6", "net/tcp6"); +} + +int net_udp_open(void) +{ + return generic_proc_open("PROC_NET_UDP", "net/udp"); +} + +int net_udp6_open(void) +{ + return generic_proc_open("PROC_NET_UDP6", "net/udp6"); +} + +int net_raw_open(void) +{ + return generic_proc_open("PROC_NET_RAW", "net/raw"); +} + +int net_raw6_open(void) +{ + return generic_proc_open("PROC_NET_RAW6", "net/raw6"); +} + +int net_unix_open(void) +{ + return generic_proc_open("PROC_NET_UNIX", "net/unix"); +} + +int net_packet_open(void) +{ + return generic_proc_open("PROC_NET_PACKET", "net/packet"); +} + +int net_netlink_open(void) +{ + return generic_proc_open("PROC_NET_NETLINK", "net/netlink"); +} + +int slabinfo_open(void) +{ + return generic_proc_open("PROC_SLABINFO", "slabinfo"); +} + +int net_sockstat_open(void) +{ + return generic_proc_open("PROC_NET_SOCKSTAT", "net/sockstat"); +} + +int net_sockstat6_open(void) +{ + return generic_proc_open("PROC_NET_SOCKSTAT6", "net/sockstat6"); +} + +int net_snmp_open(void) +{ + return generic_proc_open("PROC_NET_SNMP", "net/snmp"); +} + +int net_netstat_open(void) +{ + return generic_proc_open("PROC_NET_NETSTAT", "net/netstat"); +} + +int ephemeral_ports_open(void) +{ + return generic_proc_open("PROC_IP_LOCAL_PORT_RANGE", "sys/net/ipv4/ip_local_port_range"); +} + +int find_users(int ino, char *buf, int buflen) +{ + char pattern[64]; + int pattern_len; + char *ptr = buf; + char name[1024]; + DIR *dir; + struct dirent *d; + int cnt = 0; + int nameoff; + + if (!ino) + return 0; + + sprintf(pattern, "socket:[%d]", ino); + pattern_len = strlen(pattern); + + strncpy(name, getenv("PROC_ROOT") ? : "/proc/", sizeof(name)/2); + name[sizeof(name)/2] = 0; + if (strlen(name) == 0 || + name[strlen(name)-1] != '/') + strcat(name, "/"); + nameoff = strlen(name); + if ((dir = opendir(name)) == NULL) + return 0; + + while ((d = readdir(dir)) != NULL) { + DIR *dir1; + struct dirent *d1; + int pid; + int pos; + char crap; + char process[16]; + + if (sscanf(d->d_name, "%d%c", &pid, &crap) != 1) + continue; + + sprintf(name+nameoff, "%d/fd/", pid); + pos = strlen(name); + if ((dir1 = opendir(name)) == NULL) + continue; + + process[0] = 0; + + while ((d1 = readdir(dir1)) != NULL) { + int fd, n; + char lnk[64]; + + if (sscanf(d1->d_name, "%d%c", &fd, &crap) != 1) + continue; + + sprintf(name+pos, "%d", fd); + n = readlink(name, lnk, sizeof(lnk)-1); + if (n != pattern_len || + memcmp(lnk, pattern, n)) + continue; + + if (ptr-buf >= buflen-1) + break; + + if (process[0] == 0) { + char tmp[1024]; + FILE *fp; + snprintf(tmp, sizeof(tmp), "%s/%d/stat", + getenv("PROC_ROOT") ? : "/proc", pid); + if ((fp = fopen(tmp, "r")) != NULL) { + fscanf(fp, "%*d (%[^)])", process); + fclose(fp); + } + } + + snprintf(ptr, buflen-(ptr-buf), "(\"%s\",%d,%d),", process, pid, fd); + ptr += strlen(ptr); + cnt++; + } + closedir(dir1); + } + closedir(dir); + if (ptr != buf) + ptr[-1] = 0; + return cnt; +} + + +/* Get stats from slab */ + +struct slabstat +{ + int socks; + int tcp_ports; + int tcp_tws; + int tcp_syns; + int skbs; +}; + +struct slabstat slabstat; + +const char *slabstat_ids[] = +{ + "sock", + "tcp_bind_bucket", + "tcp_tw_bucket", + "tcp_open_request", + "skbuff_head_cache", +}; + +int get_slabstat(struct slabstat *s) +{ + char buf[256]; + FILE *fp; + int cnt; + + memset(s, 0, sizeof(*s)); + + if ((fp = fdopen(slabinfo_open(), "r")) == NULL) + return -1; + + cnt = sizeof(*s)/sizeof(int); + + fgets(buf, sizeof(buf), fp); + while(fgets(buf, sizeof(buf), fp) != NULL) { + int i; + for (i=0; i<sizeof(slabstat_ids)/sizeof(slabstat_ids[0]); i++) { + if (memcmp(buf, slabstat_ids[i], strlen(slabstat_ids[i])) == 0) { + sscanf(buf, "%*s%d", ((int *)s) + i); + cnt--; + break; + } + } + if (cnt <= 0) + break; + } + + fclose(fp); + return 0; +} + + + + +char *sstate_name[] = { + "UNKNOWN", + "ESTAB", + "SYN-SENT", + "SYN-RECV", + "FIN-WAIT-1", + "FIN-WAIT-2", + "TIME-WAIT", + "UNCONN", + "CLOSE-WAIT", + "LAST-ACK", + "LISTEN", + "CLOSING", +}; + +char *sstate_namel[] = { + "UNKNOWN", + "established", + "syn-sent", + "syn-recv", + "fin-wait-1", + "fin-wait-2", + "time-wait", + "unconnected", + "close-wait", + "last-ack", + "listening", + "closing", +}; + +struct tcpstat +{ + inet_prefix local; + inet_prefix remote; + int lport; + int rport; + int state; + int rq, wq; + int timer; + int timeout; + int retrs; + int ino; + int probes; + int uid; + int refcnt; + unsigned long long sk; + int rto, ato, qack, cwnd, ssthresh; +}; + +char *tmr_name[] = { + "off", + "on", + "keepalive", + "timewait", + "persist", + "unknown" +}; + +char *print_ms_timer(int timeout) +{ + static char buf[64]; + int secs, msecs, minutes; + if (timeout < 0) + timeout = 0; + secs = timeout/1000; + minutes = secs/60; + secs = secs%60; + msecs = timeout%1000; + buf[0] = 0; + if (minutes) { + msecs = 0; + snprintf(buf, sizeof(buf)-16, "%dmin", minutes); + if (minutes > 9) + secs = 0; + } + if (secs) { + if (secs > 9) + msecs = 0; + sprintf(buf+strlen(buf), "%d%s", secs, msecs ? "." : "sec"); + } + if (msecs) + sprintf(buf+strlen(buf), "%03dms", msecs); + return buf; +}; + +char *print_hz_timer(int timeout) +{ + int hz = get_hz(); + return print_ms_timer(((timeout*1000) + hz-1)/hz); +}; + +struct scache +{ + struct scache *next; + int port; + char *name; + const char *proto; +}; + +struct scache *rlist; + +void init_service_resolver(void) +{ + char buf[128]; + FILE *fp = popen("/usr/sbin/rpcinfo -p 2>/dev/null", "r"); + if (fp) { + fgets(buf, sizeof(buf), fp); + while (fgets(buf, sizeof(buf), fp) != NULL) { + unsigned int progn, port; + char proto[128], prog[128]; + if (sscanf(buf, "%u %*d %s %u %s", &progn, proto, + &port, prog+4) == 4) { + struct scache *c = malloc(sizeof(*c)); + if (c) { + c->port = port; + memcpy(prog, "rpc.", 4); + c->name = strdup(prog); + if (strcmp(proto, TCP_PROTO) == 0) + c->proto = TCP_PROTO; + else if (strcmp(proto, UDP_PROTO) == 0) + c->proto = UDP_PROTO; + else + c->proto = NULL; + c->next = rlist; + rlist = c; + } + } + } + } +} + +const char *__resolve_service(int port) +{ + struct scache *c; + + for (c = rlist; c; c = c->next) { + if (c->port == port && c->proto == dg_proto) + return c->name; + } + + /* Even do not try default linux ephemeral port ranges: + * default /etc/services contains so much of useless crap + * wouldbe "allocated" to this area that resolution + * is really harmful. I shrug each time when seeing + * "socks" or "cfinger" in dumps. + */ + if (port < 32768 && (port < 1024 || port > 4999)) { + static int notfirst; + struct servent *se; + if (!notfirst) { + setservent(1); + notfirst = 1; + } + se = getservbyport(htons(port), dg_proto); + if (se) + return se->s_name; + } + + return NULL; +} + + +const char *resolve_service(int port) +{ + static char buf[128]; + static struct scache cache[256]; + + if (port == 0) { + buf[0] = '*'; + buf[1] = 0; + return buf; + } + + if (resolve_services) { + if (dg_proto == RAW_PROTO) { + return inet_proto_n2a(port, buf, sizeof(buf)); + } else { + struct scache *c; + const char *res; + int hash = (port^(((unsigned long)dg_proto)>>2))&255; + + for (c = &cache[hash]; c; c = c->next) { + if (c->port == port && + c->proto == dg_proto) { + if (c->name) + return c->name; + goto do_numeric; + } + } + + if ((res = __resolve_service(port)) != NULL) { + if ((c = malloc(sizeof(*c))) == NULL) + goto do_numeric; + } else { + c = &cache[hash]; + if (c->name) + free(c->name); + } + c->port = port; + c->name = NULL; + c->proto = dg_proto; + if (res) { + c->name = strdup(res); + c->next = cache[hash].next; + cache[hash].next = c; + } + if (c->name) + return c->name; + } + } + + do_numeric: + sprintf(buf, "%u", port); + return buf; +} + +void formatted_print(inet_prefix *a, int port) +{ + char buf[1024]; + const char *ap = buf; + int est_len; + + est_len = addr_width; + + if (a->family == AF_INET) { + if (a->data[0] == 0) { + buf[0] = '*'; + buf[1] = 0; + } else { + ap = format_host(AF_INET, 4, a->data, buf, sizeof(buf)); + } + } else { + ap = format_host(a->family, 16, a->data, buf, sizeof(buf)); + est_len = strlen(ap); + if (est_len <= addr_width) + est_len = addr_width; + else + est_len = addr_width + ((est_len-addr_width+3)/4)*4; + } + printf("%*s:%-*s ", est_len, ap, serv_width, resolve_service(port)); +} + +struct aafilter +{ + inet_prefix addr; + int port; + struct aafilter *next; +}; + +int inet2_addr_match(inet_prefix *a, inet_prefix *p, int plen) +{ + if (!inet_addr_match(a, p, plen)) + return 0; + /* Cursed "v4 mapped" addresses: v4 mapped socket matches + * pure IPv4 rule, but v4-mapped rule selects only v4-mapped + * sockets. Fair? */ + if (p->family == AF_INET && a->family == AF_INET6) { + if (a->data[0] == 0 && a->data[1] == 0 && + a->data[2] == htonl(0xffff)) { + inet_prefix tmp = *a; + tmp.data[0] = a->data[3]; + return inet_addr_match(&tmp, p, plen); + } + } + return 1; +} + +int unix_match(inet_prefix *a, inet_prefix *p) +{ + char *addr, *pattern; + memcpy(&addr, a->data, sizeof(addr)); + memcpy(&pattern, p->data, sizeof(pattern)); + if (pattern == NULL) + return 1; + if (addr == NULL) + addr = ""; + return !fnmatch(pattern, addr, 0); +} + +int run_ssfilter(struct ssfilter *f, struct tcpstat *s) +{ + switch (f->type) { + case SSF_S_AUTO: + { + static int low, high=65535; + + if (s->local.family == AF_UNIX) { + char *p; + memcpy(&p, s->local.data, sizeof(p)); + return p == NULL || (p[0] == '@' && strlen(p) == 6 && + strspn(p+1, "0123456789abcdef") == 5); + } + if (s->local.family == AF_PACKET) + return s->lport == 0 && s->local.data == 0; + if (s->local.family == AF_NETLINK) + return s->lport < 0; + + if (!low) { + FILE *fp = fdopen(ephemeral_ports_open(), "r"); + if (fp) { + fscanf(fp, "%d%d", &low, &high); + fclose(fp); + } + } + return s->lport >= low && s->lport <= high; + } + case SSF_DCOND: + { + struct aafilter *a = (void*)f->pred; + if (a->addr.family == AF_UNIX) + return unix_match(&s->remote, &a->addr); + if (a->port != -1 && a->port != s->rport) + return 0; + if (a->addr.bitlen) { + do { + if (!inet2_addr_match(&s->remote, &a->addr, a->addr.bitlen)) + return 1; + } while ((a = a->next) != NULL); + return 0; + } + return 1; + } + case SSF_SCOND: + { + struct aafilter *a = (void*)f->pred; + if (a->addr.family == AF_UNIX) + return unix_match(&s->local, &a->addr); + if (a->port != -1 && a->port != s->lport) + return 0; + if (a->addr.bitlen) { + do { + if (!inet2_addr_match(&s->local, &a->addr, a->addr.bitlen)) + return 1; + } while ((a = a->next) != NULL); + return 0; + } + return 1; + } + case SSF_D_GE: + { + struct aafilter *a = (void*)f->pred; + return s->rport >= a->port; + } + case SSF_D_LE: + { + struct aafilter *a = (void*)f->pred; + return s->rport <= a->port; + } + case SSF_S_GE: + { + struct aafilter *a = (void*)f->pred; + return s->lport >= a->port; + } + case SSF_S_LE: + { + struct aafilter *a = (void*)f->pred; + return s->lport <= a->port; + } + + /* Yup. It is recursion. Sorry. */ + case SSF_AND: + return run_ssfilter(f->pred, s) && run_ssfilter(f->post, s); + case SSF_OR: + return run_ssfilter(f->pred, s) || run_ssfilter(f->post, s); + case SSF_NOT: + return !run_ssfilter(f->pred, s); + default: + abort(); + } +} + +/* Relocate external jumps by reloc. */ +void ssfilter_patch(char *a, int len, int reloc) +{ + while (len > 0) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)a; + if (op->no == len+4) + op->no += reloc; + len -= op->yes; + a += op->yes; + } + if (len < 0) + abort(); +} + +int ssfilter_bytecompile(struct ssfilter *f, char **bytecode) +{ + switch (f->type) { + case SSF_S_AUTO: + { + if (!(*bytecode=malloc(4))) abort(); + ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_AUTO, 4, 8 }; + return 8; + } + case SSF_DCOND: + case SSF_SCOND: + { + struct aafilter *a = (void*)f->pred; + struct aafilter *b; + char *ptr; + int code = (f->type == SSF_DCOND ? TCPDIAG_BC_D_COND : TCPDIAG_BC_S_COND); + int len = 0; + + for (b=a; b; b=b->next) { + len += 4 + sizeof(struct tcpdiag_hostcond); + if (a->addr.family == AF_INET6) + len += 16; + else + len += 4; + if (b->next) + len += 4; + } + if (!(ptr = malloc(len))) abort(); + *bytecode = ptr; + for (b=a; b; b=b->next) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op *)ptr; + int alen = (a->addr.family == AF_INET6 ? 16 : 4); + int oplen = alen + 4 + sizeof(struct tcpdiag_hostcond); + struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(ptr+4); + + *op = (struct tcpdiag_bc_op){ code, oplen, oplen+4 }; + cond->family = a->addr.family; + cond->port = a->port; + cond->prefix_len = a->addr.bitlen; + memcpy(cond->addr, a->addr.data, alen); + ptr += oplen; + if (b->next) { + op = (struct tcpdiag_bc_op *)ptr; + *op = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, len - (ptr-*bytecode)}; + ptr += 4; + } + } + return ptr - *bytecode; + } + case SSF_D_GE: + { + struct aafilter *x = (void*)f->pred; + if (!(*bytecode=malloc(8))) abort(); + ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_D_GE, 8, 12 }; + ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port }; + return 8; + } + case SSF_D_LE: + { + struct aafilter *x = (void*)f->pred; + if (!(*bytecode=malloc(8))) abort(); + ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_D_LE, 8, 12 }; + ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port }; + return 8; + } + case SSF_S_GE: + { + struct aafilter *x = (void*)f->pred; + if (!(*bytecode=malloc(8))) abort(); + ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_S_GE, 8, 12 }; + ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port }; + return 8; + } + case SSF_S_LE: + { + struct aafilter *x = (void*)f->pred; + if (!(*bytecode=malloc(8))) abort(); + ((struct tcpdiag_bc_op*)*bytecode)[0] = (struct tcpdiag_bc_op){ TCPDIAG_BC_S_LE, 8, 12 }; + ((struct tcpdiag_bc_op*)*bytecode)[1] = (struct tcpdiag_bc_op){ 0, 0, x->port }; + return 8; + } + + case SSF_AND: + { + char *a1, *a2, *a, l1, l2; + l1 = ssfilter_bytecompile(f->pred, &a1); + l2 = ssfilter_bytecompile(f->post, &a2); + if (!(a = malloc(l1+l2))) abort(); + memcpy(a, a1, l1); + memcpy(a+l1, a2, l2); + free(a1); free(a2); + ssfilter_patch(a, l1, l2); + *bytecode = a; + return l1+l2; + } + case SSF_OR: + { + char *a1, *a2, *a, l1, l2; + l1 = ssfilter_bytecompile(f->pred, &a1); + l2 = ssfilter_bytecompile(f->post, &a2); + if (!(a = malloc(l1+l2+4))) abort(); + memcpy(a, a1, l1); + memcpy(a+l1+4, a2, l2); + free(a1); free(a2); + *(struct tcpdiag_bc_op*)(a+l1) = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, l2+4 }; + *bytecode = a; + return l1+l2+4; + } + case SSF_NOT: + { + char *a1, *a, l1; + l1 = ssfilter_bytecompile(f->pred, &a1); + if (!(a = malloc(l1+4))) abort(); + memcpy(a, a1, l1); + free(a1); + *(struct tcpdiag_bc_op*)(a+l1) = (struct tcpdiag_bc_op){ TCPDIAG_BC_JMP, 4, 8 }; + *bytecode = a; + return l1+4; + } + default: + abort(); + } +} + +int remember_he(struct aafilter *a, struct hostent *he) +{ + char **ptr = he->h_addr_list; + int cnt = 0; + int len; + + if (he->h_addrtype == AF_INET) + len = 4; + else if (he->h_addrtype == AF_INET6) + len = 16; + else + return 0; + + while (*ptr) { + struct aafilter *b = a; + if (a->addr.bitlen) { + if ((b = malloc(sizeof(*b))) == NULL) + return cnt; + *b = *a; + b->next = a->next; + a->next = b; + } + memcpy(b->addr.data, *ptr, len); + b->addr.bytelen = len; + b->addr.bitlen = len*8; + b->addr.family = he->h_addrtype; + ptr++; + cnt++; + } + return cnt; +} + +int get_dns_host(struct aafilter *a, char *addr, int fam) +{ + static int notfirst; + int cnt = 0; + struct hostent *he; + + a->addr.bitlen = 0; + if (!notfirst) { + sethostent(1); + notfirst = 1; + } + he = gethostbyname2(addr, fam == AF_UNSPEC ? AF_INET : fam); + if (he) + cnt = remember_he(a, he); + if (fam == AF_UNSPEC) { + he = gethostbyname2(addr, AF_INET6); + if (he) + cnt += remember_he(a, he); + } + return !cnt; +} + +int xll_initted = 0; + +void xll_init(void) +{ + struct rtnl_handle rth; + rtnl_open(&rth, 0); + ll_init_map(&rth); + rtnl_close(&rth); + xll_initted = 1; +} + +const char *xll_index_to_name(int index) +{ + if (!xll_initted) + xll_init(); + return ll_index_to_name(index); +} + +int xll_name_to_index(char *dev) +{ + if (!xll_initted) + xll_init(); + return ll_name_to_index(dev); +} + +void *parse_hostcond(char *addr) +{ + char *port = NULL; + struct aafilter a; + struct aafilter *res; + int fam = preferred_family; + + memset(&a, 0, sizeof(a)); + a.port = -1; + + if (fam == AF_UNIX || strncmp(addr, "unix:", 5) == 0) { + char *p; + a.addr.family = AF_UNIX; + if (strncmp(addr, "unix:", 5) == 0) + addr+=5; + p = strdup(addr); + a.addr.bitlen = 8*strlen(p); + memcpy(a.addr.data, &p, sizeof(p)); + goto out; + } + + if (fam == AF_PACKET || strncmp(addr, "link:", 5) == 0) { + a.addr.family = AF_PACKET; + a.addr.bitlen = 0; + if (strncmp(addr, "link:", 5) == 0) + addr+=5; + port = strchr(addr, ':'); + if (port) { + *port = 0; + if (port[1] && strcmp(port+1, "*")) { + if (get_integer(&a.port, port+1, 0)) { + if ((a.port = xll_name_to_index(port+1)) <= 0) + return NULL; + } + } + } + if (addr[0] && strcmp(addr, "*")) { + unsigned short tmp; + a.addr.bitlen = 32; + if (ll_proto_a2n(&tmp, addr)) + return NULL; + a.addr.data[0] = ntohs(tmp); + } + goto out; + } + + if (fam == AF_NETLINK || strncmp(addr, "netlink:", 8) == 0) { + a.addr.family = AF_NETLINK; + a.addr.bitlen = 0; + if (strncmp(addr, "netlink:", 8) == 0) + addr+=8; + port = strchr(addr, ':'); + if (port) { + *port = 0; + if (port[1] && strcmp(port+1, "*")) { + if (get_integer(&a.port, port+1, 0)) { + if (strcmp(port+1, "kernel") == 0) + a.port = 0; + else + return NULL; + } + } + } + if (addr[0] && strcmp(addr, "*")) { + a.addr.bitlen = 32; + if (get_u32(a.addr.data, addr, 0)) { + if (strcmp(addr, "rtnl") == 0) + a.addr.data[0] = 0; + else if (strcmp(addr, "fw") == 0) + a.addr.data[0] = 3; + else if (strcmp(addr, "tcpdiag") == 0) + a.addr.data[0] = 4; + else + return NULL; + } + } + goto out; + } + + if (strncmp(addr, "inet:", 5) == 0) { + addr += 5; + fam = AF_INET; + } else if (strncmp(addr, "inet6:", 6) == 0) { + addr += 6; + fam = AF_INET6; + } + + /* URL-like literal [] */ + if (addr[0] == '[') { + addr++; + if ((port = strchr(addr, ']')) == NULL) + return NULL; + *port++ = 0; + } else if (addr[0] == '*') { + port = addr+1; + } else { + port = strrchr(strchr(addr, '/') ? : addr, ':'); + } + if (port && *port) { + if (*port != ':') + return NULL; + *port++ = 0; + if (*port && *port != '*') { + if (get_integer(&a.port, port, 0)) { + struct servent *se1 = NULL; + struct servent *se2 = NULL; + if (current_filter.dbs&(1<<UDP_DB)) + se1 = getservbyname(port, UDP_PROTO); + if (current_filter.dbs&(1<<TCP_DB)) + se2 = getservbyname(port, TCP_PROTO); + if (se1 && se2 && se1->s_port != se2->s_port) { + fprintf(stderr, "Error: ambiguous port \"%s\".\n", port); + return NULL; + } + if (!se1) + se1 = se2; + if (se1) { + a.port = ntohs(se1->s_port); + } else { + struct scache *s; + for (s = rlist; s; s = s->next) { + if ((s->proto == UDP_PROTO && + (current_filter.dbs&(1<<UDP_DB))) || + (s->proto == TCP_PROTO && + (current_filter.dbs&(1<<TCP_DB)))) { + if (s->name && strcmp(s->name, port) == 0) { + if (a.port > 0 && a.port != s->port) { + fprintf(stderr, "Error: ambiguous port \"%s\".\n", port); + return NULL; + } + a.port = s->port; + } + } + } + if (a.port <= 0) { + fprintf(stderr, "Error: \"%s\" does not look like a port.\n", port); + return NULL; + } + } + } + } + } + if (addr && *addr && *addr != '*') { + if (get_prefix_1(&a.addr, addr, fam)) { + if (get_dns_host(&a, addr, fam)) { + fprintf(stderr, "Error: an inet prefix is expected rather than \"%s\".\n", addr); + return NULL; + } + } + } + + out: + res = malloc(sizeof(*res)); + if (res) + memcpy(res, &a, sizeof(a)); + return res; +} + +int tcp_show_line(char *line, struct filter *f, int family) +{ + struct tcpstat s; + char *loc, *rem, *data; + char opt[256]; + int n; + char *p; + + if ((p = strchr(line, ':')) == NULL) + return -1; + loc = p+2; + + if ((p = strchr(loc, ':')) == NULL) + return -1; + p[5] = 0; + rem = p+6; + + if ((p = strchr(rem, ':')) == NULL) + return -1; + p[5] = 0; + data = p+6; + + do { + int state = (data[1] >= 'A') ? (data[1] - 'A' + 10) : (data[1] - '0'); + + if (!(f->states & (1<<state))) + return 0; + } while (0); + + s.local.family = s.remote.family = family; + if (family == AF_INET) { + sscanf(loc, "%x:%x", s.local.data, (unsigned*)&s.lport); + sscanf(rem, "%x:%x", s.remote.data, (unsigned*)&s.rport); + s.local.bytelen = s.remote.bytelen = 4; + } else { + sscanf(loc, "%08x%08x%08x%08x:%x", + s.local.data, + s.local.data+1, + s.local.data+2, + s.local.data+3, + &s.lport); + sscanf(rem, "%08x%08x%08x%08x:%x", + s.remote.data, + s.remote.data+1, + s.remote.data+2, + s.remote.data+3, + &s.rport); + s.local.bytelen = s.remote.bytelen = 16; + } + + if (f->f && run_ssfilter(f->f, &s) == 0) + return 0; + + opt[0] = 0; + n = sscanf(data, "%x %x:%x %x:%x %x %d %d %d %d %llx %d %d %d %d %d %[^\n]\n", + &s.state, &s.wq, &s.rq, + &s.timer, &s.timeout, &s.retrs, &s.uid, &s.probes, &s.ino, + &s.refcnt, &s.sk, &s.rto, &s.ato, &s.qack, + &s.cwnd, &s.ssthresh, opt); + + if (n < 17) + opt[0] = 0; + + if (n < 12) { + s.rto = 0; + s.cwnd = 2; + s.ssthresh = -1; + s.ato = s.qack = 0; + } + + if (netid_width) + printf("%-*s ", netid_width, "tcp"); + if (state_width) + printf("%-*s ", state_width, sstate_name[s.state]); + + printf("%-6d %-6d ", s.rq, s.wq); + + formatted_print(&s.local, s.lport); + formatted_print(&s.remote, s.rport); + + if (show_options) { + if (s.timer) { + if (s.timer > 4) + s.timer = 5; + printf(" timer:(%s,%s,%d)", + tmr_name[s.timer], + print_hz_timer(s.timeout), + s.timer != 1 ? s.probes : s.retrs); + } + } + if (show_tcpinfo) { + if (s.rto && s.rto != 3*get_hz()) + printf(" rto:%g", (double)s.rto/get_hz()); + if (s.ato) + printf(" ato:%g", (double)s.ato/get_hz()); + if (s.cwnd != 2) + printf(" cwnd:%d", s.cwnd); + if (s.ssthresh != -1) + printf(" ssthresh:%d", s.ssthresh); + if (s.qack/2) + printf(" qack:%d", s.qack/2); + if (s.qack&1) + printf(" bidir"); + } + if (show_users) { + char ubuf[4096]; + if (find_users(s.ino, ubuf, sizeof(ubuf)) > 0) + printf(" users:(%s)", ubuf); + } + if (show_details) { + if (s.uid) + printf(" uid:%u", (unsigned)s.uid); + printf(" ino:%u", (unsigned)s.ino); + printf(" sk:%llx", s.sk); + if (opt[0]) + printf(" opt:\"%s\"", opt); + } + printf("\n"); + + return 0; +} + +int generic_record_read(int fd, char *buf, int bufsize, + int (*worker)(char*, struct filter *, int), + struct filter *f, int fam) +{ + int n; + int recsize; + int eof = 0; + char *p; + + /* Load the first chunk and calculate record length from it. */ + n = read(fd, buf, bufsize); + if (n < 0) + goto outerr; + /* I _know_ that this is wrong, do not remind. :-) + * But this works nowadays. */ + if (n < bufsize) + eof = 1; + p = memchr(buf, '\n', n); + if (p == NULL || (p-buf) >= n) + goto outwrongformat; + recsize = (p-buf)+1; + p = buf+recsize; + + for (;;) { + while ((p+recsize) - buf <= n) { + if (p[recsize-1] != '\n') + goto outwrongformat; + p[recsize-1] = 0; + if (worker(p, f, fam) < 0) + goto done; + p += recsize; + } + if (!eof) { + int remains = (buf+bufsize) - p; + memcpy(buf, p, remains); + p = buf+remains; + n = read(fd, p, (buf+bufsize) - p); + if (n < 0) + goto outerr; + if (n < (buf+bufsize) - p) { + eof = 1; + if (n == 0) { + if (remains) + goto outwrongformat; + goto done; + } + } + n += remains; + p = buf; + } else { + if (p != buf+n) + goto outwrongformat; + goto done; + } + } +done: + return 0; + +outwrongformat: + errno = EINVAL; +outerr: + return -1; +} + + +int tcp_show_sock(struct nlmsghdr *nlh, struct filter *f) +{ + struct tcpdiagmsg *r = NLMSG_DATA(nlh); + struct tcpstat s; + + s.state = r->tcpdiag_state; + s.local.family = s.remote.family = r->tcpdiag_family; + s.lport = ntohs(r->id.tcpdiag_sport); + s.rport = ntohs(r->id.tcpdiag_dport); + if (s.local.family == AF_INET) { + s.local.bytelen = s.remote.bytelen = 4; + } else { + s.local.bytelen = s.remote.bytelen = 16; + } + memcpy(s.local.data, r->id.tcpdiag_src, s.local.bytelen); + memcpy(s.remote.data, r->id.tcpdiag_dst, s.local.bytelen); + + if (f && f->f && run_ssfilter(f->f, &s) == 0) + return 0; + + if (netid_width) + printf("%-*s ", netid_width, "tcp"); + if (state_width) + printf("%-*s ", state_width, sstate_name[s.state]); + + printf("%-6d %-6d ", r->tcpdiag_rqueue, r->tcpdiag_wqueue); + + formatted_print(&s.local, s.lport); + formatted_print(&s.remote, s.rport); + + if (show_options) { + if (r->tcpdiag_timer) { + if (r->tcpdiag_timer > 4) + r->tcpdiag_timer = 5; + printf(" timer:(%s,%s,%d)", + tmr_name[r->tcpdiag_timer], + print_ms_timer(r->tcpdiag_expires), + r->tcpdiag_retrans); + } + } + if (show_users) { + char ubuf[4096]; + if (find_users(r->tcpdiag_inode, ubuf, sizeof(ubuf)) > 0) + printf(" users:(%s)", ubuf); + } + if (show_details) { + if (r->tcpdiag_uid) + printf(" uid:%u", (unsigned)r->tcpdiag_uid); + printf(" ino:%u", (unsigned)r->tcpdiag_inode); + printf(" sk:%08x", r->id.tcpdiag_cookie[0]); + if (r->id.tcpdiag_cookie[1] != 0) + printf("%08x", r->id.tcpdiag_cookie[1]); + } + if (show_mem || show_tcpinfo) { + struct rtattr * tb[TCPDIAG_MAX+1]; + struct tcpdiag_meminfo *minfo = NULL; + struct tcp_info *info = NULL; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCPDIAG_MAX, (struct rtattr*)(r+1), + nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r))); + if (tb[TCPDIAG_MEMINFO]) + minfo = RTA_DATA(tb[TCPDIAG_MEMINFO]); + if (tb[TCPDIAG_INFO]) + info = RTA_DATA(tb[TCPDIAG_INFO]); + if (minfo) { + printf(" mem:(r%u,w%u,f%u,t%u)", + minfo->tcpdiag_rmem, + minfo->tcpdiag_wmem, + minfo->tcpdiag_fmem, + minfo->tcpdiag_tmem); + } + if (info) { +#ifdef TCP_INFO + if (info->tcpi_rto && info->tcpi_rto != 3000000) + printf(" rto:%g", (double)info->tcpi_rto/1000); + if (info->tcpi_rtt) + printf(" rtt:%g/%g", (double)info->tcpi_rtt/1000, + (double)info->tcpi_rttvar/1000); + if (info->tcpi_ato) + printf(" ato:%g", (double)info->tcpi_ato/1000); + if (info->tcpi_snd_cwnd != 2) + printf(" cwnd:%d", info->tcpi_snd_cwnd); + if (info->tcpi_snd_ssthresh < 0xFFFF) + printf(" ssthresh:%d", info->tcpi_snd_ssthresh); +#else +#warning No TCP_INFO. Please, do not repeat this experiment, use right kernel. + printf(" MORE_INFO_PROVIDED_YOU_COMPILED_SS_RIGHT"); +#endif + } + } + printf("\n"); + + return 0; + +} + +int tcp_show_netlink(struct filter *f, FILE *dump_fp) +{ + int fd; + struct sockaddr_nl nladdr; + struct { + struct nlmsghdr nlh; + struct tcpdiagreq r; + } req; + char *bc = NULL; + int bclen; + struct msghdr msg; + struct rtattr rta; + char buf[8192]; + struct iovec iov[3]; + + if ((fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_TCPDIAG)) < 0) + return -1; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = TCPDIAG_GETSOCK; + req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 123456; + memset(&req.r, 0, sizeof(req.r)); + req.r.tcpdiag_family = AF_INET; + req.r.tcpdiag_states = f->states; + if (show_mem) + req.r.tcpdiag_ext |= (1<<(TCPDIAG_MEMINFO-1)); + if (show_tcpinfo) + req.r.tcpdiag_ext |= (1<<(TCPDIAG_INFO-1)); + + iov[0] = (struct iovec){ &req, sizeof(req) }; + if (f->f) { + bclen = ssfilter_bytecompile(f->f, &bc); + rta.rta_type = TCPDIAG_REQ_BYTECODE; + rta.rta_len = RTA_LENGTH(bclen); + iov[1] = (struct iovec){ &rta, sizeof(rta) }; + iov[2] = (struct iovec){ bc, bclen }; + req.nlh.nlmsg_len += RTA_LENGTH(bclen); + } + + msg = (struct msghdr) { + (void*)&nladdr, sizeof(nladdr), + iov, f->f ? 3 : 1, + NULL, 0, + 0 + }; + + if (sendmsg(fd, &msg, 0) < 0) + return -1; + + + iov[0] = (struct iovec){ buf, sizeof(buf) }; + + while (1) { + int status; + struct nlmsghdr *h; + + msg = (struct msghdr) { + (void*)&nladdr, sizeof(nladdr), + iov, 1, + NULL, 0, + 0 + }; + + status = recvmsg(fd, &msg, 0); + + if (status < 0) { + if (errno == EINTR) + continue; + perror("OVERRUN"); + continue; + } + if (status == 0) { + fprintf(stderr, "EOF on netlink\n"); + return 0; + } + + if (dump_fp) + fwrite(buf, 1, NLMSG_ALIGN(status), dump_fp); + + h = (struct nlmsghdr*)buf; + while (NLMSG_OK(h, status)) { + int err; + + if (/*h->nlmsg_pid != rth->local.nl_pid ||*/ + h->nlmsg_seq != 123456) + goto skip_it; + + if (h->nlmsg_type == NLMSG_DONE) + return 0; + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + perror("TCPDIAG answers"); + } + return 0; + } + if (!dump_fp) { + err = tcp_show_sock(h, NULL); + if (err < 0) + return err; + } + +skip_it: + h = NLMSG_NEXT(h, status); + } + if (msg.msg_flags & MSG_TRUNC) { + fprintf(stderr, "Message truncated\n"); + continue; + } + if (status) { + fprintf(stderr, "!!!Remnant of size %d\n", status); + exit(1); + } + } + return 0; +} + +int tcp_show_netlink_file(struct filter *f) +{ + FILE *fp; + char buf[8192]; + + if ((fp = fopen(getenv("TCPDIAG_FILE"), "r")) == NULL) { + perror("fopen($TCPDIAG_FILE)"); + return -1; + } + + while (1) { + int status, err; + struct nlmsghdr *h = (struct nlmsghdr*)buf; + + status = fread(buf, 1, sizeof(*h), fp); + if (status < 0) { + perror("Reading header from $TCPDIAG_FILE"); + return -1; + } + if (status != sizeof(*h)) { + perror("Unexpected EOF reading $TCPDIAG_FILE"); + return -1; + } + + status = fread(h+1, 1, NLMSG_ALIGN(h->nlmsg_len-sizeof(*h)), fp); + + if (status < 0) { + perror("Reading $TCPDIAG_FILE"); + return -1; + } + if (status + sizeof(*h) < h->nlmsg_len) { + perror("Unexpected EOF reading $TCPDIAG_FILE"); + return -1; + } + + /* The only legal exit point */ + if (h->nlmsg_type == NLMSG_DONE) + return 0; + + if (h->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h); + if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) { + fprintf(stderr, "ERROR truncated\n"); + } else { + errno = -err->error; + perror("TCPDIAG answered"); + } + return -1; + } + + err = tcp_show_sock(h, f); + if (err < 0) + return err; + } +} + +int tcp_show(struct filter *f) +{ + int fd = -1; + char *buf = NULL; + int bufsize = 64*1024; + + dg_proto = TCP_PROTO; + + if (getenv("TCPDIAG_FILE")) + return tcp_show_netlink_file(f); + + if (!getenv("PROC_NET_TCP") && !getenv("PROC_ROOT") + && tcp_show_netlink(f, NULL) == 0) + return 0; + + /* Sigh... We have to parse /proc/net/tcp... */ + + /* Estimate amount of sockets and try to allocate + * huge buffer to read all the table at one read. + * Limit it by 16MB though. The assumption is: as soon as + * kernel was able to hold information about N connections, + * it is able to give us some memory for snapshot. + */ + if (1) { + int guess = slabstat.socks+slabstat.tcp_syns; + if (f->states&(1<<SS_TIME_WAIT)) + guess += slabstat.tcp_tws; + if (guess > (16*1024*1024)/128) + guess = (16*1024*1024)/128; + guess *= 128; + if (guess > bufsize) + bufsize = guess; + } + while (bufsize >= 64*1024) { + if ((buf = malloc(bufsize)) != NULL) + break; + bufsize /= 2; + } + if (buf == NULL) { + errno = ENOMEM; + return -1; + } + + if (f->families & (1<<AF_INET)) { + if ((fd = net_tcp_open()) < 0) + goto outerr; + if (generic_record_read(fd, buf, bufsize, tcp_show_line, f, AF_INET)) + goto outerr; + close(fd); + } + + if ((f->families & (1<<AF_INET6)) && + (fd = net_tcp6_open()) >= 0) { + if (generic_record_read(fd, buf, bufsize, tcp_show_line, f, AF_INET6)) + goto outerr; + close(fd); + } + + free(buf); + return 0; + +outerr: + do { + int saved_errno = errno; + if (buf) + free(buf); + if (fd >= 0) + close(fd); + errno = saved_errno; + return -1; + } while (0); +} + + +int dgram_show_line(char *line, struct filter *f, int family) +{ + struct tcpstat s; + char *loc, *rem, *data; + char opt[256]; + int n; + char *p; + + if ((p = strchr(line, ':')) == NULL) + return -1; + loc = p+2; + + if ((p = strchr(loc, ':')) == NULL) + return -1; + p[5] = 0; + rem = p+6; + + if ((p = strchr(rem, ':')) == NULL) + return -1; + p[5] = 0; + data = p+6; + + do { + int state = (data[1] >= 'A') ? (data[1] - 'A' + 10) : (data[1] - '0'); + + if (!(f->states & (1<<state))) + return 0; + } while (0); + + s.local.family = s.remote.family = family; + if (family == AF_INET) { + sscanf(loc, "%x:%x", s.local.data, (unsigned*)&s.lport); + sscanf(rem, "%x:%x", s.remote.data, (unsigned*)&s.rport); + s.local.bytelen = s.remote.bytelen = 4; + } else { + sscanf(loc, "%08x%08x%08x%08x:%x", + s.local.data, + s.local.data+1, + s.local.data+2, + s.local.data+3, + &s.lport); + sscanf(rem, "%08x%08x%08x%08x:%x", + s.remote.data, + s.remote.data+1, + s.remote.data+2, + s.remote.data+3, + &s.rport); + s.local.bytelen = s.remote.bytelen = 16; + } + + if (f->f && run_ssfilter(f->f, &s) == 0) + return 0; + + opt[0] = 0; + n = sscanf(data, "%x %x:%x %*x:%*x %*x %d %*d %d %d %llx %[^\n]\n", + &s.state, &s.wq, &s.rq, + &s.uid, &s.ino, + &s.refcnt, &s.sk, opt); + + if (n < 9) + opt[0] = 0; + + if (netid_width) + printf("%-*s ", netid_width, dg_proto); + if (state_width) + printf("%-*s ", state_width, sstate_name[s.state]); + + printf("%-6d %-6d ", s.rq, s.wq); + + formatted_print(&s.local, s.lport); + formatted_print(&s.remote, s.rport); + + if (show_users) { + char ubuf[4096]; + if (find_users(s.ino, ubuf, sizeof(ubuf)) > 0) + printf(" users:(%s)", ubuf); + } + + if (show_details) { + if (s.uid) + printf(" uid=%u", (unsigned)s.uid); + printf(" ino=%u", (unsigned)s.ino); + printf(" sk=%llx", s.sk); + if (opt[0]) + printf(" opt:\"%s\"", opt); + } + printf("\n"); + + return 0; +} + + +int udp_show(struct filter *f) +{ + int fd = -1; + char buf[8192]; + int bufsize = sizeof(buf); + + dg_proto = UDP_PROTO; + + if (f->families&(1<<AF_INET)) { + if ((fd = net_udp_open()) < 0) + goto outerr; + if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET)) + goto outerr; + close(fd); + } + + if ((f->families&(1<<AF_INET6)) && + (fd = net_udp6_open()) >= 0) { + if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET6)) + goto outerr; + close(fd); + } + return 0; + +outerr: + do { + int saved_errno = errno; + if (fd >= 0) + close(fd); + errno = saved_errno; + return -1; + } while (0); +} + +int raw_show(struct filter *f) +{ + int fd = -1; + char buf[8192]; + int bufsize = sizeof(buf); + + dg_proto = RAW_PROTO; + + if (f->families&(1<<AF_INET)) { + if ((fd = net_raw_open()) < 0) + goto outerr; + if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET)) + goto outerr; + close(fd); + } + + if ((f->families&(1<<AF_INET6)) && + (fd = net_raw6_open()) >= 0) { + if (generic_record_read(fd, buf, bufsize, dgram_show_line, f, AF_INET6)) + goto outerr; + close(fd); + } + return 0; + +outerr: + do { + int saved_errno = errno; + if (fd >= 0) + close(fd); + errno = saved_errno; + return -1; + } while (0); +} + + +struct unixstat +{ + struct unixstat *next; + int ino; + int peer; + int rq; + int wq; + int state; + int type; + char *name; +}; + + + +int unix_state_map[] = { SS_CLOSE, SS_SYN_SENT, + SS_ESTABLISHED, SS_CLOSING }; + + +#define MAX_UNIX_REMEMBER (1024*1024/sizeof(struct unixstat)) + +void unix_list_free(struct unixstat *list) +{ + while (list) { + struct unixstat *s = list; + list = list->next; + if (s->name) + free(s->name); + free(s); + } +} + +void unix_list_print(struct unixstat *list, struct filter *f) +{ + struct unixstat *s; + char *peer; + + for (s = list; s; s = s->next) { + if (!(f->states & (1<<s->state))) + continue; + if (s->type == SOCK_STREAM && !(f->dbs&(1<<UNIX_ST_DB))) + continue; + if (s->type == SOCK_DGRAM && !(f->dbs&(1<<UNIX_DG_DB))) + continue; + + peer = "*"; + if (s->peer) { + struct unixstat *p; + for (p = list; p; p = p->next) { + if (s->peer == p->ino) + break; + } + if (!p) { + peer = "?"; + } else { + peer = p->name ? : "*"; + } + } + + if (f->f) { + struct tcpstat tst; + tst.local.family = AF_UNIX; + tst.remote.family = AF_UNIX; + memcpy(tst.local.data, &s->name, sizeof(s->name)); + if (strcmp(peer, "*") == 0) + memset(tst.remote.data, 0, sizeof(peer)); + else + memcpy(tst.remote.data, &peer, sizeof(peer)); + if (run_ssfilter(f->f, &tst) == 0) + continue; + } + + if (netid_width) + printf("%-*s ", netid_width, + s->type == SOCK_STREAM ? "u_str" : "u_dgr"); + if (state_width) + printf("%-*s ", state_width, sstate_name[s->state]); + printf("%-6d %-6d ", s->rq, s->wq); + printf("%*s %-*d %*s %-*d", + addr_width, s->name ? : "*", serv_width, s->ino, + addr_width, peer, serv_width, s->peer); + if (show_users) { + char ubuf[4096]; + if (find_users(s->ino, ubuf, sizeof(ubuf)) > 0) + printf(" users:(%s)", ubuf); + } + printf("\n"); + } +} + +int unix_show(struct filter *f) +{ + FILE *fp; + char buf[256]; + char name[128]; + int newformat = 0; + int cnt; + struct unixstat *list = NULL; + + if ((fp = fdopen(net_unix_open(), "r")) == NULL) + return -1; + fgets(buf, sizeof(buf)-1, fp); + + if (memcmp(buf, "Peer", 4) == 0) + newformat = 1; + cnt = 0; + + while (fgets(buf, sizeof(buf)-1, fp)) { + struct unixstat *u, **insp; + int flags; + + if (!(u = malloc(sizeof(*u)))) + break; + u->name = NULL; + + if (sscanf(buf, "%x: %x %x %x %x %x %d %s", + &u->peer, &u->rq, &u->wq, &flags, &u->type, + &u->state, &u->ino, name) < 8) + name[0] = 0; + + if (flags&(1<<16)) { + u->state = SS_LISTEN; + } else { + u->state = unix_state_map[u->state-1]; + if (u->type == SOCK_DGRAM && + u->state == SS_CLOSE && + u->peer) + u->state = SS_ESTABLISHED; + } + + if (!newformat) { + u->peer = 0; + u->rq = 0; + u->wq = 0; + } + + insp = &list; + while (*insp) { + if (u->type < (*insp)->type || + (u->type == (*insp)->type && + u->ino < (*insp)->ino)) + break; + insp = &(*insp)->next; + } + u->next = *insp; + *insp = u; + + if (name[0]) { + if ((u->name = malloc(strlen(name)+1)) == NULL) + break; + strcpy(u->name, name); + } + if (++cnt > MAX_UNIX_REMEMBER) { + unix_list_print(list, f); + unix_list_free(list); + list = NULL; + cnt = 0; + } + } + + if (list) { + unix_list_print(list, f); + unix_list_free(list); + list = NULL; + cnt = 0; + } + + return 0; +} + + +int packet_show(struct filter *f) +{ + FILE *fp; + char buf[256]; + int type; + int prot; + int iface; + int state; + int rq; + int uid; + int ino; + unsigned long long sk; + + if (!(f->states & (1<<SS_CLOSE))) + return 0; + + if ((fp = fdopen(net_packet_open(), "r")) == NULL) + return -1; + fgets(buf, sizeof(buf)-1, fp); + + while (fgets(buf, sizeof(buf)-1, fp)) { + sscanf(buf, "%llx %*d %d %x %d %d %u %u %u", + &sk, + &type, &prot, &iface, &state, + &rq, &uid, &ino); + + if (type == SOCK_RAW && !(f->dbs&(1<<PACKET_R_DB))) + continue; + if (type == SOCK_DGRAM && !(f->dbs&(1<<PACKET_DG_DB))) + continue; + if (f->f) { + struct tcpstat tst; + tst.local.family = AF_PACKET; + tst.remote.family = AF_PACKET; + tst.rport = 0; + tst.lport = iface; + tst.local.data[0] = prot; + tst.remote.data[0] = 0; + if (run_ssfilter(f->f, &tst) == 0) + continue; + } + + if (netid_width) + printf("%-*s ", netid_width, + type == SOCK_RAW ? "p_raw" : "p_dgr"); + if (state_width) + printf("%-*s ", state_width, "UNCONN"); + printf("%-6d %-6d ", rq, 0); + if (prot == 3) { + printf("%*s:", addr_width, "*"); + } else { + char tb[16]; + printf("%*s:", addr_width, + ll_proto_n2a(htons(prot), tb, sizeof(tb))); + } + if (iface == 0) { + printf("%-*s ", serv_width, "*"); + } else { + printf("%-*s ", serv_width, xll_index_to_name(iface)); + } + printf("%*s*%-*s", + addr_width, "", serv_width, ""); + + if (show_users) { + char ubuf[4096]; + if (find_users(ino, ubuf, sizeof(ubuf)) > 0) + printf(" users:(%s)", ubuf); + } + if (show_details) { + printf(" ino=%u uid=%u sk=%llx", ino, uid, sk); + } + printf("\n"); + } + + return 0; +} + +int netlink_show(struct filter *f) +{ + FILE *fp; + char buf[256]; + int prot, pid; + unsigned groups; + int rq, wq, rc; + unsigned long long sk, cb; + + if (!(f->states & (1<<SS_CLOSE))) + return 0; + + if ((fp = fdopen(net_netlink_open(), "r")) == NULL) + return -1; + fgets(buf, sizeof(buf)-1, fp); + + while (fgets(buf, sizeof(buf)-1, fp)) { + sscanf(buf, "%llx %d %d %x %d %d %llx %d", + &sk, + &prot, &pid, &groups, &rq, &wq, &cb, &rc); + + if (f->f) { + struct tcpstat tst; + tst.local.family = AF_NETLINK; + tst.remote.family = AF_NETLINK; + tst.rport = -1; + tst.lport = pid; + tst.local.data[0] = prot; + tst.remote.data[0] = 0; + if (run_ssfilter(f->f, &tst) == 0) + continue; + } + + if (netid_width) + printf("%-*s ", netid_width, "nl"); + if (state_width) + printf("%-*s ", state_width, "UNCONN"); + printf("%-6d %-6d ", rq, wq); + if (resolve_services && prot == 0) + printf("%*s:", addr_width, "rtnl"); + else if (resolve_services && prot == 3) + printf("%*s:", addr_width, "fw"); + else if (resolve_services && prot == 4) + printf("%*s:", addr_width, "tcpdiag"); + else + printf("%*d:", addr_width, prot); + if (pid == -1) { + printf("%-*s ", serv_width, "*"); + } else if (resolve_services) { + int done = 0; + if (!pid) { + done = 1; + printf("%-*s ", serv_width, "kernel"); + } else if (pid > 0) { + char procname[64]; + FILE *fp; + sprintf(procname, "%s/%d/stat", + getenv("PROC_ROOT") ? : "/proc", pid); + if ((fp = fopen(procname, "r")) != NULL) { + if (fscanf(fp, "%*d (%[^)])", procname) == 1) { + sprintf(procname+strlen(procname), "/%d", pid); + printf("%-*s ", serv_width, procname); + done = 1; + } + fclose(fp); + } + } + if (!done) + printf("%-*d ", serv_width, pid); + } else { + printf("%-*d ", serv_width, pid); + } + printf("%*s*%-*s", + addr_width, "", serv_width, ""); + + if (show_details) { + printf(" sk=%llx cb=%llx groups=0x%08x", sk, cb, groups); + } + printf("\n"); + } + + return 0; +} + +struct snmpstat +{ + int tcp_estab; +}; + +int get_snmp_int(char *proto, char *key, int *result) +{ + char buf[1024]; + FILE *fp; + int protolen = strlen(proto); + int keylen = strlen(key); + + *result = 0; + + if ((fp = fdopen(net_snmp_open(), "r")) == NULL) + return -1; + + while (fgets(buf, sizeof(buf), fp) != NULL) { + char *p = buf; + int pos = 0; + if (memcmp(buf, proto, protolen)) + continue; + while ((p = strchr(p, ' ')) != NULL) { + pos++; + p++; + if (memcmp(p, key, keylen) == 0 && + (p[keylen] == ' ' || p[keylen] == '\n')) + break; + } + if (fgets(buf, sizeof(buf), fp) == NULL) + break; + if (memcmp(buf, proto, protolen)) + break; + p = buf; + while ((p = strchr(p, ' ')) != NULL) { + p++; + if (--pos == 0) { + sscanf(p, "%d", result); + fclose(fp); + return 0; + } + } + } + + fclose(fp); + errno = ESRCH; + return -1; +} + + +/* Get stats from sockstat */ + +struct sockstat +{ + int socks; + int tcp_mem; + int tcp_total; + int tcp_orphans; + int tcp_tws; + int tcp4_hashed; + int udp4; + int raw4; + int frag4; + int frag4_mem; + int tcp6_hashed; + int udp6; + int raw6; + int frag6; + int frag6_mem; +}; + +static void get_sockstat_line(char *line, struct sockstat *s) +{ + char id[256], rem[256]; + + if (sscanf(line, "%[^ ] %[^\n]\n", id, rem) != 2) + return; + + if (strcmp(id, "sockets:") == 0) + sscanf(rem, "%*s%d", &s->socks); + else if (strcmp(id, "UDP:") == 0) + sscanf(rem, "%*s%d", &s->udp4); + else if (strcmp(id, "UDP6:") == 0) + sscanf(rem, "%*s%d", &s->udp6); + else if (strcmp(id, "RAW:") == 0) + sscanf(rem, "%*s%d", &s->raw4); + else if (strcmp(id, "RAW6:") == 0) + sscanf(rem, "%*s%d", &s->raw6); + else if (strcmp(id, "TCP6:") == 0) + sscanf(rem, "%*s%d", &s->tcp6_hashed); + else if (strcmp(id, "FRAG:") == 0) + sscanf(rem, "%*s%d%*s%d", &s->frag4, &s->frag4_mem); + else if (strcmp(id, "FRAG6:") == 0) + sscanf(rem, "%*s%d%*s%d", &s->frag6, &s->frag6_mem); + else if (strcmp(id, "TCP:") == 0) + sscanf(rem, "%*s%d%*s%d%*s%d%*s%d%*s%d", + &s->tcp4_hashed, + &s->tcp_orphans, &s->tcp_tws, &s->tcp_total, &s->tcp_mem); +} + +int get_sockstat(struct sockstat *s) +{ + char buf[256]; + FILE *fp; + + memset(s, 0, sizeof(*s)); + + if ((fp = fdopen(net_sockstat_open(), "r")) == NULL) + return -1; + while(fgets(buf, sizeof(buf), fp) != NULL) + get_sockstat_line(buf, s); + fclose(fp); + + if ((fp = fdopen(net_sockstat6_open(), "r")) == NULL) + return 0; + while(fgets(buf, sizeof(buf), fp) != NULL) + get_sockstat_line(buf, s); + fclose(fp); + + return 0; +} + +int print_summary(void) +{ + struct sockstat s; + struct snmpstat sn; + + if (get_sockstat(&s) < 0) + perror("ss: get_sockstat"); + if (get_snmp_int("Tcp:", "CurrEstab", &sn.tcp_estab) < 0) + perror("ss: get_snmpstat"); + + printf("Total: %d (kernel %d)\n", s.socks, slabstat.socks); + + printf("TCP: %d (estab %d, closed %d, orphaned %d, synrecv %d, timewait %d/%d), ports %d\n", + s.tcp_total + slabstat.tcp_syns + s.tcp_tws, + sn.tcp_estab, + s.tcp_total - (s.tcp4_hashed+s.tcp6_hashed-s.tcp_tws), + s.tcp_orphans, + slabstat.tcp_syns, + s.tcp_tws, slabstat.tcp_tws, + slabstat.tcp_ports + ); + + printf("\n"); + printf("Transport Total IP IPv6\n"); + printf("* %-9d %-9s %-9s\n", slabstat.socks, "-", "-"); + printf("RAW %-9d %-9d %-9d\n", s.raw4+s.raw6, s.raw4, s.raw6); + printf("UDP %-9d %-9d %-9d\n", s.udp4+s.udp6, s.udp4, s.udp6); + printf("TCP %-9d %-9d %-9d\n", s.tcp4_hashed+s.tcp6_hashed, s.tcp4_hashed, s.tcp6_hashed); + printf("INET %-9d %-9d %-9d\n", + s.raw4+s.udp4+s.tcp4_hashed+ + s.raw6+s.udp6+s.tcp6_hashed, + s.raw4+s.udp4+s.tcp4_hashed, + s.raw6+s.udp6+s.tcp6_hashed); + printf("FRAG %-9d %-9d %-9d\n", s.frag4+s.frag6, s.frag4, s.frag6); + + printf("\n"); + + return 0; +} + + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, +"Usage: ss [ OPTIONS ]\n" +" ss [ OPTIONS ] [ FILTER ]\n" +"where OPTIONS := { -h[elp] | -V[ersion] | -n[umeric] | -r[esolve] |\n" +" -a[ll] -l[istening] -o[ptions] -e[xtended] -p[rocesses]\n" +" -A QUERY } -s[ummary]\n" +" -f[amily] { inet | inet6 | link | unix } }\n" +" QUERY := {all|inet|tcp|udp|raw|unix|packet|netlink}[,QUERY]\n" +" FILTER := [ state TCP-STATE ] [ EXPRESSION ]\n" +); + exit(-1); +} + + +int scan_state(char *state) +{ + int i; + if (strcasecmp(state, "close") == 0 || + strcasecmp(state, "closed") == 0) + return (1<<SS_CLOSE); + if (strcasecmp(state, "syn-rcv") == 0) + return (1<<SS_SYN_RECV); + if (matches(state, "established") == 0) + return (1<<SS_ESTABLISHED); + if (strcasecmp(state, "all") == 0) + return SS_ALL; + if (strcasecmp(state, "connected") == 0) + return SS_ALL & ~((1<<SS_CLOSE)|(1<<SS_LISTEN)); + if (matches(state, "synchronized") == 0) + return SS_ALL & ~((1<<SS_CLOSE)|(1<<SS_LISTEN)|(1<<SS_SYN_SENT)); + if (strcasecmp(state, "bucket") == 0) + return (1<<SS_SYN_RECV)|(1<<SS_TIME_WAIT); + if (strcasecmp(state, "big") == 0) + return SS_ALL & ~((1<<SS_SYN_RECV)|(1<<SS_TIME_WAIT)); + for (i=0; i<SS_MAX; i++) { + if (matches(state, sstate_namel[i]) == 0) + return (1<<i); + } + return 0; +} + + +int main(int argc, char *argv[]) +{ + int do_default = 1; + int saw_states = 0; + int saw_query = 0; + int do_summary = 0; + char *dump_tcpdiag = NULL; + FILE *filter_fp = NULL; + int ch; + + memset(¤t_filter, 0, sizeof(current_filter)); + + current_filter.states = default_filter.states; + + while ((ch = getopt(argc, argv, "h?aletuwxnro460spfmiA:D:F:vV")) != EOF) { + switch(ch) { + case 'n': + resolve_services = 0; + break; + case 'r': + resolve_hosts = 1; + break; + case 'o': + show_options = 1; + break; + case 'e': + show_options = 1; + show_details++; + break; + case 'm': + show_mem = 1; + break; + case 'i': + show_tcpinfo = 1; + break; + case 'p': + show_users++; + break; + case 't': + current_filter.dbs |= (1<<TCP_DB); + do_default = 0; + break; + case 'u': + current_filter.dbs |= (1<<UDP_DB); + do_default = 0; + break; + case 'w': + current_filter.dbs |= (1<<RAW_DB); + do_default = 0; + break; + case 'x': + current_filter.dbs |= UNIX_DBM; + do_default = 0; + break; + case 'a': + current_filter.states = SS_ALL; + break; + case 'l': + current_filter.states = (1<<SS_LISTEN); + break; + case '4': + preferred_family = AF_INET; + break; + case '6': + preferred_family = AF_INET6; + break; + case '0': + preferred_family = AF_PACKET; + break; + case 'f': + if (strcmp(optarg, "inet") == 0) + preferred_family = AF_INET; + else if (strcmp(optarg, "inet6") == 0) + preferred_family = AF_INET6; + else if (strcmp(optarg, "link") == 0) + preferred_family = AF_PACKET; + else if (strcmp(optarg, "unix") == 0) + preferred_family = AF_UNIX; + else if (strcmp(optarg, "netlink") == 0) + preferred_family = AF_NETLINK; + else if (strcmp(optarg, "help") == 0) + usage(); + else { + fprintf(stderr, "ss: \"%s\" is invalid family\n", optarg); + usage(); + } + break; + case 'A': + { + char *p, *p1; + if (!saw_query) { + current_filter.dbs = 0; + saw_query = 1; + do_default = 0; + } + p = p1 = optarg; + do { + if ((p1 = strchr(p, ',')) != NULL) + *p1 = 0; + if (strcmp(p, "all") == 0) { + current_filter.dbs = ALL_DB; + } else if (strcmp(p, "inet") == 0) { + current_filter.dbs |= (1<<TCP_DB)|(1<<UDP_DB)|(1<<RAW_DB); + } else if (strcmp(p, "udp") == 0) { + current_filter.dbs |= (1<<UDP_DB); + } else if (strcmp(p, "tcp") == 0) { + current_filter.dbs |= (1<<TCP_DB); + } else if (strcmp(p, "raw") == 0) { + current_filter.dbs |= (1<<RAW_DB); + } else if (strcmp(p, "unix") == 0) { + current_filter.dbs |= UNIX_DBM; + } else if (matches(p, "unix_stream") == 0 || + strcmp(p, "u_str") == 0) { + current_filter.dbs |= (1<<UNIX_ST_DB); + } else if (matches(p, "unix_dgram") == 0 || + strcmp(p, "u_dgr") == 0) { + current_filter.dbs |= (1<<UNIX_DG_DB); + } else if (strcmp(p, "packet") == 0) { + current_filter.dbs |= PACKET_DBM; + } else if (strcmp(p, "packet_raw") == 0 || + strcmp(p, "p_raw") == 0) { + current_filter.dbs |= (1<<PACKET_R_DB); + } else if (strcmp(p, "packet_dgram") == 0 || + strcmp(p, "p_dgr") == 0) { + current_filter.dbs |= (1<<PACKET_DG_DB); + } else if (strcmp(p, "netlink") == 0) { + current_filter.dbs |= (1<<NETLINK_DB); + } else { + fprintf(stderr, "ss: \"%s\" is illegal socket table id\n", p); + usage(); + } + p = p1 + 1; + } while (p1); + break; + } + case 's': + do_summary = 1; + break; + case 'D': + dump_tcpdiag = optarg; + break; + case 'F': + if (filter_fp) { + fprintf(stderr, "More than one filter file\n"); + exit(-1); + } + if (optarg[0] == '-') + filter_fp = stdin; + else + filter_fp = fopen(optarg, "r"); + if (!filter_fp) { + perror("fopen filter file"); + exit(-1); + } + break; + case 'v': + case 'V': + printf("ss utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + case 'h': + case '?': + default: + usage(); + } + } + + argc -= optind; + argv += optind; + + get_slabstat(&slabstat); + + if (do_summary) { + print_summary(); + if (do_default && argc == 0) + exit(0); + } + + if (do_default) + current_filter.dbs = default_filter.dbs; + + if (preferred_family == AF_UNSPEC) { + if (!(current_filter.dbs&~UNIX_DBM)) + preferred_family = AF_UNIX; + else if (!(current_filter.dbs&~PACKET_DBM)) + preferred_family = AF_PACKET; + else if (!(current_filter.dbs&~(1<<NETLINK_DB))) + preferred_family = AF_NETLINK; + } + + if (preferred_family != AF_UNSPEC) { + int mask2; + if (preferred_family == AF_INET || + preferred_family == AF_INET6) { + mask2= (1<<TCP_DB); + if (!do_default) + mask2 = (1<<UDP_DB)|(1<<RAW_DB); + } else if (preferred_family == AF_PACKET) { + mask2 = PACKET_DBM; + } else if (preferred_family == AF_UNIX) { + mask2 = UNIX_DBM; + } else if (preferred_family == AF_NETLINK) { + mask2 = (1<<NETLINK_DB); + } else { + mask2 = 0; + } + + if (do_default) + current_filter.dbs = mask2; + else + current_filter.dbs &= mask2; + current_filter.families = (1<<preferred_family); + } else { + if (!do_default) + current_filter.families = ~0; + else + current_filter.families = default_filter.families; + } + if (current_filter.dbs == 0) { + fprintf(stderr, "ss: no socket tables to show with such filter.\n"); + exit(0); + } + if (current_filter.families == 0) { + fprintf(stderr, "ss: no families to show with such filter.\n"); + exit(0); + } + + if (resolve_services && resolve_hosts && + (current_filter.dbs&(UNIX_DBM|(1<<TCP_DB)|(1<<UDP_DB)))) + init_service_resolver(); + + /* Now parse filter... */ + if (argc == 0 && filter_fp) { + if (ssfilter_parse(¤t_filter.f, 0, NULL, filter_fp)) + usage(); + } + + while (argc > 0) { + if (strcmp(*argv, "state") == 0) { + NEXT_ARG(); + if (!saw_states) + current_filter.states = 0; + current_filter.states |= scan_state(*argv); + saw_states = 1; + } else if (strcmp(*argv, "exclude") == 0 || + strcmp(*argv, "excl") == 0) { + NEXT_ARG(); + if (!saw_states) + current_filter.states = SS_ALL; + current_filter.states &= ~scan_state(*argv); + saw_states = 1; + } else { + if (ssfilter_parse(¤t_filter.f, argc, argv, filter_fp)) + usage(); + break; + } + argc--; argv++; + } + + if (current_filter.states == 0) { + fprintf(stderr, "ss: no socket states to show with such filter.\n"); + exit(0); + } + + if (dump_tcpdiag) { + FILE *dump_fp = stdout; + if (!(current_filter.dbs & (1<<TCP_DB))) { + fprintf(stderr, "ss: tcpdiag dump requested and no tcp in filter.\n"); + exit(0); + } + if (dump_tcpdiag[0] != '-') { + dump_fp = fopen(dump_tcpdiag, "w"); + if (!dump_tcpdiag) { + perror("fopen dump file"); + exit(-1); + } + } + tcp_show_netlink(¤t_filter, dump_fp); + fflush(dump_fp); + exit(0); + } + + netid_width = 0; + if (current_filter.dbs&(current_filter.dbs-1)) + netid_width = 5; + + state_width = 0; + if (current_filter.states&(current_filter.states-1)) + state_width = 10; + + screen_width = 80; + if (isatty(STDOUT_FILENO)) { + struct winsize w; + + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &w) != -1) { + if (w.ws_col > 0) + screen_width = w.ws_col; + } + } + + addrp_width = screen_width; + addrp_width -= netid_width+1; + addrp_width -= state_width+1; + addrp_width -= 14; + + if (addrp_width&1) { + if (netid_width) + netid_width++; + else if (state_width) + state_width++; + } + + addrp_width /= 2; + addrp_width--; + + serv_width = resolve_services ? 7 : 5; + + if (addrp_width < 15+serv_width+1) + addrp_width = 15+serv_width+1; + + addr_width = addrp_width - serv_width - 1; + + if (netid_width) + printf("%-*s ", netid_width, "Netid"); + if (state_width) + printf("%-*s ", state_width, "State"); + printf("%-6s %-6s ", "Recv-Q", "Send-Q"); + + printf("%*s:%-*s %*s:%-*s\n", + addr_width, "Local Address", serv_width, "Port", + addr_width, "Peer Address", serv_width, "Port"); + +//printf("%08x %08x %08x\n", current_filter.dbs, current_filter.states, current_filter.families); + fflush(stdout); + + if (current_filter.dbs & (1<<NETLINK_DB)) + netlink_show(¤t_filter); + if (current_filter.dbs & PACKET_DBM) + packet_show(¤t_filter); + if (current_filter.dbs & UNIX_DBM) + unix_show(¤t_filter); + if (current_filter.dbs & (1<<RAW_DB)) + raw_show(¤t_filter); + if (current_filter.dbs & (1<<UDP_DB)) + udp_show(¤t_filter); + if (current_filter.dbs & (1<<TCP_DB)) + tcp_show(¤t_filter); + return 0; +} diff --git a/misc/ssfilter.h b/misc/ssfilter.h index e69de29b..00b92e3d 100644 --- a/misc/ssfilter.h +++ b/misc/ssfilter.h @@ -0,0 +1,21 @@ +#define SSF_DCOND 0 +#define SSF_SCOND 1 +#define SSF_OR 2 +#define SSF_AND 3 +#define SSF_NOT 4 +#define SSF_D_GE 5 +#define SSF_D_LE 6 +#define SSF_S_GE 7 +#define SSF_S_LE 8 +#define SSF_S_AUTO 9 + +struct ssfilter +{ + int type; + struct ssfilter *post; + struct ssfilter *pred; +}; + +int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp); +void *parse_hostcond(char*); + diff --git a/misc/ssfilter.y b/misc/ssfilter.y index e69de29b..f47ab2fd 100644 --- a/misc/ssfilter.y +++ b/misc/ssfilter.y @@ -0,0 +1,274 @@ +%{ + +#include <stdio.h> +#include <malloc.h> +#include <string.h> +#include "ssfilter.h" + +typedef struct ssfilter * ssfilter_t; + +#define YYSTYPE ssfilter_t + +static struct ssfilter * alloc_node(int type, void *pred) +{ + struct ssfilter *n = malloc(sizeof(*n)); + if (n == NULL) + abort(); + n->type = type; + n->pred = pred; + n->post = NULL; + return n; +} + +static char **yy_argv; +static int yy_argc; +static FILE *yy_fp; +static ssfilter_t *yy_ret; + +static int yylex(void); + +static void yyerror(char *s) +{ + fprintf(stderr, "ss: bison bellows (while parsing filter): \"%s!\"", s); +} + +%} + +%token HOSTCOND DCOND SCOND DPORT SPORT LEQ GEQ NEQ AUTOBOUND +%left '|' +%left '&' +%nonassoc '!' + +%% +applet: null expr + { + *yy_ret = $2; + $$ = $2; + } + | null + ; +null: /* NOTHING */ { $$ = NULL; } + ; +expr: DCOND HOSTCOND + { + $$ = alloc_node(SSF_DCOND, $2); + } + | SCOND HOSTCOND + { + $$ = alloc_node(SSF_SCOND, $2); + } + | DPORT GEQ HOSTCOND + { + $$ = alloc_node(SSF_D_GE, $3); + } + | DPORT LEQ HOSTCOND + { + $$ = alloc_node(SSF_D_LE, $3); + } + | DPORT '>' HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_D_LE, $3)); + } + | DPORT '<' HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_D_GE, $3)); + } + | DPORT '=' HOSTCOND + { + $$ = alloc_node(SSF_DCOND, $3); + } + | DPORT NEQ HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_DCOND, $3)); + } + + | SPORT GEQ HOSTCOND + { + $$ = alloc_node(SSF_S_GE, $3); + } + | SPORT LEQ HOSTCOND + { + $$ = alloc_node(SSF_S_LE, $3); + } + | SPORT '>' HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_S_LE, $3)); + } + | SPORT '<' HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_S_GE, $3)); + } + | SPORT '=' HOSTCOND + { + $$ = alloc_node(SSF_SCOND, $3); + } + | SPORT NEQ HOSTCOND + { + $$ = alloc_node(SSF_NOT, alloc_node(SSF_SCOND, $3)); + } + + | AUTOBOUND + { + $$ = alloc_node(SSF_S_AUTO, NULL); + } + | expr '|' expr + { + $$ = alloc_node(SSF_OR, $1); + $$->post = $3; + } + | expr expr + { + $$ = alloc_node(SSF_AND, $1); + $$->post = $2; + } + | expr '&' expr + + { + $$ = alloc_node(SSF_AND, $1); + $$->post = $3; + } + | '!' expr + { + $$ = alloc_node(SSF_NOT, $2); + } + | '(' expr ')' + { + $$ = $2; + } +; +%% + +static char *get_token_from_line(char **ptr) +{ + char *tok, *cp = *ptr; + + while (*cp == ' ' || *cp == '\t') cp++; + + if (*cp == 0) { + *ptr = cp; + return NULL; + } + + tok = cp; + + while (*cp != 0 && *cp != ' ' && *cp != '\t') { + /* Backslash escapes everything. */ + if (*cp == '\\') { + char *tp; + for (tp = cp; tp != tok; tp--) + *tp = *(tp-1); + cp++; + tok++; + if (*cp == 0) + break; + } + cp++; + } + if (*cp) + *cp++ = 0; + *ptr = cp; + return tok; +} + +int yylex(void) +{ + static char argbuf[1024]; + static char *tokptr = argbuf; + static int argc; + char *curtok; + + do { + while (*tokptr == 0) { + tokptr = NULL; + if (argc < yy_argc) { + tokptr = yy_argv[argc]; + argc++; + } else if (yy_fp) { + while (tokptr == NULL) { + if (fgets(argbuf, sizeof(argbuf)-1, yy_fp) == NULL) + return 0; + argbuf[sizeof(argbuf)-1] = 0; + if (strlen(argbuf) == sizeof(argbuf) - 1) { + fprintf(stderr, "Too long line in filter"); + exit(-1); + } + if (argbuf[strlen(argbuf)-1] == '\n') + argbuf[strlen(argbuf)-1] = 0; + if (argbuf[0] == '#' || argbuf[0] == '0') + continue; + tokptr = argbuf; + } + } else { + return 0; + } + } + } while ((curtok = get_token_from_line(&tokptr)) == NULL); + + if (strcmp(curtok, "!") == 0 || + strcmp(curtok, "not") == 0) + return '!'; + if (strcmp(curtok, "&") == 0 || + strcmp(curtok, "&&") == 0 || + strcmp(curtok, "and") == 0) + return '&'; + if (strcmp(curtok, "|") == 0 || + strcmp(curtok, "||") == 0 || + strcmp(curtok, "or") == 0) + return '|'; + if (strcmp(curtok, "(") == 0) + return '('; + if (strcmp(curtok, ")") == 0) + return ')'; + if (strcmp(curtok, "dst") == 0) + return DCOND; + if (strcmp(curtok, "src") == 0) + return SCOND; + if (strcmp(curtok, "dport") == 0) + return DPORT; + if (strcmp(curtok, "sport") == 0) + return SPORT; + if (strcmp(curtok, ">=") == 0 || + strcmp(curtok, "ge") == 0 || + strcmp(curtok, "geq") == 0) + return GEQ; + if (strcmp(curtok, "<=") == 0 || + strcmp(curtok, "le") == 0 || + strcmp(curtok, "leq") == 0) + return LEQ; + if (strcmp(curtok, "!=") == 0 || + strcmp(curtok, "ne") == 0 || + strcmp(curtok, "neq") == 0) + return NEQ; + if (strcmp(curtok, "=") == 0 || + strcmp(curtok, "==") == 0 || + strcmp(curtok, "eq") == 0) + return '='; + if (strcmp(curtok, ">") == 0 || + strcmp(curtok, "gt") == 0) + return '>'; + if (strcmp(curtok, "<") == 0 || + strcmp(curtok, "lt") == 0) + return '<'; + if (strcmp(curtok, "autobound") == 0) + return AUTOBOUND; + yylval = (void*)parse_hostcond(curtok); + if (yylval == NULL) { + fprintf(stderr, "Cannot parse dst/src address.\n"); + exit(1); + } + return HOSTCOND; +} + +int ssfilter_parse(struct ssfilter **f, int argc, char **argv, FILE *fp) +{ + yy_argc = argc; + yy_argv = argv; + yy_fp = fp; + yy_ret = f; + + if (yyparse()) { + fprintf(stderr, " Sorry.\n"); + return -1; + } + return 0; +} diff --git a/tc/Makefile b/tc/Makefile index e69de29b..ec1d3399 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -0,0 +1,54 @@ +TCOBJ=tc.o tc_qdisc.o tc_class.o tc_filter.o tc_util.o m_police.o m_estimator.o + +include ../Config + +TCMODULES := +TCMODULES += q_fifo.o +TCMODULES += q_sfq.o +TCMODULES += q_red.o +TCMODULES += q_prio.o +TCMODULES += q_tbf.o +TCMODULES += q_cbq.o +TCMODULES += f_rsvp.o +TCMODULES += f_u32.o +TCMODULES += f_route.o +TCMODULES += f_fw.o +ifeq ($(TC_CONFIG_DIFFSERV),y) + TCMODULES += q_dsmark.o + TCMODULES += q_gred.o + TCMODULES += f_tcindex.o + TCMODULES += q_ingress.o +endif +ifeq ($(TC_CONFIG_ATM),y) + TCMODULES += q_atm.o + LDLIBS += -latm +endif + +#TCMODULES += q_csz.o +#TCMODULES += q_hpfq.o +#TCMODULES += q_hfsc.o + +TCOBJ += $(TCMODULES) + +TCLIB := tc_core.o +TCLIB += tc_red.o +TCLIB += tc_cbq.o +TCLIB += tc_estimator.o + +LDLIBS += -L. -ltc -lm -ldl +LDFLAGS += -Wl,-export-dynamic + +all: libtc.a tc + +tc: $(TCOBJ) $(LIBNETLINK) $(LIBUTIL) $(TCLIB) + +libtc.a: $(TCLIB) + $(AR) rcs $@ $(TCLIB) + +install: all + install -m 0755 -s tc $(DESTDIR)$(SBINDIR) + + +clean: + rm -f $(TCOBJ) $(TCLIB) libtc.a tc + diff --git a/tc/README.last b/tc/README.last index e69de29b..9400438a 100644 --- a/tc/README.last +++ b/tc/README.last @@ -0,0 +1,47 @@ +Kernel code and interface. +-------------------------- + +* Compile time switches + +There is only one, but very important, compile time switch. +It is not settable by "make config", but should be selected +manually and after a bit of thinking in <include/net/pkt_sched.h> + +PSCHED_CLOCK_SOURCE can take three values: + + PSCHED_GETTIMEOFDAY + PSCHED_JIFFIES + PSCHED_CPU + + + PSCHED_GETTIMEOFDAY + +Default setting is the most conservative PSCHED_GETTIMEOFDAY. +It is very slow both because of weird slowness of do_gettimeofday() +and because it forces code to use unnatural "timeval" format, +where microseconds and seconds fields are separate. +Besides that, it will misbehave, when delays exceed 2 seconds +(f.e. very slow links or classes bounded to small slice of bandwidth) +To resume: as only you will get it working, select correct clock +source and forget about PSCHED_GETTIMEOFDAY forever. + + + PSCHED_JIFFIES + +Clock is derived from jiffies. On architectures with HZ=100 +granularity of this clock is not enough to make reasonable +bindings to real time. However, taking into account Linux +architecture problems, which force us to use artificial +integrated clock in any case, this switch is not so bad +for schduling even on high speed networks, though policing +is not reliable. + + + PSCHED_CPU + +It is available only for alpha and pentiums with correct +CPU timestamp. It is the fastest way, use it when it is available, +but remember: not all pentiums have this facility, and +a lot of them have clock, broken by APM etc. etc. + + diff --git a/tc/f_fw.c b/tc/f_fw.c index e69de29b..3c5e3e2f 100644 --- a/tc/f_fw.c +++ b/tc/f_fw.c @@ -0,0 +1,116 @@ +/* + * f_fw.c FW filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... fw [ classid CLASSID ] [ police POLICE_SPEC ]\n"); + fprintf(stderr, " POLICE_SPEC := ... look at TBF\n"); + fprintf(stderr, " CLASSID := X:Y\n"); +} + +#define usage() return(-1) + +static int fw_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) +{ + struct tc_police tp; + struct tcmsg *t = NLMSG_DATA(n); + struct rtattr *tail; + + memset(&tp, 0, sizeof(tp)); + + if (handle) { + if (get_u32(&t->tcm_handle, handle, 0)) { + fprintf(stderr, "Illegal \"handle\"\n"); + return -1; + } + } + + if (argc == 0) + return 0; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 4096, TCA_OPTIONS, NULL, 0); + + while (argc > 0) { + if (matches(*argv, "classid") == 0 || + matches(*argv, "flowid") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_tc_classid(&handle, *argv)) { + fprintf(stderr, "Illegal \"classid\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_FW_CLASSID, &handle, 4); + } else if (matches(*argv, "police") == 0) { + NEXT_ARG(); + if (parse_police(&argc, &argv, TCA_FW_POLICE, n)) { + fprintf(stderr, "Illegal \"police\"\n"); + return -1; + } + continue; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; + return 0; +} + +static int fw_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) +{ + struct rtattr *tb[TCA_FW_MAX+1]; + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + if (opt) + parse_rtattr(tb, TCA_FW_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (handle) + fprintf(f, "handle 0x%x ", handle); + + if (tb[TCA_FW_CLASSID]) { + SPRINT_BUF(b1); + fprintf(f, "classid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_FW_CLASSID]), b1)); + } + + if (tb[TCA_FW_POLICE]) + tc_print_police(f, tb[TCA_FW_POLICE]); + return 0; +} + +struct filter_util fw_util = { + NULL, + "fw", + fw_parse_opt, + fw_print_opt, +}; diff --git a/tc/f_route.c b/tc/f_route.c index e69de29b..f13c28b5 100644 --- a/tc/f_route.c +++ b/tc/f_route.c @@ -0,0 +1,175 @@ +/* + * f_route.c ROUTE filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "rt_names.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... route [ from REALM | fromif TAG ] [ to REALM ]\n"); + fprintf(stderr, " [ flowid CLASSID ] [ police POLICE_SPEC ]\n"); + fprintf(stderr, " POLICE_SPEC := ... look at TBF\n"); + fprintf(stderr, " CLASSID := X:Y\n"); +} + +#define usage() return(-1) + +static int route_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) +{ + struct tc_police tp; + struct tcmsg *t = NLMSG_DATA(n); + struct rtattr *tail; + __u32 fh = 0xFFFF8000; + __u32 order = 0; + + memset(&tp, 0, sizeof(tp)); + + if (handle) { + if (get_u32(&t->tcm_handle, handle, 0)) { + fprintf(stderr, "Illegal \"handle\"\n"); + return -1; + } + } + + if (argc == 0) + return 0; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 4096, TCA_OPTIONS, NULL, 0); + + while (argc > 0) { + if (matches(*argv, "to") == 0) { + __u32 id; + NEXT_ARG(); + if (rtnl_rtrealm_a2n(&id, *argv)) { + fprintf(stderr, "Illegal \"to\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_ROUTE4_TO, &id, 4); + fh &= ~0x80FF; + fh |= id&0xFF; + } else if (matches(*argv, "from") == 0) { + __u32 id; + NEXT_ARG(); + if (rtnl_rtrealm_a2n(&id, *argv)) { + fprintf(stderr, "Illegal \"from\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_ROUTE4_FROM, &id, 4); + fh &= 0xFFFF; + fh |= id<<16; + } else if (matches(*argv, "fromif") == 0) { + struct rtnl_handle rth; + __u32 id; + NEXT_ARG(); + if (rtnl_open(&rth, 0) == 0) { + ll_init_map(&rth); + rtnl_close(&rth); + } + if ((id=ll_name_to_index(*argv)) <= 0) { + fprintf(stderr, "Illegal \"fromif\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_ROUTE4_IIF, &id, 4); + fh &= 0xFFFF; + fh |= (0x8000|id)<<16; + } else if (matches(*argv, "classid") == 0 || + strcmp(*argv, "flowid") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_tc_classid(&handle, *argv)) { + fprintf(stderr, "Illegal \"classid\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_ROUTE4_CLASSID, &handle, 4); + } else if (matches(*argv, "police") == 0) { + NEXT_ARG(); + if (parse_police(&argc, &argv, TCA_ROUTE4_POLICE, n)) { + fprintf(stderr, "Illegal \"police\"\n"); + return -1; + } + continue; + } else if (matches(*argv, "order") == 0) { + NEXT_ARG(); + if (get_u32(&order, *argv, 0)) { + fprintf(stderr, "Illegal \"order\"\n"); + return -1; + } + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; + if (order) { + fh &= ~0x7F00; + fh |= (order<<8)&0x7F00; + } + if (!t->tcm_handle) + t->tcm_handle = fh; + return 0; +} + +static int route_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) +{ + struct rtattr *tb[TCA_ROUTE4_MAX+1]; + SPRINT_BUF(b1); + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + if (opt) + parse_rtattr(tb, TCA_ROUTE4_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (handle) + fprintf(f, "fh 0x%08x ", handle); + if (handle&0x7F00) + fprintf(f, "order %d ", (handle>>8)&0x7F); + + if (tb[TCA_ROUTE4_CLASSID]) { + SPRINT_BUF(b1); + fprintf(f, "flowid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID]), b1)); + } + if (tb[TCA_ROUTE4_TO]) + fprintf(f, "to %s ", rtnl_rtrealm_n2a(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_TO]), b1, sizeof(b1))); + if (tb[TCA_ROUTE4_FROM]) + fprintf(f, "from %s ", rtnl_rtrealm_n2a(*(__u32*)RTA_DATA(tb[TCA_ROUTE4_FROM]), b1, sizeof(b1))); + if (tb[TCA_ROUTE4_IIF]) + fprintf(f, "fromif %s", ll_index_to_name(*(int*)RTA_DATA(tb[TCA_ROUTE4_IIF]))); + if (tb[TCA_ROUTE4_POLICE]) + tc_print_police(f, tb[TCA_ROUTE4_POLICE]); + return 0; +} + +struct filter_util route_util = { + NULL, + "route", + route_parse_opt, + route_print_opt, +}; diff --git a/tc/f_rsvp.c b/tc/f_rsvp.c index e69de29b..3d9b5283 100644 --- a/tc/f_rsvp.c +++ b/tc/f_rsvp.c @@ -0,0 +1,408 @@ +/* + * q_rsvp.c RSVP filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "rt_names.h" +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... rsvp ipproto PROTOCOL session DST[/PORT | GPI ]\n"); + fprintf(stderr, " [ sender SRC[/PORT | GPI ]\n"); + fprintf(stderr, " [ classid CLASSID ] [ police POLICE_SPEC ]\n"); + fprintf(stderr, " [ tunnelid ID ] [ tunnel ID skip NUMBER ]\n"); + fprintf(stderr, "Where: GPI := { flowlabel NUMBER | spi/ah SPI | spi/esp SPI |\n"); + fprintf(stderr, " u{8|16|32} NUMBER mask MASK at OFFSET}\n"); + fprintf(stderr, " POLICE_SPEC := ... look at TBF\n"); + fprintf(stderr, " FILTERID := X:Y\n"); +} + +#define usage() return(-1) + +int get_addr_and_pi(int *argc_p, char ***argv_p, inet_prefix * addr, + struct tc_rsvp_pinfo *pinfo, int dir, int family) +{ + int argc = *argc_p; + char **argv = *argv_p; + char *p = strchr(*argv, '/'); + struct tc_rsvp_gpi *pi = dir ? &pinfo->dpi : &pinfo->spi; + + if (p) { + __u16 tmp; + + if (get_u16(&tmp, p+1, 0)) + return -1; + + if (dir == 0) { + /* Source port: u16 at offset 0 */ + pi->key = htonl(((__u32)tmp)<<16); + pi->mask = htonl(0xFFFF0000); + } else { + /* Destination port: u16 at offset 2 */ + pi->key = htonl(((__u32)tmp)); + pi->mask = htonl(0x0000FFFF); + } + pi->offset = 0; + *p = 0; + } + if (get_addr_1(addr, *argv, family)) + return -1; + if (p) + *p = '/'; + + argc--; argv++; + + if (pi->mask || argc <= 0) + goto done; + + if (strcmp(*argv, "spi/ah") == 0 || + strcmp(*argv, "gpi/ah") == 0) { + __u32 gpi; + NEXT_ARG(); + if (get_u32(&gpi, *argv, 0)) + return -1; + pi->mask = htonl(0xFFFFFFFF); + pi->key = htonl(gpi); + pi->offset = 4; + if (pinfo->protocol == 0) + pinfo->protocol = IPPROTO_AH; + argc--; argv++; + } else if (strcmp(*argv, "spi/esp") == 0 || + strcmp(*argv, "gpi/esp") == 0) { + __u32 gpi; + NEXT_ARG(); + if (get_u32(&gpi, *argv, 0)) + return -1; + pi->mask = htonl(0xFFFFFFFF); + pi->key = htonl(gpi); + pi->offset = 0; + if (pinfo->protocol == 0) + pinfo->protocol = IPPROTO_ESP; + argc--; argv++; + } else if (strcmp(*argv, "flowlabel") == 0) { + __u32 flabel; + NEXT_ARG(); + if (get_u32(&flabel, *argv, 0)) + return -1; + if (family != AF_INET6) + return -1; + pi->mask = htonl(0x000FFFFF); + pi->key = htonl(flabel) & pi->mask; + pi->offset = -40; + argc--; argv++; + } else if (strcmp(*argv, "u32") == 0 || + strcmp(*argv, "u16") == 0 || + strcmp(*argv, "u8") == 0) { + int sz = 1; + __u32 tmp; + __u32 mask = 0xff; + if (strcmp(*argv, "u32") == 0) { + sz = 4; + mask = 0xffff; + } else if (strcmp(*argv, "u16") == 0) { + mask = 0xffffffff; + sz = 2; + } + NEXT_ARG(); + if (get_u32(&tmp, *argv, 0)) + return -1; + argc--; argv++; + if (strcmp(*argv, "mask") == 0) { + NEXT_ARG(); + if (get_u32(&mask, *argv, 16)) + return -1; + argc--; argv++; + } + if (strcmp(*argv, "at") == 0) { + NEXT_ARG(); + if (get_integer(&pi->offset, *argv, 0)) + return -1; + argc--; argv++; + } + if (sz == 1) { + if ((pi->offset & 3) == 0) { + mask <<= 24; + tmp <<= 24; + } else if ((pi->offset & 3) == 1) { + mask <<= 16; + tmp <<= 16; + } else if ((pi->offset & 3) == 3) { + mask <<= 8; + tmp <<= 8; + } + } else if (sz == 2) { + if ((pi->offset & 3) == 0) { + mask <<= 16; + tmp <<= 16; + } + } + pi->offset &= ~3; + pi->mask = htonl(mask); + pi->key = htonl(tmp) & pi->mask; + } + +done: + *argc_p = argc; + *argv_p = argv; + return 0; +} + + +static int rsvp_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) +{ + int family = strcmp(qu->id, "rsvp") == 0 ? AF_INET : AF_INET6; + struct tc_rsvp_pinfo pinfo; + struct tc_police tp; + struct tcmsg *t = NLMSG_DATA(n); + int pinfo_ok = 0; + struct rtattr *tail; + + memset(&pinfo, 0, sizeof(pinfo)); + memset(&tp, 0, sizeof(tp)); + + if (handle) { + if (get_u32(&t->tcm_handle, handle, 0)) { + fprintf(stderr, "Illegal \"handle\"\n"); + return -1; + } + } + + if (argc == 0) + return 0; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 4096, TCA_OPTIONS, NULL, 0); + + while (argc > 0) { + if (matches(*argv, "session") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (get_addr_and_pi(&argc, &argv, &addr, &pinfo, 1, family)) { + fprintf(stderr, "Illegal \"session\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_RSVP_DST, &addr.data, addr.bytelen); + if (pinfo.dpi.mask || pinfo.protocol) + pinfo_ok++; + continue; + } else if (matches(*argv, "sender") == 0 || + matches(*argv, "flowspec") == 0) { + inet_prefix addr; + NEXT_ARG(); + if (get_addr_and_pi(&argc, &argv, &addr, &pinfo, 0, family)) { + fprintf(stderr, "Illegal \"sender\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_RSVP_SRC, &addr.data, addr.bytelen); + if (pinfo.spi.mask || pinfo.protocol) + pinfo_ok++; + continue; + } else if (matches("ipproto", *argv) == 0) { + int num; + NEXT_ARG(); + num = inet_proto_a2n(*argv); + if (num < 0) { + fprintf(stderr, "Illegal \"ipproto\"\n"); + return -1; + } + pinfo.protocol = num; + pinfo_ok++; + } else if (matches(*argv, "classid") == 0 || + strcmp(*argv, "flowid") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_tc_classid(&handle, *argv)) { + fprintf(stderr, "Illegal \"classid\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_RSVP_CLASSID, &handle, 4); + } else if (strcmp(*argv, "tunnelid") == 0) { + unsigned tid; + NEXT_ARG(); + if (get_unsigned(&tid, *argv, 0)) { + fprintf(stderr, "Illegal \"tunnelid\"\n"); + return -1; + } + pinfo.tunnelid = tid; + pinfo_ok++; + } else if (strcmp(*argv, "tunnel") == 0) { + unsigned tid; + NEXT_ARG(); + if (get_unsigned(&tid, *argv, 0)) { + fprintf(stderr, "Illegal \"tunnel\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_RSVP_CLASSID, &tid, 4); + NEXT_ARG(); + if (strcmp(*argv, "skip") == 0) { + NEXT_ARG(); + } + if (get_unsigned(&tid, *argv, 0)) { + fprintf(stderr, "Illegal \"skip\"\n"); + return -1; + } + pinfo.tunnelhdr = tid; + pinfo_ok++; + } else if (matches(*argv, "police") == 0) { + NEXT_ARG(); + if (parse_police(&argc, &argv, TCA_RSVP_POLICE, n)) { + fprintf(stderr, "Illegal \"police\"\n"); + return -1; + } + continue; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (pinfo_ok) + addattr_l(n, 4096, TCA_RSVP_PINFO, &pinfo, sizeof(pinfo)); + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; + return 0; +} + +static char * sprint_spi(struct tc_rsvp_gpi *pi, int dir, char *buf) +{ + if (pi->offset == 0) { + if (dir && pi->mask == htonl(0xFFFF)) { + snprintf(buf, SPRINT_BSIZE-1, "/%d", htonl(pi->key)); + return buf; + } + if (!dir && pi->mask == htonl(0xFFFF0000)) { + snprintf(buf, SPRINT_BSIZE-1, "/%d", htonl(pi->key)>>16); + return buf; + } + if (pi->mask == htonl(0xFFFFFFFF)) { + snprintf(buf, SPRINT_BSIZE-1, " spi/esp 0x%08x", htonl(pi->key)); + return buf; + } + } else if (pi->offset == 4 && pi->mask == htonl(0xFFFFFFFF)) { + snprintf(buf, SPRINT_BSIZE-1, " spi/ah 0x%08x", htonl(pi->key)); + return buf; + } else if (pi->offset == -40 && pi->mask == htonl(0x000FFFFF)) { + snprintf(buf, SPRINT_BSIZE-1, " flowlabel 0x%05x", htonl(pi->key)); + return buf; + } + snprintf(buf, SPRINT_BSIZE-1, " u32 0x%08x mask %08x at %d", + htonl(pi->key), htonl(pi->mask), pi->offset); + return buf; +} + +static int rsvp_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) +{ + int family = strcmp(qu->id, "rsvp") == 0 ? AF_INET : AF_INET6; + struct rtattr *tb[TCA_RSVP_MAX+1]; + struct tc_rsvp_pinfo *pinfo = NULL; + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + if (opt) + parse_rtattr(tb, TCA_RSVP_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (handle) + fprintf(f, "fh 0x%08x ", handle); + + if (tb[TCA_RSVP_PINFO]) { + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO]) < sizeof(*pinfo)) + return -1; + + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO]); + } + + if (tb[TCA_RSVP_CLASSID]) { + SPRINT_BUF(b1); + if (!pinfo || pinfo->tunnelhdr == 0) + fprintf(f, "flowid %s ", sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_RSVP_CLASSID]), b1)); + else + fprintf(f, "tunnel %d skip %d ", *(__u32*)RTA_DATA(tb[TCA_RSVP_CLASSID]), pinfo->tunnelhdr); + } else if (pinfo && pinfo->tunnelhdr) + fprintf(f, "tunnel [BAD] skip %d ", pinfo->tunnelhdr); + + if (tb[TCA_RSVP_DST]) { + char buf[128]; + fprintf(f, "session "); + if (inet_ntop(family, RTA_DATA(tb[TCA_RSVP_DST]), buf, sizeof(buf)) == 0) + fprintf(f, " [INVALID DADDR] "); + else + fprintf(f, "%s", buf); + if (pinfo && pinfo->dpi.mask) { + SPRINT_BUF(b2); + fprintf(f, "%s ", sprint_spi(&pinfo->dpi, 1, b2)); + } else + fprintf(f, " "); + } else { + if (pinfo && pinfo->dpi.mask) { + SPRINT_BUF(b2); + fprintf(f, "session [NONE]%s ", sprint_spi(&pinfo->dpi, 1, b2)); + } else + fprintf(f, "session NONE "); + } + + if (pinfo && pinfo->protocol) { + SPRINT_BUF(b1); + fprintf(f, "ipproto %s ", inet_proto_n2a(pinfo->protocol, b1, sizeof(b1))); + } + if (pinfo && pinfo->tunnelid) + fprintf(f, "tunnelid %d ", pinfo->tunnelid); + if (tb[TCA_RSVP_SRC]) { + char buf[128]; + fprintf(f, "sender "); + if (inet_ntop(family, RTA_DATA(tb[TCA_RSVP_SRC]), buf, sizeof(buf)) == 0) { + fprintf(f, "[BAD]"); + } else { + fprintf(f, " %s", buf); + } + if (pinfo && pinfo->spi.mask) { + SPRINT_BUF(b2); + fprintf(f, "%s ", sprint_spi(&pinfo->spi, 0, b2)); + } else + fprintf(f, " "); + } else if (pinfo && pinfo->spi.mask) { + SPRINT_BUF(b2); + fprintf(f, "sender [NONE]%s ", sprint_spi(&pinfo->spi, 0, b2)); + } + if (tb[TCA_RSVP_POLICE]) + tc_print_police(f, tb[TCA_RSVP_POLICE]); + return 0; +} + +struct filter_util rsvp_util = { + NULL, + "rsvp", + rsvp_parse_opt, + rsvp_print_opt, +}; + +struct filter_util rsvp6_util = { + NULL, + "rsvp6", + rsvp_parse_opt, + rsvp_print_opt, +}; diff --git a/tc/f_tcindex.c b/tc/f_tcindex.c index e69de29b..59397487 100644 --- a/tc/f_tcindex.c +++ b/tc/f_tcindex.c @@ -0,0 +1,186 @@ +/* + * f_tcindex.c Traffic control index filter + * + * Written 1998,1999 by Werner Almesberger + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <string.h> +#include <netinet/in.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr," Usage: ... tcindex [ hash SIZE ] [ mask MASK ]" + " [ shift SHIFT ]\n"); + fprintf(stderr," [ pass_on | fall_through ]\n"); + fprintf(stderr," [ classid CLASSID ] " + "[ police POLICE_SPEC ]\n"); +} + + +#define usage() return(-1) + + +static int tcindex_parse_opt(struct filter_util *qu, char *handle, int argc, + char **argv, struct nlmsghdr *n) +{ + struct tcmsg *t = NLMSG_DATA(n); + struct rtattr *tail; + char *end; + + if (handle) { + t->tcm_handle = strtoul(handle,&end,0); + if (*end) { + fprintf(stderr, "Illegal filter ID\n"); + return -1; + } + } + if (!argc) return 0; + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n,4096,TCA_OPTIONS,NULL,0); + while (argc) { + if (!strcmp(*argv,"hash")) { + int hash; + + NEXT_ARG(); + hash = strtoul(*argv,&end,0); + if (*end || !hash || hash > 0x10000) { + explain(); + return -1; + } + addattr_l(n,4096,TCA_TCINDEX_HASH,&hash,sizeof(hash)); + } + else if (!strcmp(*argv,"mask")) { + __u16 mask; + + NEXT_ARG(); + mask = strtoul(*argv,&end,0); + if (*end) { + explain(); + return -1; + } + addattr_l(n,4096,TCA_TCINDEX_MASK,&mask,sizeof(mask)); + } + else if (!strcmp(*argv,"shift")) { + int shift; + + NEXT_ARG(); + shift = strtoul(*argv,&end,0); + if (*end) { + explain(); + return -1; + } + addattr_l(n,4096,TCA_TCINDEX_SHIFT,&shift, + sizeof(shift)); + } + else if (!strcmp(*argv,"fall_through")) { + int value = 1; + + addattr_l(n,4096,TCA_TCINDEX_FALL_THROUGH,&value, + sizeof(value)); + } + else if (!strcmp(*argv,"pass_on")) { + int value = 0; + + addattr_l(n,4096,TCA_TCINDEX_FALL_THROUGH,&value, + sizeof(value)); + } + else if (!strcmp(*argv,"classid")) { + __u32 handle; + + NEXT_ARG(); + if (get_tc_classid(&handle,*argv)) { + fprintf(stderr, "Illegal \"classid\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_TCINDEX_CLASSID, &handle, 4); + } + else if (!strcmp(*argv,"police")) { + NEXT_ARG(); + if (parse_police(&argc, &argv, TCA_TCINDEX_POLICE, n)) { + fprintf(stderr, "Illegal \"police\"\n"); + return -1; + } + continue; + } + else { + explain(); + return -1; + } + argc--; + argv++; + } + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; + return 0; +} + + +static int tcindex_print_opt(struct filter_util *qu, FILE *f, + struct rtattr *opt, __u32 handle) +{ + struct rtattr *tb[TCA_TCINDEX_MAX+1]; + + if (!opt) return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_TCINDEX_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (handle != ~0) fprintf(f,"handle 0x%04x ",handle); + if (tb[TCA_TCINDEX_HASH]) { + __u16 hash; + + if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH]) < sizeof(hash)) + return -1; + hash = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_HASH]); + fprintf(f,"hash %d ",hash); + } + if (tb[TCA_TCINDEX_MASK]) { + __u16 mask; + + if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK]) < sizeof(mask)) + return -1; + mask = *(__u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK]); + fprintf(f,"mask 0x%04x ",mask); + } + if (tb[TCA_TCINDEX_SHIFT]) { + int shift; + + if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT]) < sizeof(shift)) + return -1; + shift = *(int *) RTA_DATA(tb[TCA_TCINDEX_SHIFT]); + fprintf(f,"shift %d ",shift); + } + if (tb[TCA_TCINDEX_FALL_THROUGH]) { + int fall_through; + + if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH]) < + sizeof(fall_through)) + return -1; + fall_through = *(int *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH]); + fprintf(f,fall_through ? "fall_through " : "pass_on "); + } + if (tb[TCA_TCINDEX_CLASSID]) { + SPRINT_BUF(b1); + fprintf(f, "classid %s ",sprint_tc_classid(*(__u32 *) + RTA_DATA(tb[TCA_TCINDEX_CLASSID]), b1)); + } + if (tb[TCA_TCINDEX_POLICE]) { + fprintf(f, "\n"); + tc_print_police(f, tb[TCA_TCINDEX_POLICE]); + } + return 0; +} + +struct filter_util tcindex_util = { + NULL, + "tcindex", + tcindex_parse_opt, + tcindex_print_opt, +}; diff --git a/tc/f_u32.c b/tc/f_u32.c index e69de29b..3e76e9cf 100644 --- a/tc/f_u32.c +++ b/tc/f_u32.c @@ -0,0 +1,977 @@ +/* + * q_u32.c U32 filter. + * + * This program is free software; you can u32istribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... u32 [ match SELECTOR ... ] [ link HTID ] [ classid CLASSID ]\n"); + fprintf(stderr, " [ police POLICE_SPEC ] [ offset OFFSET_SPEC ]\n"); + fprintf(stderr, " [ ht HTID ] [ hashkey HASHKEY_SPEC ]\n"); + fprintf(stderr, " [ sample SAMPLE ]\n"); + fprintf(stderr, "or u32 divisor DIVISOR\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Where: SELECTOR := SAMPLE SAMPLE ...\n"); + fprintf(stderr, " SAMPLE := { ip | ip6 | udp | tcp | icmp | u{32|16|8} } SAMPLE_ARGS\n"); + fprintf(stderr, " FILTERID := X:Y:Z\n"); +} + +#define usage() return(-1) + +int get_u32_handle(__u32 *handle, char *str) +{ + __u32 htid=0, hash=0, nodeid=0; + char *tmp = strchr(str, ':'); + + if (tmp == NULL) { + if (memcmp("0x", str, 2) == 0) + return get_u32(handle, str, 16); + return -1; + } + htid = strtoul(str, &tmp, 16); + if (tmp == str && *str != ':' && *str != 0) + return -1; + if (htid>=0x1000) + return -1; + if (*tmp) { + str = tmp+1; + hash = strtoul(str, &tmp, 16); + if (tmp == str && *str != ':' && *str != 0) + return -1; + if (hash>=0x100) + return -1; + if (*tmp) { + str = tmp+1; + nodeid = strtoul(str, &tmp, 16); + if (tmp == str && *str != 0) + return -1; + if (nodeid>=0x1000) + return -1; + } + } + *handle = (htid<<20)|(hash<<12)|nodeid; + return 0; +} + +char * sprint_u32_handle(__u32 handle, char *buf) +{ + int bsize = SPRINT_BSIZE-1; + __u32 htid = TC_U32_HTID(handle); + __u32 hash = TC_U32_HASH(handle); + __u32 nodeid = TC_U32_NODE(handle); + char *b = buf; + + if (handle == 0) { + snprintf(b, bsize, "none"); + return b; + } + if (htid) { + int l = snprintf(b, bsize, "%x:", htid>>20); + bsize -= l; + b += l; + } + if (nodeid|hash) { + if (hash) { + int l = snprintf(b, bsize, "%x", hash); + bsize -= l; + b += l; + } + if (nodeid) { + int l = snprintf(b, bsize, ":%x", nodeid); + bsize -= l; + b += l; + } + } + if (show_raw) + snprintf(b, bsize, "[%08x] ", handle); + return buf; +} + +static int pack_key(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask) +{ + int i; + int hwm = sel->nkeys; + + key &= mask; + + for (i=0; i<hwm; i++) { + if (sel->keys[i].off == off && sel->keys[i].offmask == offmask) { + __u32 intersect = mask&sel->keys[i].mask; + + if ((key^sel->keys[i].val) & intersect) + return -1; + sel->keys[i].val |= key; + sel->keys[i].mask |= mask; + return 0; + } + } + + if (hwm >= 128) + return -1; + if (off % 4) + return -1; + sel->keys[hwm].val = key; + sel->keys[hwm].mask = mask; + sel->keys[hwm].off = off; + sel->keys[hwm].offmask = offmask; + sel->nkeys++; + return 0; +} + +static int pack_key32(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask) +{ + key = htonl(key); + mask = htonl(mask); + return pack_key(sel, key, mask, off, offmask); +} + +static int pack_key16(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask) +{ + if (key > 0xFFFF || mask > 0xFFFF) + return -1; + + if ((off & 3) == 0) { + key <<= 16; + mask <<= 16; + } + off &= ~3; + key = htonl(key); + mask = htonl(mask); + + return pack_key(sel, key, mask, off, offmask); +} + +static int pack_key8(struct tc_u32_sel *sel, __u32 key, __u32 mask, int off, int offmask) +{ + if (key > 0xFF || mask > 0xFF) + return -1; + + if ((off & 3) == 0) { + key <<= 24; + mask <<= 24; + } else if ((off & 3) == 1) { + key <<= 16; + mask <<= 16; + } else if ((off & 3) == 2) { + key <<= 8; + mask <<= 8; + } + off &= ~3; + key = htonl(key); + mask = htonl(mask); + + return pack_key(sel, key, mask, off, offmask); +} + + +int parse_at(int *argc_p, char ***argv_p, int *off, int *offmask) +{ + int argc = *argc_p; + char **argv = *argv_p; + char *p = *argv; + + if (argc <= 0) + return -1; + + if (strlen(p) > strlen("nexthdr+") && + memcmp(p, "nexthdr+", strlen("nexthdr+")) == 0) { + *offmask = -1; + p += strlen("nexthdr+"); + } else if (matches(*argv, "nexthdr+") == 0) { + NEXT_ARG(); + *offmask = -1; + p = *argv; + } + + if (get_integer(off, p, 0)) + return -1; + argc--; argv++; + + *argc_p = argc; + *argv_p = argv; + return 0; +} + + +static int parse_u32(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + __u32 key; + __u32 mask; + + if (argc < 2) + return -1; + + if (get_u32(&key, *argv, 0)) + return -1; + argc--; argv++; + + if (get_u32(&mask, *argv, 16)) + return -1; + argc--; argv++; + + if (argc > 0 && strcmp(argv[0], "at") == 0) { + NEXT_ARG(); + if (parse_at(&argc, &argv, &off, &offmask)) + return -1; + } + + res = pack_key32(sel, key, mask, off, offmask); + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_u16(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + __u32 key; + __u32 mask; + + if (argc < 2) + return -1; + + if (get_u32(&key, *argv, 0)) + return -1; + argc--; argv++; + + if (get_u32(&mask, *argv, 16)) + return -1; + argc--; argv++; + + if (argc > 0 && strcmp(argv[0], "at") == 0) { + NEXT_ARG(); + if (parse_at(&argc, &argv, &off, &offmask)) + return -1; + } + res = pack_key16(sel, key, mask, off, offmask); + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_u8(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off, int offmask) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + __u32 key; + __u32 mask; + + if (argc < 2) + return -1; + + if (get_u32(&key, *argv, 0)) + return -1; + argc--; argv++; + + if (get_u32(&mask, *argv, 16)) + return -1; + argc--; argv++; + + if (key > 0xFF || mask > 0xFF) + return -1; + + if (argc > 0 && strcmp(argv[0], "at") == 0) { + NEXT_ARG(); + if (parse_at(&argc, &argv, &off, &offmask)) + return -1; + } + + res = pack_key8(sel, key, mask, off, offmask); + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_ip_addr(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + inet_prefix addr; + __u32 mask; + int offmask = 0; + + if (argc < 1) + return -1; + + if (get_prefix_1(&addr, *argv, AF_INET)) + return -1; + argc--; argv++; + + if (argc > 0 && strcmp(argv[0], "at") == 0) { + NEXT_ARG(); + if (parse_at(&argc, &argv, &off, &offmask)) + return -1; + } + + mask = 0; + if (addr.bitlen) + mask = htonl(0xFFFFFFFF<<(32-addr.bitlen)); + if (pack_key(sel, addr.data[0], mask, off, offmask) < 0) + return -1; + res = 0; + + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_ip6_addr(int *argc_p, char ***argv_p, struct tc_u32_sel *sel, int off) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + int plen = 128; + int i; + inet_prefix addr; + int offmask = 0; + + if (argc < 1) + return -1; + + if (get_prefix_1(&addr, *argv, AF_INET6)) + return -1; + argc--; argv++; + + if (argc > 0 && strcmp(argv[0], "at") == 0) { + NEXT_ARG(); + if (parse_at(&argc, &argv, &off, &offmask)) + return -1; + } + + plen = addr.bitlen; + for (i=0; i<plen; i+=32) { + if (((i+31)&~0x1F)<=plen) { + if ((res = pack_key(sel, addr.data[i/32], 0xFFFFFFFF, off+4*(i/32), offmask)) < 0) + return -1; + } else if (i<plen) { + __u32 mask = htonl(0xFFFFFFFF<<(32-(plen-i))); + if ((res = pack_key(sel, addr.data[i/32], mask, off+4*(i/32), offmask)) < 0) + return -1; + } + } + res = 0; + + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_ip(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + + if (argc < 2) + return -1; + + if (strcmp(*argv, "src") == 0) { + NEXT_ARG(); + res = parse_ip_addr(&argc, &argv, sel, 12); + goto done; + } + if (strcmp(*argv, "dst") == 0) { + NEXT_ARG(); + res = parse_ip_addr(&argc, &argv, sel, 16); + goto done; + } + if (strcmp(*argv, "tos") == 0 || + matches(*argv, "dsfield") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 1, 0); + goto done; + } + if (strcmp(*argv, "ihl") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 0, 0); + goto done; + } + if (strcmp(*argv, "protocol") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 9, 0); + goto done; + } + if (matches(*argv, "precedence") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 1, 0); + goto done; + } + if (strcmp(*argv, "nofrag") == 0) { + argc--; argv++; + res = pack_key16(sel, 0, 0x3FFF, 6, 0); + goto done; + } + if (strcmp(*argv, "firstfrag") == 0) { + argc--; argv++; + res = pack_key16(sel, 0, 0x1FFF, 6, 0); + goto done; + } + if (strcmp(*argv, "df") == 0) { + argc--; argv++; + res = pack_key16(sel, 0x4000, 0x4000, 6, 0); + goto done; + } + if (strcmp(*argv, "mf") == 0) { + argc--; argv++; + res = pack_key16(sel, 0x2000, 0x2000, 6, 0); + goto done; + } + if (strcmp(*argv, "dport") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 22, 0); + goto done; + } + if (strcmp(*argv, "sport") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 20, 0); + goto done; + } + if (strcmp(*argv, "icmp_type") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 20, 0); + goto done; + } + if (strcmp(*argv, "icmp_code") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 20, 1); + goto done; + } + return -1; + +done: + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_ip6(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + + if (argc < 2) + return -1; + + if (strcmp(*argv, "src") == 0) { + NEXT_ARG(); + res = parse_ip6_addr(&argc, &argv, sel, 8); + goto done; + } + if (strcmp(*argv, "dst") == 0) { + NEXT_ARG(); + res = parse_ip6_addr(&argc, &argv, sel, 24); + goto done; + } + if (strcmp(*argv, "priority") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 0, 0); + goto done; + } + if (strcmp(*argv, "protocol") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 6, 0); + goto done; + } + if (strcmp(*argv, "flowlabel") == 0) { + NEXT_ARG(); + res = parse_u32(&argc, &argv, sel, 0, 0); + goto done; + } + if (strcmp(*argv, "dport") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 42, 0); + goto done; + } + if (strcmp(*argv, "sport") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 40, 0); + goto done; + } + if (strcmp(*argv, "icmp_type") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 40, 0); + goto done; + } + if (strcmp(*argv, "icmp_code") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 41, 1); + goto done; + } + return -1; + +done: + *argc_p = argc; + *argv_p = argv; + return res; +} + +#define parse_tcp parse_udp +static int parse_udp(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + + if (argc < 2) + return -1; + + if (strcmp(*argv, "src") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 0, -1); + goto done; + } + if (strcmp(*argv, "dst") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 2, -1); + goto done; + } + return -1; + +done: + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_icmp(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int res = -1; + int argc = *argc_p; + char **argv = *argv_p; + + if (argc < 2) + return -1; + + if (strcmp(*argv, "type") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 0, -1); + goto done; + } + if (strcmp(*argv, "code") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 1, -1); + goto done; + } + return -1; + +done: + *argc_p = argc; + *argv_p = argv; + return res; +} + + + +static int parse_selector(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int argc = *argc_p; + char **argv = *argv_p; + int res = -1; + + if (argc <= 0) + return -1; + + if (matches(*argv, "u32") == 0) { + NEXT_ARG(); + res = parse_u32(&argc, &argv, sel, 0, 0); + goto done; + } + if (matches(*argv, "u16") == 0) { + NEXT_ARG(); + res = parse_u16(&argc, &argv, sel, 0, 0); + goto done; + } + if (matches(*argv, "u8") == 0) { + NEXT_ARG(); + res = parse_u8(&argc, &argv, sel, 0, 0); + goto done; + } + if (matches(*argv, "ip") == 0) { + NEXT_ARG(); + res = parse_ip(&argc, &argv, sel); + goto done; + } + if (matches(*argv, "ip6") == 0) { + NEXT_ARG(); + res = parse_ip6(&argc, &argv, sel); + goto done; + } + if (matches(*argv, "udp") == 0) { + NEXT_ARG(); + res = parse_udp(&argc, &argv, sel); + goto done; + } + if (matches(*argv, "tcp") == 0) { + NEXT_ARG(); + res = parse_tcp(&argc, &argv, sel); + goto done; + } + if (matches(*argv, "icmp") == 0) { + NEXT_ARG(); + res = parse_icmp(&argc, &argv, sel); + goto done; + } + return -1; + +done: + *argc_p = argc; + *argv_p = argv; + return res; +} + +static int parse_offset(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int argc = *argc_p; + char **argv = *argv_p; + + while (argc > 0) { + if (matches(*argv, "plus") == 0) { + int off; + NEXT_ARG(); + if (get_integer(&off, *argv, 0)) + return -1; + sel->off = off; + sel->flags |= TC_U32_OFFSET; + } else if (matches(*argv, "at") == 0) { + int off; + NEXT_ARG(); + if (get_integer(&off, *argv, 0)) + return -1; + sel->offoff = off; + if (off%2) { + fprintf(stderr, "offset \"at\" must be even\n"); + return -1; + } + sel->flags |= TC_U32_VAROFFSET; + } else if (matches(*argv, "mask") == 0) { + __u16 mask; + NEXT_ARG(); + if (get_u16(&mask, *argv, 16)) + return -1; + sel->offmask = htons(mask); + sel->flags |= TC_U32_VAROFFSET; + } else if (matches(*argv, "shift") == 0) { + int shift; + NEXT_ARG(); + if (get_integer(&shift, *argv, 0)) + return -1; + sel->offshift = shift; + sel->flags |= TC_U32_VAROFFSET; + } else if (matches(*argv, "eat") == 0) { + sel->flags |= TC_U32_EAT; + } else { + break; + } + argc--; argv++; + } + + *argc_p = argc; + *argv_p = argv; + return 0; +} + +static int parse_hashkey(int *argc_p, char ***argv_p, struct tc_u32_sel *sel) +{ + int argc = *argc_p; + char **argv = *argv_p; + + while (argc > 0) { + if (matches(*argv, "mask") == 0) { + __u32 mask; + NEXT_ARG(); + if (get_u32(&mask, *argv, 16)) + return -1; + sel->hmask = htonl(mask); + } else if (matches(*argv, "at") == 0) { + int num; + NEXT_ARG(); + if (get_integer(&num, *argv, 0)) + return -1; + if (num%4) + return -1; + sel->hoff = num; + } else { + break; + } + argc--; argv++; + } + + *argc_p = argc; + *argv_p = argv; + return 0; +} + +static int u32_parse_opt(struct filter_util *qu, char *handle, int argc, char **argv, struct nlmsghdr *n) +{ + struct { + struct tc_u32_sel sel; + struct tc_u32_key keys[128]; + } sel; + struct tcmsg *t = NLMSG_DATA(n); + struct rtattr *tail; + int sel_ok = 0; + int sample_ok = 0; + __u32 htid = 0; + __u32 order = 0; + + memset(&sel, 0, sizeof(sel)); + + if (handle && get_u32_handle(&t->tcm_handle, handle)) { + fprintf(stderr, "Illegal filter ID\n"); + return -1; + } + + if (argc == 0) + return 0; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 4096, TCA_OPTIONS, NULL, 0); + + while (argc > 0) { + if (matches(*argv, "match") == 0) { + NEXT_ARG(); + if (parse_selector(&argc, &argv, &sel.sel)) { + fprintf(stderr, "Illegal \"match\"\n"); + return -1; + } + sel_ok++; + continue; + } else if (matches(*argv, "offset") == 0) { + NEXT_ARG(); + if (parse_offset(&argc, &argv, &sel.sel)) { + fprintf(stderr, "Illegal \"offset\"\n"); + return -1; + } + continue; + } else if (matches(*argv, "hashkey") == 0) { + NEXT_ARG(); + if (parse_hashkey(&argc, &argv, &sel.sel)) { + fprintf(stderr, "Illegal \"hashkey\"\n"); + return -1; + } + continue; + } else if (matches(*argv, "classid") == 0 || + strcmp(*argv, "flowid") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_tc_classid(&handle, *argv)) { + fprintf(stderr, "Illegal \"classid\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_U32_CLASSID, &handle, 4); + sel.sel.flags |= TC_U32_TERMINAL; + } else if (matches(*argv, "divisor") == 0) { + unsigned divisor; + NEXT_ARG(); + if (get_unsigned(&divisor, *argv, 0) || divisor == 0 || + divisor > 0x100) { + fprintf(stderr, "Illegal \"divisor\"\n"); + return -1; + } + addattr_l(n, 4096, TCA_U32_DIVISOR, &divisor, 4); + } else if (matches(*argv, "order") == 0) { + NEXT_ARG(); + if (get_u32(&order, *argv, 0)) { + fprintf(stderr, "Illegal \"order\"\n"); + return -1; + } + } else if (strcmp(*argv, "link") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_u32_handle(&handle, *argv)) { + fprintf(stderr, "Illegal \"link\"\n"); + return -1; + } + if (handle && TC_U32_NODE(handle)) { + fprintf(stderr, "\"link\" must be a hash table.\n"); + return -1; + } + addattr_l(n, 4096, TCA_U32_LINK, &handle, 4); + } else if (strcmp(*argv, "ht") == 0) { + unsigned handle; + NEXT_ARG(); + if (get_u32_handle(&handle, *argv)) { + fprintf(stderr, "Illegal \"ht\"\n"); + return -1; + } + if (handle && TC_U32_NODE(handle)) { + fprintf(stderr, "\"ht\" must be a hash table.\n"); + return -1; + } + if (sample_ok) + htid = (htid&0xFF000)|(handle&0xFFF00000); + else + htid = (handle&0xFFFFF000); + } else if (strcmp(*argv, "sample") == 0) { + __u32 hash; + struct { + struct tc_u32_sel sel; + struct tc_u32_key keys[4]; + } sel2; + NEXT_ARG(); + if (parse_selector(&argc, &argv, &sel2.sel)) { + fprintf(stderr, "Illegal \"sample\"\n"); + return -1; + } + if (sel2.sel.nkeys != 1) { + fprintf(stderr, "\"sample\" must contain exactly ONE key.\n"); + return -1; + } + hash = sel2.sel.keys[0].val&sel2.sel.keys[0].mask; + hash ^= hash>>16; + hash ^= hash>>8; + htid = ((hash<<12)&0xFF000)|(htid&0xFFF00000); + sample_ok = 1; + continue; + } else if (matches(*argv, "police") == 0) { + NEXT_ARG(); + if (parse_police(&argc, &argv, TCA_U32_POLICE, n)) { + fprintf(stderr, "Illegal \"police\"\n"); + return -1; + } + continue; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (order) { + if (TC_U32_NODE(t->tcm_handle) && order != TC_U32_NODE(t->tcm_handle)) { + fprintf(stderr, "\"order\" contradicts \"handle\"\n"); + return -1; + } + t->tcm_handle |= order; + } + + if (htid) + addattr_l(n, 4096, TCA_U32_HASH, &htid, 4); + if (sel_ok) + addattr_l(n, 4096, TCA_U32_SEL, &sel, sizeof(sel.sel)+sel.sel.nkeys*sizeof(struct tc_u32_key)); + tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail; + return 0; +} + +static int u32_print_opt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 handle) +{ + struct rtattr *tb[TCA_U32_MAX+1]; + struct tc_u32_sel *sel = NULL; + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + if (opt) + parse_rtattr(tb, TCA_U32_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (handle) { + SPRINT_BUF(b1); + fprintf(f, "fh %s ", sprint_u32_handle(handle, b1)); + } + if (TC_U32_NODE(handle)) { + fprintf(f, "order %d ", TC_U32_NODE(handle)); + } + + if (tb[TCA_U32_SEL]) { + if (RTA_PAYLOAD(tb[TCA_U32_SEL]) < sizeof(*sel)) + return -1; + + sel = RTA_DATA(tb[TCA_U32_SEL]); + } + + if (tb[TCA_U32_DIVISOR]) { + fprintf(f, "ht divisor %d ", *(__u32*)RTA_DATA(tb[TCA_U32_DIVISOR])); + } else if (tb[TCA_U32_HASH]) { + __u32 htid = *(__u32*)RTA_DATA(tb[TCA_U32_HASH]); + fprintf(f, "key ht %x bkt %x ", TC_U32_USERHTID(htid), TC_U32_HASH(htid)); + } else { + fprintf(f, "??? "); + } + if (tb[TCA_U32_CLASSID]) { + SPRINT_BUF(b1); + fprintf(f, "%sflowid %s ", + !sel || !(sel->flags&TC_U32_TERMINAL) ? "*" : "", + sprint_tc_classid(*(__u32*)RTA_DATA(tb[TCA_U32_CLASSID]), b1)); + } else if (sel && sel->flags&TC_U32_TERMINAL) { + fprintf(f, "terminal flowid ??? "); + } + if (tb[TCA_U32_LINK]) { + SPRINT_BUF(b1); + fprintf(f, "link %s ", sprint_u32_handle(*(__u32*)RTA_DATA(tb[TCA_U32_LINK]), b1)); + } + if (tb[TCA_U32_POLICE]) { + fprintf(f, "\n"); + tc_print_police(f, tb[TCA_U32_POLICE]); + } + + if (sel) { + int i; + struct tc_u32_key *key = sel->keys; + + if (sel->nkeys) { + for (i=0; i<sel->nkeys; i++, key++) + fprintf(f, "\n match %08x/%08x at %s%d", + (unsigned int)ntohl(key->val), + (unsigned int)ntohl(key->mask), + key->offmask ? "nexthdr+" : "", + key->off); + } + + if (sel->flags&(TC_U32_VAROFFSET|TC_U32_OFFSET)) { + fprintf(f, "\n offset "); + if (sel->flags&TC_U32_VAROFFSET) + fprintf(f, "%04x>>%d at %d ", ntohs(sel->offmask), sel->offshift, sel->offoff); + if (sel->off) + fprintf(f, "plus %d ", sel->off); + } + if (sel->flags&TC_U32_EAT) + fprintf(f, " eat "); + + if (sel->hmask) { + fprintf(f, "\n hash mask %08x at %d ", + (unsigned int)htonl(sel->hmask), sel->hoff); + } + } + + return 0; +} + +struct filter_util u32_util = { + NULL, + "u32", + u32_parse_opt, + u32_print_opt, +}; diff --git a/tc/m_estimator.c b/tc/m_estimator.c index e69de29b..0f9808e5 100644 --- a/tc/m_estimator.c +++ b/tc/m_estimator.c @@ -0,0 +1,64 @@ +/* + * m_estimator.c Parse/print estimator module options. + * + * This program is free software; you can u32istribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void est_help(void) __attribute__((noreturn)); + +static void est_help(void) +{ + fprintf(stderr, "Usage: ... estimator INTERVAL TIME-CONST\n"); + fprintf(stderr, " INTERVAL is interval between measurements\n"); + fprintf(stderr, " TIME-CONST is averaging time constant\n"); + fprintf(stderr, "Example: ... est 1sec 8sec\n"); + exit(-1); +} + +int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est) +{ + int argc = *p_argc; + char **argv = *p_argv; + unsigned A, time_const; + + NEXT_ARG(); + if (est->ewma_log) + duparg("estimator", *argv); + if (matches(*argv, "help") == 0) + est_help(); + if (get_usecs(&A, *argv)) + invarg("estimator", "invalid estimator interval"); + NEXT_ARG(); + if (matches(*argv, "help") == 0) + est_help(); + if (get_usecs(&time_const, *argv)) + invarg("estimator", "invalid estimator time constant"); + if (tc_setup_estimator(A, time_const, est) < 0) { + fprintf(stderr, "Error: estimator parameters are out of range.\n"); + exit(-1); + } + if (show_raw) + fprintf(stderr, "[estimator i=%u e=%u]\n", est->interval, est->ewma_log); + *p_argc = argc; + *p_argv = argv; + return 0; +} diff --git a/tc/m_police.c b/tc/m_police.c index e69de29b..0e76efc5 100644 --- a/tc/m_police.c +++ b/tc/m_police.c @@ -0,0 +1,328 @@ +/* + * m_police.c Parse/print policing module options. + * + * This program is free software; you can u32istribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * FIXES: 19990619 - J Hadi Salim (hadi@cyberus.ca) + * simple addattr packaging fix. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... police rate BPS burst BYTES[/BYTES] [ mtu BYTES[/BYTES] ]\n"); + fprintf(stderr, " [ peakrate BPS ] [ avrate BPS ]\n"); + fprintf(stderr, " [ ACTION ]\n"); + fprintf(stderr, "Where: ACTION := reclassify | drop | continue \n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + +#define usage() return(-1) + + +char *police_action_n2a(int action, char *buf, int len) +{ + switch (action) { + case -1: + return "continue"; + break; + case TC_POLICE_OK: + return "pass"; + break; + case TC_POLICE_SHOT: + return "drop"; + break; + case TC_POLICE_RECLASSIFY: + return "reclassify"; + default: + snprintf(buf, len, "%d", action); + return buf; + } +} + +int police_action_a2n(char *arg, int *result) +{ + int res; + + if (matches(arg, "continue") == 0) + res = -1; + else if (matches(arg, "drop") == 0) + res = TC_POLICE_SHOT; + else if (matches(arg, "shot") == 0) + res = TC_POLICE_SHOT; + else if (matches(arg, "pass") == 0) + res = TC_POLICE_OK; + else if (strcmp(arg, "ok") == 0) + res = TC_POLICE_OK; + else if (matches(arg, "reclassify") == 0) + res = TC_POLICE_RECLASSIFY; + else { + char dummy; + if (sscanf(arg, "%d%c", &res, &dummy) != 1) + return -1; + } + *result = res; + return 0; +} + + +int get_police_result(int *action, int *result, char *arg) +{ + char *p = strchr(arg, '/'); + + if (p) + *p = 0; + + if (police_action_a2n(arg, action)) { + if (p) + *p = '/'; + return -1; + } + + if (p) { + *p = '/'; + if (police_action_a2n(p+1, result)) + return -1; + } + return 0; +} + +int parse_police(int *argc_p, char ***argv_p, int tca_id, struct nlmsghdr *n) +{ + int argc = *argc_p; + char **argv = *argv_p; + int res = -1; + int ok=0; + struct tc_police p; + __u32 rtab[256]; + __u32 ptab[256]; + __u32 avrate = 0; + int presult = 0; + unsigned buffer=0, mtu=0, mpu=0; + int Rcell_log=-1, Pcell_log = -1; + struct rtattr *tail; + + memset(&p, 0, sizeof(p)); + p.action = TC_POLICE_RECLASSIFY; + + if (argc <= 0) + return -1; + + while (argc > 0) { + if (matches(*argv, "index") == 0) { + NEXT_ARG(); + if (get_u32(&p.index, *argv, 16)) { + fprintf(stderr, "Illegal \"index\"\n"); + return -1; + } + } else if (matches(*argv, "burst") == 0 || + strcmp(*argv, "buffer") == 0 || + strcmp(*argv, "maxburst") == 0) { + NEXT_ARG(); + if (buffer) { + fprintf(stderr, "Double \"buffer/burst\" spec\n"); + return -1; + } + if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0) { + explain1("buffer"); + return -1; + } + } else if (strcmp(*argv, "mtu") == 0 || + strcmp(*argv, "minburst") == 0) { + NEXT_ARG(); + if (mtu) { + fprintf(stderr, "Double \"mtu/minburst\" spec\n"); + return -1; + } + if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0) { + explain1("mtu"); + return -1; + } + } else if (strcmp(*argv, "mpu") == 0) { + NEXT_ARG(); + if (mpu) { + fprintf(stderr, "Double \"mpu\" spec\n"); + return -1; + } + if (get_size(&mpu, *argv)) { + explain1("mpu"); + return -1; + } + } else if (strcmp(*argv, "rate") == 0) { + NEXT_ARG(); + if (p.rate.rate) { + fprintf(stderr, "Double \"rate\" spec\n"); + return -1; + } + if (get_rate(&p.rate.rate, *argv)) { + explain1("rate"); + return -1; + } + } else if (strcmp(*argv, "avrate") == 0) { + NEXT_ARG(); + if (avrate) { + fprintf(stderr, "Double \"avrate\" spec\n"); + return -1; + } + if (get_rate(&avrate, *argv)) { + explain1("avrate"); + return -1; + } + } else if (matches(*argv, "peakrate") == 0) { + NEXT_ARG(); + if (p.peakrate.rate) { + fprintf(stderr, "Double \"peakrate\" spec\n"); + return -1; + } + if (get_rate(&p.peakrate.rate, *argv)) { + explain1("peakrate"); + return -1; + } + } else if (matches(*argv, "reclassify") == 0) { + p.action = TC_POLICE_RECLASSIFY; + } else if (matches(*argv, "drop") == 0 || + matches(*argv, "shot") == 0) { + p.action = TC_POLICE_SHOT; + } else if (matches(*argv, "continue") == 0) { + p.action = TC_POLICE_UNSPEC; + } else if (matches(*argv, "pass") == 0) { + p.action = TC_POLICE_OK; + } else if (strcmp(*argv, "action") == 0) { + NEXT_ARG(); + if (get_police_result(&p.action, &presult, *argv)) { + fprintf(stderr, "Illegal \"action\"\n"); + return -1; + } + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + break; + } + ok++; + argc--; argv++; + } + + if (!ok) + return -1; + + if (p.rate.rate && !buffer) { + fprintf(stderr, "\"burst\" requires \"rate\".\n"); + return -1; + } + if (p.peakrate.rate) { + if (!p.rate.rate) { + fprintf(stderr, "\"peakrate\" requires \"rate\".\n"); + return -1; + } + if (!mtu) { + fprintf(stderr, "\"mtu\" is required, if \"peakrate\" is requested.\n"); + return -1; + } + } + + if (p.rate.rate) { + if ((Rcell_log = tc_calc_rtable(p.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { + fprintf(stderr, "TBF: failed to calculate rate table.\n"); + return -1; + } + p.burst = tc_calc_xmittime(p.rate.rate, buffer); + p.rate.cell_log = Rcell_log; + p.rate.mpu = mpu; + } + p.mtu = mtu; + if (p.peakrate.rate) { + if ((Pcell_log = tc_calc_rtable(p.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { + fprintf(stderr, "POLICE: failed to calculate peak rate table.\n"); + return -1; + } + p.peakrate.cell_log = Pcell_log; + p.peakrate.mpu = mpu; + } + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 1024, tca_id, NULL, 0); + addattr_l(n, 2024, TCA_POLICE_TBF, &p, sizeof(p)); + if (p.rate.rate) + addattr_l(n, 3024, TCA_POLICE_RATE, rtab, 1024); + if (p.peakrate.rate) + addattr_l(n, 4096, TCA_POLICE_PEAKRATE, ptab, 1024); + if (avrate) + addattr32(n, 4096, TCA_POLICE_AVRATE, avrate); + if (presult) + addattr32(n, 4096, TCA_POLICE_RESULT, presult); +#if 0 +#endif + + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + res = 0; + + *argc_p = argc; + *argv_p = argv; + return res; +} + + +int tc_print_police(FILE *f, struct rtattr *arg) +{ + SPRINT_BUF(b1); + struct tc_police *p; + struct rtattr *tb[TCA_POLICE_MAX+1]; + unsigned buffer; + + if (arg == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_POLICE_MAX, RTA_DATA(arg), RTA_PAYLOAD(arg)); + + if (tb[TCA_POLICE_TBF] == NULL) { + fprintf(f, "[NULL police tbf]"); + return 0; + } + if (RTA_PAYLOAD(tb[TCA_POLICE_TBF]) < sizeof(*p)) { + fprintf(f, "[truncated police tbf]"); + return -1; + } + p = RTA_DATA(tb[TCA_POLICE_TBF]); + + fprintf(f, "police %x ", p->index); + fprintf(f, "action %s", police_action_n2a(p->action, b1, sizeof(b1))); + if (tb[TCA_POLICE_RESULT]) { + fprintf(f, "/%s ", police_action_n2a(*(int*)RTA_DATA(tb[TCA_POLICE_RESULT]), b1, sizeof(b1))); + } else + fprintf(f, " "); + fprintf(f, "rate %s ", sprint_rate(p->rate.rate, b1)); + buffer = ((double)p->rate.rate*tc_core_tick2usec(p->burst))/1000000; + fprintf(f, "burst %s ", sprint_size(buffer, b1)); + fprintf(f, "mtu %s ", sprint_size(p->mtu, b1)); + if (show_raw) + fprintf(f, "[%08x] ", p->burst); + if (p->peakrate.rate) + fprintf(f, "peakrate %s ", sprint_rate(p->peakrate.rate, b1)); + if (tb[TCA_POLICE_AVRATE]) + fprintf(f, "avrate %s ", sprint_rate(*(__u32*)RTA_DATA(tb[TCA_POLICE_AVRATE]), b1)); + + return 0; +} + diff --git a/tc/q_atm.c b/tc/q_atm.c index e69de29b..d1745387 100644 --- a/tc/q_atm.c +++ b/tc/q_atm.c @@ -0,0 +1,268 @@ +/* + * q_atm.c ATM. + * + * Hacked 1998-2000 by Werner Almesberger, EPFL ICA + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <ctype.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <atm.h> +#include <linux/atmdev.h> +#include <linux/atmarp.h> + +#include "utils.h" +#include "tc_util.h" + + +#define MAX_HDR_LEN 64 + +#define usage() return(-1) + + +static int atm_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + if (argc) { + fprintf(stderr,"Usage: atm\n"); + return -1; + } + return 0; +} + + +static void explain(void) +{ + fprintf(stderr, "Usage: ... atm ( pvc ADDR | svc ADDR [ sap SAP ] ) " + "[ qos QOS ] [ sndbuf BYTES ]\n"); + fprintf(stderr, " [ hdr HEX... ] [ excess ( CLASSID | clp ) ] " + "[ clip ]\n"); +} + + +static int atm_parse_class_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) +{ + struct sockaddr_atmsvc addr; + struct atm_qos qos; + struct atm_sap sap; + unsigned char hdr[MAX_HDR_LEN]; + __u32 excess = 0; + struct rtattr *tail; + int sndbuf = 0; + int hdr_len = -1; + int set_clip = 0; + int s; + + memset(&addr,0,sizeof(addr)); + (void) text2qos("aal5,ubr:sdu=9180,rx:none",&qos,0); + (void) text2sap("blli:l2=iso8802",&sap,0); + while (argc > 0) { + if (!strcmp(*argv,"pvc")) { + NEXT_ARG(); + if (text2atm(*argv,(struct sockaddr *) &addr, + sizeof(addr),T2A_PVC | T2A_NAME) < 0) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"svc")) { + NEXT_ARG(); + if (text2atm(*argv,(struct sockaddr *) &addr, + sizeof(addr),T2A_SVC | T2A_NAME) < 0) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"qos")) { + NEXT_ARG(); + if (text2qos(*argv,&qos,0) < 0) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"sndbuf")) { + char *end; + + NEXT_ARG(); + sndbuf = strtol(*argv,&end,0); + if (*end) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"sap")) { + NEXT_ARG(); + if (addr.sas_family != AF_ATMSVC || + text2sap(*argv,&sap,T2A_NAME) < 0) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"hdr")) { + unsigned char *ptr; + char *walk; + + NEXT_ARG(); + ptr = hdr; + for (walk = *argv; *walk; walk++) { + int tmp; + + if (ptr == hdr+MAX_HDR_LEN) { + fprintf(stderr,"header is too long\n"); + return -1; + } + if (*walk == '.') continue; + if (!isxdigit(walk[0]) || !walk[1] || + !isxdigit(walk[1])) { + explain(); + return -1; + } + sscanf(walk,"%2x",&tmp); + *ptr++ = tmp; + walk++; + } + hdr_len = ptr-hdr; + } + else if (!strcmp(*argv,"excess")) { + NEXT_ARG(); + if (!strcmp(*argv,"clp")) excess = 0; + else if (get_tc_classid(&excess,*argv)) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"clip")) { + set_clip = 1; + } + else { + explain(); + return 1; + } + argc--; + argv++; + } + s = socket(addr.sas_family,SOCK_DGRAM,0); + if (s < 0) { + perror("socket"); + return -1; + } + if (setsockopt(s,SOL_ATM,SO_ATMQOS,&qos,sizeof(qos)) < 0) { + perror("SO_ATMQOS"); + return -1; + } + if (sndbuf) + if (setsockopt(s,SOL_SOCKET,SO_SNDBUF,&sndbuf,sizeof(sndbuf)) < 0) { + perror("SO_SNDBUF"); + return -1; + } + if (addr.sas_family == AF_ATMSVC && setsockopt(s,SOL_ATM,SO_ATMSAP, + &sap,sizeof(sap)) < 0) { + perror("SO_ATMSAP"); + return -1; + } + if (connect(s,(struct sockaddr *) &addr,addr.sas_family == AF_ATMPVC ? + sizeof(struct sockaddr_atmpvc) : sizeof(addr)) < 0) { + perror("connect"); + return -1; + } + if (set_clip) + if (ioctl(s,ATMARP_MKIP,0) < 0) { + perror("ioctl ATMARP_MKIP"); + return -1; + } + tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n,1024,TCA_OPTIONS,NULL,0); + addattr_l(n,1024,TCA_ATM_FD,&s,sizeof(s)); + if (excess) addattr_l(n,1024,TCA_ATM_EXCESS,&excess,sizeof(excess)); + if (hdr_len != -1) addattr_l(n,1024,TCA_ATM_HDR,hdr,hdr_len); + tail->rta_len = (((void *) n)+NLMSG_ALIGN(n->nlmsg_len))-(void *) tail; + return 0; +} + + + +static int atm_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_ATM_MAX+1]; + char buffer[MAX_ATM_ADDR_LEN+1]; + + if (!opt) return 0; + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_ATM_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_ATM_ADDR]) { + if (RTA_PAYLOAD(tb[TCA_ATM_ADDR]) < + sizeof(struct sockaddr_atmpvc)) + fprintf(stderr,"ATM: address too short\n"); + else { + if (atm2text(buffer,MAX_ATM_ADDR_LEN, + RTA_DATA(tb[TCA_ATM_ADDR]),A2T_PRETTY | A2T_NAME) < + 0) fprintf(stderr,"atm2text error\n"); + fprintf(f,"pvc %s ",buffer); + } + } + if (tb[TCA_ATM_HDR]) { + int i; + + fprintf(f,"hdr"); + for (i = 0; i < RTA_PAYLOAD(tb[TCA_ATM_HDR]); i++) + fprintf(f,"%c%02x",i ? '.' : ' ', + ((unsigned char *) RTA_DATA(tb[TCA_ATM_HDR]))[i]); + if (!i) fprintf(f," ."); + fprintf(f," "); + } + if (tb[TCA_ATM_EXCESS]) { + __u32 excess; + + if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS]) < sizeof(excess)) + fprintf(stderr,"ATM: excess class ID too short\n"); + else { + excess = *(__u32 *) RTA_DATA(tb[TCA_ATM_EXCESS]); + if (!excess) fprintf(f,"excess clp "); + else { + char buf[64]; + + print_tc_classid(buf,sizeof(buf),excess); + fprintf(f,"excess %s ",buf); + } + } + } + if (tb[TCA_ATM_STATE]) { + static const char *map[] = { ATM_VS2TXT_MAP }; + int state; + + if (RTA_PAYLOAD(tb[TCA_ATM_STATE]) < sizeof(state)) + fprintf(stderr,"ATM: state field too short\n"); + else { + state = *(int *) RTA_DATA(tb[TCA_ATM_STATE]); + fprintf(f,"%s ",map[state]); + } + } + return 0; +} + + +static int atm_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util atm_util = { + NULL, + "atm", + atm_parse_opt, + atm_print_opt, + atm_print_xstats, + + atm_parse_class_opt, + atm_print_opt +}; diff --git a/tc/q_cbq.c b/tc/q_cbq.c index e69de29b..51ed87a2 100644 --- a/tc/q_cbq.c +++ b/tc/q_cbq.c @@ -0,0 +1,555 @@ +/* + * q_cbq.c CBQ. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" +#include "tc_cbq.h" + +static void explain_class(void) +{ + fprintf(stderr, "Usage: ... cbq bandwidth BPS rate BPS maxburst PKTS [ avpkt BYTES ]\n"); + fprintf(stderr, " [ minburst PKTS ] [ bounded ] [ isolated ]\n"); + fprintf(stderr, " [ allot BYTES ] [ mpu BYTES ] [ weight RATE ]\n"); + fprintf(stderr, " [ prio NUMBER ] [ cell BYTES ] [ ewma LOG ]\n"); + fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); + fprintf(stderr, " [ split CLASSID ] [ defmap MASK/CHANGE ]\n"); +} + +static void explain(void) +{ + fprintf(stderr, "Usage: ... cbq bandwidth BPS avpkt BYTES [ mpu BYTES ]\n"); + fprintf(stderr, " [ cell BYTES ] [ ewma LOG ]\n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + +#define usage() return(-1) + +static int cbq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + struct tc_ratespec r; + struct tc_cbq_lssopt lss; + __u32 rtab[256]; + unsigned mpu=0, avpkt=0, allot=0; + int cell_log=-1; + int ewma_log=-1; + struct rtattr *tail; + + memset(&lss, 0, sizeof(lss)); + memset(&r, 0, sizeof(r)); + + while (argc > 0) { + if (strcmp(*argv, "bandwidth") == 0 || + strcmp(*argv, "rate") == 0) { + NEXT_ARG(); + if (get_rate(&r.rate, *argv)) { + explain1("bandwidth"); + return -1; + } + } else if (strcmp(*argv, "ewma") == 0) { + NEXT_ARG(); + if (get_unsigned(&ewma_log, *argv, 0)) { + explain1("ewma"); + return -1; + } + if (ewma_log > 31) { + fprintf(stderr, "ewma_log must be < 32\n"); + return -1; + } + } else if (strcmp(*argv, "cell") == 0) { + unsigned cell; + int i; + NEXT_ARG(); + if (get_size(&cell, *argv)) { + explain1("cell"); + return -1; + } + for (i=0; i<32; i++) + if ((1<<i) == cell) + break; + if (i>=32) { + fprintf(stderr, "cell must be 2^n\n"); + return -1; + } + cell_log = i; + } else if (strcmp(*argv, "avpkt") == 0) { + NEXT_ARG(); + if (get_size(&avpkt, *argv)) { + explain1("avpkt"); + return -1; + } + } else if (strcmp(*argv, "mpu") == 0) { + NEXT_ARG(); + if (get_size(&mpu, *argv)) { + explain1("mpu"); + return -1; + } + } else if (strcmp(*argv, "allot") == 0) { + NEXT_ARG(); + /* Accept and ignore "allot" for backward compatibility */ + if (get_size(&allot, *argv)) { + explain1("allot"); + return -1; + } + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + /* OK. All options are parsed. */ + + if (r.rate == 0) { + fprintf(stderr, "CBQ: bandwidth is required parameter.\n"); + return -1; + } + if (avpkt == 0) { + fprintf(stderr, "CBQ: \"avpkt\" is required.\n"); + return -1; + } + if (allot < (avpkt*3)/2) + allot = (avpkt*3)/2; + + if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, allot, mpu)) < 0) { + fprintf(stderr, "CBQ: failed to calculate rate table.\n"); + return -1; + } + r.cell_log = cell_log; + r.mpu = mpu; + + if (ewma_log < 0) + ewma_log = TC_CBQ_DEF_EWMA; + lss.ewma_log = ewma_log; + lss.maxidle = tc_cbq_calc_maxidle(r.rate, r.rate, avpkt, lss.ewma_log, 0); + lss.change = TCF_CBQ_LSS_MAXIDLE|TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT; + lss.avpkt = avpkt; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 1024, TCA_CBQ_RATE, &r, sizeof(r)); + addattr_l(n, 1024, TCA_CBQ_LSSOPT, &lss, sizeof(lss)); + addattr_l(n, 3024, TCA_CBQ_RTAB, rtab, 1024); + if (show_raw) { + int i; + for (i=0; i<256; i++) + printf("%u ", rtab[i]); + printf("\n"); + } + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + return 0; +} + +static int cbq_parse_class_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int wrr_ok=0, fopt_ok=0; + struct tc_ratespec r; + struct tc_cbq_lssopt lss; + struct tc_cbq_wrropt wrr; + struct tc_cbq_fopt fopt; + struct tc_cbq_ovl ovl; + __u32 rtab[256]; + unsigned mpu=0; + int cell_log=-1; + int ewma_log=-1; + unsigned bndw = 0; + unsigned minburst=0, maxburst=0; + struct rtattr *tail; + + memset(&r, 0, sizeof(r)); + memset(&lss, 0, sizeof(lss)); + memset(&wrr, 0, sizeof(wrr)); + memset(&fopt, 0, sizeof(fopt)); + memset(&ovl, 0, sizeof(ovl)); + + while (argc > 0) { + if (strcmp(*argv, "rate") == 0) { + NEXT_ARG(); + if (get_rate(&r.rate, *argv)) { + explain1("rate"); + return -1; + } + } else if (strcmp(*argv, "bandwidth") == 0) { + NEXT_ARG(); + if (get_rate(&bndw, *argv)) { + explain1("bandwidth"); + return -1; + } + } else if (strcmp(*argv, "minidle") == 0) { + NEXT_ARG(); + if (get_u32(&lss.minidle, *argv, 0)) { + explain1("minidle"); + return -1; + } + lss.change |= TCF_CBQ_LSS_MINIDLE; + } else if (strcmp(*argv, "minburst") == 0) { + NEXT_ARG(); + if (get_u32(&minburst, *argv, 0)) { + explain1("minburst"); + return -1; + } + lss.change |= TCF_CBQ_LSS_OFFTIME; + } else if (strcmp(*argv, "maxburst") == 0) { + NEXT_ARG(); + if (get_u32(&maxburst, *argv, 0)) { + explain1("maxburst"); + return -1; + } + lss.change |= TCF_CBQ_LSS_MAXIDLE; + } else if (strcmp(*argv, "bounded") == 0) { + lss.flags |= TCF_CBQ_LSS_BOUNDED; + lss.change |= TCF_CBQ_LSS_FLAGS; + } else if (strcmp(*argv, "borrow") == 0) { + lss.flags &= ~TCF_CBQ_LSS_BOUNDED; + lss.change |= TCF_CBQ_LSS_FLAGS; + } else if (strcmp(*argv, "isolated") == 0) { + lss.flags |= TCF_CBQ_LSS_ISOLATED; + lss.change |= TCF_CBQ_LSS_FLAGS; + } else if (strcmp(*argv, "sharing") == 0) { + lss.flags &= ~TCF_CBQ_LSS_ISOLATED; + lss.change |= TCF_CBQ_LSS_FLAGS; + } else if (strcmp(*argv, "ewma") == 0) { + NEXT_ARG(); + if (get_u32(&ewma_log, *argv, 0)) { + explain1("ewma"); + return -1; + } + if (ewma_log > 31) { + fprintf(stderr, "ewma_log must be < 32\n"); + return -1; + } + lss.change |= TCF_CBQ_LSS_EWMA; + } else if (strcmp(*argv, "cell") == 0) { + unsigned cell; + int i; + NEXT_ARG(); + if (get_size(&cell, *argv)) { + explain1("cell"); + return -1; + } + for (i=0; i<32; i++) + if ((1<<i) == cell) + break; + if (i>=32) { + fprintf(stderr, "cell must be 2^n\n"); + return -1; + } + cell_log = i; + } else if (strcmp(*argv, "prio") == 0) { + unsigned prio; + NEXT_ARG(); + if (get_u32(&prio, *argv, 0)) { + explain1("prio"); + return -1; + } + if (prio > TC_CBQ_MAXPRIO) { + fprintf(stderr, "\"prio\" must be number in the range 1...%d\n", TC_CBQ_MAXPRIO); + return -1; + } + wrr.priority = prio; + wrr_ok++; + } else if (strcmp(*argv, "allot") == 0) { + NEXT_ARG(); + if (get_size(&wrr.allot, *argv)) { + explain1("allot"); + return -1; + } + } else if (strcmp(*argv, "avpkt") == 0) { + NEXT_ARG(); + if (get_size(&lss.avpkt, *argv)) { + explain1("avpkt"); + return -1; + } + lss.change |= TCF_CBQ_LSS_AVPKT; + } else if (strcmp(*argv, "mpu") == 0) { + NEXT_ARG(); + if (get_size(&mpu, *argv)) { + explain1("mpu"); + return -1; + } + } else if (strcmp(*argv, "weight") == 0) { + NEXT_ARG(); + if (get_size(&wrr.weight, *argv)) { + explain1("weight"); + return -1; + } + wrr_ok++; + } else if (strcmp(*argv, "split") == 0) { + NEXT_ARG(); + if (get_tc_classid(&fopt.split, *argv)) { + fprintf(stderr, "Invalid split node ID.\n"); + usage(); + } + fopt_ok++; + } else if (strcmp(*argv, "defmap") == 0) { + int err; + NEXT_ARG(); + err = sscanf(*argv, "%08x/%08x", &fopt.defmap, &fopt.defchange); + if (err < 1) { + fprintf(stderr, "Invalid defmap, should be MASK32[/MASK]\n"); + return -1; + } + if (err == 1) + fopt.defchange = ~0; + fopt_ok++; + } else if (strcmp(*argv, "help") == 0) { + explain_class(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain_class(); + return -1; + } + argc--; argv++; + } + + /* OK. All options are parsed. */ + + /* 1. Prepare link sharing scheduler parameters */ + if (r.rate) { + unsigned pktsize = wrr.allot; + if (wrr.allot < (lss.avpkt*3)/2) + wrr.allot = (lss.avpkt*3)/2; + if ((cell_log = tc_calc_rtable(r.rate, rtab, cell_log, pktsize, mpu)) < 0) { + fprintf(stderr, "CBQ: failed to calculate rate table.\n"); + return -1; + } + r.cell_log = cell_log; + r.mpu = mpu; + } + if (ewma_log < 0) + ewma_log = TC_CBQ_DEF_EWMA; + lss.ewma_log = ewma_log; + if (lss.change&(TCF_CBQ_LSS_OFFTIME|TCF_CBQ_LSS_MAXIDLE)) { + if (lss.avpkt == 0) { + fprintf(stderr, "CBQ: avpkt is required for max/minburst.\n"); + return -1; + } + if (bndw==0 || r.rate == 0) { + fprintf(stderr, "CBQ: bandwidth&rate are required for max/minburst.\n"); + return -1; + } + } + if (wrr.priority == 0 && (n->nlmsg_flags&NLM_F_EXCL)) { + wrr_ok = 1; + wrr.priority = TC_CBQ_MAXPRIO; + if (wrr.allot == 0) + wrr.allot = (lss.avpkt*3)/2; + } + if (wrr_ok) { + if (wrr.weight == 0) + wrr.weight = (wrr.priority == TC_CBQ_MAXPRIO) ? 1 : r.rate; + if (wrr.allot == 0) { + fprintf(stderr, "CBQ: \"allot\" is required to set WRR parameters.\n"); + return -1; + } + } + if (lss.change&TCF_CBQ_LSS_MAXIDLE) { + lss.maxidle = tc_cbq_calc_maxidle(bndw, r.rate, lss.avpkt, ewma_log, maxburst); + lss.change |= TCF_CBQ_LSS_MAXIDLE; + lss.change |= TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT; + } + if (lss.change&TCF_CBQ_LSS_OFFTIME) { + lss.offtime = tc_cbq_calc_offtime(bndw, r.rate, lss.avpkt, ewma_log, minburst); + lss.change |= TCF_CBQ_LSS_OFFTIME; + lss.change |= TCF_CBQ_LSS_EWMA|TCF_CBQ_LSS_AVPKT; + } + if (lss.change&TCF_CBQ_LSS_MINIDLE) { + lss.minidle <<= lss.ewma_log; + lss.change |= TCF_CBQ_LSS_EWMA; + } + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + if (lss.change) { + lss.change |= TCF_CBQ_LSS_FLAGS; + addattr_l(n, 1024, TCA_CBQ_LSSOPT, &lss, sizeof(lss)); + } + if (wrr_ok) + addattr_l(n, 1024, TCA_CBQ_WRROPT, &wrr, sizeof(wrr)); + if (fopt_ok) + addattr_l(n, 1024, TCA_CBQ_FOPT, &fopt, sizeof(fopt)); + if (r.rate) { + addattr_l(n, 1024, TCA_CBQ_RATE, &r, sizeof(r)); + addattr_l(n, 3024, TCA_CBQ_RTAB, rtab, 1024); + if (show_raw) { + int i; + for (i=0; i<256; i++) + printf("%u ", rtab[i]); + printf("\n"); + } + } + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + return 0; +} + + +static int cbq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_CBQ_MAX+1]; + struct tc_ratespec *r = NULL; + struct tc_cbq_lssopt *lss = NULL; + struct tc_cbq_wrropt *wrr = NULL; + struct tc_cbq_fopt *fopt = NULL; + struct tc_cbq_ovl *ovl = NULL; + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_CBQ_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (tb[TCA_CBQ_RATE]) { + if (RTA_PAYLOAD(tb[TCA_CBQ_RATE]) < sizeof(*r)) + fprintf(stderr, "CBQ: too short rate opt\n"); + else + r = RTA_DATA(tb[TCA_CBQ_RATE]); + } + if (tb[TCA_CBQ_LSSOPT]) { + if (RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT]) < sizeof(*lss)) + fprintf(stderr, "CBQ: too short lss opt\n"); + else + lss = RTA_DATA(tb[TCA_CBQ_LSSOPT]); + } + if (tb[TCA_CBQ_WRROPT]) { + if (RTA_PAYLOAD(tb[TCA_CBQ_WRROPT]) < sizeof(*wrr)) + fprintf(stderr, "CBQ: too short wrr opt\n"); + else + wrr = RTA_DATA(tb[TCA_CBQ_WRROPT]); + } + if (tb[TCA_CBQ_FOPT]) { + if (RTA_PAYLOAD(tb[TCA_CBQ_FOPT]) < sizeof(*fopt)) + fprintf(stderr, "CBQ: too short fopt\n"); + else + fopt = RTA_DATA(tb[TCA_CBQ_FOPT]); + } + if (tb[TCA_CBQ_OVL_STRATEGY]) { + if (RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY]) < sizeof(*ovl)) + fprintf(stderr, "CBQ: too short overlimit strategy %u/%u\n", + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY]), sizeof(*ovl)); + else + ovl = RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY]); + } + + if (r) { + char buf[64]; + print_rate(buf, sizeof(buf), r->rate); + fprintf(f, "rate %s ", buf); + if (show_details) { + fprintf(f, "cell %ub ", 1<<r->cell_log); + if (r->mpu) + fprintf(f, "mpu %ub ", r->mpu); + } + } + if (lss && lss->flags) { + int comma=0; + fprintf(f, "("); + if (lss->flags&TCF_CBQ_LSS_BOUNDED) { + fprintf(f, "bounded"); + comma=1; + } + if (lss->flags&TCF_CBQ_LSS_ISOLATED) { + if (comma) + fprintf(f, ","); + fprintf(f, "isolated"); + } + fprintf(f, ") "); + } + if (wrr) { + if (wrr->priority != TC_CBQ_MAXPRIO) + fprintf(f, "prio %u", wrr->priority); + else + fprintf(f, "prio no-transmit"); + if (show_details) { + char buf[64]; + fprintf(f, "/%u ", wrr->cpriority); + if (wrr->weight != 1) { + print_rate(buf, sizeof(buf), wrr->weight); + fprintf(f, "weight %s ", buf); + } + if (wrr->allot) + fprintf(f, "allot %ub ", wrr->allot); + } + } + if (lss && show_details) { + fprintf(f, "\nlevel %u ewma %u avpkt %ub ", lss->level, lss->ewma_log, lss->avpkt); + if (lss->maxidle) { + fprintf(f, "maxidle %luus ", tc_core_tick2usec(lss->maxidle>>lss->ewma_log)); + if (show_raw) + fprintf(f, "[%08x] ", lss->maxidle); + } + if (lss->minidle!=0x7fffffff) { + fprintf(f, "minidle %luus ", tc_core_tick2usec(lss->minidle>>lss->ewma_log)); + if (show_raw) + fprintf(f, "[%08x] ", lss->minidle); + } + if (lss->offtime) { + fprintf(f, "offtime %luus ", tc_core_tick2usec(lss->offtime)); + if (show_raw) + fprintf(f, "[%08x] ", lss->offtime); + } + } + if (fopt && show_details) { + char buf[64]; + print_tc_classid(buf, sizeof(buf), fopt->split); + fprintf(f, "\nsplit %s ", buf); + if (fopt->defmap) { + fprintf(f, "defmap %08x", fopt->defmap); + } + } + return 0; +} + +static int cbq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + struct tc_cbq_xstats *st; + + if (xstats == NULL) + return 0; + + if (RTA_PAYLOAD(xstats) < sizeof(*st)) + return -1; + + st = RTA_DATA(xstats); + fprintf(f, " borrowed %u overactions %u avgidle %g undertime %g", st->borrows, + st->overactions, (double)st->avgidle, (double)st->undertime); + return 0; +} + +struct qdisc_util cbq_util = { + NULL, + "cbq", + cbq_parse_opt, + cbq_print_opt, + cbq_print_xstats, + + cbq_parse_class_opt, + cbq_print_opt, +}; + diff --git a/tc/q_csz.c b/tc/q_csz.c index e69de29b..e2734cda 100644 --- a/tc/q_csz.c +++ b/tc/q_csz.c @@ -0,0 +1,61 @@ +/* + * q_csz.c CSZ. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain() +{ + fprintf(stderr, "Usage: ... csz \n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + + +#define usage() return(-1) + +static int csz_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + return -1; +} + +static int csz_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + return -1; +} + +static int csz_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return -1; +} + +struct qdisc_util csz_util = { + NULL, + "csz", + csz_parse_opt, + csz_print_opt, + csz_print_xstats, +}; + diff --git a/tc/q_dsmark.c b/tc/q_dsmark.c index e69de29b..8a1cd4d8 100644 --- a/tc/q_dsmark.c +++ b/tc/q_dsmark.c @@ -0,0 +1,186 @@ +/* + * q_dsmark.c Differentiated Services field marking. + * + * Hacked 1998,1999 by Werner Almesberger, EPFL ICA + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + + +#define usage() return(-1) + + +static void explain(void) +{ + fprintf(stderr,"Usage: dsmark indices INDICES [ default_index " + "DEFAULT_INDEX ] [ set_tc_index ]\n"); +} + + +static int dsmark_parse_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) +{ + struct rtattr *tail; + __u16 ind; + char *end; + int dflt,set_tc_index; + + ind = set_tc_index = 0; + dflt = -1; + while (argc > 0) { + if (!strcmp(*argv,"indices")) { + NEXT_ARG(); + ind = strtoul(*argv,&end,0); + if (*end) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"default_index") || !strcmp(*argv, + "default")) { + NEXT_ARG(); + dflt = strtoul(*argv,&end,0); + if (*end) { + explain(); + return -1; + } + } + else if (!strcmp(*argv,"set_tc_index")) { + set_tc_index = 1; + } + else { + explain(); + return -1; + } + argc--; + argv++; + } + if (!ind) { + explain(); + return -1; + } + tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n,1024,TCA_OPTIONS,NULL,0); + addattr_l(n,1024,TCA_DSMARK_INDICES,&ind,sizeof(ind)); + if (dflt != -1) { + __u16 tmp = dflt; + + addattr_l(n,1024,TCA_DSMARK_DEFAULT_INDEX,&tmp,sizeof(tmp)); + } + if (set_tc_index) addattr_l(n,1024,TCA_DSMARK_SET_TC_INDEX,NULL,0); + tail->rta_len = (((void *) n)+n->nlmsg_len)-(void *) tail; + return 0; +} + + +static void explain_class(void) +{ + fprintf(stderr, "Usage: ... dsmark [ mask MASK ] [ value VALUE ]\n"); +} + + +static int dsmark_parse_class_opt(struct qdisc_util *qu, int argc, char **argv, + struct nlmsghdr *n) +{ + struct rtattr *tail; + __u8 tmp; + char *end; + + tail = (struct rtattr *) (((void *) n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n,1024,TCA_OPTIONS,NULL,0); + while (argc > 0) { + if (!strcmp(*argv,"mask")) { + NEXT_ARG(); + tmp = strtoul(*argv,&end,0); + if (*end) { + explain_class(); + return -1; + } + addattr_l(n,1024,TCA_DSMARK_MASK,&tmp,1); + } + else if (!strcmp(*argv,"value")) { + NEXT_ARG(); + tmp = strtoul(*argv,&end,0); + if (*end) { + explain_class(); + return -1; + } + addattr_l(n,1024,TCA_DSMARK_VALUE,&tmp,1); + } + else { + explain_class(); + return -1; + } + argc--; + argv++; + } + tail->rta_len = (((void *) n)+n->nlmsg_len)-(void *) tail; + return 0; +} + + + +static int dsmark_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_DSMARK_MAX+1]; + + if (!opt) return 0; + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_DSMARK_MAX, RTA_DATA(opt), RTA_PAYLOAD(opt)); + if (tb[TCA_DSMARK_MASK]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK])) + fprintf(stderr,"dsmark: empty mask\n"); + else fprintf(f,"mask 0x%02x ", + *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK])); + } + if (tb[TCA_DSMARK_VALUE]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE])) + fprintf(stderr,"dsmark: empty value\n"); + else fprintf(f,"value 0x%02x ", + *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE])); + } + if (tb[TCA_DSMARK_INDICES]) { + if (RTA_PAYLOAD(tb[TCA_DSMARK_INDICES]) < sizeof(__u16)) + fprintf(stderr,"dsmark: indices too short\n"); + else fprintf(f,"indices 0x%04x ", + *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES])); + } + if (tb[TCA_DSMARK_DEFAULT_INDEX]) { + if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX]) < sizeof(__u16)) + fprintf(stderr,"dsmark: default_index too short\n"); + else fprintf(f,"default_index 0x%04x ", + *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX])); + } + if (tb[TCA_DSMARK_SET_TC_INDEX]) fprintf(f,"set_tc_index "); + return 0; +} + + +static int dsmark_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util dsmark_util = { + NULL, + "dsmark", + dsmark_parse_opt, + dsmark_print_opt, + dsmark_print_xstats, + + dsmark_parse_class_opt, + dsmark_print_opt +}; diff --git a/tc/q_fifo.c b/tc/q_fifo.c index e69de29b..4cb9fded 100644 --- a/tc/q_fifo.c +++ b/tc/q_fifo.c @@ -0,0 +1,101 @@ +/* + * q_fifo.c FIFO. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... [p|b]fifo [ limit NUMBER ]\n"); +} + +#define usage() return(-1) + +static int fifo_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + struct tc_fifo_qopt opt; + memset(&opt, 0, sizeof(opt)); + + while (argc > 0) { + if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_size(&opt.limit, *argv)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (ok) + addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + return 0; +} + +static int fifo_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct tc_fifo_qopt *qopt; + + if (opt == NULL) + return 0; + + if (RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -1; + qopt = RTA_DATA(opt); + if (strcmp(qu->id, "bfifo") == 0) { + SPRINT_BUF(b1); + fprintf(f, "limit %s", sprint_size(qopt->limit, b1)); + } else + fprintf(f, "limit %up", qopt->limit); + return 0; +} + +static int fifo_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util bfifo_util = { + NULL, + "bfifo", + fifo_parse_opt, + fifo_print_opt, + fifo_print_xstats, +}; + +struct qdisc_util pfifo_util = { + NULL, + "pfifo", + fifo_parse_opt, + fifo_print_opt, + fifo_print_xstats, +}; diff --git a/tc/q_gred.c b/tc/q_gred.c index e69de29b..b63f8ae7 100644 --- a/tc/q_gred.c +++ b/tc/q_gred.c @@ -0,0 +1,345 @@ +/* + * q_gred.c GRED. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim(hadi@nortelnetworks.com) + * code ruthlessly ripped from + * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +#include "tc_red.h" + + +#if 0 +#define DPRINTF(format,args...) fprintf(stderr,format,##args) +#else +#define DPRINTF(format,args...) +#endif + +static void explain(void) +{ + fprintf(stderr, "Usage: ... gred DP drop-probability limit BYTES " + "min BYTES max BYTES\n"); + fprintf(stderr, " avpkt BYTES burst PACKETS probability PROBABILITY " + "bandwidth KBPS\n"); + fprintf(stderr, " [prio value]\n"); + fprintf(stderr," OR ...\n"); + fprintf(stderr," gred setup DPs <num of DPs> default <default DP> " + "[grio]\n"); +} + +#define usage() return(-1) + +static int init_gred(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + + struct rtattr *tail; + struct tc_gred_sopt opt; + memset(&opt, 0, sizeof(struct tc_gred_sopt)); + + while (argc > 0) { + DPRINTF(stderr,"init_gred: invoked with %s\n",*argv); + if (strcmp(*argv, "DPs") == 0) { + NEXT_ARG(); + DPRINTF(stderr,"init_gred: next_arg with %s\n",*argv); + opt.DPs=strtol(*argv, (char **)NULL, 10); + if (opt.DPs >MAX_DPs) { /* need a better error check */ + fprintf(stderr, "DPs =%u \n",opt.DPs); + fprintf(stderr, "Illegal \"DPs\"\n"); + fprintf(stderr, "GRED: only %d DPs are " + "currently supported\n",MAX_DPs); + return -1; + } + } else if (strcmp(*argv, "default") == 0) { + NEXT_ARG(); + opt.def_DP=strtol(*argv, (char **)NULL, 10); + if (!opt.DPs) { + fprintf(stderr, "\"default DP\" must be " + "defined after DPs\n"); + return -1; + } +#if 0 + if (opt.def_DP>opt.DPs-1) { +#endif + if (opt.def_DP>opt.DPs) { +/* + fprintf(stderr, "\"default DP\" must be less than %d\nNote: DP runs from 0 to %d for %d DPs\n",opt.DPs,opt.DPs-1,opt.DPs); +*/ + fprintf(stderr, "\"default DP\" must be less than %d\n",opt.DPs); + return -1; + } + } else if (strcmp(*argv, "grio") == 0) { + opt.grio=1; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; +} + +if ((!opt.DPs) || (!opt.def_DP)) +{ + fprintf(stderr, "Illegal gred setup parameters \n"); + return -1; +} +DPRINTF("TC_GRED: sending DPs=%d default=%d\n",opt.DPs,opt.def_DP); + n->nlmsg_flags|=NLM_F_CREATE; + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 1024, TCA_GRED_DPS, &opt, sizeof(struct tc_gred_sopt)); + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; +return 0; +} +/* +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +*/ +static int gred_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + struct tc_gred_qopt opt; + unsigned burst = 0; + unsigned avpkt = 0; + double probability = 0.02; + unsigned rate = 0; + int wlog; + __u8 sbuf[256]; + struct rtattr *tail; + + memset(&opt, 0, sizeof(opt)); + + while (argc > 0) { + if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_size(&opt.limit, *argv)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "setup") == 0) { + if (ok) { + fprintf(stderr, "Illegal \"setup\"\n"); + return -1; + } + return init_gred(qu,argc-1, argv+1,n); + + } else if (strcmp(*argv, "min") == 0) { + NEXT_ARG(); + if (get_size(&opt.qth_min, *argv)) { + fprintf(stderr, "Illegal \"min\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "max") == 0) { + NEXT_ARG(); + if (get_size(&opt.qth_max, *argv)) { + fprintf(stderr, "Illegal \"max\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "DP") == 0) { + NEXT_ARG(); + opt.DP=strtol(*argv, (char **)NULL, 10); + DPRINTF ("\n ******* DP =%u\n",opt.DP); + if (opt.DP >MAX_DPs) { /* need a better error check */ + fprintf(stderr, "DP =%u \n",opt.DP); + fprintf(stderr, "Illegal \"DP\"\n"); + fprintf(stderr, "GRED: only %d DPs are currently supported\n",MAX_DPs); + return -1; + } +#if 0 + return -1; + } +#endif + ok++; + } else if (strcmp(*argv, "burst") == 0) { + NEXT_ARG(); + if (get_unsigned(&burst, *argv, 0)) { + fprintf(stderr, "Illegal \"burst\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "avpkt") == 0) { + NEXT_ARG(); + if (get_size(&avpkt, *argv)) { + fprintf(stderr, "Illegal \"avpkt\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "probability") == 0) { + NEXT_ARG(); + if (sscanf(*argv, "%lg", &probability) != 1) { + fprintf(stderr, "Illegal \"probability\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "prio") == 0) { + NEXT_ARG(); + opt.prio=strtol(*argv, (char **)NULL, 10); + /* some error check here */ + ok++; + } else if (strcmp(*argv, "bandwidth") == 0) { + NEXT_ARG(); + if (get_rate(&rate, *argv)) { + fprintf(stderr, "Illegal \"bandwidth\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (!ok) + return 0; + + if (rate == 0) + get_rate(&rate, "10Mbit"); + + if (!opt.qth_min || !opt.qth_max || !burst || !opt.limit || !avpkt || + (opt.DP<0)) { + fprintf(stderr, "Required parameter (min, max, burst, limit, " + "avpket, DP) is missing\n"); + return -1; + } + + if ((wlog = tc_red_eval_ewma(opt.qth_min, burst, avpkt)) < 0) { + fprintf(stderr, "GRED: failed to calculate EWMA constant.\n"); + return -1; + } + if (wlog >= 10) + fprintf(stderr, "GRED: WARNING. Burst %d seems to be to " + "large.\n", burst); + opt.Wlog = wlog; + if ((wlog = tc_red_eval_P(opt.qth_min, opt.qth_max, probability)) < 0) { + fprintf(stderr, "GRED: failed to calculate probability.\n"); + return -1; + } + opt.Plog = wlog; + if ((wlog = tc_red_eval_idle_damping(opt.Wlog, avpkt, rate, sbuf)) < 0) + { + fprintf(stderr, "GRED: failed to calculate idle damping " + "table.\n"); + return -1; + } + opt.Scell_log = wlog; + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 1024, TCA_GRED_PARMS, &opt, sizeof(opt)); + addattr_l(n, 1024, TCA_GRED_STAB, sbuf, 256); + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + return 0; +} + +static int gred_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_GRED_STAB+1]; + struct tc_gred_qopt *qopt; + int i; + SPRINT_BUF(b1); + SPRINT_BUF(b2); + SPRINT_BUF(b3); + SPRINT_BUF(b4); + SPRINT_BUF(b5); + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_GRED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (tb[TCA_GRED_PARMS] == NULL) + return -1; +#if 0 + sopt = RTA_DATA(tb[TCA_GRED_DPS]); + if (RTA_PAYLOAD(tb[TCA_GRED_DPS]) < sizeof(*sopt)) { + printf("\n GRED DPs message smaller than expected\n"); + return -1; + } + + DPRINTF(f, "\n\tDPs:%d Default DP %d\n ", + sopt->DPs, sopt->def_DP); +#endif + qopt = RTA_DATA(tb[TCA_GRED_PARMS]); + if (RTA_PAYLOAD(tb[TCA_GRED_PARMS]) < sizeof(*qopt)*MAX_DPs) { + fprintf(f,"\n GRED received message smaller than expected\n"); + return -1; + } + + +#if 0 + + for (i=0;i<sopt->DPs;i++) +#endif +/* Bad hack! should really return a proper message as shown above*/ + + for (i=0;i<MAX_DPs;i++, qopt++) { + if (qopt->DP >= MAX_DPs) continue; + fprintf(f, "\n DP:%d (prio %d) Average Queue %s Measured " + "Queue %s ", + qopt->DP, + qopt->prio, + sprint_size(qopt->qave, b4), + sprint_size(qopt->backlog, b5)); + fprintf(f, "\n\t Packet drops: %d (forced %d early %d) ", + qopt->forced+qopt->early, + qopt->forced, + qopt->early); + fprintf(f, "\n\t Packet totals: %u (bytes %u) ", + qopt->packets, + qopt->bytesin); + if (show_details) + fprintf(f, "\n limit %s min %s max %s ", + sprint_size(qopt->limit, b1), + sprint_size(qopt->qth_min, b2), + sprint_size(qopt->qth_max, b3)); + fprintf(f, "ewma %u Plog %u Scell_log %u", + qopt->Wlog, qopt->Plog, qopt->Scell_log); + } + return 0; +} + +static int gred_print_xstats(struct qdisc_util *qu, FILE *f, + struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util gred_util = { + NULL, + "gred", + gred_parse_opt, + gred_print_opt, + gred_print_xstats, +}; diff --git a/tc/q_hfsc.c b/tc/q_hfsc.c index e69de29b..b9b7b751 100644 --- a/tc/q_hfsc.c +++ b/tc/q_hfsc.c @@ -0,0 +1,61 @@ +/* + * q_hfsc.c HFSC. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain() +{ + fprintf(stderr, "Usage: ... hfsc \n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + + +#define usage() return(-1) + +static int hfsc_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + return -1; +} + +static int hfsc_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + return -1; +} + +static int hfsc_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return -1; +} + +struct qdisc_util hfsc_util = { + NULL, + "hfsc", + hfsc_parse_opt, + hfsc_print_opt, + hfsc_print_xstats, +}; + diff --git a/tc/q_hpfq.c b/tc/q_hpfq.c index e69de29b..c2963669 100644 --- a/tc/q_hpfq.c +++ b/tc/q_hpfq.c @@ -0,0 +1,61 @@ +/* + * q_hpfq.c HPFQ. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain() +{ + fprintf(stderr, "Usage: ... hpfq \n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + + +#define usage() return(-1) + +static int hpfq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + return -1; +} + +static int hpfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + return -1; +} + +static int hpfq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return -1; +} + +struct qdisc_util hpfq_util = { + NULL, + "hpfq", + hpfq_parse_opt, + hpfq_print_opt, + hpfq_print_xstats, +}; + diff --git a/tc/q_ingress.c b/tc/q_ingress.c index e69de29b..0a089062 100644 --- a/tc/q_ingress.c +++ b/tc/q_ingress.c @@ -0,0 +1,76 @@ +/* + * + * q_ingress.c INGRESS. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim + * + * This is here just in case it is needed + * useless right now; might be useful in the future + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... ingress \n"); +} + +#define usage() return(-1) + +static int ingress_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + + if (argc > 0) { + while (argc > 0) { + + if (strcmp(*argv, "handle") == 0) { + NEXT_ARG(); + argc--; argv++; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + } + } + + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + return 0; +} + +static int ingress_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + + fprintf(f, "---------------- "); + return 0; +} + +static int ingress_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + +struct qdisc_util ingress_util = { + NULL, + "ingress", + ingress_parse_opt, + ingress_print_opt, + ingress_print_xstats, +}; diff --git a/tc/q_prio.c b/tc/q_prio.c index e69de29b..ddda601b 100644 --- a/tc/q_prio.c +++ b/tc/q_prio.c @@ -0,0 +1,127 @@ +/* + * q_prio.c PRIO. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Changes: + * + * Ole Husgaard <sparre@login.dknet.dk>: 990513: prio2band map was always reset. + * J Hadi Salim <hadi@cyberus.ca>: 990609: priomap fix. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... prio bands NUMBER priomap P1 P2...\n"); +} + +#define usage() return(-1) + +static int prio_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + int pmap_mode = 0; + int idx = 0; + struct tc_prio_qopt opt={3,{ 1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }}; + + while (argc > 0) { + if (strcmp(*argv, "bands") == 0) { + if (pmap_mode) + explain(); + NEXT_ARG(); + if (get_integer(&opt.bands, *argv, 10)) { + fprintf(stderr, "Illegal \"bands\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "priomap") == 0) { + if (pmap_mode) { + fprintf(stderr, "Error: duplicate priomap\n"); + return -1; + } + pmap_mode = 1; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + unsigned band; + if (!pmap_mode) { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + if (get_unsigned(&band, *argv, 10)) { + fprintf(stderr, "Illegal \"priomap\" element\n"); + return -1; + } + if (band > opt.bands) { + fprintf(stderr, "\"priomap\" element is out of bands\n"); + return -1; + } + if (idx > TC_PRIO_MAX) { + fprintf(stderr, "\"priomap\" index > TC_PRIO_MAX=%u\n", TC_PRIO_MAX); + return -1; + } + opt.priomap[idx++] = band; + } + argc--; argv++; + } + +/* + if (pmap_mode) { + for (; idx < TC_PRIO_MAX; idx++) + opt.priomap[idx] = opt.priomap[TC_PRIO_BESTEFFORT]; + } +*/ + addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + return 0; +} + +static int prio_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + int i; + struct tc_prio_qopt *qopt; + + if (opt == NULL) + return 0; + + if (RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -1; + qopt = RTA_DATA(opt); + fprintf(f, "bands %u priomap ", qopt->bands); + for (i=0; i<=TC_PRIO_MAX; i++) + fprintf(f, " %d", qopt->priomap[i]); + return 0; +} + +static int prio_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util prio_util = { + NULL, + "prio", + prio_parse_opt, + prio_print_opt, + prio_print_xstats, +}; + diff --git a/tc/q_red.c b/tc/q_red.c index e69de29b..c156d47c 100644 --- a/tc/q_red.c +++ b/tc/q_red.c @@ -0,0 +1,222 @@ +/* + * q_red.c RED. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +#include "tc_red.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... red limit BYTES min BYTES max BYTES avpkt BYTES burst PACKETS\n"); + fprintf(stderr, " probability PROBABILITY bandwidth KBPS [ ecn ]\n"); +} + +#define usage() return(-1) + +static int red_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + struct tc_red_qopt opt; + unsigned burst = 0; + unsigned avpkt = 0; + double probability = 0.02; + unsigned rate = 0; + int ecn_ok = 0; + int wlog; + __u8 sbuf[256]; + struct rtattr *tail; + + memset(&opt, 0, sizeof(opt)); + + while (argc > 0) { + if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_size(&opt.limit, *argv)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "min") == 0) { + NEXT_ARG(); + if (get_size(&opt.qth_min, *argv)) { + fprintf(stderr, "Illegal \"min\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "max") == 0) { + NEXT_ARG(); + if (get_size(&opt.qth_max, *argv)) { + fprintf(stderr, "Illegal \"max\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "burst") == 0) { + NEXT_ARG(); + if (get_unsigned(&burst, *argv, 0)) { + fprintf(stderr, "Illegal \"burst\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "avpkt") == 0) { + NEXT_ARG(); + if (get_size(&avpkt, *argv)) { + fprintf(stderr, "Illegal \"avpkt\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "probability") == 0) { + NEXT_ARG(); + if (sscanf(*argv, "%lg", &probability) != 1) { + fprintf(stderr, "Illegal \"probability\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "bandwidth") == 0) { + NEXT_ARG(); + if (get_rate(&rate, *argv)) { + fprintf(stderr, "Illegal \"bandwidth\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "ecn") == 0) { + ecn_ok = 1; + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (!ok) + return 0; + + if (rate == 0) + get_rate(&rate, "10Mbit"); + + if (!opt.qth_min || !opt.qth_max || !burst || !opt.limit || !avpkt) { + fprintf(stderr, "Required parameter (min, max, burst, limit, avpket) is missing\n"); + return -1; + } + + if ((wlog = tc_red_eval_ewma(opt.qth_min, burst, avpkt)) < 0) { + fprintf(stderr, "RED: failed to calculate EWMA constant.\n"); + return -1; + } + if (wlog >= 10) + fprintf(stderr, "RED: WARNING. Burst %d seems to be to large.\n", burst); + opt.Wlog = wlog; + if ((wlog = tc_red_eval_P(opt.qth_min, opt.qth_max, probability)) < 0) { + fprintf(stderr, "RED: failed to calculate probability.\n"); + return -1; + } + opt.Plog = wlog; + if ((wlog = tc_red_eval_idle_damping(opt.Wlog, avpkt, rate, sbuf)) < 0) { + fprintf(stderr, "RED: failed to calculate idle damping table.\n"); + return -1; + } + opt.Scell_log = wlog; + if (ecn_ok) { +#ifdef TC_RED_ECN + opt.flags |= TC_RED_ECN; +#else + fprintf(stderr, "RED: ECN support is missing in this binary.\n"); + return -1; +#endif + } + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 1024, TCA_RED_PARMS, &opt, sizeof(opt)); + addattr_l(n, 1024, TCA_RED_STAB, sbuf, 256); + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + return 0; +} + +static int red_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_RED_STAB+1]; + struct tc_red_qopt *qopt; + SPRINT_BUF(b1); + SPRINT_BUF(b2); + SPRINT_BUF(b3); + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_RED_STAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (tb[TCA_RED_PARMS] == NULL) + return -1; + qopt = RTA_DATA(tb[TCA_RED_PARMS]); + if (RTA_PAYLOAD(tb[TCA_RED_PARMS]) < sizeof(*qopt)) + return -1; + fprintf(f, "limit %s min %s max %s ", + sprint_size(qopt->limit, b1), + sprint_size(qopt->qth_min, b2), + sprint_size(qopt->qth_max, b3)); +#ifdef TC_RED_ECN + if (qopt->flags & TC_RED_ECN) + fprintf(f, "ecn "); +#endif + if (show_details) { + fprintf(f, "ewma %u Plog %u Scell_log %u", + qopt->Wlog, qopt->Plog, qopt->Scell_log); + } + return 0; +} + +static int red_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ +#ifdef TC_RED_ECN + struct tc_red_xstats *st; + + if (xstats == NULL) + return 0; + + if (RTA_PAYLOAD(xstats) < sizeof(*st)) + return -1; + + st = RTA_DATA(xstats); + fprintf(f, " marked %u early %u pdrop %u other %u", + st->marked, st->early, st->pdrop, st->other); + return 0; + +#endif + return 0; +} + + +struct qdisc_util red_util = { + NULL, + "red", + red_parse_opt, + red_print_opt, + red_print_xstats, +}; diff --git a/tc/q_sfq.c b/tc/q_sfq.c index e69de29b..d7a3c0fa 100644 --- a/tc/q_sfq.c +++ b/tc/q_sfq.c @@ -0,0 +1,115 @@ +/* + * q_sfq.c SFQ. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... sfq [ limit NUMBER ] [ perturb SECS ] [ quantum BYTES ]\n"); +} + +#define usage() return(-1) + +static int sfq_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + struct tc_sfq_qopt opt; + + memset(&opt, 0, sizeof(opt)); + + while (argc > 0) { + if (strcmp(*argv, "quantum") == 0) { + NEXT_ARG(); + if (get_size(&opt.quantum, *argv)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "perturb") == 0) { + NEXT_ARG(); + if (get_integer(&opt.perturb_period, *argv, 0)) { + fprintf(stderr, "Illegal \"perturb\"\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "limit") == 0) { + NEXT_ARG(); + if (get_u32(&opt.limit, *argv, 0)) { + fprintf(stderr, "Illegal \"limit\"\n"); + return -1; + } + if (opt.limit < 2) { + fprintf(stderr, "Illegal \"limit\", must be > 1\n"); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (ok) + addattr_l(n, 1024, TCA_OPTIONS, &opt, sizeof(opt)); + return 0; +} + +static int sfq_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct tc_sfq_qopt *qopt; + SPRINT_BUF(b1); + + if (opt == NULL) + return 0; + + if (RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -1; + qopt = RTA_DATA(opt); + fprintf(f, "limit %up ", qopt->limit); + fprintf(f, "quantum %s ", sprint_size(qopt->quantum, b1)); + if (show_details) { + fprintf(f, "flows %u/%u ", qopt->flows, qopt->divisor); + } + if (qopt->perturb_period) + fprintf(f, "perturb %dsec ", qopt->perturb_period); + return 0; +} + +static int sfq_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + + +struct qdisc_util sfq_util = { + NULL, + "sfq", + sfq_parse_opt, + sfq_print_opt, + sfq_print_xstats, +}; diff --git a/tc/q_tbf.c b/tc/q_tbf.c index e69de29b..01d514fb 100644 --- a/tc/q_tbf.c +++ b/tc/q_tbf.c @@ -0,0 +1,272 @@ +/* + * q_tbf.c TBF. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... tbf limit BYTES burst BYTES[/BYTES] rate KBPS [ mtu BYTES[/BYTES] ]\n"); + fprintf(stderr, " [ peakrate KBPS ] [ latency TIME ]\n"); +} + +static void explain1(char *arg) +{ + fprintf(stderr, "Illegal \"%s\"\n", arg); +} + + +#define usage() return(-1) + +static int tbf_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok=0; + struct tc_tbf_qopt opt; + __u32 rtab[256]; + __u32 ptab[256]; + unsigned buffer=0, mtu=0, mpu=0, latency=0; + int Rcell_log=-1, Pcell_log = -1; + struct rtattr *tail; + + memset(&opt, 0, sizeof(opt)); + + while (argc > 0) { + if (matches(*argv, "limit") == 0) { + NEXT_ARG(); + if (opt.limit || latency) { + fprintf(stderr, "Double \"limit/latency\" spec\n"); + return -1; + } + if (get_size(&opt.limit, *argv)) { + explain1("limit"); + return -1; + } + ok++; + } else if (matches(*argv, "latency") == 0) { + NEXT_ARG(); + if (opt.limit || latency) { + fprintf(stderr, "Double \"limit/latency\" spec\n"); + return -1; + } + if (get_usecs(&latency, *argv)) { + explain1("latency"); + return -1; + } + ok++; + } else if (matches(*argv, "burst") == 0 || + strcmp(*argv, "buffer") == 0 || + strcmp(*argv, "maxburst") == 0) { + NEXT_ARG(); + if (buffer) { + fprintf(stderr, "Double \"buffer/burst\" spec\n"); + return -1; + } + if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0) { + explain1("buffer"); + return -1; + } + ok++; + } else if (strcmp(*argv, "mtu") == 0 || + strcmp(*argv, "minburst") == 0) { + NEXT_ARG(); + if (mtu) { + fprintf(stderr, "Double \"mtu/minburst\" spec\n"); + return -1; + } + if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0) { + explain1("mtu"); + return -1; + } + ok++; + } else if (strcmp(*argv, "mpu") == 0) { + NEXT_ARG(); + if (mpu) { + fprintf(stderr, "Double \"mpu\" spec\n"); + return -1; + } + if (get_size(&mpu, *argv)) { + explain1("mpu"); + return -1; + } + ok++; + } else if (strcmp(*argv, "rate") == 0) { + NEXT_ARG(); + if (opt.rate.rate) { + fprintf(stderr, "Double \"rate\" spec\n"); + return -1; + } + if (get_rate(&opt.rate.rate, *argv)) { + explain1("rate"); + return -1; + } + ok++; + } else if (matches(*argv, "peakrate") == 0) { + NEXT_ARG(); + if (opt.peakrate.rate) { + fprintf(stderr, "Double \"peakrate\" spec\n"); + return -1; + } + if (get_rate(&opt.peakrate.rate, *argv)) { + explain1("peakrate"); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "What is \"%s\"?\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + if (!ok) + return 0; + + if (opt.rate.rate == 0 || !buffer) { + fprintf(stderr, "Both \"rate\" and \"burst\" are required.\n"); + return -1; + } + if (opt.peakrate.rate) { + if (!mtu) { + fprintf(stderr, "\"mtu\" is required, if \"peakrate\" is requested.\n"); + return -1; + } + } + + if (opt.limit == 0 && latency == 0) { + fprintf(stderr, "Either \"limit\" or \"latency\" are required.\n"); + return -1; + } + + if (opt.limit == 0) { + double lim = opt.rate.rate*(double)latency/1000000 + buffer; + if (opt.peakrate.rate) { + double lim2 = opt.peakrate.rate*(double)latency/1000000 + mtu; + if (lim2 < lim) + lim = lim2; + } + opt.limit = lim; + } + + if ((Rcell_log = tc_calc_rtable(opt.rate.rate, rtab, Rcell_log, mtu, mpu)) < 0) { + fprintf(stderr, "TBF: failed to calculate rate table.\n"); + return -1; + } + opt.buffer = tc_calc_xmittime(opt.rate.rate, buffer); + opt.rate.cell_log = Rcell_log; + opt.rate.mpu = mpu; + if (opt.peakrate.rate) { + if ((Pcell_log = tc_calc_rtable(opt.peakrate.rate, ptab, Pcell_log, mtu, mpu)) < 0) { + fprintf(stderr, "TBF: failed to calculate peak rate table.\n"); + return -1; + } + opt.mtu = tc_calc_xmittime(opt.peakrate.rate, mtu); + opt.peakrate.cell_log = Pcell_log; + opt.peakrate.mpu = mpu; + } + + tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len)); + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 2024, TCA_TBF_PARMS, &opt, sizeof(opt)); + addattr_l(n, 3024, TCA_TBF_RTAB, rtab, 1024); + if (opt.peakrate.rate) + addattr_l(n, 4096, TCA_TBF_PTAB, ptab, 1024); + tail->rta_len = (((void*)n)+NLMSG_ALIGN(n->nlmsg_len)) - (void*)tail; + return 0; +} + +static int tbf_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_TBF_PTAB+1]; + struct tc_tbf_qopt *qopt; + double buffer, mtu; + double latency; + SPRINT_BUF(b1); + SPRINT_BUF(b2); + + if (opt == NULL) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_TBF_PTAB, RTA_DATA(opt), RTA_PAYLOAD(opt)); + + if (tb[TCA_TBF_PARMS] == NULL) + return -1; + + qopt = RTA_DATA(tb[TCA_TBF_PARMS]); + if (RTA_PAYLOAD(tb[TCA_TBF_PARMS]) < sizeof(*qopt)) + return -1; + fprintf(f, "rate %s ", sprint_rate(qopt->rate.rate, b1)); + buffer = ((double)qopt->rate.rate*tc_core_tick2usec(qopt->buffer))/1000000; + if (show_details) { + fprintf(f, "burst %s/%u mpu %s ", sprint_size(buffer, b1), + 1<<qopt->rate.cell_log, sprint_size(qopt->rate.mpu, b2)); + } else { + fprintf(f, "burst %s ", sprint_size(buffer, b1)); + } + if (show_raw) + fprintf(f, "[%08x] ", qopt->buffer); + if (qopt->peakrate.rate) { + fprintf(f, "peakrate %s ", sprint_rate(qopt->peakrate.rate, b1)); + if (qopt->mtu || qopt->peakrate.mpu) { + mtu = ((double)qopt->peakrate.rate*tc_core_tick2usec(qopt->mtu))/1000000; + if (show_details) { + fprintf(f, "mtu %s/%u mpu %s ", sprint_size(mtu, b1), + 1<<qopt->peakrate.cell_log, sprint_size(qopt->peakrate.mpu, b2)); + } else { + fprintf(f, "minburst %s ", sprint_size(mtu, b1)); + } + if (show_raw) + fprintf(f, "[%08x] ", qopt->mtu); + } + } + + if (show_raw) + fprintf(f, "limit %s ", sprint_size(qopt->limit, b1)); + + latency = 1000000*(qopt->limit/(double)qopt->rate.rate) - tc_core_tick2usec(qopt->buffer); + if (qopt->peakrate.rate) { + double lat2 = 1000000*(qopt->limit/(double)qopt->peakrate.rate) - tc_core_tick2usec(qopt->mtu); + if (lat2 > latency) + latency = lat2; + } + fprintf(f, "lat %s ", sprint_usecs(tc_core_tick2usec(latency), b1)); + + return 0; +} + +static int tbf_print_xstats(struct qdisc_util *qu, FILE *f, struct rtattr *xstats) +{ + return 0; +} + +struct qdisc_util tbf_util = { + NULL, + "tbf", + tbf_parse_opt, + tbf_print_opt, + tbf_print_xstats, +}; + diff --git a/tc/tc.c b/tc/tc.c index e69de29b..35b3a95c 100644 --- a/tc/tc.c +++ b/tc/tc.c @@ -0,0 +1,306 @@ +/* + * tc.c "tc" utility frontend. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * + * Petri Mattila <petri@prihateam.fi> 990308: wrong memset's resulted in faults + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <dlfcn.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <errno.h> + +#include "SNAPSHOT.h" +#include "utils.h" +#include "tc_util.h" +#include "tc_common.h" + +int show_stats = 0; +int show_details = 0; +int show_raw = 0; +int resolve_hosts = 0; + +void *BODY; +static struct qdisc_util * qdisc_list; +static struct filter_util * filter_list; + +static int print_noqopt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + if (opt && RTA_PAYLOAD(opt)) + fprintf(f, "[Unknown qdisc, optlen=%u] ", RTA_PAYLOAD(opt)); + return 0; +} + +static int parse_noqopt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + if (argc) { + fprintf(stderr, "Unknown qdisc \"%s\", hence option \"%s\" is unparsable\n", qu->id, *argv); + return -1; + } + return 0; +} + +static int print_nofopt(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 fhandle) +{ + if (opt && RTA_PAYLOAD(opt)) + fprintf(f, "fh %08x [Unknown filter, optlen=%u] ", fhandle, RTA_PAYLOAD(opt)); + else if (fhandle) + fprintf(f, "fh %08x ", fhandle); + return 0; +} + +static int parse_nofopt(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n) +{ + __u32 handle; + + if (argc) { + fprintf(stderr, "Unknown filter \"%s\", hence option \"%s\" is unparsable\n", qu->id, *argv); + return -1; + } + if (fhandle) { + struct tcmsg *t = NLMSG_DATA(n); + if (get_u32(&handle, fhandle, 16)) { + fprintf(stderr, "Unparsable filter ID \"%s\"\n", fhandle); + return -1; + } + t->tcm_handle = handle; + } + return 0; +} + +#if 0 +/* Builtin filter types */ + +static int f_parse_noopt(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n) +{ + if (argc || fhandle) { + fprintf(stderr, "Filter \"%s\" has no options.\n", qu->id); + return -1; + } + return 0; +} +#endif + +struct qdisc_util *get_qdisc_kind(char *str) +{ + void *dlh; + char buf[256]; + struct qdisc_util *q; + + for (q = qdisc_list; q; q = q->next) + if (strcmp(q->id, str) == 0) + return q; + + snprintf(buf, sizeof(buf), "q_%s.so", str); + dlh = dlopen(buf, RTLD_LAZY); + if (dlh == NULL) { + dlh = BODY; + if (dlh == NULL) { + dlh = BODY = dlopen(NULL, RTLD_LAZY); + if (dlh == NULL) + goto noexist; + } + } + + snprintf(buf, sizeof(buf), "%s_util", str); + q = dlsym(dlh, buf); + if (q == NULL) + goto noexist; + +reg: + q->next = qdisc_list; + qdisc_list = q; + return q; + +noexist: + q = malloc(sizeof(*q)); + if (q) { + memset(q, 0, sizeof(*q)); + strncpy(q->id, str, 15); + q->parse_qopt = parse_noqopt; + q->print_qopt = print_noqopt; + goto reg; + } + return q; +} + + +struct filter_util *get_filter_kind(char *str) +{ + void *dlh; + char buf[256]; + struct filter_util *q; + + for (q = filter_list; q; q = q->next) + if (strcmp(q->id, str) == 0) + return q; + + snprintf(buf, sizeof(buf), "f_%s.so", str); + dlh = dlopen(buf, RTLD_LAZY); + if (dlh == NULL) { + dlh = BODY; + if (dlh == NULL) { + dlh = BODY = dlopen(NULL, RTLD_LAZY); + if (dlh == NULL) + goto noexist; + } + } + + snprintf(buf, sizeof(buf), "%s_util", str); + q = dlsym(dlh, buf); + if (q == NULL) + goto noexist; + +reg: + q->next = filter_list; + filter_list = q; + return q; + +noexist: + q = malloc(sizeof(*q)); + if (q) { + memset(q, 0, sizeof(*q)); + strncpy(q->id, str, 15); + q->parse_fopt = parse_nofopt; + q->print_fopt = print_nofopt; + goto reg; + } + return q; +} + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: tc [ OPTIONS ] OBJECT { COMMAND | help }\n" + "where OBJECT := { qdisc | class | filter }\n" + " OPTIONS := { -s[tatistics] | -d[etails] | -r[aw] | -b[atch] file }\n"); + exit(-1); +} + + + +int main(int argc, char **argv) +{ + char *basename; + + basename = strrchr(argv[0], '/'); + if (basename == NULL) + basename = argv[0]; + else + basename++; + + + /* batch mode */ + if (argc > 1 && matches(argv[1], "-batch") == 0) { + FILE *batch; + char line[400]; + char *largv[100]; + int largc, ret=0; +#define BMAXARG (sizeof(largv)/sizeof(char *)-2) + + if (argc != 3) { + fprintf(stderr, "Wrong number of arguments in batch mode\n"); + exit(-1); + } + if (matches(argv[2], "-") != 0) { + if ((batch = fopen(argv[2], "r")) == NULL) { + fprintf(stderr, "Cannot open file \"%s\" for reading: %s=n", argv[2], strerror(errno)); + exit(-1); + } + } else { + if ((batch = fdopen(0, "r")) == NULL) { + fprintf(stderr, "Cannot open stdin for reading: %s=n", strerror(errno)); + exit(-1); + } + } + + tc_core_init(); + + while (fgets(line, sizeof(line)-1, batch)) { + if (line[strlen(line)-1]=='\n') { + line[strlen(line)-1] = '\0'; + } else { + fprintf(stderr, "No newline at the end of line, looks like to long (%d chars or more)\n", strlen(line)); + exit(-1); + } + largc = 0; + largv[largc]=strtok(line, " "); + while ((largv[++largc]=strtok(NULL, " ")) != NULL) { + if (largc > BMAXARG) { + fprintf(stderr, "Over %d arguments in batch mode, enough!\n", BMAXARG); + exit(-1); + } + } + + if (matches(largv[0], "qdisc") == 0) { + ret += do_qdisc(largc-1, largv+1); + } else if (matches(largv[0], "class") == 0) { + ret += do_class(largc-1, largv+1); + } else if (matches(largv[0], "filter") == 0) { + ret += do_filter(largc-1, largv+1); + } else if (matches(largv[0], "help") == 0) { + usage(); /* note that usage() doesn't return */ + } else { + fprintf(stderr, "Object \"%s\" is unknown, try \"tc help\".\n", largv[1]); + exit(-1); + } + } + fclose(batch); + exit(0); /* end of batch, that's all */ + } + + while (argc > 1) { + if (argv[1][0] != '-') + break; + if (matches(argv[1], "-stats") == 0 || + matches(argv[1], "-statistics") == 0) { + ++show_stats; + } else if (matches(argv[1], "-details") == 0) { + ++show_details; + } else if (matches(argv[1], "-raw") == 0) { + ++show_raw; + } else if (matches(argv[1], "-Version") == 0) { + printf("tc utility, iproute2-ss%s\n", SNAPSHOT); + exit(0); + } else if (matches(argv[1], "-help") == 0) { + usage(); + } else { + fprintf(stderr, "Option \"%s\" is unknown, try \"tc -help\".\n", argv[1]); + exit(-1); + } + argc--; argv++; + } + + tc_core_init(); + + if (argc > 1) { + if (matches(argv[1], "qdisc") == 0) + return do_qdisc(argc-2, argv+2); + if (matches(argv[1], "class") == 0) + return do_class(argc-2, argv+2); + if (matches(argv[1], "filter") == 0) + return do_filter(argc-2, argv+2); + if (matches(argv[1], "help") == 0) + usage(); + fprintf(stderr, "Object \"%s\" is unknown, try \"tc help\".\n", argv[1]); + exit(-1); + } + + usage(); +} diff --git a/tc/tc_cbq.c b/tc/tc_cbq.c index e69de29b..0abcc9da 100644 --- a/tc/tc_cbq.c +++ b/tc/tc_cbq.c @@ -0,0 +1,57 @@ +/* + * tc_cbq.c CBQ maintanance routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <math.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "tc_core.h" +#include "tc_cbq.h" + +unsigned tc_cbq_calc_maxidle(unsigned bndw, unsigned rate, unsigned avpkt, + int ewma_log, unsigned maxburst) +{ + double maxidle; + double g = 1.0 - 1.0/(1<<ewma_log); + double xmt = (double)avpkt/bndw; + + maxidle = xmt*(1-g); + if (bndw != rate && maxburst) { + double vxmt = (double)avpkt/rate - xmt; + vxmt *= (pow(g, -(double)maxburst) - 1); + if (vxmt > maxidle) + maxidle = vxmt; + } + return tc_core_usec2tick(maxidle*(1<<ewma_log)*1000000); +} + +unsigned tc_cbq_calc_offtime(unsigned bndw, unsigned rate, unsigned avpkt, + int ewma_log, unsigned minburst) +{ + double g = 1.0 - 1.0/(1<<ewma_log); + double offtime = (double)avpkt/rate - (double)avpkt/bndw; + + if (minburst == 0) + return 0; + if (minburst == 1) + offtime *= pow(g, -(double)minburst) - 1; + else + offtime *= 1 + (pow(g, -(double)(minburst-1)) - 1)/(1-g); + return tc_core_usec2tick(offtime*1000000); +} diff --git a/tc/tc_cbq.h b/tc/tc_cbq.h index e69de29b..8f956490 100644 --- a/tc/tc_cbq.h +++ b/tc/tc_cbq.h @@ -0,0 +1,9 @@ +#ifndef _TC_CBQ_H_ +#define _TC_CBQ_H_ 1 + +unsigned tc_cbq_calc_maxidle(unsigned bndw, unsigned rate, unsigned avpkt, + int ewma_log, unsigned maxburst); +unsigned tc_cbq_calc_offtime(unsigned bndw, unsigned rate, unsigned avpkt, + int ewma_log, unsigned minburst); + +#endif diff --git a/tc/tc_class.c b/tc/tc_class.c index e69de29b..542f8d5f 100644 --- a/tc/tc_class.c +++ b/tc/tc_class.c @@ -0,0 +1,361 @@ +/* + * tc_class.c "tc class". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <math.h> + +#include "utils.h" +#include "tc_util.h" +#include "tc_common.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: tc class [ add | del | change | get ] dev STRING\n"); + fprintf(stderr, " [ classid CLASSID ] [ root | parent CLASSID ]\n"); + fprintf(stderr, " [ [ QDISC_KIND ] [ help | OPTIONS ] ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " tc class show [ dev STRING ] [ root | parent CLASSID ]\n"); + fprintf(stderr, "Where:\n"); + fprintf(stderr, "QDISC_KIND := { prio | cbq | etc. }\n"); + fprintf(stderr, "OPTIONS := ... try tc class add <desired QDISC_KIND> help\n"); + exit(-1); +} + +int tc_class_modify(int cmd, unsigned flags, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct tcmsg t; + char buf[4096]; + } req; + struct qdisc_util *q = NULL; + struct tc_estimator est; + char d[16]; + char k[16]; + + memset(&req, 0, sizeof(req)); + memset(&est, 0, sizeof(est)); + memset(d, 0, sizeof(d)); + memset(k, 0, sizeof(k)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST|flags; + req.n.nlmsg_type = cmd; + req.t.tcm_family = AF_UNSPEC; + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (d[0]) + duparg("dev", *argv); + strncpy(d, *argv, sizeof(d)-1); + } else if (strcmp(*argv, "classid") == 0) { + __u32 handle; + NEXT_ARG(); + if (req.t.tcm_handle) + duparg("classid", *argv); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "invalid class ID"); + req.t.tcm_handle = handle; + } else if (strcmp(*argv, "root") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"root\" is duplicate parent ID.\n"); + exit(-1); + } + req.t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "parent") == 0) { + __u32 handle; + NEXT_ARG(); + if (req.t.tcm_parent) + duparg("parent", *argv); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "invalid parent ID"); + req.t.tcm_parent = handle; + } else if (matches(*argv, "estimator") == 0) { + if (parse_estimator(&argc, &argv, &est)) + return -1; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + strncpy(k, *argv, sizeof(k)-1); + + q = get_qdisc_kind(k); + argc--; argv++; + break; + } + argc--; argv++; + } + + if (k[0]) + addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1); + if (est.ewma_log) + addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est)); + + if (q) { + if (q->parse_copt == NULL) { + fprintf(stderr, "Error: Qdisc \"%s\" is classless.\n", k); + exit(1); + } + if (q->parse_copt(q, argc, argv, &req.n)) + exit(1); + } else { + if (argc) { + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc class help\".", *argv); + exit(-1); + } + } + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + if (d[0]) { + ll_init_map(&rth); + + if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + } + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + rtnl_close(&rth); + return 0; +} + +void print_class_tcstats(FILE *fp, struct tc_stats *st) +{ + SPRINT_BUF(b1); + + fprintf(fp, " Sent %llu bytes %u pkts (dropped %u, overlimits %u) ", + (unsigned long long)st->bytes, st->packets, st->drops, st->overlimits); + if (st->bps || st->pps || st->qlen || st->backlog) { + fprintf(fp, "\n "); + if (st->bps || st->pps) { + fprintf(fp, "rate "); + if (st->bps) + fprintf(fp, "%s ", sprint_rate(st->bps, b1)); + if (st->pps) + fprintf(fp, "%upps ", st->pps); + } + if (st->qlen || st->backlog) { + fprintf(fp, "backlog "); + if (st->backlog) + fprintf(fp, "%s ", sprint_size(st->backlog, b1)); + if (st->qlen) + fprintf(fp, "%up ", st->qlen); + } + } +} + +int filter_ifindex; +__u32 filter_qdisc; + +int print_class(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct tcmsg *t = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[TCA_MAX+1]; + struct qdisc_util *q; + char abuf[256]; + + if (n->nlmsg_type != RTM_NEWTCLASS && n->nlmsg_type != RTM_DELTCLASS) { + fprintf(stderr, "Not a class\n"); + return 0; + } + len -= NLMSG_LENGTH(sizeof(*t)); + if (len < 0) { + fprintf(stderr, "Wrong len %d\n", len); + return -1; + } + if (filter_qdisc && TC_H_MAJ(t->tcm_handle^filter_qdisc)) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len); + + if (tb[TCA_KIND] == NULL) { + fprintf(stderr, "NULL kind\n"); + return -1; + } + + if (n->nlmsg_type == RTM_DELTCLASS) + fprintf(fp, "deleted "); + + abuf[0] = 0; + if (t->tcm_handle) { + if (filter_qdisc) + print_tc_classid(abuf, sizeof(abuf), TC_H_MIN(t->tcm_handle)); + else + print_tc_classid(abuf, sizeof(abuf), t->tcm_handle); + } + fprintf(fp, "class %s %s ", (char*)RTA_DATA(tb[TCA_KIND]), abuf); + + if (filter_ifindex == 0) + fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex)); + + if (t->tcm_parent == TC_H_ROOT) + fprintf(fp, "root "); + else { + if (filter_qdisc) + print_tc_classid(abuf, sizeof(abuf), TC_H_MIN(t->tcm_parent)); + else + print_tc_classid(abuf, sizeof(abuf), t->tcm_parent); + fprintf(fp, "parent %s ", abuf); + } + if (t->tcm_info) + fprintf(fp, "leaf %x: ", t->tcm_info>>16); + q = get_qdisc_kind(RTA_DATA(tb[TCA_KIND])); + if (tb[TCA_OPTIONS]) { + if (q && q->print_copt) + q->print_copt(q, fp, tb[TCA_OPTIONS]); + else + fprintf(fp, "[cannot parse class parameters]"); + } + fprintf(fp, "\n"); + if (show_stats) { + if (tb[TCA_STATS]) { + if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats)) + fprintf(fp, "statistics truncated"); + else { + struct tc_stats st; + memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st)); + print_class_tcstats(fp, &st); + fprintf(fp, "\n"); + } + } + if (q && tb[TCA_XSTATS]) { + q->print_xstats(q, fp, tb[TCA_XSTATS]); + fprintf(fp, "\n"); + } + } + fflush(fp); + return 0; +} + + +int tc_class_list(int argc, char **argv) +{ + struct tcmsg t; + struct rtnl_handle rth; + char d[16]; + + memset(&t, 0, sizeof(t)); + t.tcm_family = AF_UNSPEC; + memset(d, 0, sizeof(d)); + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (d[0]) + duparg("dev", *argv); + strncpy(d, *argv, sizeof(d)-1); + } else if (strcmp(*argv, "qdisc") == 0) { + NEXT_ARG(); + if (filter_qdisc) + duparg("qdisc", *argv); + if (get_qdisc_handle(&filter_qdisc, *argv)) + invarg(*argv, "invalid qdisc ID"); + } else if (strcmp(*argv, "root") == 0) { + if (t.tcm_parent) { + fprintf(stderr, "Error: \"root\" is duplicate parent ID\n"); + exit(-1); + } + t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "parent") == 0) { + __u32 handle; + if (t.tcm_parent) + duparg("parent", *argv); + NEXT_ARG(); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "invalid parent ID"); + t.tcm_parent = handle; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + fprintf(stderr, "What is \"%s\"? Try \"tc class help\".\n", *argv); + exit(-1); + } + + argc--; argv++; + } + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + ll_init_map(&rth); + + if (d[0]) { + if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + filter_ifindex = t.tcm_ifindex; + } + + if (rtnl_dump_request(&rth, RTM_GETTCLASS, &t, sizeof(t)) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, print_class, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + rtnl_close(&rth); + return 0; +} + +int do_class(int argc, char **argv) +{ + if (argc < 1) + return tc_class_list(0, NULL); + if (matches(*argv, "add") == 0) + return tc_class_modify(RTM_NEWTCLASS, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1); + if (matches(*argv, "change") == 0) + return tc_class_modify(RTM_NEWTCLASS, 0, argc-1, argv+1); + if (matches(*argv, "replace") == 0) + return tc_class_modify(RTM_NEWTCLASS, NLM_F_CREATE, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return tc_class_modify(RTM_DELTCLASS, 0, argc-1, argv+1); +#if 0 + if (matches(*argv, "get") == 0) + return tc_class_get(RTM_GETTCLASS, 0, argc-1, argv+1); +#endif + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return tc_class_list(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"tc class help\".\n", *argv); + return -1; +} diff --git a/tc/tc_common.h b/tc/tc_common.h index e69de29b..d695ca2d 100644 --- a/tc/tc_common.h +++ b/tc/tc_common.h @@ -0,0 +1,5 @@ +extern int do_qdisc(int argc, char **argv); +extern int do_class(int argc, char **argv); +extern int do_filter(int argc, char **argv); + +extern int parse_estimator(int *p_argc, char ***p_argv, struct tc_estimator *est); diff --git a/tc/tc_core.c b/tc/tc_core.c index e69de29b..55586741 100644 --- a/tc/tc_core.c +++ b/tc/tc_core.c @@ -0,0 +1,85 @@ +/* + * tc_core.c TC core library. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <math.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "tc_core.h" + +static __u32 t2us=1; +static __u32 us2t=1; +static double tick_in_usec = 1; + +long tc_core_usec2tick(long usec) +{ + return usec*tick_in_usec; +} + +long tc_core_tick2usec(long tick) +{ + return tick/tick_in_usec; +} + +unsigned tc_calc_xmittime(unsigned rate, unsigned size) +{ + return tc_core_usec2tick(1000000*((double)size/rate)); +} + +/* + rtab[pkt_len>>cell_log] = pkt_xmit_time + */ + +int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, + unsigned mpu) +{ + int i; + + if (mtu == 0) + mtu = 2047; + + if (cell_log < 0) { + cell_log = 0; + while ((mtu>>cell_log) > 255) + cell_log++; + } + for (i=0; i<256; i++) { + unsigned sz = (i<<cell_log); + if (sz < mpu) + sz = mpu; + rtab[i] = tc_core_usec2tick(1000000*((double)sz/bps)); + } + return cell_log; +} + +int tc_core_init() +{ + FILE *fp = fopen("/proc/net/psched", "r"); + + if (fp == NULL) + return -1; + + if (fscanf(fp, "%08x%08x", &t2us, &us2t) != 2) { + fclose(fp); + return -1; + } + fclose(fp); + tick_in_usec = (double)t2us/us2t; + return 0; +} diff --git a/tc/tc_core.h b/tc/tc_core.h index e69de29b..1d2257ee 100644 --- a/tc/tc_core.h +++ b/tc/tc_core.h @@ -0,0 +1,16 @@ +#ifndef _TC_CORE_H_ +#define _TC_CORE_H_ 1 + +#include <asm/types.h> +#include <linux/pkt_sched.h> + +long tc_core_usec2tick(long usec); +long tc_core_tick2usec(long tick); +unsigned tc_calc_xmittime(unsigned rate, unsigned size); +int tc_calc_rtable(unsigned bps, __u32 *rtab, int cell_log, unsigned mtu, unsigned mpu); + +int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est); + +int tc_core_init(void); + +#endif diff --git a/tc/tc_estimator.c b/tc/tc_estimator.c index e69de29b..434db0fe 100644 --- a/tc/tc_estimator.c +++ b/tc/tc_estimator.c @@ -0,0 +1,44 @@ +/* + * tc_core.c TC core library. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <math.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "tc_core.h" + +int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est) +{ + for (est->interval=0; est->interval<=5; est->interval++) { + if (A <= (1<<est->interval)*(1000000/4)) + break; + } + if (est->interval > 5) + return -1; + est->interval -= 2; + for (est->ewma_log=1; est->ewma_log<32; est->ewma_log++) { + double w = 1.0 - 1.0/(1<<est->ewma_log); + if (A/(-log(w)) > time_const) + break; + } + est->ewma_log--; + if (est->ewma_log==0 || est->ewma_log >= 31) + return -1; + return 0; +} diff --git a/tc/tc_filter.c b/tc/tc_filter.c index e69de29b..300c3e70 100644 --- a/tc/tc_filter.c +++ b/tc/tc_filter.c @@ -0,0 +1,388 @@ +/* + * tc_filter.c "tc filter". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <linux/if_ether.h> + +#include "rt_names.h" +#include "utils.h" +#include "tc_util.h" +#include "tc_common.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: tc filter [ add | del | change | get ] dev STRING\n"); + fprintf(stderr, " [ pref PRIO ] [ protocol PROTO ]\n"); + fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); + fprintf(stderr, " [ root | classid CLASSID ] [ handle FILTERID ]\n"); + fprintf(stderr, " [ [ FILTER_TYPE ] [ help | OPTIONS ] ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " tc filter show [ dev STRING ] [ root | parent CLASSID ]\n"); + fprintf(stderr, "Where:\n"); + fprintf(stderr, "FILTER_TYPE := { rsvp | u32 | fw | route | etc. }\n"); + fprintf(stderr, "FILTERID := ... format depends on classifier, see there\n"); + fprintf(stderr, "OPTIONS := ... try tc filter add <desired FILTER_KIND> help\n"); + exit(-1); +} + + +int tc_filter_modify(int cmd, unsigned flags, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct tcmsg t; + char buf[4096]; + } req; + struct filter_util *q = NULL; + __u32 prio = 0; + __u32 protocol = 0; + char *fhandle = NULL; + char d[16]; + char k[16]; + struct tc_estimator est; + + memset(&req, 0, sizeof(req)); + memset(&est, 0, sizeof(est)); + memset(d, 0, sizeof(d)); + memset(k, 0, sizeof(k)); + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST|flags; + req.n.nlmsg_type = cmd; + req.t.tcm_family = AF_UNSPEC; + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (d[0]) + duparg("dev", *argv); + strncpy(d, *argv, sizeof(d)-1); + } else if (strcmp(*argv, "root") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"root\" is duplicate parent ID\n"); + exit(-1); + } + req.t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "parent") == 0) { + __u32 handle; + NEXT_ARG(); + if (req.t.tcm_parent) + duparg("parent", *argv); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "Invalid parent ID"); + req.t.tcm_parent = handle; + } else if (strcmp(*argv, "handle") == 0) { + NEXT_ARG(); + if (fhandle) + duparg("handle", *argv); + fhandle = *argv; + } else if (matches(*argv, "preference") == 0 || + matches(*argv, "priority") == 0) { + NEXT_ARG(); + if (prio) + duparg("priority", *argv); + if (get_u32(&prio, *argv, 0)) + invarg(*argv, "invalid prpriority value"); + } else if (matches(*argv, "protocol") == 0) { + __u16 id; + NEXT_ARG(); + if (protocol) + duparg("protocol", *argv); + if (ll_proto_a2n(&id, *argv)) + invarg(*argv, "invalid protocol"); + protocol = id; + } else if (matches(*argv, "estimator") == 0) { + if (parse_estimator(&argc, &argv, &est) < 0) + return -1; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + strncpy(k, *argv, sizeof(k)-1); + + q = get_filter_kind(k); + argc--; argv++; + break; + } + + argc--; argv++; + } + + req.t.tcm_info = TC_H_MAKE(prio<<16, protocol); + + if (k[0]) + addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1); + + if (q) { + if (q->parse_fopt(q, fhandle, argc, argv, &req.n)) + exit(1); + } else { + if (fhandle) { + fprintf(stderr, "Must specify filter type when using " + "\"handle\"\n"); + exit(-1); + } + if (argc) { + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc filter help\".\n", *argv); + exit(-1); + } + } + if (est.ewma_log) + addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est)); + + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + if (d[0]) { + ll_init_map(&rth); + + if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + } + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + rtnl_close(&rth); + return 0; +} + +static __u32 filter_parent; +static int filter_ifindex; +static __u32 filter_prio; +static __u32 filter_protocol; + +int print_filter(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct tcmsg *t = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[TCA_MAX+1]; + struct filter_util *q; + char abuf[256]; + + if (n->nlmsg_type != RTM_NEWTFILTER && n->nlmsg_type != RTM_DELTFILTER) { + fprintf(stderr, "Not a filter\n"); + return 0; + } + len -= NLMSG_LENGTH(sizeof(*t)); + if (len < 0) { + fprintf(stderr, "Wrong len %d\n", len); + return -1; + } + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len); + + if (tb[TCA_KIND] == NULL) { + fprintf(stderr, "NULL kind\n"); + return -1; + } + + if (n->nlmsg_type == RTM_DELTFILTER) + fprintf(fp, "deleted "); + + fprintf(fp, "filter "); + if (!filter_ifindex || filter_ifindex != t->tcm_ifindex) + fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex)); + + if (!filter_parent || filter_parent != t->tcm_parent) { + if (t->tcm_parent == TC_H_ROOT) + fprintf(fp, "root "); + else { + print_tc_classid(abuf, sizeof(abuf), t->tcm_parent); + fprintf(fp, "parent %s ", abuf); + } + } + if (t->tcm_info) { + __u32 protocol = TC_H_MIN(t->tcm_info); + __u32 prio = TC_H_MAJ(t->tcm_info)>>16; + if (!filter_protocol || filter_protocol != protocol) { + if (protocol) { + SPRINT_BUF(b1); + fprintf(fp, "protocol %s ", + ll_proto_n2a(protocol, b1, sizeof(b1))); + } + } + if (!filter_prio || filter_prio != prio) { + if (prio) + fprintf(fp, "pref %u ", prio); + } + } + fprintf(fp, "%s ", (char*)RTA_DATA(tb[TCA_KIND])); + q = get_filter_kind(RTA_DATA(tb[TCA_KIND])); + if (tb[TCA_OPTIONS]) { + if (q) + q->print_fopt(q, fp, tb[TCA_OPTIONS], t->tcm_handle); + else + fprintf(fp, "[cannot parse parameters]"); + } + fprintf(fp, "\n"); + + if (show_stats) { + if (tb[TCA_STATS]) { + if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats)) + fprintf(fp, "statistics truncated"); + else { + struct tc_stats st; + memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st)); + print_tcstats(fp, &st); + fprintf(fp, "\n"); + } + } + } + fflush(fp); + return 0; +} + + +int tc_filter_list(int argc, char **argv) +{ + struct tcmsg t; + struct rtnl_handle rth; + char d[16]; + __u32 prio = 0; + __u32 protocol = 0; + char *fhandle = NULL; + + memset(&t, 0, sizeof(t)); + t.tcm_family = AF_UNSPEC; + memset(d, 0, sizeof(d)); + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (d[0]) + duparg("dev", *argv); + strncpy(d, *argv, sizeof(d)-1); + } else if (strcmp(*argv, "root") == 0) { + if (t.tcm_parent) { + fprintf(stderr, "Error: \"root\" is duplicate parent ID\n"); + exit(-1); + } + filter_parent = t.tcm_parent = TC_H_ROOT; + } else if (strcmp(*argv, "parent") == 0) { + __u32 handle; + NEXT_ARG(); + if (t.tcm_parent) + duparg("parent", *argv); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "invalid parent ID"); + filter_parent = t.tcm_parent = handle; + } else if (strcmp(*argv, "handle") == 0) { + NEXT_ARG(); + if (fhandle) + duparg("handle", *argv); + fhandle = *argv; + } else if (matches(*argv, "preference") == 0 || + matches(*argv, "priority") == 0) { + NEXT_ARG(); + if (prio) + duparg("priority", *argv); + if (get_u32(&prio, *argv, 0)) + invarg(*argv, "invalid preference"); + filter_prio = prio; + } else if (matches(*argv, "protocol") == 0) { + __u16 res; + NEXT_ARG(); + if (protocol) + duparg("protocol", *argv); + if (ll_proto_a2n(&res, *argv)) + invarg(*argv, "invalid protocol"); + protocol = res; + filter_protocol = protocol; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + fprintf(stderr, " What is \"%s\"? Try \"tc filter help\"\n", *argv); + exit(-1); + } + + argc--; argv++; + } + + t.tcm_info = TC_H_MAKE(prio<<16, protocol); + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + ll_init_map(&rth); + + if (d[0]) { + if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + filter_ifindex = t.tcm_ifindex; + } + + if (rtnl_dump_request(&rth, RTM_GETTFILTER, &t, sizeof(t)) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, print_filter, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + rtnl_close(&rth); + return 0; +} + +int do_filter(int argc, char **argv) +{ + if (argc < 1) + return tc_filter_list(0, NULL); + if (matches(*argv, "add") == 0) + return tc_filter_modify(RTM_NEWTFILTER, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1); + if (matches(*argv, "change") == 0) + return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1); + if (matches(*argv, "replace") == 0) + return tc_filter_modify(RTM_NEWTFILTER, NLM_F_CREATE, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return tc_filter_modify(RTM_DELTFILTER, 0, argc-1, argv+1); +#if 0 + if (matches(*argv, "get") == 0) + return tc_filter_get(RTM_GETTFILTER, 0, argc-1, argv+1); +#endif + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return tc_filter_list(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"tc filter help\".\n", *argv); + exit(-1); +} + diff --git a/tc/tc_qdisc.c b/tc/tc_qdisc.c index e69de29b..361ca8aa 100644 --- a/tc/tc_qdisc.c +++ b/tc/tc_qdisc.c @@ -0,0 +1,353 @@ +/* + * tc_qdisc.c "tc qdisc". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * J Hadi Salim: Extension to ingress + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <math.h> + +#include "utils.h" +#include "tc_util.h" +#include "tc_common.h" + +static void usage(void) __attribute__((noreturn)); + +static void usage(void) +{ + fprintf(stderr, "Usage: tc qdisc [ add | del | replace | change | get ] dev STRING\n"); + fprintf(stderr, " [ handle QHANDLE ] [ root | ingress | parent CLASSID ]\n"); + fprintf(stderr, " [ estimator INTERVAL TIME_CONSTANT ]\n"); + fprintf(stderr, " [ [ QDISC_KIND ] [ help | OPTIONS ] ]\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " tc qdisc show [ dev STRING ] [ingress]\n"); + fprintf(stderr, "Where:\n"); + fprintf(stderr, "QDISC_KIND := { [p|b]fifo | tbf | prio | cbq | red | etc. }\n"); + fprintf(stderr, "OPTIONS := ... try tc qdisc add <desired QDISC_KIND> help\n"); + exit(-1); +} + +int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv) +{ + struct rtnl_handle rth; + struct { + struct nlmsghdr n; + struct tcmsg t; + char buf[4096]; + } req; + struct qdisc_util *q = NULL; + struct tc_estimator est; + char d[16]; + char k[16]; + + memset(&req, 0, sizeof(req)); + memset(&est, 0, sizeof(est)); + memset(&d, 0, sizeof(d)); + memset(&k, 0, sizeof(k)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); + req.n.nlmsg_flags = NLM_F_REQUEST|flags; + req.n.nlmsg_type = cmd; + req.t.tcm_family = AF_UNSPEC; + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + if (d[0]) + duparg("dev", *argv); + strncpy(d, *argv, sizeof(d)-1); + } else if (strcmp(*argv, "handle") == 0) { + __u32 handle; + if (req.t.tcm_handle) + duparg("handle", *argv); + NEXT_ARG(); + if (get_qdisc_handle(&handle, *argv)) + invarg(*argv, "invalid qdisc ID"); + req.t.tcm_handle = handle; + } else if (strcmp(*argv, "root") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"root\" is duplicate parent ID\n"); + exit(-1); + } + req.t.tcm_parent = TC_H_ROOT; +#ifdef TC_H_INGRESS + } else if (strcmp(*argv, "ingress") == 0) { + if (req.t.tcm_parent) { + fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n"); + exit(-1); + } + req.t.tcm_parent = TC_H_INGRESS; + strncpy(k, "ingress", sizeof(k)-1); + q = get_qdisc_kind(k); + req.t.tcm_handle = 0xffff0000; + + argc--; argv++; + break; +#endif + } else if (strcmp(*argv, "parent") == 0) { + __u32 handle; + NEXT_ARG(); + if (req.t.tcm_parent) + duparg("parent", *argv); + if (get_tc_classid(&handle, *argv)) + invarg(*argv, "invalid parent ID"); + req.t.tcm_parent = handle; + } else if (matches(*argv, "estimator") == 0) { + if (parse_estimator(&argc, &argv, &est)) + return -1; + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + strncpy(k, *argv, sizeof(k)-1); + + q = get_qdisc_kind(k); + argc--; argv++; + break; + } + argc--; argv++; + } + + if (k[0]) + addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1); + if (est.ewma_log) + addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est)); + + if (q) { + if (q->parse_qopt(q, argc, argv, &req.n)) + exit(1); + } else { + if (argc) { + if (matches(*argv, "help") == 0) + usage(); + + fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc qdisc help\".\n", *argv); + exit(-1); + } + } + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + if (d[0]) { + int idx; + + ll_init_map(&rth); + + if ((idx = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + req.t.tcm_ifindex = idx; + } + + if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0) + exit(2); + + rtnl_close(&rth); + return 0; +} + +void print_tcstats(FILE *fp, struct tc_stats *st) +{ + SPRINT_BUF(b1); + + fprintf(fp, " Sent %llu bytes %u pkts (dropped %u, overlimits %u) ", + (unsigned long long)st->bytes, st->packets, st->drops, st->overlimits); + if (st->bps || st->pps || st->qlen || st->backlog) { + fprintf(fp, "\n "); + if (st->bps || st->pps) { + fprintf(fp, "rate "); + if (st->bps) + fprintf(fp, "%s ", sprint_rate(st->bps, b1)); + if (st->pps) + fprintf(fp, "%upps ", st->pps); + } + if (st->qlen || st->backlog) { + fprintf(fp, "backlog "); + if (st->backlog) + fprintf(fp, "%s ", sprint_size(st->backlog, b1)); + if (st->qlen) + fprintf(fp, "%up ", st->qlen); + } + } +} + +static int filter_ifindex; + +int print_qdisc(struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) +{ + FILE *fp = (FILE*)arg; + struct tcmsg *t = NLMSG_DATA(n); + int len = n->nlmsg_len; + struct rtattr * tb[TCA_MAX+1]; + struct qdisc_util *q; + char abuf[256]; + + if (n->nlmsg_type != RTM_NEWQDISC && n->nlmsg_type != RTM_DELQDISC) { + fprintf(stderr, "Not a qdisc\n"); + return 0; + } + len -= NLMSG_LENGTH(sizeof(*t)); + if (len < 0) { + fprintf(stderr, "Wrong len %d\n", len); + return -1; + } + + if (filter_ifindex && filter_ifindex != t->tcm_ifindex) + return 0; + + memset(tb, 0, sizeof(tb)); + parse_rtattr(tb, TCA_MAX, TCA_RTA(t), len); + + if (tb[TCA_KIND] == NULL) { + fprintf(stderr, "NULL kind\n"); + return -1; + } + + if (n->nlmsg_type == RTM_DELQDISC) + fprintf(fp, "deleted "); + + fprintf(fp, "qdisc %s %x: ", (char*)RTA_DATA(tb[TCA_KIND]), t->tcm_handle>>16); + if (filter_ifindex == 0) + fprintf(fp, "dev %s ", ll_index_to_name(t->tcm_ifindex)); + if (t->tcm_parent == TC_H_ROOT) + fprintf(fp, "root "); + else if (t->tcm_parent) { + print_tc_classid(abuf, sizeof(abuf), t->tcm_parent); + fprintf(fp, "parent %s ", abuf); + } + if (t->tcm_info != 1) { + fprintf(fp, "refcnt %d ", t->tcm_info); + } + q = get_qdisc_kind(RTA_DATA(tb[TCA_KIND])); + if (tb[TCA_OPTIONS]) { + if (q) + q->print_qopt(q, fp, tb[TCA_OPTIONS]); + else + fprintf(fp, "[cannot parse qdisc parameters]"); + } + fprintf(fp, "\n"); + if (show_stats) { + if (tb[TCA_STATS]) { + if (RTA_PAYLOAD(tb[TCA_STATS]) < sizeof(struct tc_stats)) + fprintf(fp, "statistics truncated"); + else { + struct tc_stats st; + memcpy(&st, RTA_DATA(tb[TCA_STATS]), sizeof(st)); + print_tcstats(fp, &st); + fprintf(fp, "\n"); + } + } + if (q && tb[TCA_XSTATS]) { + q->print_xstats(q, fp, tb[TCA_XSTATS]); + fprintf(fp, "\n"); + } + } + fflush(fp); + return 0; +} + + +int tc_qdisc_list(int argc, char **argv) +{ + struct tcmsg t; + struct rtnl_handle rth; + char d[16]; + + memset(&t, 0, sizeof(t)); + t.tcm_family = AF_UNSPEC; + memset(&d, 0, sizeof(d)); + + while (argc > 0) { + if (strcmp(*argv, "dev") == 0) { + NEXT_ARG(); + strncpy(d, *argv, sizeof(d)-1); +#ifdef TC_H_INGRESS + } else if (strcmp(*argv, "ingress") == 0) { + if (t.tcm_parent) { + fprintf(stderr, "Duplicate parent ID\n"); + usage(); + } + t.tcm_parent = TC_H_INGRESS; +#endif + } else if (matches(*argv, "help") == 0) { + usage(); + } else { + fprintf(stderr, "What is \"%s\"? Try \"tc qdisc help\".\n", *argv); + return -1; + } + + argc--; argv++; + } + + if (rtnl_open(&rth, 0) < 0) { + fprintf(stderr, "Cannot open rtnetlink\n"); + exit(1); + } + + ll_init_map(&rth); + + if (d[0]) { + if ((t.tcm_ifindex = ll_name_to_index(d)) == 0) { + fprintf(stderr, "Cannot find device \"%s\"\n", d); + exit(1); + } + filter_ifindex = t.tcm_ifindex; + } + + if (rtnl_dump_request(&rth, RTM_GETQDISC, &t, sizeof(t)) < 0) { + perror("Cannot send dump request"); + exit(1); + } + + if (rtnl_dump_filter(&rth, print_qdisc, stdout, NULL, NULL) < 0) { + fprintf(stderr, "Dump terminated\n"); + exit(1); + } + + rtnl_close(&rth); + return 0; +} + +int do_qdisc(int argc, char **argv) +{ + if (argc < 1) + return tc_qdisc_list(0, NULL); + if (matches(*argv, "add") == 0) + return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1); + if (matches(*argv, "change") == 0) + return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1); + if (matches(*argv, "replace") == 0) + return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1); + if (matches(*argv, "link") == 0) + return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1); + if (matches(*argv, "delete") == 0) + return tc_qdisc_modify(RTM_DELQDISC, 0, argc-1, argv+1); +#if 0 + if (matches(*argv, "get") == 0) + return tc_qdisc_get(RTM_GETQDISC, 0, argc-1, argv+1); +#endif + if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 + || matches(*argv, "lst") == 0) + return tc_qdisc_list(argc-1, argv+1); + if (matches(*argv, "help") == 0) + usage(); + fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", *argv); + return -1; +} diff --git a/tc/tc_red.c b/tc/tc_red.c index e69de29b..385e7af1 100644 --- a/tc/tc_red.c +++ b/tc/tc_red.c @@ -0,0 +1,97 @@ +/* + * tc_red.c RED maintanance routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <math.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> + +#include "tc_core.h" +#include "tc_red.h" + +/* + Plog = log(prob/(qmax - qmin)) + */ +int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob) +{ + int i = qmax - qmin; + + if (i <= 0) + return -1; + + prob /= i; + + for (i=0; i<32; i++) { + if (prob > 1.0) + break; + prob *= 2; + } + if (i>=32) + return -1; + return i; +} + +/* + burst + 1 - qmin/avpkt < (1-(1-W)^burst)/W + */ + +int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt) +{ + int wlog = 1; + double W = 0.5; + double a = (double)burst + 1 - (double)qmin/avpkt; + + if (a < 1.0) + return -1; + for (wlog=1; wlog<32; wlog++, W /= 2) { + if (a <= (1 - pow(1-W, burst))/W) + return wlog; + } + return -1; +} + +/* + Stab[t>>Scell_log] = -log(1-W) * t/xmit_time + */ + +int tc_red_eval_idle_damping(int Wlog, unsigned avpkt, unsigned bps, __u8 *sbuf) +{ + double xmit_time = tc_core_usec2tick(1000000*(double)avpkt/bps); + double lW = -log(1.0 - 1.0/(1<<Wlog))/xmit_time; + double maxtime = 31/lW; + int clog; + int i; + double tmp; + + tmp = maxtime; + for (clog=0; clog<32; clog++) { + if (maxtime/(1<<clog) < 512) + break; + } + if (clog >= 32) + return -1; + + sbuf[0] = 0; + for (i=1; i<255; i++) { + sbuf[i] = (i<<clog)*lW; + if (sbuf[i] > 31) + sbuf[i] = 31; + } + sbuf[255] = 31; + return clog; +} diff --git a/tc/tc_red.h b/tc/tc_red.h index e69de29b..6f6b09e3 100644 --- a/tc/tc_red.h +++ b/tc/tc_red.h @@ -0,0 +1,8 @@ +#ifndef _TC_RED_H_ +#define _TC_RED_H_ 1 + +extern int tc_red_eval_P(unsigned qmin, unsigned qmax, double prob); +extern int tc_red_eval_ewma(unsigned qmin, unsigned burst, unsigned avpkt); +extern int tc_red_eval_idle_damping(int wlog, unsigned avpkt, unsigned bandwidth, __u8 *sbuf); + +#endif diff --git a/tc/tc_util.c b/tc/tc_util.c index e69de29b..d1355391 100644 --- a/tc/tc_util.c +++ b/tc/tc_util.c @@ -0,0 +1,313 @@ +/* + * tc_util.c Misc TC utility functions. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <syslog.h> +#include <fcntl.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <string.h> +#include <math.h> + +#include "utils.h" +#include "tc_util.h" + +int get_qdisc_handle(__u32 *h, char *str) +{ + __u32 maj; + char *p; + + maj = TC_H_UNSPEC; + if (strcmp(str, "none") == 0) + goto ok; + maj = strtoul(str, &p, 16); + if (p == str) + return -1; + maj <<= 16; + if (*p != ':' && *p!=0) + return -1; +ok: + *h = maj; + return 0; +} + +int get_tc_classid(__u32 *h, char *str) +{ + __u32 maj, min; + char *p; + + maj = TC_H_ROOT; + if (strcmp(str, "root") == 0) + goto ok; + maj = TC_H_UNSPEC; + if (strcmp(str, "none") == 0) + goto ok; + maj = strtoul(str, &p, 16); + if (p == str) { + maj = 0; + if (*p != ':') + return -1; + } + if (*p == ':') { + maj <<= 16; + str = p+1; + min = strtoul(str, &p, 16); + if (*p != 0) + return -1; + maj |= min; + } else if (*p != 0) + return -1; + +ok: + *h = maj; + return 0; +} + +int print_tc_classid(char *buf, int len, __u32 h) +{ + if (h == TC_H_ROOT) + sprintf(buf, "root"); + else if (h == TC_H_UNSPEC) + snprintf(buf, len, "none"); + else if (TC_H_MAJ(h) == 0) + snprintf(buf, len, ":%x", TC_H_MIN(h)); + else if (TC_H_MIN(h) == 0) + snprintf(buf, len, "%x:", TC_H_MAJ(h)>>16); + else + snprintf(buf, len, "%x:%x", TC_H_MAJ(h)>>16, TC_H_MIN(h)); + return 0; +} + +char * sprint_tc_classid(__u32 h, char *buf) +{ + if (print_tc_classid(buf, SPRINT_BSIZE-1, h)) + strcpy(buf, "???"); + return buf; +} + + +int get_rate(unsigned *rate, char *str) +{ + char *p; + double bps = strtod(str, &p); + + if (p == str) + return -1; + + if (*p) { + if (strcasecmp(p, "kbps") == 0) + bps *= 1024; + else if (strcasecmp(p, "mbps") == 0) + bps *= 1024*1024; + else if (strcasecmp(p, "mbit") == 0) + bps *= 1024*1024/8; + else if (strcasecmp(p, "kbit") == 0) + bps *= 1024/8; + else if (strcasecmp(p, "bps") != 0) + return -1; + } else + bps /= 8; + + *rate = bps; + return 0; +} + +int get_rate_and_cell(unsigned *rate, int *cell_log, char *str) +{ + char * slash = strchr(str, '/'); + + if (slash) + *slash = 0; + + if (get_rate(rate, str)) + return -1; + + if (slash) { + int cell; + int i; + + if (get_integer(&cell, slash+1, 0)) + return -1; + *slash = '/'; + + for (i=0; i<32; i++) { + if ((1<<i) == cell) { + *cell_log = i; + return 0; + } + } + return -1; + } + return 0; +} + + +int print_rate(char *buf, int len, __u32 rate) +{ + double tmp = (double)rate*8; + + if (tmp >= 1024*1023 && fabs(1024*1024*rint(tmp/(1024*1024)) - tmp) < 1024) + snprintf(buf, len, "%gMbit", rint(tmp/(1024*1024))); + else if (tmp >= 1024-16 && fabs(1024*rint(tmp/1024) - tmp) < 16) + snprintf(buf, len, "%gKbit", rint(tmp/1024)); + else + snprintf(buf, len, "%ubps", rate); + return 0; +} + +char * sprint_rate(__u32 rate, char *buf) +{ + if (print_rate(buf, SPRINT_BSIZE-1, rate)) + strcpy(buf, "???"); + return buf; +} + +int get_usecs(unsigned *usecs, char *str) +{ + double t; + char *p; + + t = strtod(str, &p); + if (p == str) + return -1; + + if (*p) { + if (strcasecmp(p, "s") == 0 || strcasecmp(p, "sec")==0 || + strcasecmp(p, "secs")==0) + t *= 1000000; + else if (strcasecmp(p, "ms") == 0 || strcasecmp(p, "msec")==0 || + strcasecmp(p, "msecs") == 0) + t *= 1000; + else if (strcasecmp(p, "us") == 0 || strcasecmp(p, "usec")==0 || + strcasecmp(p, "usecs") == 0) + t *= 1; + else + return -1; + } + + *usecs = t; + return 0; +} + + +int print_usecs(char *buf, int len, __u32 usec) +{ + double tmp = usec; + + if (tmp >= 1000000) + snprintf(buf, len, "%.1fs", tmp/1000000); + else if (tmp >= 1000) + snprintf(buf, len, "%.1fms", tmp/1000); + else + snprintf(buf, len, "%uus", usec); + return 0; +} + +char * sprint_usecs(__u32 usecs, char *buf) +{ + if (print_usecs(buf, SPRINT_BSIZE-1, usecs)) + strcpy(buf, "???"); + return buf; +} + +int get_size(unsigned *size, char *str) +{ + double sz; + char *p; + + sz = strtod(str, &p); + if (p == str) + return -1; + + if (*p) { + if (strcasecmp(p, "kb") == 0 || strcasecmp(p, "k")==0) + sz *= 1024; + else if (strcasecmp(p, "mb") == 0 || strcasecmp(p, "m")==0) + sz *= 1024*1024; + else if (strcasecmp(p, "mbit") == 0) + sz *= 1024*1024/8; + else if (strcasecmp(p, "kbit") == 0) + sz *= 1024/8; + else if (strcasecmp(p, "b") != 0) + return -1; + } + + *size = sz; + return 0; +} + +int get_size_and_cell(unsigned *size, int *cell_log, char *str) +{ + char * slash = strchr(str, '/'); + + if (slash) + *slash = 0; + + if (get_size(size, str)) + return -1; + + if (slash) { + int cell; + int i; + + if (get_integer(&cell, slash+1, 0)) + return -1; + *slash = '/'; + + for (i=0; i<32; i++) { + if ((1<<i) == cell) { + *cell_log = i; + return 0; + } + } + return -1; + } + return 0; +} + +int print_size(char *buf, int len, __u32 sz) +{ + double tmp = sz; + + if (sz >= 1024*1024 && fabs(1024*1024*rint(tmp/(1024*1024)) - sz) < 1024) + snprintf(buf, len, "%gMb", rint(tmp/(1024*1024))); + else if (sz >= 1024 && fabs(1024*rint(tmp/1024) - sz) < 16) + snprintf(buf, len, "%gKb", rint(tmp/1024)); + else + snprintf(buf, len, "%ub", sz); + return 0; +} + +char * sprint_size(__u32 size, char *buf) +{ + if (print_size(buf, SPRINT_BSIZE-1, size)) + strcpy(buf, "???"); + return buf; +} + +int print_qdisc_handle(char *buf, int len, __u32 h) +{ + snprintf(buf, len, "%x:", TC_H_MAJ(h)>>16); + return 0; +} + +char * sprint_qdisc_handle(__u32 h, char *buf) +{ + if (print_qdisc_handle(buf, SPRINT_BSIZE-1, h)) + strcpy(buf, "???"); + return buf; +} + + diff --git a/tc/tc_util.h b/tc/tc_util.h index e69de29b..bdc88d1f 100644 --- a/tc/tc_util.h +++ b/tc/tc_util.h @@ -0,0 +1,57 @@ +#ifndef _TC_UTIL_H_ +#define _TC_UTIL_H_ 1 + +#include <linux/pkt_sched.h> +#include <linux/pkt_cls.h> +#include "tc_core.h" + +struct qdisc_util +{ + struct qdisc_util *next; + char id[16]; + int (*parse_qopt)(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n); + int (*print_qopt)(struct qdisc_util *qu, FILE *f, struct rtattr *opt); + int (*print_xstats)(struct qdisc_util *qu, FILE *f, struct rtattr *xstats); + + int (*parse_copt)(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n); + int (*print_copt)(struct qdisc_util *qu, FILE *f, struct rtattr *opt); +}; + +struct filter_util +{ + struct filter_util *next; + char id[16]; + int (*parse_fopt)(struct filter_util *qu, char *fhandle, int argc, char **argv, struct nlmsghdr *n); + int (*print_fopt)(struct filter_util *qu, FILE *f, struct rtattr *opt, __u32 fhandle); +}; + + +extern struct qdisc_util *get_qdisc_kind(char *str); +extern struct filter_util *get_filter_kind(char *str); + +extern int get_qdisc_handle(__u32 *h, char *str); +extern int get_rate(unsigned *rate, char *str); +extern int get_size(unsigned *size, char *str); +extern int get_size_and_cell(unsigned *size, int *cell_log, char *str); +extern int get_usecs(unsigned *usecs, char *str); +extern int print_rate(char *buf, int len, __u32 rate); +extern int print_size(char *buf, int len, __u32 size); +extern int print_qdisc_handle(char *buf, int len, __u32 h); +extern int print_usecs(char *buf, int len, __u32 usecs); +extern char * sprint_rate(__u32 rate, char *buf); +extern char * sprint_size(__u32 size, char *buf); +extern char * sprint_qdisc_handle(__u32 h, char *buf); +extern char * sprint_tc_classid(__u32 h, char *buf); +extern char * sprint_usecs(__u32 usecs, char *buf); + +extern void print_tcstats(FILE *fp, struct tc_stats *st); + +extern int get_tc_classid(__u32 *h, char *str); +extern int print_tc_classid(char *buf, int len, __u32 h); +extern char * sprint_tc_classid(__u32 h, char *buf); + +extern int tc_print_police(FILE *f, struct rtattr *tb); +extern int parse_police(int *, char ***, int, struct nlmsghdr *); + + +#endif -- 2.39.2