diff --git a/sys/netinet/ip_dummynet.c b/sys/netinet/ip_dummynet.c index 9791a7fcac2..28e877a9aa3 100644 --- a/sys/netinet/ip_dummynet.c +++ b/sys/netinet/ip_dummynet.c @@ -1,38 +1,47 @@ /* - * Copyright (c) 1998 Luigi Rizzo + * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved * - * Redistribution and use in source forms, with and without modification, - * are permitted provided that this entire comment appears intact. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * Redistribution in binary form may occur without any restrictions. - * Obviously, it would be nice if you gave credit where credit is due - * but requiring it would be too onerous. - * - * This software is provided ``AS IS'' without any warranties of any kind. + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ */ +#define DEB(x) +#define DDB(x) x + /* * This module implements IP dummynet, a bandwidth limiter/delay emulator * used in conjunction with the ipfw package. * - * Changes: + * Most important Changes: * - * 980821: changed conventions in the queueing logic - * packets passed from dummynet to ip_in/out are prepended with - * a vestigial mbuf type MT_DUMMYNET which contains a pointer - * to the matching rule. - * ip_input/output will extract the parameters, free the vestigial mbuf, - * and do the processing. - * - * 980519: fixed behaviour when deleting rules. - * 980518: added splimp()/splx() to protect against races + * 000106: large rewrite, use heaps to handle very many pipes. * 980513: initial release + * + * include files marked with XXX are probably not needed */ -/* include files marked with XXX are probably not needed */ - #include #include #include @@ -60,54 +69,63 @@ #include #endif +/* + * the addresses/ports of last pkt matched by the firewall are + * in this structure. This is so that we can easily find them without + * navigating through the mbuf. + */ +struct dn_flow_id dn_last_pkt ; + +/* + * we keep a private variable for the simulation time, but probably + * it would be better to use the already existing one "softticks" + * (in sys/kern/kern_timer.c) + */ +static dn_key curr_time = 0 ; /* current simulation time */ + +static int dn_hash_size = 64 ; /* default hash size */ + +/* statistics on number of queue searches and search steps */ +static int searches, search_steps ; + +static struct dn_heap ready_heap, extract_heap ; +static int heap_init(struct dn_heap *h, int size) ; +static int heap_insert (struct dn_heap *h, dn_key key1, void *p); +static void heap_extract(struct dn_heap *h); +static void transmit_event(struct dn_pipe *pipe); +static void ready_event(struct dn_flow_queue *q); + static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ -static int dn_debug = 0 ; /* verbose */ -static int dn_calls = 0 ; /* number of calls */ -static int dn_idle = 1; #ifdef SYSCTL_NODE -SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet"); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dn_debug, 0, ""); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, calls, CTLFLAG_RD, &dn_calls, 0, ""); -SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, idle, CTLFLAG_RD, &dn_idle, 0, ""); +SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, + CTLFLAG_RW, 0, "Dummynet"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, + CTLFLAG_RD, &dn_hash_size, 0, "Default hash table size"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, + CTLFLAG_RD, &curr_time, 0, "Current tick"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, + CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, + CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, + CTLFLAG_RD, &searches, 0, "Number of queue searches"); +SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, + CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); #endif static int ip_dn_ctl(struct sockopt *sopt); static void rt_unref(struct rtentry *); static void dummynet(void *); -static void dn_restart(void); -static void dn_move(struct dn_pipe *pipe, int immediate); static void dummynet_flush(void); /* - * the following is needed when deleting a pipe, because rules can + * ip_fw_chain is used when deleting a pipe, because ipfw rules can * hold references to the pipe. */ extern LIST_HEAD (ip_fw_head, ip_fw_chain) ip_fw_chain; -/* - * invoked to reschedule the periodic task if necessary. - * Should only be called when dn_idle = 1 ; - */ -static void -dn_restart() -{ - struct dn_pipe *pipe; - - if (!dn_idle) - return; - - for (pipe = all_pipes ; pipe ; pipe = pipe->next ) { - /* if there any pipe that needs work, restart */ - if (pipe->r.head || pipe->p.head || pipe->numbytes < 0 ) { - dn_idle = 0; - timeout(dummynet, NULL, 1); - return ; - } - } -} - static void rt_unref(struct rtentry *rt) { @@ -119,124 +137,192 @@ rt_unref(struct rtentry *rt) } /* - * move packets from R-queue to P-queue + * Heap management functions. + * + * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. + * Some macros help finding parent/children so we can optimize them. + # + * heap_init() is called to expand the heap when needed. + * Increment size in blocks of 256 entries (which make one 4KB page) + * XXX failure to allocate a new element is a pretty bad failure + * as we basically stall a whole queue forever!! + * Returns 1 on error, 0 on success + */ +#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) +#define HEAP_LEFT(x) ( 2*(x) + 1 ) +#define HEAP_IS_LEFT(x) ( (x) & 1 ) +#define HEAP_RIGHT(x) ( 2*(x) + 1 ) +#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } +#define HEAP_INCREMENT 255 + +static int +heap_init(struct dn_heap *h, int new_size) +{ + struct dn_heap_entry *p; + + if (h->size >= new_size ) { + printf("heap_init, Bogus call, have %d want %d\n", + h->size, new_size); + return 0 ; + } + new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; + p = malloc(new_size * sizeof(*p), M_IPFW, M_DONTWAIT ); + if (p == NULL) { + printf(" heap_init, resize %d failed\n", new_size ); + return 1 ; /* error */ + } + if (h->size > 0) { + bcopy(h->p, p, h->size * sizeof(*p) ); + free(h->p, M_IPFW); + } + h->p = p ; + h->size = new_size ; + return 0 ; +} + +/* + * Insert element in heap. Normally, p != NULL, we insert p in + * a new position and bubble up. If p == NULL, then the element is + * already in place, and key is the position where to start the + * bubble-up. + * Returns 1 on failure (cannot allocate new heap entry) + */ +static int +heap_insert(struct dn_heap *h, dn_key key1, void *p) +{ + int son = h->elements ; + + if (p == NULL) /* data already there, set starting point */ + son = key1 ; + else { /* insert new element at the end, possibly resize */ + son = h->elements ; + if (son == h->size) /* need resize... */ + if (heap_init(h, h->elements+1) ) + return 1 ; /* failure... */ + h->p[son].object = p ; + h->p[son].key = key1 ; + h->elements++ ; + } + while (son > 0) { /* bubble up */ + int father = HEAP_FATHER(son) ; + struct dn_heap_entry tmp ; + + if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) + break ; /* found right position */ + /* son smaller than father, swap and try again */ + HEAP_SWAP(h->p[son], h->p[father], tmp) ; + son = father ; + } + return 0 ; +} + +/* + * remove top element from heap */ static void -dn_move(struct dn_pipe *pipe, int immediate) +heap_extract(struct dn_heap *h) +{ + int child, father, max = h->elements - 1 ; + if (max < 0) + return ; + + /* move up smallest child */ + father = 0 ; + child = HEAP_LEFT(father) ; /* left child */ + while (child <= max) { /* valid entry */ + if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) + child = child+1 ; /* take right child, otherwise left */ + h->p[father] = h->p[child] ; + father = child ; + child = HEAP_LEFT(child) ; /* left child for next loop */ + } + h->elements-- ; + if (father != max) { + /* + * Fill hole with last entry and bubble up, reusing the insert code + */ + h->p[father] = h->p[max] ; + heap_insert(h, father, NULL); /* this one cannot fail */ + } +} + +/* + * heapify() will reorganize data inside an array to maintain the + * heap property. It is needed when we delete a bunch of entries. + */ +static void +heapify(struct dn_heap *h) { - struct dn_pkt *pkt; - - /* - * consistency check, should catch new pipes which are - * not initialized properly. - */ - if ( pipe->p.head == NULL && - pipe->ticks_from_last_insert != pipe->delay) { - printf("Warning, empty pipe and delay %d (should be %d)\n", - pipe->ticks_from_last_insert, pipe->delay); - pipe->ticks_from_last_insert = pipe->delay; + int father, i ; + struct dn_heap_entry tmp ; + + for (i = h->elements - 1 ; i > 0 ; i-- ) { + father = HEAP_FATHER(i) ; + if ( DN_KEY_LT(h->p[i].key, h->p[father].key) ) + HEAP_SWAP(h->p[father], h->p[i], tmp) ; } - /* this ought to go in dn_dequeue() */ - if (!immediate && pipe->ticks_from_last_insert < pipe->delay) - pipe->ticks_from_last_insert++; - if ((pkt = pipe->r.head) != NULL) { +} +/* + * --- end of heap management functions --- + */ + +/* + * Scheduler functions -- transmit_event(), ready_event() + * + * transmit_event() is called when the delay-line needs to enter + * the scheduler, either because of existing pkts getting ready, + * or new packets entering the queue. The event handled is the delivery + * time of the packet. + * + * ready_event() does something similar with flow queues, and the + * event handled is the finish time of the head pkt. + * + * In both cases, we make sure that the data structures are consistent + * before passing pkts out, because this might trigger recursive + * invocations of the procedures. + */ +static void +transmit_event(struct dn_pipe *pipe) +{ + struct dn_pkt *pkt ; + + while ( (pkt = pipe->p.head) && DN_KEY_LEQ(pkt->output_time, curr_time) ) { /* - * Move at most numbytes bytes from src and move to dst. - * delay is set to ticks_from_last_insert, which - * is reset after the first insertion; + * first unlink, then call procedures, since ip_input() can invoke + * ip_output() and viceversa, thus causing nested calls */ - while ( pkt ) { - int len = pkt->dn_m->m_pkthdr.len ; - - /* - * queue limitation: pass packets down if the len is - * such that the pkt would go out before the next tick. - */ - if (pipe->bandwidth) { - int len_scaled = len*8*hz ; - /* numbytes is in bit/sec, scaled 8*hz ... */ - if (pipe->numbytes < len_scaled) - break; - pipe->numbytes -= len_scaled; - } - pipe->r_len--; /* elements in queue */ - pipe->r_len_bytes -= len ; - - /* - * to add delay jitter, must act here. A lower value - * (bounded to 0) means lower delay. - */ - pkt->delay = pipe->ticks_from_last_insert; - pipe->ticks_from_last_insert = 0; - /* compensate the decrement done next in dn_dequeue */ - if (!immediate && pkt->delay >0 && pipe->p.head==NULL) - pkt->delay++; - if (pipe->p.head == NULL) - pipe->p.head = pkt; - else - (struct dn_pkt *)pipe->p.tail->dn_next = pkt; - pipe->p.tail = pkt; - pkt = (struct dn_pkt *)pkt->dn_next; - pipe->p.tail->dn_next = NULL; - } - pipe->r.head = pkt; - - /*** XXX just a sanity check */ - if ( ( pkt == NULL && pipe->r_len != 0) || - ( pkt != NULL && pipe->r_len == 0) ) - printf("-- Warning, pipe head %p len %d\n", - (void *)pkt, pipe->r_len); - } - - /* - * deliver packets downstream after the delay in the P-queue. - */ - - if (pipe->p.head == NULL) - return; - if (!immediate) - pipe->p.head->delay--; - while ( (pkt = pipe->p.head) && pkt->delay < 1) { - /* - * first unlink, then call procedures since ip_input() - * can result in a call to ip_output cnd viceversa, - * thus causing nested calls - */ - pipe->p.head = (struct dn_pkt *) pkt->dn_next ; + pipe->p.head = DN_NEXT(pkt) ; /* - * the trick to avoid flow-id settings here is to prepend a - * vestigial mbuf to the packet, with the following values: - * m_type = MT_DUMMYNET - * m_next = the actual mbuf to be processed by ip_input/output - * m_data = the matching rule - * The vestigial element is the same memory area used by - * the dn_pkt, and IS FREED HERE because it can contain - * parameters passed to the called routine. The buffer IS NOT - * A REAL MBUF, just a block of memory acquired with malloc(). + * The actual mbuf is preceded by a struct dn_pkt, resembling an mbuf + * (NOT A REAL one, just a small block of malloc'ed memory) with + * m_type = MT_DUMMYNET + * m_next = actual mbuf to be processed by ip_input/output + * m_data = the matching rule + * and some other fields. + * The block IS FREED HERE because it contains parameters passed + * to the called routine. */ switch (pkt->dn_dir) { - case DN_TO_IP_OUT: { - struct route *ro = &(pkt->ro) ; - - (void)ip_output((struct mbuf *)pkt, (struct mbuf *)pkt->ifp, - ro, pkt->flags, NULL); - rt_unref (ro->ro_rt) ; - } + case DN_TO_IP_OUT: + (void)ip_output((struct mbuf *)pkt, NULL, NULL, 0, NULL); + rt_unref (pkt->ro.ro_rt) ; break ; + case DN_TO_IP_IN : ip_input((struct mbuf *)pkt) ; break ; + #ifdef BRIDGE case DN_TO_BDG_FWD : { struct mbuf *m = (struct mbuf *)pkt ; - bdg_forward(&m, pkt->ifp); if (m) m_freem(m); } break ; #endif + default: printf("dummynet: bad switch %d!\n", pkt->dn_dir); m_freem(pkt->dn_m); @@ -244,47 +330,191 @@ dn_move(struct dn_pipe *pipe, int immediate) } FREE(pkt, M_IPFW); } + /* if there are leftover packets, put into the heap for next event */ + if ( (pkt = pipe->p.head) ) + heap_insert(&extract_heap, pkt->output_time, pipe ) ; + /* XXX should check errors on heap_insert, by draining the + * whole pipe p and hoping in the future we are more successful + */ } + /* - * this is the periodic task that moves packets between the R- - * and the P- queue + * ready_event() is invoked every time the queue must enter the + * scheduler, either because the first packet arrives, or because + * a previously scheduled event fired. + * On invokation, drain as many pkts as possible (could be 0) and then + * if there are leftover packets reinsert the pkt in the scheduler. */ -/*ARGSUSED*/ -void +static void +ready_event(struct dn_flow_queue *q) +{ + struct dn_pkt *pkt; + struct dn_pipe *p = q->p ; + int p_was_empty = (p->p.head == NULL) ; + + while ( (pkt = q->r.head) != NULL ) { + int len = pkt->dn_m->m_pkthdr.len; + int len_scaled = p->bandwidth ? len*8*hz : 0 ; + /* + * bandwidth==0 (no limit) means we can drain as many pkts as + * needed from the queue. Setting len_scaled = 0 does the job. + */ + if (len_scaled > q->numbytes ) + break ; + /* + * extract pkt from queue, compute output time (could be now) + * and put into delay line (p_queue) + */ + q->numbytes -= len_scaled ; + q->r.head = DN_NEXT(pkt) ; + q->len-- ; + q->len_bytes -= len ; + + pkt->output_time = curr_time + p->delay ; + + if (p->p.head == NULL) + p->p.head = pkt; + else + DN_NEXT(p->p.tail) = pkt; + p->p.tail = pkt; + DN_NEXT(p->p.tail) = NULL; + } + /* + * If the delay line was empty call transmit_event(p) now. + * Otherwise, the scheduler will take care of it. + */ + if (p_was_empty) + transmit_event(p); + /* + * If we have more packets queued, schedule next ready event + * (can only occur when bandwidth != 0, otherwise we would have + * flushed the whole queue in the previous loop). + * To this purpose compute how many ticks to go for the next + * event, accounting for packet size and residual credit. This means + * we compute the finish time of the packet. + */ + if ( (pkt = q->r.head) != NULL ) { /* this implies bandwidth != 0 */ + dn_key t ; + t = (pkt->dn_m->m_pkthdr.len*8*hz - q->numbytes + p->bandwidth - 1 ) / + p->bandwidth ; + q->numbytes += t * p->bandwidth ; + heap_insert(&ready_heap, curr_time + t, (void *)q ); + /* XXX should check errors on heap_insert, and drain the whole + * queue on error hoping next time we are luckier. + */ + } +} + +/* + * this is called once per tick, or HZ times per second. It is used to + * increment the current tick counter and schedule expired events. + */ +static void dummynet(void * __unused unused) { - struct dn_pipe *p ; + void *p ; /* generic parameter to handler */ + struct dn_heap *h ; int s ; - dn_calls++ ; - for (p = all_pipes ; p ; p = p->next ) { + s = splnet(); /* avoid network interrupts... */ + curr_time++ ; + h = &ready_heap ; + while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { /* - * Increment the amount of data that can be sent. However, - * don't do that if the channel is idle - * (r.head == NULL && numbytes >= bandwidth). - * This bug fix is from tim shepard (shep@bbn.com) + * XXX if the event is late, we should probably credit the queue + * by q->p->bandwidth * (delta_ticks). On the other hand, i dont + * think this can ever occur with this code (i.e. curr_time will + * still be incremented by one at each tick. Things might be + * different if we were using the counter from the high priority + * timer. */ - s = splimp(); - if (p->r.head != NULL || p->numbytes < p->bandwidth ) - p->numbytes += p->bandwidth ; - dn_move(p, 0); /* is it really 0 (also below) ? */ - splx(s); + if (h->p[0].key != curr_time) + printf("-- dummynet: warning, event is %d ticks late\n", + curr_time - h->p[0].key); + p = h->p[0].object ; + heap_extract(h); /* need to extract before processing */ + ready_event(p) ; + } + h = &extract_heap ; + while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { + if (h->p[0].key != curr_time) /* XXX same as above */ + printf("-- dummynet: warning, event is %d ticks late\n", + curr_time - h->p[0].key); + p = h->p[0].object ; + heap_extract(&extract_heap); + transmit_event(p); } - - /* - * finally, if some queue has data, restart the timer. - */ - s = splimp(); - dn_idle = 1; - dn_restart(); splx(s); + timeout(dummynet, NULL, 1); +} + +/* + * Given a pipe and a pkt in dn_last_pkt, find a matching queue + * after appropriate masking. The queue is moved to front + * so that further searches take less time. + * XXX if the queue is longer than some threshold should consider + * purging old unused entries. They will get in the way every time + * we have a new flow. + */ +static struct dn_flow_queue * +find_queue(struct dn_pipe *pipe) +{ + int i = 0 ; /* we need i and q for new allocations */ + struct dn_flow_queue *q, *prev; + + if ( !(pipe->flags & DN_HAVE_FLOW_MASK) ) + q = pipe->rq[0] ; + else { + /* first, do the masking */ + dn_last_pkt.dst_ip &= pipe->flow_mask.dst_ip ; + dn_last_pkt.src_ip &= pipe->flow_mask.src_ip ; + dn_last_pkt.dst_port &= pipe->flow_mask.dst_port ; + dn_last_pkt.src_port &= pipe->flow_mask.src_port ; + dn_last_pkt.proto &= pipe->flow_mask.proto ; + /* then, hash function */ + i = ( (dn_last_pkt.dst_ip) & 0xffff ) ^ + ( (dn_last_pkt.dst_ip >> 15) & 0xffff ) ^ + ( (dn_last_pkt.src_ip << 1) & 0xffff ) ^ + ( (dn_last_pkt.src_ip >> 16 ) & 0xffff ) ^ + (dn_last_pkt.dst_port << 1) ^ (dn_last_pkt.src_port) ^ + (dn_last_pkt.proto ); + i = i % pipe->rq_size ; + /* finally, scan the current list for a match */ + searches++ ; + for (prev=NULL, q = pipe->rq[i] ; q ; prev = q , q = q->next ) { + search_steps++; + if (bcmp(&dn_last_pkt, &(q->id), sizeof(q->id) ) == 0) + break ; /* found */ + } + if (q && prev != NULL) { /* found and not in front */ + prev->next = q->next ; + q->next = pipe->rq[i] ; + pipe->rq[i] = q ; + } + } + if (q == NULL) { /* no match, need to allocate a new entry */ + q = malloc(sizeof(*q), M_IPFW, M_DONTWAIT) ; + if (q == NULL) { + printf("sorry, cannot allocate new flow\n"); + return NULL ; + } + bzero(q, sizeof(*q) ); /* needed */ + q->id = dn_last_pkt ; + q->p = pipe ; + q->hash_slot = i ; + q->next = pipe->rq[i] ; + pipe->rq[i] = q ; + pipe->rq_elements++ ; + DEB(printf("++ new queue (%d) for 0x%08x/0x%04x -> 0x%08x/0x%04x\n", + pipe->rq_elements, + dn_last_pkt.src_ip, dn_last_pkt.src_port, + dn_last_pkt.dst_ip, dn_last_pkt.dst_port); ) + } + return q ; } /* * dummynet hook for packets. - * input and output use the same code, so i use bit 16 in the pipe - * number to chose the direction: 1 for output packets, 0 for input. - * for input, only m is significant. For output, also the others. */ int dummynet_io(int pipe_nr, int dir, @@ -293,174 +523,196 @@ dummynet_io(int pipe_nr, int dir, struct ip_fw_chain *rule, int flags) { struct dn_pkt *pkt; - struct dn_pipe *pipe; + struct dn_pipe *p; int len = m->m_pkthdr.len ; + struct dn_flow_queue *q = NULL ; + int s ; - int s=splimp(); + s = splimp(); + /* XXX check the spl protection. It might be unnecessary since we + * run this at splnet() already. + */ + + DEB(printf("-- last_pkt dst 0x%08x/0x%04x src 0x%08x/0x%04x\n", + dn_last_pkt.dst_ip, dn_last_pkt.dst_port, + dn_last_pkt.src_ip, dn_last_pkt.src_port);) pipe_nr &= 0xffff ; /* * locate pipe. First time is expensive, next have direct access. */ - - if ( (pipe = rule->rule->pipe_ptr) == NULL ) { - for (pipe=all_pipes; pipe && pipe->pipe_nr !=pipe_nr; pipe=pipe->next) + if ( (p = rule->rule->pipe_ptr) == NULL ) { + for (p = all_pipes; p && p->pipe_nr != pipe_nr; p = p->next) ; - if (pipe == NULL) { - splx(s); - if (dn_debug) - printf("warning, pkt for no pipe %d\n", pipe_nr); - m_freem(m); - return 0 ; - } else - rule->rule->pipe_ptr = pipe ; + if (p == NULL) + goto dropit ; /* this pipe does not exist! */ + rule->rule->pipe_ptr = p ; /* record pipe ptr for the future */ } - + q = find_queue(p); /* - * should i drop ? - * This section implements random packet drop. + * update statistics, then do various check on reasons to drop pkt */ - if ( (pipe->plr && random() < pipe->plr) || - (pipe->queue_size && pipe->r_len >= pipe->queue_size) || - (pipe->queue_size_bytes && - len + pipe->r_len_bytes > pipe->queue_size_bytes) || - (pkt = (struct dn_pkt *)malloc(sizeof (*pkt), - M_IPFW, M_NOWAIT) ) == NULL ) { - splx(s); - if (dn_debug) - printf("-- dummynet: drop from pipe %d, have %d pks, %d bytes\n", - pipe_nr, pipe->r_len, pipe->r_len_bytes); - pipe->r_drops++ ; - m_freem(m); - return 0 ; /* XXX error */ - } - bzero(pkt, sizeof(*pkt) ); - /* build and enqueue packet */ + if ( q == NULL ) + goto dropit ; /* cannot allocate queue */ + q->tot_bytes += len ; + q->tot_pkts++ ; + if ( p->plr && random() < p->plr ) + goto dropit ; /* random pkt drop */ + if ( p->queue_size && q->len >= p->queue_size) + goto dropit ; /* queue count overflow */ + if ( p->queue_size_bytes && len + q->len_bytes > p->queue_size_bytes) + goto dropit ; /* queue size overflow */ + /* + * can implement RED drops here if needed. + */ + + pkt = (struct dn_pkt *)malloc(sizeof (*pkt), M_IPFW, M_NOWAIT) ; + if ( pkt == NULL ) + goto dropit ; /* cannot allocate packet header */ + /* ok, i can handle the pkt now... */ + bzero(pkt, sizeof(*pkt) ); /* XXX expensive, see if we can remove it*/ + /* build and enqueue packet + parameters */ pkt->hdr.mh_type = MT_DUMMYNET ; (struct ip_fw_chain *)pkt->hdr.mh_data = rule ; - pkt->dn_next = NULL; + DN_NEXT(pkt) = NULL; pkt->dn_m = m; pkt->dn_dir = dir ; - pkt->delay = 0; pkt->ifp = ifp; if (dir == DN_TO_IP_OUT) { /* - * we need to copy *ro because for icmp pkts (and maybe others) - * the caller passed a pointer into the stack. + * We need to copy *ro because for ICMP pkts (and maybe others) + * the caller passed a pointer into the stack; and, dst might + * also be a pointer into *ro so it needs to be updated. */ pkt->ro = *ro; if (ro->ro_rt) ro->ro_rt->rt_refcnt++ ; /* XXX */ - /* - * and again, dst might be a pointer into *ro... - */ if (dst == (struct sockaddr_in *)&ro->ro_dst) /* dst points into ro */ dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; - pkt->dn_dst = dst; /* XXX this can't be right! */ - /* - * 'flags' also need to be kept for later packet treatment - * such as IPSEC. IPSEC consider sending packet's m->m_pkthdr.rcvif - * as 'socket *' at ip_output(), if IP_SOCKINMRCVIF is set. - */ - pkt->flags = flags; + pkt->dn_dst = dst; + pkt->flags = flags ; } - if (pipe->r.head == NULL) - pipe->r.head = pkt; + if (q->r.head == NULL) + q->r.head = pkt; else - (struct dn_pkt *)pipe->r.tail->dn_next = pkt; - pipe->r.tail = pkt; - pipe->r_len++; - pipe->r_len_bytes += len ; + DN_NEXT(q->r.tail) = pkt; + q->r.tail = pkt; + q->len++; + q->len_bytes += len ; - /* - * here we could implement RED if we like to + /* + * If queue was empty (this is first pkt) then call ready_event() + * now to make the pkt go out at the right time. Otherwise we are done, + * as there must be a ready event already scheduled. */ - - if (pipe->r.head == pkt) { /* process immediately */ - dn_move(pipe, 1); - } - if (dn_idle) - dn_restart(); + if (q->r.head == pkt) /* r_queue was empty */ + ready_event( q ); splx(s); return 0; + +dropit: + splx(s); + if (q) + q->drops++ ; + m_freem(m); + return 0 ; /* XXX should I return an error ? */ } +/* + * below, the rt_unref is only needed when (pkt->dn_dir == DN_TO_IP_OUT) + * Doing this would probably save us the initial bzero of dn_pkt + */ +#define DN_FREE_PKT(pkt) { \ + struct dn_pkt *n = pkt ; \ + rt_unref ( n->ro.ro_rt ) ; \ + m_freem(n->dn_m); \ + pkt = DN_NEXT(n) ; \ + free(n, M_IPFW) ; } /* * dispose all packets queued on a pipe */ static void purge_pipe(struct dn_pipe *pipe) { - struct dn_pkt *pkt, *n ; - struct rtentry *tmp_rt ; + struct dn_pkt *pkt ; + struct dn_flow_queue *q, *qn ; + int i ; - for (pkt = pipe->r.head ; pkt ; ) { - rt_unref (tmp_rt = pkt->ro.ro_rt ) ; - m_freem(pkt->dn_m); - n = pkt ; - pkt = (struct dn_pkt *)pkt->dn_next ; - free(n, M_IPFW) ; - } - for (pkt = pipe->p.head ; pkt ; ) { - rt_unref (tmp_rt = pkt->ro.ro_rt ) ; - m_freem(pkt->dn_m); - n = pkt ; - pkt = (struct dn_pkt *)pkt->dn_next ; - free(n, M_IPFW) ; - } + for (i = 0 ; i < pipe->rq_size ; i++ ) + for (q = pipe->rq[i] ; q ; q = qn ) { + for (pkt = q->r.head ; pkt ; ) + DN_FREE_PKT(pkt) ; + qn = q->next ; + free(q, M_IPFW); + } + for (pkt = pipe->p.head ; pkt ; ) + DN_FREE_PKT(pkt) ; } /* - * delete all pipes returning memory + * Delete all pipes and heaps returning memory. Must also + * remove references from all ipfw rules to all pipes. */ static void dummynet_flush() { - struct dn_pipe *q, *p = all_pipes ; - int s = splnet() ; + struct dn_pipe *curr_p, *p ; + struct ip_fw_chain *chain ; + int s ; - all_pipes = NULL ; + s = splnet() ; + + /* remove all references to pipes ...*/ + for (chain= ip_fw_chain.lh_first ; chain; chain = chain->chain.le_next) + chain->rule->pipe_ptr = NULL ; + /* prevent future matches... */ + p = all_pipes ; + all_pipes = NULL ; + /* and free heaps so we don't have unwanted events */ + if (ready_heap.size >0 ) + free(ready_heap.p, M_IPFW); + ready_heap.elements = ready_heap.size = 0 ; + if (extract_heap.size >0 ) + free(extract_heap.p, M_IPFW); + extract_heap.elements = extract_heap.size = 0 ; splx(s) ; /* - * purge all queued pkts and delete all pipes + * Now purge all queued pkts and delete all pipes */ for ( ; p ; ) { purge_pipe(p); - q = p ; + curr_p = p ; p = p->next ; - free(q, M_IPFW); + free(curr_p->rq, M_IPFW); + free(curr_p, M_IPFW); } } extern struct ip_fw_chain *ip_fw_default_rule ; /* - * when a firewall rule is deleted, scan all pipes and remove the flow-id + * when a firewall rule is deleted, scan all queues and remove the flow-id * from packets matching this rule. */ void dn_rule_delete(void *r) { struct dn_pipe *p ; - int matches = 0 ; + struct dn_flow_queue *q ; + struct dn_pkt *pkt ; + int i ; for ( p = all_pipes ; p ; p = p->next ) { - struct dn_pkt *x ; - for (x = p->r.head ; x ; x = (struct dn_pkt *)x->dn_next ) - if (x->hdr.mh_data == r) { - matches++ ; - x->hdr.mh_data = (void *)ip_fw_default_rule ; - } - for (x = p->p.head ; x ; x = (struct dn_pkt *)x->dn_next ) - if (x->hdr.mh_data == r) { - matches++ ; - x->hdr.mh_data = (void *)ip_fw_default_rule ; - } + for (i = 0 ; i < p->rq_size ; i++) + for (q = p->rq[i] ; q ; q = q->next ) + for (pkt = q->r.head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->hdr.mh_data == r) + pkt->hdr.mh_data = (void *)ip_fw_default_rule ; + for (pkt = p->p.head ; pkt ; pkt = DN_NEXT(pkt) ) + if (pkt->hdr.mh_data == r) + pkt->hdr.mh_data = (void *)ip_fw_default_rule ; } - printf("dn_rule_delete, r %p, default %p%s, %d matches\n", - (void *)r, (void *)ip_fw_default_rule, - r == ip_fw_default_rule ? " AARGH!":"", matches); } /* @@ -472,7 +724,7 @@ ip_dn_ctl(struct sockopt *sopt) { int error = 0 ; size_t size ; - char *buf, *bp ; + char *buf, *bp ; /* bp is the "copy-pointer" */ struct dn_pipe *p, tmp_pipe ; struct dn_pipe *x, *a, *b ; @@ -487,92 +739,116 @@ ip_dn_ctl(struct sockopt *sopt) case IP_DUMMYNET_GET : for (p = all_pipes, size = 0 ; p ; p = p->next ) - size += sizeof( *p ) ; + size += sizeof( *p ) + + p->rq_elements * sizeof(struct dn_flow_queue); buf = malloc(size, M_TEMP, M_WAITOK); if (buf == 0) { error = ENOBUFS ; break ; } for (p = all_pipes, bp = buf ; p ; p = p->next ) { - struct dn_pipe *q = (struct dn_pipe *)bp ; + int i ; + struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; + struct dn_flow_queue *q; - bcopy(p, bp, sizeof( *p ) ); /* - * return bw and delay in bits/s and ms, respectively + * copy the pipe descriptor into *bp, convert delay back to ms, + * then copy the queue descriptor(s) one at a time. */ - q->delay = (q->delay * 1000) / hz ; + bcopy(p, bp, sizeof( *p ) ); + pipe_bp->delay = (pipe_bp->delay * 1000) / hz ; bp += sizeof( *p ) ; + for (i = 0 ; i < p->rq_size ; i++) + for (q = p->rq[i] ; q ; q = q->next, bp += sizeof(*q) ) + bcopy(q, bp, sizeof( *q ) ); } error = sooptcopyout(sopt, buf, size); FREE(buf, M_TEMP); break ; + case IP_DUMMYNET_FLUSH : - dummynet_flush() ; + dummynet_flush() ; break ; + case IP_DUMMYNET_CONFIGURE : p = &tmp_pipe ; error = sooptcopyin(sopt, p, sizeof *p, sizeof *p); if (error) break ; - /* - * The config program passes parameters as follows: - * bandwidth = bits/second (0 = no limits); - * delay = ms - * must be translated in ticks. - * queue_size = slots (0 = no limit) - * queue_size_bytes = bytes (0 = no limit) - * only one can be set, must be bound-checked - */ - p->delay = ( p->delay * hz ) / 1000 ; - if (p->queue_size == 0 && p->queue_size_bytes == 0) - p->queue_size = 50 ; - if (p->queue_size != 0 ) /* buffers are prevailing */ - p->queue_size_bytes = 0 ; - if (p->queue_size > 100) - p->queue_size = 50 ; - if (p->queue_size_bytes > 1024*1024) - p->queue_size_bytes = 1024*1024 ; -#if 0 - printf("ip_dn: config pipe %d %d bit/s %d ms %d bufs\n", - p->pipe_nr, - p->bandwidth * 8 * hz , - p->delay * 1000 / hz , p->queue_size); -#endif - for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + /* + * The config program passes parameters as follows: + * bandwidth = bits/second (0 means no limits); + * delay = millisec., must be translated into ticks. + * queue_size = slots (0 means no limit) + * queue_size_bytes = bytes (0 means no limit) + * only one can be set, must be bound-checked + */ + p->delay = ( p->delay * hz ) / 1000 ; + if (p->queue_size == 0 && p->queue_size_bytes == 0) + p->queue_size = 50 ; + if (p->queue_size != 0 ) /* buffers are prevailing */ + p->queue_size_bytes = 0 ; + if (p->queue_size > 100) + p->queue_size = 50 ; + if (p->queue_size_bytes > 1024*1024) + p->queue_size_bytes = 1024*1024 ; + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; - if (b && b->pipe_nr == p->pipe_nr) { - /* XXX should spl and flush old pipe... */ - b->bandwidth = p->bandwidth ; - b->delay = p->delay ; - b->ticks_from_last_insert = p->delay ; - b->queue_size = p->queue_size ; - b->queue_size_bytes = p->queue_size_bytes ; - b->plr = p->plr ; - } else { - int s ; - x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; - if (x == NULL) { - printf("ip_dummynet.c: sorry no memory\n"); + if (b && b->pipe_nr == p->pipe_nr) { + b->bandwidth = p->bandwidth ; + b->delay = p->delay ; + b->queue_size = p->queue_size ; + b->queue_size_bytes = p->queue_size_bytes ; + b->plr = p->plr ; + b->flow_mask = p->flow_mask ; + b->flags = p->flags ; + } else { /* completely new pipe */ + int s ; + x = malloc(sizeof(struct dn_pipe), M_IPFW, M_DONTWAIT) ; + if (x == NULL) { + printf("ip_dummynet.c: no memory for new pipe\n"); error = ENOSPC ; break ; - } - bzero(x, sizeof(*x) ); - x->bandwidth = p->bandwidth ; - x->delay = p->delay ; - x->ticks_from_last_insert = p->delay ; - x->pipe_nr = p->pipe_nr ; - x->queue_size = p->queue_size ; - x->queue_size_bytes = p->queue_size_bytes ; - x->plr = p->plr ; - - s = splnet() ; - x->next = b ; - if (a == NULL) - all_pipes = x ; - else - a->next = x ; - splx(s); } + bzero(x, sizeof(*x) ); + x->bandwidth = p->bandwidth ; + x->delay = p->delay ; + x->pipe_nr = p->pipe_nr ; + x->queue_size = p->queue_size ; + x->queue_size_bytes = p->queue_size_bytes ; + x->plr = p->plr ; + x->flow_mask = p->flow_mask ; + x->flags = p->flags ; + if (x->flags & DN_HAVE_FLOW_MASK) {/* allocate some slots */ + int l = p->rq_size ; + if (l == 0) + l = dn_hash_size ; + if (l < 4) + l = 4 ; + else if (l > 1024) + l = 1024 ; + x->rq_size = l ; + } else /* one is enough for null mask */ + x->rq_size = 1 ; + x->rq = malloc(x->rq_size * sizeof(struct dn_flow_queue *), + M_IPFW, M_DONTWAIT) ; + if (x->rq == NULL ) { + printf("sorry, cannot allocate queue\n"); + free(x, M_IPFW); + error = ENOSPC ; + break ; + } + bzero(x->rq, x->rq_size * sizeof(struct dn_flow_queue *) ); + x->rq_elements = 0 ; + + s = splnet() ; + x->next = b ; + if (a == NULL) + all_pipes = x ; + else + a->next = x ; + splx(s); + } break ; case IP_DUMMYNET_DEL : @@ -581,48 +857,82 @@ ip_dn_ctl(struct sockopt *sopt) if (error) break ; - for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; + for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; a = b , b = b->next) ; - if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */ - int s = splnet() ; - struct ip_fw_chain *chain = ip_fw_chain.lh_first; + if (b && b->pipe_nr == p->pipe_nr) { /* found pipe */ + int s ; + struct ip_fw_chain *chain ; - if (a == NULL) - all_pipes = b->next ; - else - a->next = b->next ; - /* - * remove references to this pipe from the ip_fw rules. - */ - for (; chain; chain = chain->chain.le_next) { - register struct ip_fw *const f = chain->rule; - if (f->pipe_ptr == b) - f->pipe_ptr = NULL ; + s = splnet() ; + chain = ip_fw_chain.lh_first; + + if (a == NULL) + all_pipes = b->next ; + else + a->next = b->next ; + /* + * remove references to this pipe from the ip_fw rules. + */ + for (; chain; chain = chain->chain.le_next) + if (chain->rule->pipe_ptr == b) + chain->rule->pipe_ptr = NULL ; + /* remove all references to b from heaps */ + if (ready_heap.elements > 0) { + struct dn_heap *h = &ready_heap ; + int i = 0, found = 0 ; + while ( i < h->elements ) { + if (((struct dn_flow_queue *)(h->p[i].object))->p == b) { + /* found one */ + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; } - splx(s); - purge_pipe(b); /* remove pkts from here */ - free(b, M_IPFW); + if (found) + heapify(h); } - break ; + if (extract_heap.elements > 0) { + struct dn_heap *h = &extract_heap ; + int i = 0, found = 0 ; + while ( i < h->elements ) { + if (h->p[i].object == b) { /* found one */ + h->elements-- ; + h->p[i] = h->p[h->elements] ; + found++ ; + } else + i++ ; + } + if (found) + heapify(h); + } + splx(s); + purge_pipe(b); /* remove pkts from here */ + free(b->rq, M_IPFW); + free(b, M_IPFW); } + break ; + } return error ; } static void ip_dn_init(void) { - printf("DUMMYNET initialized (990811)\n"); + printf("DUMMYNET initialized (000106)\n"); all_pipes = NULL ; + ready_heap.size = ready_heap.elements = 0 ; + extract_heap.size = extract_heap.elements = 0 ; ip_dn_ctl_ptr = ip_dn_ctl; + timeout(dummynet, NULL, 1); } -static ip_dn_ctl_t *old_dn_ctl_ptr; +static ip_dn_ctl_t *old_dn_ctl_ptr ; static int dummynet_modevent(module_t mod, int type, void *data) { - int s; - + int s ; switch (type) { case MOD_LOAD: s = splnet(); @@ -632,19 +942,19 @@ dummynet_modevent(module_t mod, int type, void *data) break; case MOD_UNLOAD: s = splnet(); - ip_dn_ctl_ptr = old_dn_ctl_ptr; + ip_dn_ctl_ptr = old_dn_ctl_ptr; splx(s); dummynet_flush(); - break; + break ; default: - break; + break ; } - return 0; + return 0 ; } static moduledata_t dummynet_mod = { "dummynet", dummynet_modevent, NULL -}; +} ; DECLARE_MODULE(dummynet, dummynet_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); diff --git a/sys/netinet/ip_dummynet.h b/sys/netinet/ip_dummynet.h index 61ca6ddf365..dbfa8a254d4 100644 --- a/sys/netinet/ip_dummynet.h +++ b/sys/netinet/ip_dummynet.h @@ -1,14 +1,28 @@ /* - * Copyright (c) 1998 Luigi Rizzo + * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa + * Portions Copyright (c) 2000 Akamba Corp. + * All rights reserved * - * Redistribution and use in source forms, with and without modification, - * are permitted provided that this entire comment appears intact. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. * - * Redistribution in binary form may occur without any restrictions. - * Obviously, it would be nice if you gave credit where credit is due - * but requiring it would be too onerous. - * - * This software is provided ``AS IS'' without any warranties of any kind. + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * $FreeBSD$ */ @@ -18,8 +32,7 @@ /* * Definition of dummynet data structures. - * Dummynet handles a list of pipes, each one identified by a unique - * number (hopefully the list is short so we use a linked list). + * We first start with the heap which is used by the scheduler. * * Each list contains a set of parameters identifying the pipe, and * a set of packets queued on the pipe itself. @@ -28,6 +41,31 @@ * is pretty simple and this makes the code more portable. */ +typedef u_int32_t dn_key ; /* sorting key */ +#define DN_KEY_LT(a,b) ((int)((a)-(b)) < 0) +#define DN_KEY_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define DN_KEY_GT(a,b) ((int)((a)-(b)) > 0) +#define DN_KEY_GEQ(a,b) ((int)((a)-(b)) >= 0) + +struct dn_heap_entry { + dn_key key ; /* sorting key. Topmost element is smallest one */ + void *object ; /* object pointer */ +} ; + +struct dn_heap { + int size ; + int elements ; + struct dn_heap_entry *p ; /* really an array of "size" entries */ +} ; + +/* + * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the + * packet when it comes out of a pipe. The definition + * ought to go in /sys/sys/mbuf.h but here it is less intrusive. + */ + +#define MT_DUMMYNET MT_CONTROL + /* * struct dn_pkt identifies a packet in the dummynet queue. The * first part is really an m_hdr for implementation purposes, and some @@ -38,19 +76,19 @@ struct dn_pkt { struct m_hdr hdr ; #define dn_next hdr.mh_nextpkt /* next element in queue */ +#define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next #define dn_m hdr.mh_next /* packet to be forwarded */ -#define dn_dst hdr.mh_len /* dst, for ip_output */ -#define dn_dir hdr.mh_flags /* IP_FW_F_IN or IP_FW_F_OUT */ - int delay; /* stays queued until delay=0 */ - struct ifnet *ifp; /* interface, for ip_output */ - struct route ro; /* route, for ip_output. MUST COPY */ - int flags; /* flags, for ip_output */ +/* #define dn_dst hdr.mh_len -* dst, for ip_output */ +#define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */ +#define DN_TO_IP_OUT 1 +#define DN_TO_IP_IN 2 +#define DN_TO_BDG_FWD 3 -#ifdef DUMMYNET_DEBUG - struct timeval beg, mid; /* testing only */ - int act_delay; /* testing only */ - int in_delay; /* testing only */ -#endif + dn_key output_time; /* when the pkt is due for delivery */ + struct ifnet *ifp; /* interface, for ip_output */ + struct sockaddr_in *dn_dst ; + struct route ro; /* route, for ip_output. MUST COPY */ + int flags ; /* flags, for ip_output (IPv6 ?) */ }; struct dn_queue { @@ -58,53 +96,64 @@ struct dn_queue { } ; /* - * descriptor of a pipe. The flags field will be used to speed up the - * forwarding code paths, in case some of the parameters are not - * used. + * Flow mask/flow id for each queue. + */ +struct dn_flow_id { + u_int32_t dst_ip, src_ip ; + u_int16_t dst_port, src_port ; + u_int8_t proto ; +} ; + +/* + * We use per flow queues. Hashing is used to select the right slot, + * then we scan the list to match the flow-id. + * The pipe is shared as it is only a delay line and thus one is enough. + */ +struct dn_flow_queue { + struct dn_flow_queue *next ; + struct dn_flow_id id ; + struct dn_pipe *p ; /* parent pipe */ + struct dn_queue r; + long numbytes ; + u_int len ; + u_int len_bytes ; + + u_int64_t tot_pkts ; /* statistics counters */ + u_int64_t tot_bytes ; + u_int32_t drops ; + int hash_slot ; /* debugging/diagnostic */ +} ; + +/* + * Pipe descriptor. Contains global parameters, delay-line queue, + * and the hash array of the per-flow queues. */ struct dn_pipe { /* a pipe */ struct dn_pipe *next ; u_short pipe_nr ; /* number */ u_short flags ; /* to speed up things */ -#define DN_HAVE_BW 1 -#define DN_HAVE_QUEUE 2 -#define DN_HAVE_DELAY 4 +#define DN_HAVE_FLOW_MASK 8 int bandwidth; /* really, bytes/tick. */ int queue_size ; int queue_size_bytes ; int delay ; /* really, ticks */ int plr ; /* pkt loss rate (2^31-1 means 100%) */ - struct dn_queue r; - int r_len; /* elements in r_queue */ - int r_len_bytes; /* bytes in r_queue */ - int r_drops; /* drops from r_queue */ struct dn_queue p ; - int ticks_from_last_insert; - long numbytes; /* which can send or receive */ + struct dn_flow_id flow_mask ; + int rq_size ; + int rq_elements ; + struct dn_flow_queue **rq ; /* array of rq_size entries */ }; -/* - * The following is used to define a new mbuf type that is - * prepended to the packet when it comes out of a pipe. The definition - * ought to go in /sys/sys/mbuf.h but here it is less intrusive. - */ - -#define MT_DUMMYNET MT_CONTROL -/* - * what to do of a packet when it comes out of a pipe - */ -#define DN_TO_IP_OUT 1 -#define DN_TO_IP_IN 2 -#define DN_TO_BDG_FWD 3 - #ifdef _KERNEL MALLOC_DECLARE(M_IPFW); typedef int ip_dn_ctl_t __P((struct sockopt *)) ; extern ip_dn_ctl_t *ip_dn_ctl_ptr; +extern struct dn_flow_id dn_last_pkt ; void dn_rule_delete(void *r); /* used in ip_fw.c */ int dummynet_io(int pipe, int dir,