Index: linux-2.6.11-w/include/linux/pkt_sched.h =================================================================== --- linux-2.6.11-w/include/linux/pkt_sched.h (revision 2) +++ linux-2.6.11-w/include/linux/pkt_sched.h (revision 3) @@ -272,6 +272,28 @@ __u32 ctokens; }; + +/* PERFLOW section */ + +#define TC_PERFLOW_DEFAULTLIMIT 1024 + +struct tc_perflow_qopt +{ + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 limit; + __u32 qlen; +}; +enum +{ + TCA_PERFLOW_UNSPEC, + TCA_PERFLOW_PARMS, + __TCA_PERFLOW_MAX, +}; + +#define TCA_PERFLOW_MAX (__TCA_PERFLOW_MAX - 1) + + /* HFSC section */ struct tc_hfsc_qopt Index: linux-2.6.11-w/net/sched/Kconfig =================================================================== --- linux-2.6.11-w/net/sched/Kconfig (revision 2) +++ linux-2.6.11-w/net/sched/Kconfig (revision 3) @@ -192,6 +192,15 @@ To compile this code as a module, choose M here: the module will be called sch_gred. +config NET_SCH_PERFLOW + tristate "PERFLOW queue" + depends on NET_SCHED + ---help--- + Say Y here if you want to use per flow rate control. + + To compile this code as a module, choose M here: the + module will be called sch_perflow. + config NET_SCH_DSMARK tristate "Diffserv field marker" depends on NET_SCHED Index: linux-2.6.11-w/net/sched/Makefile =================================================================== --- linux-2.6.11-w/net/sched/Makefile (revision 2) +++ linux-2.6.11-w/net/sched/Makefile (revision 3) @@ -19,6 +19,7 @@ obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_PERFLOW) += sch_perflow.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o Index: linux-2.6.11-w/net/sched/sch_perflow.c =================================================================== --- linux-2.6.11-w/net/sched/sch_perflow.c (revision 0) +++ linux-2.6.11-w/net/sched/sch_perflow.c (revision 3) @@ -0,0 +1,555 @@ +/* + * net/sched/sch_perflow.c Per flow rate control queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Wang Jian , 2005 + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Per flow rate control algorithm + +Description +----------- + + When a new packet arrives, we lookup in the table to see if it + belongs to a flow being traced. If not, we create new entry for it. + + For a packet belongs to a flow being traced, we see if it has + available token to send it out. + + The 'rate' and 'ceil' have same meaning of HTB qdisc. The 'limit' + parameter defined how many flows we will trace, which defaults to + 1024. + + Any flow entry without traffic in LIFETIME seconds will be wiped + to free slot. + + +Algorithm +--------- + + The algorithm is simple and naive. We have a token bucket, which + generate grate/10 (= rate * limit / 10) token in 1/10 seconds. + + A flow can always send when it is under rate. When a flow is over + rate but under ceil, every time it borrows, it is put into borrow + list, so it may receive penalty. + + When a under-rate flow sends but token is short, penalty is added + to one of flows in borrow list, and this punished flow is + removed from borrow list. + + When a flow carrying penalty has packet, the penalty is + added to used token. This means, this flow can borrow less than + last time (if in a new time slot), or can borrow less + (ceil - rate - penalty). + + There are other rules that handle fairness and low rate situation. + See code for details. + + NOTE: the implementation of algorithm is not timer driven, but + packet driven. + + The ideas behind this algorithm are + + 1. We assume that perflow qdisc has rate * limit guaranteed. + 2. We can't affect past. We can only affect future. + 3. If a flow's borrowing leads to overlimit, we let this flow + borrow less in future. + 4. With a round robin punishment style, a flow borrows more times, + it stays in borrow list more times, and so receives more penalty. + (But we should consider more about this, I think the fairness + should be improved by sort borrow list.) + + +Security Consideration +---------------------- + + It's dangerous to create new entry for any new packets not belongs + to a flow being traced. For example, syn flood attack on a single + port may cause thousands of entries created. The 'limit' parameter + is used to set limit on maximum entries we will create. + + But even with 'limit', port scan can fill the slots and make valid + new flow has no slot available and not be traced. + + One of solution is to use netfilter MARK for classification + (a.k.a. cls_fw.c). Only established session is given a fw mark, and + then, if port scan use a spoofing source address, no fw mark gotten. + + +Application +----------- + + You should use HTB or other classful qdisc to enclose this qdisc. + + */ + + +#define PERFLOW_HSIZE 251 +#define PERFLOW_LIFETIME (10*HZ) + +struct perflow_entry +{ + u32 src; + u32 dst; + u16 sport; + u16 dport; + + struct list_head hlist; + struct list_head borrow; + struct timer_list timer; + struct perflow_sched_data *q; + u32 jiffies; + u32 used; + u32 penalty; + + u8 protocol; +}; + +struct perflow_sched_data +{ + /* parameters */ + u32 rate; /* guaranted rate */ + u32 ceil; /* upper bound */ + u32 limit; /* maximum flows we trace */ + u32 qlen; /* maximum queue length */ + + /* variables */ + u32 grate; /* aggregative rate */ + + u32 jiffies; + u32 token; + u32 flow_count; /* how many flows we are tracing */ + struct list_head ht[PERFLOW_HSIZE]; /* hash table */ + struct list_head borrow; /* lists of borrowing flow */ +}; + +struct commonhdr +{ + u16 source; + u16 dest; +}; + +static __inline__ u32 perflow_hash(struct perflow_entry *tuple) +{ + return (jhash_3words(tuple->src ^ tuple->protocol, + tuple->dst, + (tuple->sport << 16 | tuple->dport), + 0x5E83AD03) % PERFLOW_HSIZE); +} + +static __inline__ int perflow_is_valid(struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + + if (skb->protocol != __constant_htons(ETH_P_IP)) + return 0; + + if (iph->protocol != IPPROTO_TCP && iph->protocol != IPPROTO_TCP + && iph->protocol != IPPROTO_SCTP) + return 0; + + return 1; +} + +static __inline__ void perflow_flow_timer(unsigned long arg) +{ + struct perflow_entry *e = (struct perflow_entry *) arg; + + e->q->flow_count--; + list_del_init(&e->hlist); + if (!list_empty(&e->borrow)) + list_del_init(&e->borrow); + kfree(e); +} + +static __inline__ struct perflow_entry * +perflow_new_flow(u32 hash, struct perflow_entry *tuple, struct Qdisc *sch) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + struct perflow_entry *e; + + if (q->flow_count >= q->limit) + return NULL; + + e = kmalloc(sizeof(*e), GFP_KERNEL); + if (!e) + return NULL; + + q->flow_count++; + + memset(e, 0, sizeof(*e)); + + e->src = tuple->src; + e->dst = tuple->dst; + e->sport = tuple->sport; + e->dport = tuple->dport; + e->protocol = tuple->protocol; + + INIT_LIST_HEAD(&e->hlist); + INIT_LIST_HEAD(&e->borrow); + init_timer(&e->timer); + e->timer.function = perflow_flow_timer; + e->timer.data = (unsigned long) e; + /* sync flow's tick to queue's tick */ + e->jiffies = q->jiffies; + e->q = q; + + list_add(&e->hlist, q->ht + hash); + + return e; +} + +static __inline__ void +perflow_fill_tuple(struct perflow_entry *tuple, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + struct commonhdr *ch; + + memset(tuple, 0, sizeof(struct perflow_entry)); +#if 1 + /* not a traceable flow, do nothing */ + if (!perflow_is_valid(skb)) + return; +#endif + + ch = (struct commonhdr *)((void *)iph + (iph->ihl << 2)); + tuple->src = iph->saddr; + tuple->dst = iph->daddr; + tuple->sport = ch->source; + tuple->dport = ch->dest; + tuple->protocol = iph->protocol; +} + +static struct perflow_entry * +perflow_find_flow(u32 *hash, struct perflow_entry *flow, struct Qdisc *sch) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + struct list_head *h, *head; + struct perflow_entry *e; + + *hash = perflow_hash(flow); + head = q->ht + (*hash); + + list_for_each(h, head) { + e = list_entry(h, struct perflow_entry, hlist); + if (e->src == flow->src + && e->dst == flow->dst + && e->sport == flow->sport + && e->dport == flow->dport + && e->protocol == flow->protocol) + del_timer(&e->timer); + return e; + } + return NULL; +} + +static void perflow_punish(int penalty, struct list_head *borrow) +{ + struct perflow_entry *e; + struct list_head *h; + + if (!list_empty(borrow)) { + h = borrow->next; + list_del_init(h); + e = list_entry(h, struct perflow_entry, hlist); + e->penalty += penalty; + } +} + +static __inline__ void +perflow_adjust_used(struct perflow_entry *flow, u32 rate, u32 ceil) +{ + int cycles; + + /* still in this time slot */ + if ((jiffies - flow->jiffies) <= HZ) { + if ((ceil - flow->used) > flow->penalty) { + flow->used += flow->penalty; + flow->penalty = 0; + } else { + flow->used = ceil; + flow->penalty -= ceil - flow->used; + } + return; + } + + /* the packet arrives after cycles slots */ + cycles = (jiffies - flow->jiffies) / HZ; + flow->jiffies += cycles * HZ; + + if ((cycles * ceil - flow->used) > flow->penalty) { + flow->used = 0; + flow->penalty = 0; + } else { + flow->used = flow->penalty + flow->used - cycles * ceil; + if (flow->used > ceil) { + flow->used = ceil; + flow->penalty -= ceil; + } + } +} + +static int perflow_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + struct perflow_entry tuple, *flow; + u32 hash; + + /* not a traceable flow */ + if (!perflow_is_valid(skb)) + goto drop; + + /* if max qlen is set and current queue is full, drop */ + if (q->qlen && sch->q.qlen >= q->qlen) { + printk(KERN_WARNING "qlen overlimit drop\n"); + goto drop; + } + + perflow_fill_tuple(&tuple, skb); + flow = perflow_find_flow(&hash, &tuple, sch); + + if (flow == NULL) { + flow = perflow_new_flow(hash, &tuple, sch); + if (flow == NULL) + goto drop; + } + + /* renew timer for flow */ + flow->timer.expires = jiffies + PERFLOW_LIFETIME; + add_timer(&flow->timer); + + if ((jiffies - q->jiffies) > HZ) { + q->token = q->grate; + q->jiffies += (jiffies - q->jiffies) / HZ * HZ; + } + + /* renew used for this flow */ + perflow_adjust_used(flow, q->rate, q->ceil); + + /* we always satisfy flow under rate */ + if (flow->used < q->rate) { + flow->used += skb->len; + if (flow->used > q->rate) { + /* if we borrow, we may receive penalty */ + if (list_empty(&flow->borrow)) + list_add_tail(&flow->borrow, &q->borrow); + } + if (flow->used > q->ceil) + flow->penalty += flow->used - q->ceil; + if (q->token < skb->len) { + q->token = 0; + perflow_punish(skb->len - q->token, &q->borrow); + } else { + q->token -= skb->len; + } + } else { + if (q->token < skb->len || flow->used >= q->ceil) + goto drop; + + flow->used += skb->len; + q->token -= skb->len; + + if (list_empty(&flow->borrow)) + list_add_tail(&flow->borrow, &q->borrow); + else + list_move(&flow->borrow, &q->borrow); + + if (flow->used > q->ceil) + flow->penalty += flow->used - q->ceil; + } + +enqueue: + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + __skb_queue_tail(&sch->q, skb); + return NET_XMIT_SUCCESS; + +drop: + sch->qstats.overlimits++; + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; +} + +static int perflow_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff *perflow_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->qstats.backlog -= skb->len; + } + return skb; +} + +static unsigned int perflow_drop(struct Qdisc *sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + sch->qstats.drops++; + kfree_skb(skb); + return len; + } + return 0; +} + +static void perflow_reset(struct Qdisc *sch) +{ + skb_queue_purge(&sch->q); + sch->qstats.backlog = 0; +} + +static int perflow_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + struct tc_perflow_qopt *ctl; + + /* if limit and qlen are lowered, and we are in a over limit + * state, we still don't drop flow or drop packet. + * sch->q will decrease to allowed length + * q->flow_count will decrease because no new flow is traced + */ + if (opt == NULL) { + return -EINVAL; + } else { + ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + q->rate = ctl->rate.rate; + q->ceil = ctl->ceil.rate; + q->limit = ctl->limit; + q->qlen = ctl->qlen; + + if (q->limit == 0) + q->limit = 1024; + /* aggregative rate is (rate * 1.05 * limit) */ + q->grate = (q->rate + q->rate/20) * q->limit; + + q->jiffies = jiffies; + } + + return 0; +} + +static int perflow_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + int i, ret; + + ret = perflow_change(sch, opt); + if (ret) + return ret; + + INIT_LIST_HEAD(&q->borrow); + + for (i = 0; i < PERFLOW_HSIZE; i++) + INIT_LIST_HEAD(q->ht + i); + + return 0; +} + +static void perflow_destroy(struct Qdisc *sch) +{ + struct perflow_sched_data *q = qdisc_priv(sch); + struct list_head *h, *n; + struct perflow_entry *e; + int i; + + for (i = 0; i < PERFLOW_HSIZE; i++) { + list_for_each_safe (h, n, q->ht + i) { + e = list_entry(h, struct perflow_entry, hlist); + del_timer(&e->timer); + if (!list_empty(&e->borrow)) + list_del_init(&e->borrow); + list_del(h); + kfree(e); + } + } +} + +static int perflow_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return -1; +} + +static struct Qdisc_ops perflow_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "perflow", + .priv_size = sizeof(struct perflow_sched_data), + .enqueue = perflow_enqueue, + .dequeue = perflow_dequeue, + .requeue = perflow_requeue, + .drop = perflow_drop, + .init = perflow_init, + .reset = perflow_reset, + .destroy = perflow_destroy, + .change = perflow_change, + .dump = perflow_dump, + .owner = THIS_MODULE, +}; + +static int __init perflow_module_init(void) +{ + return register_qdisc(&perflow_qdisc_ops); +} + +static void __exit perflow_module_exit(void) +{ + unregister_qdisc(&perflow_qdisc_ops); +} + +module_init(perflow_module_init); +module_exit(perflow_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Wang Jian ");