net/sched/sch_sfq.c

   1 /*
   2  * net/sched/sch_sfq.c  Stochastic Fairness Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  */
  11
  12 #include <linux/module.h>
  13 #include <linux/types.h>
  14 #include <linux/kernel.h>
  15 #include <linux/jiffies.h>
  16 #include <linux/string.h>
  17 #include <linux/in.h>
  18 #include <linux/errno.h>
  19 #include <linux/init.h>
  20 #include <linux/skbuff.h>
  21 #include <linux/jhash.h>
  22 #include <linux/slab.h>
  23 #include <linux/vmalloc.h>
  24 #include <net/netlink.h>
  25 #include <net/pkt_sched.h>
  26 #include <net/flow_keys.h>
  27 #include <net/red.h>
  28
  29
  30 /*      Stochastic Fairness Queuing algorithm.
  31         =======================================
  32
  33         Source:
  34         Paul E. McKenney "Stochastic Fairness Queuing",
  35         IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
  36
  37         Paul E. McKenney "Stochastic Fairness Queuing",
  38         "Interworking: Research and Experience", v.2, 1991, p.113-131.
  39
  40
  41         See also:
  42         M. Shreedhar and George Varghese "Efficient Fair
  43         Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
  44
  45
  46         This is not the thing that is usually called (W)FQ nowadays.
  47         It does not use any timestamp mechanism, but instead
  48         processes queues in round-robin order.
  49
  50         ADVANTAGE:
  51
  52         - It is very cheap. Both CPU and memory requirements are minimal.
  53
  54         DRAWBACKS:
  55
  56         - "Stochastic" -> It is not 100% fair.
  57         When hash collisions occur, several flows are considered as one.
  58
  59         - "Round-robin" -> It introduces larger delays than virtual clock
  60         based schemes, and should not be used for isolating interactive
  61         traffic from non-interactive. It means, that this scheduler
  62         should be used as leaf of CBQ or P3, which put interactive traffic
  63         to higher priority band.
  64
  65         We still need true WFQ for top level CSZ, but using WFQ
  66         for the best effort traffic is absolutely pointless:
  67         SFQ is superior for this purpose.
  68
  69         IMPLEMENTATION:
  70         This implementation limits :
  71         - maximal queue length per flow to 127 packets.
  72         - max mtu to 2^18-1;
  73         - max 65408 flows,
  74         - number of hash buckets to 65536.
  75
  76         It is easy to increase these values, but not in flight.  */
  77
  78 #define SFQ_MAX_DEPTH           127 /* max number of packets per flow */
  79 #define SFQ_DEFAULT_FLOWS       128
  80 #define SFQ_MAX_FLOWS           (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */
  81 #define SFQ_EMPTY_SLOT          0xffff
  82 #define SFQ_DEFAULT_HASH_DIVISOR 1024
  83
  84 /* We use 16 bits to store allot, and want to handle packets up to 64K
  85  * Scale allot by 8 (1<<3) so that no overflow occurs.
  86  */
  87 #define SFQ_ALLOT_SHIFT         3
  88 #define SFQ_ALLOT_SIZE(X)       DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT)
  89
  90 /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */
  91 typedef u16 sfq_index;
  92
  93 /*
  94  * We dont use pointers to save space.
  95  * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array
  96  * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH]
  97  * are 'pointers' to dep[] array
  98  */
  99 struct sfq_head {
 100         sfq_index       next;
 101         sfq_index       prev;
 102 };
 103
 104 struct sfq_slot {
 105         struct sk_buff  *skblist_next;
 106         struct sk_buff  *skblist_prev;
 107         sfq_index       qlen; /* number of skbs in skblist */
 108         sfq_index       next; /* next slot in sfq RR chain */
 109         struct sfq_head dep; /* anchor in dep[] chains */
 110         unsigned short  hash; /* hash value (index in ht[]) */
 111         short           allot; /* credit for this slot */
 112
 113         unsigned int    backlog;
 114         struct red_vars vars;
 115 };
 116
 117 struct sfq_sched_data {
 118 /* frequently used fields */
 119         int             limit;          /* limit of total number of packets in this qdisc */
 120         unsigned int    divisor;        /* number of slots in hash table */
 121         u8              headdrop;
 122         u8              maxdepth;       /* limit of packets per flow */
 123
 124         u32             perturbation;
 125         u8              cur_depth;      /* depth of longest slot */
 126         u8              flags;
 127         unsigned short  scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
 128         struct tcf_proto *filter_list;
 129         sfq_index       *ht;            /* Hash table ('divisor' slots) */
 130         struct sfq_slot *slots;         /* Flows table ('maxflows' entries) */
 131
 132         struct red_parms *red_parms;
 133         struct tc_sfqred_stats stats;
 134         struct sfq_slot *tail;          /* current slot in round */
 135
 136         struct sfq_head dep[SFQ_MAX_DEPTH + 1];
 137                                         /* Linked lists of slots, indexed by depth
 138                                          * dep[0] : list of unused flows
 139                                          * dep[1] : list of flows with 1 packet
 140                                          * dep[X] : list of flows with X packets
 141                                          */
 142
 143         unsigned int    maxflows;       /* number of flows in flows array */
 144         int             perturb_period;
 145         unsigned int    quantum;        /* Allotment per round: MUST BE >= MTU */
 146         struct timer_list perturb_timer;
 147 };
 148
 149 /*
 150  * sfq_head are either in a sfq_slot or in dep[] array
 151  */
 152 static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val)
 153 {
 154         if (val < SFQ_MAX_FLOWS)
 155                 return &q->slots[val].dep;
 156         return &q->dep[val - SFQ_MAX_FLOWS];
 157 }
 158
 159 /*
 160  * In order to be able to quickly rehash our queue when timer changes
 161  * q->perturbation, we store flow_keys in skb->cb[]
 162  */
 163 struct sfq_skb_cb {
 164        struct flow_keys        keys;
 165 };
 166
 167 static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb)
 168 {
 169         qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb));
 170         return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data;
 171 }
 172
 173 static unsigned int sfq_hash(const struct sfq_sched_data *q,
 174                              const struct sk_buff *skb)
 175 {
 176         const struct flow_keys *keys = &sfq_skb_cb(skb)->keys;
 177         unsigned int hash;
 178
 179         hash = jhash_3words((__force u32)keys->dst,
 180                             (__force u32)keys->src ^ keys->ip_proto,
 181                             (__force u32)keys->ports, q->perturbation);
 182         return hash & (q->divisor - 1);
 183 }
 184
 185 static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
 186                                  int *qerr)
 187 {
 188         struct sfq_sched_data *q = qdisc_priv(sch);
 189         struct tcf_result res;
 190         int result;
 191
 192         if (TC_H_MAJ(skb->priority) == sch->handle &&
 193             TC_H_MIN(skb->priority) > 0 &&
 194             TC_H_MIN(skb->priority) <= q->divisor)
 195                 return TC_H_MIN(skb->priority);
 196
 197         if (!q->filter_list) {
 198                 skb_flow_dissect(skb, &sfq_skb_cb(skb)->keys);
 199                 return sfq_hash(q, skb) + 1;
 200         }
 201
 202         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 203         result = tc_classify(skb, q->filter_list, &res);
 204         if (result >= 0) {
 205 #ifdef CONFIG_NET_CLS_ACT
 206                 switch (result) {
 207                 case TC_ACT_STOLEN:
 208                 case TC_ACT_QUEUED:
 209                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 210                 case TC_ACT_SHOT:
 211                         return 0;
 212                 }
 213 #endif
 214                 if (TC_H_MIN(res.classid) <= q->divisor)
 215                         return TC_H_MIN(res.classid);
 216         }
 217         return 0;
 218 }
 219
 220 /*
 221  * x : slot number [0 .. SFQ_MAX_FLOWS - 1]
 222  */
 223 static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
 224 {
 225         sfq_index p, n;
 226         struct sfq_slot *slot = &q->slots[x];
 227         int qlen = slot->qlen;
 228
 229         p = qlen + SFQ_MAX_FLOWS;
 230         n = q->dep[qlen].next;
 231
 232         slot->dep.next = n;
 233         slot->dep.prev = p;
 234
 235         q->dep[qlen].next = x;          /* sfq_dep_head(q, p)->next = x */
 236         sfq_dep_head(q, n)->prev = x;
 237 }
 238
 239 #define sfq_unlink(q, x, n, p)                  \
 240         do {                                    \
 241                 n = q->slots[x].dep.next;       \
 242                 p = q->slots[x].dep.prev;       \
 243                 sfq_dep_head(q, p)->next = n;   \
 244                 sfq_dep_head(q, n)->prev = p;   \
 245         } while (0)
 246
 247
 248 static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
 249 {
 250         sfq_index p, n;
 251         int d;
 252
 253         sfq_unlink(q, x, n, p);
 254
 255         d = q->slots[x].qlen--;
 256         if (n == p && q->cur_depth == d)
 257                 q->cur_depth--;
 258         sfq_link(q, x);
 259 }
 260
 261 static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
 262 {
 263         sfq_index p, n;
 264         int d;
 265
 266         sfq_unlink(q, x, n, p);
 267
 268         d = ++q->slots[x].qlen;
 269         if (q->cur_depth < d)
 270                 q->cur_depth = d;
 271         sfq_link(q, x);
 272 }
 273
 274 /* helper functions : might be changed when/if skb use a standard list_head */
 275
 276 /* remove one skb from tail of slot queue */
 277 static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot)
 278 {
 279         struct sk_buff *skb = slot->skblist_prev;
 280
 281         slot->skblist_prev = skb->prev;
 282         skb->prev->next = (struct sk_buff *)slot;
 283         skb->next = skb->prev = NULL;
 284         return skb;
 285 }
 286
 287 /* remove one skb from head of slot queue */
 288 static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot)
 289 {
 290         struct sk_buff *skb = slot->skblist_next;
 291
 292         slot->skblist_next = skb->next;
 293         skb->next->prev = (struct sk_buff *)slot;
 294         skb->next = skb->prev = NULL;
 295         return skb;
 296 }
 297
 298 static inline void slot_queue_init(struct sfq_slot *slot)
 299 {
 300         memset(slot, 0, sizeof(*slot));
 301         slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot;
 302 }
 303
 304 /* add skb to slot queue (tail add) */
 305 static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb)
 306 {
 307         skb->prev = slot->skblist_prev;
 308         skb->next = (struct sk_buff *)slot;
 309         slot->skblist_prev->next = skb;
 310         slot->skblist_prev = skb;
 311 }
 312
 313 #define slot_queue_walk(slot, skb)              \
 314         for (skb = slot->skblist_next;          \
 315              skb != (struct sk_buff *)slot;     \
 316              skb = skb->next)
 317
 318 static unsigned int sfq_drop(struct Qdisc *sch)
 319 {
 320         struct sfq_sched_data *q = qdisc_priv(sch);
 321         sfq_index x, d = q->cur_depth;
 322         struct sk_buff *skb;
 323         unsigned int len;
 324         struct sfq_slot *slot;
 325
 326         /* Queue is full! Find the longest slot and drop tail packet from it */
 327         if (d > 1) {
 328                 x = q->dep[d].next;
 329                 slot = &q->slots[x];
 330 drop:
 331                 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot);
 332                 len = qdisc_pkt_len(skb);
 333                 slot->backlog -= len;
 334                 sfq_dec(q, x);
 335                 kfree_skb(skb);
 336                 sch->q.qlen--;
 337                 sch->qstats.drops++;
 338                 sch->qstats.backlog -= len;
 339                 return len;
 340         }
 341
 342         if (d == 1) {
 343                 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
 344                 x = q->tail->next;
 345                 slot = &q->slots[x];
 346                 q->tail->next = slot->next;
 347                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 348                 goto drop;
 349         }
 350
 351         return 0;
 352 }
 353
 354 /* Is ECN parameter configured */
 355 static int sfq_prob_mark(const struct sfq_sched_data *q)
 356 {
 357         return q->flags & TC_RED_ECN;
 358 }
 359
 360 /* Should packets over max threshold just be marked */
 361 static int sfq_hard_mark(const struct sfq_sched_data *q)
 362 {
 363         return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN;
 364 }
 365
 366 static int sfq_headdrop(const struct sfq_sched_data *q)
 367 {
 368         return q->headdrop;
 369 }
 370
 371 static int
 372 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 373 {
 374         struct sfq_sched_data *q = qdisc_priv(sch);
 375         unsigned int hash;
 376         sfq_index x, qlen;
 377         struct sfq_slot *slot;
 378         int uninitialized_var(ret);
 379         struct sk_buff *head;
 380         int delta;
 381
 382         hash = sfq_classify(skb, sch, &ret);
 383         if (hash == 0) {
 384                 if (ret & __NET_XMIT_BYPASS)
 385                         sch->qstats.drops++;
 386                 kfree_skb(skb);
 387                 return ret;
 388         }
 389         hash--;
 390
 391         x = q->ht[hash];
 392         slot = &q->slots[x];
 393         if (x == SFQ_EMPTY_SLOT) {
 394                 x = q->dep[0].next; /* get a free slot */
 395                 if (x >= SFQ_MAX_FLOWS)
 396                         return qdisc_drop(skb, sch);
 397                 q->ht[hash] = x;
 398                 slot = &q->slots[x];
 399                 slot->hash = hash;
 400                 slot->backlog = 0; /* should already be 0 anyway... */
 401                 red_set_vars(&slot->vars);
 402                 goto enqueue;
 403         }
 404         if (q->red_parms) {
 405                 slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms,
 406                                                         &slot->vars,
 407                                                         slot->backlog);
 408                 switch (red_action(q->red_parms,
 409                                    &slot->vars,
 410                                    slot->vars.qavg)) {
 411                 case RED_DONT_MARK:
 412                         break;
 413
 414                 case RED_PROB_MARK:
 415                         sch->qstats.overlimits++;
 416                         if (sfq_prob_mark(q)) {
 417                                 /* We know we have at least one packet in queue */
 418                                 if (sfq_headdrop(q) &&
 419                                     INET_ECN_set_ce(slot->skblist_next)) {
 420                                         q->stats.prob_mark_head++;
 421                                         break;
 422                                 }
 423                                 if (INET_ECN_set_ce(skb)) {
 424                                         q->stats.prob_mark++;
 425                                         break;
 426                                 }
 427                         }
 428                         q->stats.prob_drop++;
 429                         goto congestion_drop;
 430
 431                 case RED_HARD_MARK:
 432                         sch->qstats.overlimits++;
 433                         if (sfq_hard_mark(q)) {
 434                                 /* We know we have at least one packet in queue */
 435                                 if (sfq_headdrop(q) &&
 436                                     INET_ECN_set_ce(slot->skblist_next)) {
 437                                         q->stats.forced_mark_head++;
 438                                         break;
 439                                 }
 440                                 if (INET_ECN_set_ce(skb)) {
 441                                         q->stats.forced_mark++;
 442                                         break;
 443                                 }
 444                         }
 445                         q->stats.forced_drop++;
 446                         goto congestion_drop;
 447                 }
 448         }
 449
 450         if (slot->qlen >= q->maxdepth) {
 451 congestion_drop:
 452                 if (!sfq_headdrop(q))
 453                         return qdisc_drop(skb, sch);
 454
 455                 /* We know we have at least one packet in queue */
 456                 head = slot_dequeue_head(slot);
 457                 delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb);
 458                 sch->qstats.backlog -= delta;
 459                 slot->backlog -= delta;
 460                 qdisc_drop(head, sch);
 461
 462                 slot_queue_add(slot, skb);
 463                 return NET_XMIT_CN;
 464         }
 465
 466 enqueue:
 467         sch->qstats.backlog += qdisc_pkt_len(skb);
 468         slot->backlog += qdisc_pkt_len(skb);
 469         slot_queue_add(slot, skb);
 470         sfq_inc(q, x);
 471         if (slot->qlen == 1) {          /* The flow is new */
 472                 if (q->tail == NULL) {  /* It is the first flow */
 473                         slot->next = x;
 474                 } else {
 475                         slot->next = q->tail->next;
 476                         q->tail->next = x;
 477                 }
 478                 /* We put this flow at the end of our flow list.
 479                  * This might sound unfair for a new flow to wait after old ones,
 480                  * but we could endup servicing new flows only, and freeze old ones.
 481                  */
 482                 q->tail = slot;
 483                 /* We could use a bigger initial quantum for new flows */
 484                 slot->allot = q->scaled_quantum;
 485         }
 486         if (++sch->q.qlen <= q->limit)
 487                 return NET_XMIT_SUCCESS;
 488
 489         qlen = slot->qlen;
 490         sfq_drop(sch);
 491         /* Return Congestion Notification only if we dropped a packet
 492          * from this flow.
 493          */
 494         if (qlen != slot->qlen)
 495                 return NET_XMIT_CN;
 496
 497         /* As we dropped a packet, better let upper stack know this */
 498         qdisc_tree_decrease_qlen(sch, 1);
 499         return NET_XMIT_SUCCESS;
 500 }
 501
 502 static struct sk_buff *
 503 sfq_dequeue(struct Qdisc *sch)
 504 {
 505         struct sfq_sched_data *q = qdisc_priv(sch);
 506         struct sk_buff *skb;
 507         sfq_index a, next_a;
 508         struct sfq_slot *slot;
 509
 510         /* No active slots */
 511         if (q->tail == NULL)
 512                 return NULL;
 513
 514 next_slot:
 515         a = q->tail->next;
 516         slot = &q->slots[a];
 517         if (slot->allot <= 0) {
 518                 q->tail = slot;
 519                 slot->allot += q->scaled_quantum;
 520                 goto next_slot;
 521         }
 522         skb = slot_dequeue_head(slot);
 523         sfq_dec(q, a);
 524         qdisc_bstats_update(sch, skb);
 525         sch->q.qlen--;
 526         sch->qstats.backlog -= qdisc_pkt_len(skb);
 527         slot->backlog -= qdisc_pkt_len(skb);
 528         /* Is the slot empty? */
 529         if (slot->qlen == 0) {
 530                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 531                 next_a = slot->next;
 532                 if (a == next_a) {
 533                         q->tail = NULL; /* no more active slots */
 534                         return skb;
 535                 }
 536                 q->tail->next = next_a;
 537         } else {
 538                 slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb));
 539         }
 540         return skb;
 541 }
 542
 543 static void
 544 sfq_reset(struct Qdisc *sch)
 545 {
 546         struct sk_buff *skb;
 547
 548         while ((skb = sfq_dequeue(sch)) != NULL)
 549                 kfree_skb(skb);
 550 }
 551
 552 /*
 553  * When q->perturbation is changed, we rehash all queued skbs
 554  * to avoid OOO (Out Of Order) effects.
 555  * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change
 556  * counters.
 557  */
 558 static void sfq_rehash(struct Qdisc *sch)
 559 {
 560         struct sfq_sched_data *q = qdisc_priv(sch);
 561         struct sk_buff *skb;
 562         int i;
 563         struct sfq_slot *slot;
 564         struct sk_buff_head list;
 565         int dropped = 0;
 566
 567         __skb_queue_head_init(&list);
 568
 569         for (i = 0; i < q->maxflows; i++) {
 570                 slot = &q->slots[i];
 571                 if (!slot->qlen)
 572                         continue;
 573                 while (slot->qlen) {
 574                         skb = slot_dequeue_head(slot);
 575                         sfq_dec(q, i);
 576                         __skb_queue_tail(&list, skb);
 577                 }
 578                 slot->backlog = 0;
 579                 red_set_vars(&slot->vars);
 580                 q->ht[slot->hash] = SFQ_EMPTY_SLOT;
 581         }
 582         q->tail = NULL;
 583
 584         while ((skb = __skb_dequeue(&list)) != NULL) {
 585                 unsigned int hash = sfq_hash(q, skb);
 586                 sfq_index x = q->ht[hash];
 587
 588                 slot = &q->slots[x];
 589                 if (x == SFQ_EMPTY_SLOT) {
 590                         x = q->dep[0].next; /* get a free slot */
 591                         if (x >= SFQ_MAX_FLOWS) {
 592 drop:                           sch->qstats.backlog -= qdisc_pkt_len(skb);
 593                                 kfree_skb(skb);
 594                                 dropped++;
 595                                 continue;
 596                         }
 597                         q->ht[hash] = x;
 598                         slot = &q->slots[x];
 599                         slot->hash = hash;
 600                 }
 601                 if (slot->qlen >= q->maxdepth)
 602                         goto drop;
 603                 slot_queue_add(slot, skb);
 604                 if (q->red_parms)
 605                         slot->vars.qavg = red_calc_qavg(q->red_parms,
 606                                                         &slot->vars,
 607                                                         slot->backlog);
 608                 slot->backlog += qdisc_pkt_len(skb);
 609                 sfq_inc(q, x);
 610                 if (slot->qlen == 1) {          /* The flow is new */
 611                         if (q->tail == NULL) {  /* It is the first flow */
 612                                 slot->next = x;
 613                         } else {
 614                                 slot->next = q->tail->next;
 615                                 q->tail->next = x;
 616                         }
 617                         q->tail = slot;
 618                         slot->allot = q->scaled_quantum;
 619                 }
 620         }
 621         sch->q.qlen -= dropped;
 622         qdisc_tree_decrease_qlen(sch, dropped);
 623 }
 624
 625 static void sfq_perturbation(unsigned long arg)
 626 {
 627         struct Qdisc *sch = (struct Qdisc *)arg;
 628         struct sfq_sched_data *q = qdisc_priv(sch);
 629         spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));
 630
 631         spin_lock(root_lock);
 632         q->perturbation = net_random();
 633         if (!q->filter_list && q->tail)
 634                 sfq_rehash(sch);
 635         spin_unlock(root_lock);
 636
 637         if (q->perturb_period)
 638                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 639 }
 640
 641 static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
 642 {
 643         struct sfq_sched_data *q = qdisc_priv(sch);
 644         struct tc_sfq_qopt *ctl = nla_data(opt);
 645         struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
 646         unsigned int qlen;
 647         struct red_parms *p = NULL;
 648
 649         if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
 650                 return -EINVAL;
 651         if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1)))
 652                 ctl_v1 = nla_data(opt);
 653         if (ctl->divisor &&
 654             (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536))
 655                 return -EINVAL;
 656         if (ctl_v1 && ctl_v1->qth_min) {
 657                 p = kmalloc(sizeof(*p), GFP_KERNEL);
 658                 if (!p)
 659                         return -ENOMEM;
 660         }
 661         sch_tree_lock(sch);
 662         if (ctl->quantum) {
 663                 q->quantum = ctl->quantum;
 664                 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 665         }
 666         q->perturb_period = ctl->perturb_period * HZ;
 667         if (ctl->flows)
 668                 q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
 669         if (ctl->divisor) {
 670                 q->divisor = ctl->divisor;
 671                 q->maxflows = min_t(u32, q->maxflows, q->divisor);
 672         }
 673         if (ctl_v1) {
 674                 if (ctl_v1->depth)
 675                         q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
 676                 if (p) {
 677                         swap(q->red_parms, p);
 678                         red_set_parms(q->red_parms,
 679                                       ctl_v1->qth_min, ctl_v1->qth_max,
 680                                       ctl_v1->Wlog,
 681                                       ctl_v1->Plog, ctl_v1->Scell_log,
 682                                       NULL,
 683                                       ctl_v1->max_P);
 684                 }
 685                 q->flags = ctl_v1->flags;
 686                 q->headdrop = ctl_v1->headdrop;
 687         }
 688         if (ctl->limit) {
 689                 q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
 690                 q->maxflows = min_t(u32, q->maxflows, q->limit);
 691         }
 692
 693         qlen = sch->q.qlen;
 694         while (sch->q.qlen > q->limit)
 695                 sfq_drop(sch);
 696         qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
 697
 698         del_timer(&q->perturb_timer);
 699         if (q->perturb_period) {
 700                 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
 701                 q->perturbation = net_random();
 702         }
 703         sch_tree_unlock(sch);
 704         kfree(p);
 705         return 0;
 706 }
 707
 708 static void *sfq_alloc(size_t sz)
 709 {
 710         void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN);
 711
 712         if (!ptr)
 713                 ptr = vmalloc(sz);
 714         return ptr;
 715 }
 716
 717 static void sfq_free(void *addr)
 718 {
 719         if (addr) {
 720                 if (is_vmalloc_addr(addr))
 721                         vfree(addr);
 722                 else
 723                         kfree(addr);
 724         }
 725 }
 726
 727 static void sfq_destroy(struct Qdisc *sch)
 728 {
 729         struct sfq_sched_data *q = qdisc_priv(sch);
 730
 731         tcf_destroy_chain(&q->filter_list);
 732         q->perturb_period = 0;
 733         del_timer_sync(&q->perturb_timer);
 734         sfq_free(q->ht);
 735         sfq_free(q->slots);
 736         kfree(q->red_parms);
 737 }
 738
 739 static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
 740 {
 741         struct sfq_sched_data *q = qdisc_priv(sch);
 742         int i;
 743
 744         q->perturb_timer.function = sfq_perturbation;
 745         q->perturb_timer.data = (unsigned long)sch;
 746         init_timer_deferrable(&q->perturb_timer);
 747
 748         for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) {
 749                 q->dep[i].next = i + SFQ_MAX_FLOWS;
 750                 q->dep[i].prev = i + SFQ_MAX_FLOWS;
 751         }
 752
 753         q->limit = SFQ_MAX_DEPTH;
 754         q->maxdepth = SFQ_MAX_DEPTH;
 755         q->cur_depth = 0;
 756         q->tail = NULL;
 757         q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
 758         q->maxflows = SFQ_DEFAULT_FLOWS;
 759         q->quantum = psched_mtu(qdisc_dev(sch));
 760         q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
 761         q->perturb_period = 0;
 762         q->perturbation = net_random();
 763
 764         if (opt) {
 765                 int err = sfq_change(sch, opt);
 766                 if (err)
 767                         return err;
 768         }
 769
 770         q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor);
 771         q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows);
 772         if (!q->ht || !q->slots) {
 773                 sfq_destroy(sch);
 774                 return -ENOMEM;
 775         }
 776         for (i = 0; i < q->divisor; i++)
 777                 q->ht[i] = SFQ_EMPTY_SLOT;
 778
 779         for (i = 0; i < q->maxflows; i++) {
 780                 slot_queue_init(&q->slots[i]);
 781                 sfq_link(q, i);
 782         }
 783         if (q->limit >= 1)
 784                 sch->flags |= TCQ_F_CAN_BYPASS;
 785         else
 786                 sch->flags &= ~TCQ_F_CAN_BYPASS;
 787         return 0;
 788 }
 789
 790 static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
 791 {
 792         struct sfq_sched_data *q = qdisc_priv(sch);
 793         unsigned char *b = skb_tail_pointer(skb);
 794         struct tc_sfq_qopt_v1 opt;
 795         struct red_parms *p = q->red_parms;
 796
 797         memset(&opt, 0, sizeof(opt));
 798         opt.v0.quantum  = q->quantum;
 799         opt.v0.perturb_period = q->perturb_period / HZ;
 800         opt.v0.limit    = q->limit;
 801         opt.v0.divisor  = q->divisor;
 802         opt.v0.flows    = q->maxflows;
 803         opt.depth       = q->maxdepth;
 804         opt.headdrop    = q->headdrop;
 805
 806         if (p) {
 807                 opt.qth_min     = p->qth_min >> p->Wlog;
 808                 opt.qth_max     = p->qth_max >> p->Wlog;
 809                 opt.Wlog        = p->Wlog;
 810                 opt.Plog        = p->Plog;
 811                 opt.Scell_log   = p->Scell_log;
 812                 opt.max_P       = p->max_P;
 813         }
 814         memcpy(&opt.stats, &q->stats, sizeof(opt.stats));
 815         opt.flags       = q->flags;
 816
 817         if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 818                 goto nla_put_failure;
 819
 820         return skb->len;
 821
 822 nla_put_failure:
 823         nlmsg_trim(skb, b);
 824         return -1;
 825 }
 826
 827 static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
 828 {
 829         return NULL;
 830 }
 831
 832 static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
 833 {
 834         return 0;
 835 }
 836
 837 static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
 838                               u32 classid)
 839 {
 840         /* we cannot bypass queue discipline anymore */
 841         sch->flags &= ~TCQ_F_CAN_BYPASS;
 842         return 0;
 843 }
 844
 845 static void sfq_put(struct Qdisc *q, unsigned long cl)
 846 {
 847 }
 848
 849 static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
 850 {
 851         struct sfq_sched_data *q = qdisc_priv(sch);
 852
 853         if (cl)
 854                 return NULL;
 855         return &q->filter_list;
 856 }
 857
 858 static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
 859                           struct sk_buff *skb, struct tcmsg *tcm)
 860 {
 861         tcm->tcm_handle |= TC_H_MIN(cl);
 862         return 0;
 863 }
 864
 865 static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
 866                                 struct gnet_dump *d)
 867 {
 868         struct sfq_sched_data *q = qdisc_priv(sch);
 869         sfq_index idx = q->ht[cl - 1];
 870         struct gnet_stats_queue qs = { 0 };
 871         struct tc_sfq_xstats xstats = { 0 };
 872
 873         if (idx != SFQ_EMPTY_SLOT) {
 874                 const struct sfq_slot *slot = &q->slots[idx];
 875
 876                 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT;
 877                 qs.qlen = slot->qlen;
 878                 qs.backlog = slot->backlog;
 879         }
 880         if (gnet_stats_copy_queue(d, &qs) < 0)
 881                 return -1;
 882         return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
 883 }
 884
 885 static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 886 {
 887         struct sfq_sched_data *q = qdisc_priv(sch);
 888         unsigned int i;
 889
 890         if (arg->stop)
 891                 return;
 892
 893         for (i = 0; i < q->divisor; i++) {
 894                 if (q->ht[i] == SFQ_EMPTY_SLOT ||
 895                     arg->count < arg->skip) {
 896                         arg->count++;
 897                         continue;
 898                 }
 899                 if (arg->fn(sch, i + 1, arg) < 0) {
 900                         arg->stop = 1;
 901                         break;
 902                 }
 903                 arg->count++;
 904         }
 905 }
 906
 907 static const struct Qdisc_class_ops sfq_class_ops = {
 908         .leaf           =       sfq_leaf,
 909         .get            =       sfq_get,
 910         .put            =       sfq_put,
 911         .tcf_chain      =       sfq_find_tcf,
 912         .bind_tcf       =       sfq_bind,
 913         .unbind_tcf     =       sfq_put,
 914         .dump           =       sfq_dump_class,
 915         .dump_stats     =       sfq_dump_class_stats,
 916         .walk           =       sfq_walk,
 917 };
 918
 919 static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
 920         .cl_ops         =       &sfq_class_ops,
 921         .id             =       "sfq",
 922         .priv_size      =       sizeof(struct sfq_sched_data),
 923         .enqueue        =       sfq_enqueue,
 924         .dequeue        =       sfq_dequeue,
 925         .peek           =       qdisc_peek_dequeued,
 926         .drop           =       sfq_drop,
 927         .init           =       sfq_init,
 928         .reset          =       sfq_reset,
 929         .destroy        =       sfq_destroy,
 930         .change         =       NULL,
 931         .dump           =       sfq_dump,
 932         .owner          =       THIS_MODULE,
 933 };
 934
 935 static int __init sfq_module_init(void)
 936 {
 937         return register_qdisc(&sfq_qdisc_ops);
 938 }
 939 static void __exit sfq_module_exit(void)
 940 {
 941         unregister_qdisc(&sfq_qdisc_ops);
 942 }
 943 module_init(sfq_module_init)
 944 module_exit(sfq_module_exit)
 945 MODULE_LICENSE("GPL");