Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[linux-drm-fsl-dcu.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
277                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
278 }
279 EXPORT_SYMBOL(qdisc_list_add);
280
281 void qdisc_list_del(struct Qdisc *q)
282 {
283         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
284                 list_del(&q->list);
285 }
286 EXPORT_SYMBOL(qdisc_list_del);
287
288 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
289 {
290         struct Qdisc *q;
291
292         q = qdisc_match_from_root(dev->qdisc, handle);
293         if (q)
294                 goto out;
295
296         if (dev_ingress_queue(dev))
297                 q = qdisc_match_from_root(
298                         dev_ingress_queue(dev)->qdisc_sleeping,
299                         handle);
300 out:
301         return q;
302 }
303
304 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
305 {
306         unsigned long cl;
307         struct Qdisc *leaf;
308         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
309
310         if (cops == NULL)
311                 return NULL;
312         cl = cops->get(p, classid);
313
314         if (cl == 0)
315                 return NULL;
316         leaf = cops->leaf(p, cl);
317         cops->put(p, cl);
318         return leaf;
319 }
320
321 /* Find queueing discipline by name */
322
323 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
324 {
325         struct Qdisc_ops *q = NULL;
326
327         if (kind) {
328                 read_lock(&qdisc_mod_lock);
329                 for (q = qdisc_base; q; q = q->next) {
330                         if (nla_strcmp(kind, q->id) == 0) {
331                                 if (!try_module_get(q->owner))
332                                         q = NULL;
333                                 break;
334                         }
335                 }
336                 read_unlock(&qdisc_mod_lock);
337         }
338         return q;
339 }
340
341 /* The linklayer setting were not transferred from iproute2, in older
342  * versions, and the rate tables lookup systems have been dropped in
343  * the kernel. To keep backward compatible with older iproute2 tc
344  * utils, we detect the linklayer setting by detecting if the rate
345  * table were modified.
346  *
347  * For linklayer ATM table entries, the rate table will be aligned to
348  * 48 bytes, thus some table entries will contain the same value.  The
349  * mpu (min packet unit) is also encoded into the old rate table, thus
350  * starting from the mpu, we find low and high table entries for
351  * mapping this cell.  If these entries contain the same value, when
352  * the rate tables have been modified for linklayer ATM.
353  *
354  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
355  * and then roundup to the next cell, calc the table entry one below,
356  * and compare.
357  */
358 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
359 {
360         int low       = roundup(r->mpu, 48);
361         int high      = roundup(low+1, 48);
362         int cell_low  = low >> r->cell_log;
363         int cell_high = (high >> r->cell_log) - 1;
364
365         /* rtab is too inaccurate at rates > 100Mbit/s */
366         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
367                 pr_debug("TC linklayer: Giving up ATM detection\n");
368                 return TC_LINKLAYER_ETHERNET;
369         }
370
371         if ((cell_high > cell_low) && (cell_high < 256)
372             && (rtab[cell_low] == rtab[cell_high])) {
373                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
374                          cell_low, cell_high, rtab[cell_high]);
375                 return TC_LINKLAYER_ATM;
376         }
377         return TC_LINKLAYER_ETHERNET;
378 }
379
380 static struct qdisc_rate_table *qdisc_rtab_list;
381
382 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
383 {
384         struct qdisc_rate_table *rtab;
385
386         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
387             nla_len(tab) != TC_RTAB_SIZE)
388                 return NULL;
389
390         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
391                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
392                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
393                         rtab->refcnt++;
394                         return rtab;
395                 }
396         }
397
398         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
399         if (rtab) {
400                 rtab->rate = *r;
401                 rtab->refcnt = 1;
402                 memcpy(rtab->data, nla_data(tab), 1024);
403                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
404                         r->linklayer = __detect_linklayer(r, rtab->data);
405                 rtab->next = qdisc_rtab_list;
406                 qdisc_rtab_list = rtab;
407         }
408         return rtab;
409 }
410 EXPORT_SYMBOL(qdisc_get_rtab);
411
412 void qdisc_put_rtab(struct qdisc_rate_table *tab)
413 {
414         struct qdisc_rate_table *rtab, **rtabp;
415
416         if (!tab || --tab->refcnt)
417                 return;
418
419         for (rtabp = &qdisc_rtab_list;
420              (rtab = *rtabp) != NULL;
421              rtabp = &rtab->next) {
422                 if (rtab == tab) {
423                         *rtabp = rtab->next;
424                         kfree(rtab);
425                         return;
426                 }
427         }
428 }
429 EXPORT_SYMBOL(qdisc_put_rtab);
430
431 static LIST_HEAD(qdisc_stab_list);
432 static DEFINE_SPINLOCK(qdisc_stab_lock);
433
434 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
435         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
436         [TCA_STAB_DATA] = { .type = NLA_BINARY },
437 };
438
439 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
440 {
441         struct nlattr *tb[TCA_STAB_MAX + 1];
442         struct qdisc_size_table *stab;
443         struct tc_sizespec *s;
444         unsigned int tsize = 0;
445         u16 *tab = NULL;
446         int err;
447
448         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
449         if (err < 0)
450                 return ERR_PTR(err);
451         if (!tb[TCA_STAB_BASE])
452                 return ERR_PTR(-EINVAL);
453
454         s = nla_data(tb[TCA_STAB_BASE]);
455
456         if (s->tsize > 0) {
457                 if (!tb[TCA_STAB_DATA])
458                         return ERR_PTR(-EINVAL);
459                 tab = nla_data(tb[TCA_STAB_DATA]);
460                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
461         }
462
463         if (tsize != s->tsize || (!tab && tsize > 0))
464                 return ERR_PTR(-EINVAL);
465
466         spin_lock(&qdisc_stab_lock);
467
468         list_for_each_entry(stab, &qdisc_stab_list, list) {
469                 if (memcmp(&stab->szopts, s, sizeof(*s)))
470                         continue;
471                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
472                         continue;
473                 stab->refcnt++;
474                 spin_unlock(&qdisc_stab_lock);
475                 return stab;
476         }
477
478         spin_unlock(&qdisc_stab_lock);
479
480         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
481         if (!stab)
482                 return ERR_PTR(-ENOMEM);
483
484         stab->refcnt = 1;
485         stab->szopts = *s;
486         if (tsize > 0)
487                 memcpy(stab->data, tab, tsize * sizeof(u16));
488
489         spin_lock(&qdisc_stab_lock);
490         list_add_tail(&stab->list, &qdisc_stab_list);
491         spin_unlock(&qdisc_stab_lock);
492
493         return stab;
494 }
495
496 static void stab_kfree_rcu(struct rcu_head *head)
497 {
498         kfree(container_of(head, struct qdisc_size_table, rcu));
499 }
500
501 void qdisc_put_stab(struct qdisc_size_table *tab)
502 {
503         if (!tab)
504                 return;
505
506         spin_lock(&qdisc_stab_lock);
507
508         if (--tab->refcnt == 0) {
509                 list_del(&tab->list);
510                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
511         }
512
513         spin_unlock(&qdisc_stab_lock);
514 }
515 EXPORT_SYMBOL(qdisc_put_stab);
516
517 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
518 {
519         struct nlattr *nest;
520
521         nest = nla_nest_start(skb, TCA_STAB);
522         if (nest == NULL)
523                 goto nla_put_failure;
524         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
525                 goto nla_put_failure;
526         nla_nest_end(skb, nest);
527
528         return skb->len;
529
530 nla_put_failure:
531         return -1;
532 }
533
534 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
535 {
536         int pkt_len, slot;
537
538         pkt_len = skb->len + stab->szopts.overhead;
539         if (unlikely(!stab->szopts.tsize))
540                 goto out;
541
542         slot = pkt_len + stab->szopts.cell_align;
543         if (unlikely(slot < 0))
544                 slot = 0;
545
546         slot >>= stab->szopts.cell_log;
547         if (likely(slot < stab->szopts.tsize))
548                 pkt_len = stab->data[slot];
549         else
550                 pkt_len = stab->data[stab->szopts.tsize - 1] *
551                                 (slot / stab->szopts.tsize) +
552                                 stab->data[slot % stab->szopts.tsize];
553
554         pkt_len <<= stab->szopts.size_log;
555 out:
556         if (unlikely(pkt_len < 1))
557                 pkt_len = 1;
558         qdisc_skb_cb(skb)->pkt_len = pkt_len;
559 }
560 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
561
562 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
563 {
564         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
565                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
566                         txt, qdisc->ops->id, qdisc->handle >> 16);
567                 qdisc->flags |= TCQ_F_WARN_NONWC;
568         }
569 }
570 EXPORT_SYMBOL(qdisc_warn_nonwc);
571
572 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
573 {
574         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
575                                                  timer);
576
577         qdisc_unthrottled(wd->qdisc);
578         __netif_schedule(qdisc_root(wd->qdisc));
579
580         return HRTIMER_NORESTART;
581 }
582
583 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
584 {
585         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
586         wd->timer.function = qdisc_watchdog;
587         wd->qdisc = qdisc;
588 }
589 EXPORT_SYMBOL(qdisc_watchdog_init);
590
591 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
592 {
593         if (test_bit(__QDISC_STATE_DEACTIVATED,
594                      &qdisc_root_sleeping(wd->qdisc)->state))
595                 return;
596
597         qdisc_throttled(wd->qdisc);
598
599         hrtimer_start(&wd->timer,
600                       ns_to_ktime(expires),
601                       HRTIMER_MODE_ABS);
602 }
603 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
604
605 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
606 {
607         hrtimer_cancel(&wd->timer);
608         qdisc_unthrottled(wd->qdisc);
609 }
610 EXPORT_SYMBOL(qdisc_watchdog_cancel);
611
612 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
613 {
614         unsigned int size = n * sizeof(struct hlist_head), i;
615         struct hlist_head *h;
616
617         if (size <= PAGE_SIZE)
618                 h = kmalloc(size, GFP_KERNEL);
619         else
620                 h = (struct hlist_head *)
621                         __get_free_pages(GFP_KERNEL, get_order(size));
622
623         if (h != NULL) {
624                 for (i = 0; i < n; i++)
625                         INIT_HLIST_HEAD(&h[i]);
626         }
627         return h;
628 }
629
630 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
631 {
632         unsigned int size = n * sizeof(struct hlist_head);
633
634         if (size <= PAGE_SIZE)
635                 kfree(h);
636         else
637                 free_pages((unsigned long)h, get_order(size));
638 }
639
640 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
641 {
642         struct Qdisc_class_common *cl;
643         struct hlist_node *next;
644         struct hlist_head *nhash, *ohash;
645         unsigned int nsize, nmask, osize;
646         unsigned int i, h;
647
648         /* Rehash when load factor exceeds 0.75 */
649         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
650                 return;
651         nsize = clhash->hashsize * 2;
652         nmask = nsize - 1;
653         nhash = qdisc_class_hash_alloc(nsize);
654         if (nhash == NULL)
655                 return;
656
657         ohash = clhash->hash;
658         osize = clhash->hashsize;
659
660         sch_tree_lock(sch);
661         for (i = 0; i < osize; i++) {
662                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
663                         h = qdisc_class_hash(cl->classid, nmask);
664                         hlist_add_head(&cl->hnode, &nhash[h]);
665                 }
666         }
667         clhash->hash     = nhash;
668         clhash->hashsize = nsize;
669         clhash->hashmask = nmask;
670         sch_tree_unlock(sch);
671
672         qdisc_class_hash_free(ohash, osize);
673 }
674 EXPORT_SYMBOL(qdisc_class_hash_grow);
675
676 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
677 {
678         unsigned int size = 4;
679
680         clhash->hash = qdisc_class_hash_alloc(size);
681         if (clhash->hash == NULL)
682                 return -ENOMEM;
683         clhash->hashsize  = size;
684         clhash->hashmask  = size - 1;
685         clhash->hashelems = 0;
686         return 0;
687 }
688 EXPORT_SYMBOL(qdisc_class_hash_init);
689
690 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
691 {
692         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
693 }
694 EXPORT_SYMBOL(qdisc_class_hash_destroy);
695
696 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
697                              struct Qdisc_class_common *cl)
698 {
699         unsigned int h;
700
701         INIT_HLIST_NODE(&cl->hnode);
702         h = qdisc_class_hash(cl->classid, clhash->hashmask);
703         hlist_add_head(&cl->hnode, &clhash->hash[h]);
704         clhash->hashelems++;
705 }
706 EXPORT_SYMBOL(qdisc_class_hash_insert);
707
708 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
709                              struct Qdisc_class_common *cl)
710 {
711         hlist_del(&cl->hnode);
712         clhash->hashelems--;
713 }
714 EXPORT_SYMBOL(qdisc_class_hash_remove);
715
716 /* Allocate an unique handle from space managed by kernel
717  * Possible range is [8000-FFFF]:0000 (0x8000 values)
718  */
719 static u32 qdisc_alloc_handle(struct net_device *dev)
720 {
721         int i = 0x8000;
722         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
723
724         do {
725                 autohandle += TC_H_MAKE(0x10000U, 0);
726                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
727                         autohandle = TC_H_MAKE(0x80000000U, 0);
728                 if (!qdisc_lookup(dev, autohandle))
729                         return autohandle;
730                 cond_resched();
731         } while (--i > 0);
732
733         return 0;
734 }
735
736 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
737 {
738         const struct Qdisc_class_ops *cops;
739         unsigned long cl;
740         u32 parentid;
741         int drops;
742
743         if (n == 0)
744                 return;
745         drops = max_t(int, n, 0);
746         while ((parentid = sch->parent)) {
747                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
748                         return;
749
750                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
751                 if (sch == NULL) {
752                         WARN_ON(parentid != TC_H_ROOT);
753                         return;
754                 }
755                 cops = sch->ops->cl_ops;
756                 if (cops->qlen_notify) {
757                         cl = cops->get(sch, parentid);
758                         cops->qlen_notify(sch, cl);
759                         cops->put(sch, cl);
760                 }
761                 sch->q.qlen -= n;
762                 sch->qstats.drops += drops;
763         }
764 }
765 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
766
767 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
768                                struct nlmsghdr *n, u32 clid,
769                                struct Qdisc *old, struct Qdisc *new)
770 {
771         if (new || old)
772                 qdisc_notify(net, skb, n, clid, old, new);
773
774         if (old)
775                 qdisc_destroy(old);
776 }
777
778 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
779  * to device "dev".
780  *
781  * When appropriate send a netlink notification using 'skb'
782  * and "n".
783  *
784  * On success, destroy old qdisc.
785  */
786
787 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
788                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
789                        struct Qdisc *new, struct Qdisc *old)
790 {
791         struct Qdisc *q = old;
792         struct net *net = dev_net(dev);
793         int err = 0;
794
795         if (parent == NULL) {
796                 unsigned int i, num_q, ingress;
797
798                 ingress = 0;
799                 num_q = dev->num_tx_queues;
800                 if ((q && q->flags & TCQ_F_INGRESS) ||
801                     (new && new->flags & TCQ_F_INGRESS)) {
802                         num_q = 1;
803                         ingress = 1;
804                         if (!dev_ingress_queue(dev))
805                                 return -ENOENT;
806                 }
807
808                 if (dev->flags & IFF_UP)
809                         dev_deactivate(dev);
810
811                 if (new && new->ops->attach) {
812                         new->ops->attach(new);
813                         num_q = 0;
814                 }
815
816                 for (i = 0; i < num_q; i++) {
817                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
818
819                         if (!ingress)
820                                 dev_queue = netdev_get_tx_queue(dev, i);
821
822                         old = dev_graft_qdisc(dev_queue, new);
823                         if (new && i > 0)
824                                 atomic_inc(&new->refcnt);
825
826                         if (!ingress)
827                                 qdisc_destroy(old);
828                 }
829
830                 if (!ingress) {
831                         notify_and_destroy(net, skb, n, classid,
832                                            dev->qdisc, new);
833                         if (new && !new->ops->attach)
834                                 atomic_inc(&new->refcnt);
835                         dev->qdisc = new ? : &noop_qdisc;
836                 } else {
837                         notify_and_destroy(net, skb, n, classid, old, new);
838                 }
839
840                 if (dev->flags & IFF_UP)
841                         dev_activate(dev);
842         } else {
843                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
844
845                 err = -EOPNOTSUPP;
846                 if (cops && cops->graft) {
847                         unsigned long cl = cops->get(parent, classid);
848                         if (cl) {
849                                 err = cops->graft(parent, cl, new, &old);
850                                 cops->put(parent, cl);
851                         } else
852                                 err = -ENOENT;
853                 }
854                 if (!err)
855                         notify_and_destroy(net, skb, n, classid, old, new);
856         }
857         return err;
858 }
859
860 /* lockdep annotation is needed for ingress; egress gets it only for name */
861 static struct lock_class_key qdisc_tx_lock;
862 static struct lock_class_key qdisc_rx_lock;
863
864 /*
865    Allocate and initialize new qdisc.
866
867    Parameters are passed via opt.
868  */
869
870 static struct Qdisc *
871 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
872              struct Qdisc *p, u32 parent, u32 handle,
873              struct nlattr **tca, int *errp)
874 {
875         int err;
876         struct nlattr *kind = tca[TCA_KIND];
877         struct Qdisc *sch;
878         struct Qdisc_ops *ops;
879         struct qdisc_size_table *stab;
880
881         ops = qdisc_lookup_ops(kind);
882 #ifdef CONFIG_MODULES
883         if (ops == NULL && kind != NULL) {
884                 char name[IFNAMSIZ];
885                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
886                         /* We dropped the RTNL semaphore in order to
887                          * perform the module load.  So, even if we
888                          * succeeded in loading the module we have to
889                          * tell the caller to replay the request.  We
890                          * indicate this using -EAGAIN.
891                          * We replay the request because the device may
892                          * go away in the mean time.
893                          */
894                         rtnl_unlock();
895                         request_module("sch_%s", name);
896                         rtnl_lock();
897                         ops = qdisc_lookup_ops(kind);
898                         if (ops != NULL) {
899                                 /* We will try again qdisc_lookup_ops,
900                                  * so don't keep a reference.
901                                  */
902                                 module_put(ops->owner);
903                                 err = -EAGAIN;
904                                 goto err_out;
905                         }
906                 }
907         }
908 #endif
909
910         err = -ENOENT;
911         if (ops == NULL)
912                 goto err_out;
913
914         sch = qdisc_alloc(dev_queue, ops);
915         if (IS_ERR(sch)) {
916                 err = PTR_ERR(sch);
917                 goto err_out2;
918         }
919
920         sch->parent = parent;
921
922         if (handle == TC_H_INGRESS) {
923                 sch->flags |= TCQ_F_INGRESS;
924                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
925                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
926         } else {
927                 if (handle == 0) {
928                         handle = qdisc_alloc_handle(dev);
929                         err = -ENOMEM;
930                         if (handle == 0)
931                                 goto err_out3;
932                 }
933                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
934                 if (!netif_is_multiqueue(dev))
935                         sch->flags |= TCQ_F_ONETXQUEUE;
936         }
937
938         sch->handle = handle;
939
940         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
941                 if (tca[TCA_STAB]) {
942                         stab = qdisc_get_stab(tca[TCA_STAB]);
943                         if (IS_ERR(stab)) {
944                                 err = PTR_ERR(stab);
945                                 goto err_out4;
946                         }
947                         rcu_assign_pointer(sch->stab, stab);
948                 }
949                 if (tca[TCA_RATE]) {
950                         spinlock_t *root_lock;
951
952                         err = -EOPNOTSUPP;
953                         if (sch->flags & TCQ_F_MQROOT)
954                                 goto err_out4;
955
956                         if ((sch->parent != TC_H_ROOT) &&
957                             !(sch->flags & TCQ_F_INGRESS) &&
958                             (!p || !(p->flags & TCQ_F_MQROOT)))
959                                 root_lock = qdisc_root_sleeping_lock(sch);
960                         else
961                                 root_lock = qdisc_lock(sch);
962
963                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
964                                                 root_lock, tca[TCA_RATE]);
965                         if (err)
966                                 goto err_out4;
967                 }
968
969                 qdisc_list_add(sch);
970
971                 return sch;
972         }
973 err_out3:
974         dev_put(dev);
975         kfree((char *) sch - sch->padded);
976 err_out2:
977         module_put(ops->owner);
978 err_out:
979         *errp = err;
980         return NULL;
981
982 err_out4:
983         /*
984          * Any broken qdiscs that would require a ops->reset() here?
985          * The qdisc was never in action so it shouldn't be necessary.
986          */
987         qdisc_put_stab(rtnl_dereference(sch->stab));
988         if (ops->destroy)
989                 ops->destroy(sch);
990         goto err_out3;
991 }
992
993 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
994 {
995         struct qdisc_size_table *ostab, *stab = NULL;
996         int err = 0;
997
998         if (tca[TCA_OPTIONS]) {
999                 if (sch->ops->change == NULL)
1000                         return -EINVAL;
1001                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1002                 if (err)
1003                         return err;
1004         }
1005
1006         if (tca[TCA_STAB]) {
1007                 stab = qdisc_get_stab(tca[TCA_STAB]);
1008                 if (IS_ERR(stab))
1009                         return PTR_ERR(stab);
1010         }
1011
1012         ostab = rtnl_dereference(sch->stab);
1013         rcu_assign_pointer(sch->stab, stab);
1014         qdisc_put_stab(ostab);
1015
1016         if (tca[TCA_RATE]) {
1017                 /* NB: ignores errors from replace_estimator
1018                    because change can't be undone. */
1019                 if (sch->flags & TCQ_F_MQROOT)
1020                         goto out;
1021                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
1022                                             qdisc_root_sleeping_lock(sch),
1023                                             tca[TCA_RATE]);
1024         }
1025 out:
1026         return 0;
1027 }
1028
1029 struct check_loop_arg {
1030         struct qdisc_walker     w;
1031         struct Qdisc            *p;
1032         int                     depth;
1033 };
1034
1035 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1036
1037 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1038 {
1039         struct check_loop_arg   arg;
1040
1041         if (q->ops->cl_ops == NULL)
1042                 return 0;
1043
1044         arg.w.stop = arg.w.skip = arg.w.count = 0;
1045         arg.w.fn = check_loop_fn;
1046         arg.depth = depth;
1047         arg.p = p;
1048         q->ops->cl_ops->walk(q, &arg.w);
1049         return arg.w.stop ? -ELOOP : 0;
1050 }
1051
1052 static int
1053 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1054 {
1055         struct Qdisc *leaf;
1056         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1057         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1058
1059         leaf = cops->leaf(q, cl);
1060         if (leaf) {
1061                 if (leaf == arg->p || arg->depth > 7)
1062                         return -ELOOP;
1063                 return check_loop(leaf, arg->p, arg->depth + 1);
1064         }
1065         return 0;
1066 }
1067
1068 /*
1069  * Delete/get qdisc.
1070  */
1071
1072 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1073 {
1074         struct net *net = sock_net(skb->sk);
1075         struct tcmsg *tcm = nlmsg_data(n);
1076         struct nlattr *tca[TCA_MAX + 1];
1077         struct net_device *dev;
1078         u32 clid;
1079         struct Qdisc *q = NULL;
1080         struct Qdisc *p = NULL;
1081         int err;
1082
1083         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1084                 return -EPERM;
1085
1086         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1087         if (err < 0)
1088                 return err;
1089
1090         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1091         if (!dev)
1092                 return -ENODEV;
1093
1094         clid = tcm->tcm_parent;
1095         if (clid) {
1096                 if (clid != TC_H_ROOT) {
1097                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1098                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1099                                 if (!p)
1100                                         return -ENOENT;
1101                                 q = qdisc_leaf(p, clid);
1102                         } else if (dev_ingress_queue(dev)) {
1103                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1104                         }
1105                 } else {
1106                         q = dev->qdisc;
1107                 }
1108                 if (!q)
1109                         return -ENOENT;
1110
1111                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1112                         return -EINVAL;
1113         } else {
1114                 q = qdisc_lookup(dev, tcm->tcm_handle);
1115                 if (!q)
1116                         return -ENOENT;
1117         }
1118
1119         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1120                 return -EINVAL;
1121
1122         if (n->nlmsg_type == RTM_DELQDISC) {
1123                 if (!clid)
1124                         return -EINVAL;
1125                 if (q->handle == 0)
1126                         return -ENOENT;
1127                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1128                 if (err != 0)
1129                         return err;
1130         } else {
1131                 qdisc_notify(net, skb, n, clid, NULL, q);
1132         }
1133         return 0;
1134 }
1135
1136 /*
1137  * Create/change qdisc.
1138  */
1139
1140 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1141 {
1142         struct net *net = sock_net(skb->sk);
1143         struct tcmsg *tcm;
1144         struct nlattr *tca[TCA_MAX + 1];
1145         struct net_device *dev;
1146         u32 clid;
1147         struct Qdisc *q, *p;
1148         int err;
1149
1150         if (!capable(CAP_NET_ADMIN))
1151                 return -EPERM;
1152
1153 replay:
1154         /* Reinit, just in case something touches this. */
1155         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1156         if (err < 0)
1157                 return err;
1158
1159         tcm = nlmsg_data(n);
1160         clid = tcm->tcm_parent;
1161         q = p = NULL;
1162
1163         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1164         if (!dev)
1165                 return -ENODEV;
1166
1167
1168         if (clid) {
1169                 if (clid != TC_H_ROOT) {
1170                         if (clid != TC_H_INGRESS) {
1171                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1172                                 if (!p)
1173                                         return -ENOENT;
1174                                 q = qdisc_leaf(p, clid);
1175                         } else if (dev_ingress_queue_create(dev)) {
1176                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1177                         }
1178                 } else {
1179                         q = dev->qdisc;
1180                 }
1181
1182                 /* It may be default qdisc, ignore it */
1183                 if (q && q->handle == 0)
1184                         q = NULL;
1185
1186                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1187                         if (tcm->tcm_handle) {
1188                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1189                                         return -EEXIST;
1190                                 if (TC_H_MIN(tcm->tcm_handle))
1191                                         return -EINVAL;
1192                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1193                                 if (!q)
1194                                         goto create_n_graft;
1195                                 if (n->nlmsg_flags & NLM_F_EXCL)
1196                                         return -EEXIST;
1197                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1198                                         return -EINVAL;
1199                                 if (q == p ||
1200                                     (p && check_loop(q, p, 0)))
1201                                         return -ELOOP;
1202                                 atomic_inc(&q->refcnt);
1203                                 goto graft;
1204                         } else {
1205                                 if (!q)
1206                                         goto create_n_graft;
1207
1208                                 /* This magic test requires explanation.
1209                                  *
1210                                  *   We know, that some child q is already
1211                                  *   attached to this parent and have choice:
1212                                  *   either to change it or to create/graft new one.
1213                                  *
1214                                  *   1. We are allowed to create/graft only
1215                                  *   if CREATE and REPLACE flags are set.
1216                                  *
1217                                  *   2. If EXCL is set, requestor wanted to say,
1218                                  *   that qdisc tcm_handle is not expected
1219                                  *   to exist, so that we choose create/graft too.
1220                                  *
1221                                  *   3. The last case is when no flags are set.
1222                                  *   Alas, it is sort of hole in API, we
1223                                  *   cannot decide what to do unambiguously.
1224                                  *   For now we select create/graft, if
1225                                  *   user gave KIND, which does not match existing.
1226                                  */
1227                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1228                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1229                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1230                                      (tca[TCA_KIND] &&
1231                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1232                                         goto create_n_graft;
1233                         }
1234                 }
1235         } else {
1236                 if (!tcm->tcm_handle)
1237                         return -EINVAL;
1238                 q = qdisc_lookup(dev, tcm->tcm_handle);
1239         }
1240
1241         /* Change qdisc parameters */
1242         if (q == NULL)
1243                 return -ENOENT;
1244         if (n->nlmsg_flags & NLM_F_EXCL)
1245                 return -EEXIST;
1246         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1247                 return -EINVAL;
1248         err = qdisc_change(q, tca);
1249         if (err == 0)
1250                 qdisc_notify(net, skb, n, clid, NULL, q);
1251         return err;
1252
1253 create_n_graft:
1254         if (!(n->nlmsg_flags & NLM_F_CREATE))
1255                 return -ENOENT;
1256         if (clid == TC_H_INGRESS) {
1257                 if (dev_ingress_queue(dev))
1258                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1259                                          tcm->tcm_parent, tcm->tcm_parent,
1260                                          tca, &err);
1261                 else
1262                         err = -ENOENT;
1263         } else {
1264                 struct netdev_queue *dev_queue;
1265
1266                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1267                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1268                 else if (p)
1269                         dev_queue = p->dev_queue;
1270                 else
1271                         dev_queue = netdev_get_tx_queue(dev, 0);
1272
1273                 q = qdisc_create(dev, dev_queue, p,
1274                                  tcm->tcm_parent, tcm->tcm_handle,
1275                                  tca, &err);
1276         }
1277         if (q == NULL) {
1278                 if (err == -EAGAIN)
1279                         goto replay;
1280                 return err;
1281         }
1282
1283 graft:
1284         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1285         if (err) {
1286                 if (q)
1287                         qdisc_destroy(q);
1288                 return err;
1289         }
1290
1291         return 0;
1292 }
1293
1294 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1295                          u32 portid, u32 seq, u16 flags, int event)
1296 {
1297         struct tcmsg *tcm;
1298         struct nlmsghdr  *nlh;
1299         unsigned char *b = skb_tail_pointer(skb);
1300         struct gnet_dump d;
1301         struct qdisc_size_table *stab;
1302
1303         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1304         if (!nlh)
1305                 goto out_nlmsg_trim;
1306         tcm = nlmsg_data(nlh);
1307         tcm->tcm_family = AF_UNSPEC;
1308         tcm->tcm__pad1 = 0;
1309         tcm->tcm__pad2 = 0;
1310         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1311         tcm->tcm_parent = clid;
1312         tcm->tcm_handle = q->handle;
1313         tcm->tcm_info = atomic_read(&q->refcnt);
1314         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1315                 goto nla_put_failure;
1316         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1317                 goto nla_put_failure;
1318         q->qstats.qlen = q->q.qlen;
1319
1320         stab = rtnl_dereference(q->stab);
1321         if (stab && qdisc_dump_stab(skb, stab) < 0)
1322                 goto nla_put_failure;
1323
1324         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1325                                          qdisc_root_sleeping_lock(q), &d) < 0)
1326                 goto nla_put_failure;
1327
1328         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1329                 goto nla_put_failure;
1330
1331         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1332             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1333             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1334                 goto nla_put_failure;
1335
1336         if (gnet_stats_finish_copy(&d) < 0)
1337                 goto nla_put_failure;
1338
1339         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1340         return skb->len;
1341
1342 out_nlmsg_trim:
1343 nla_put_failure:
1344         nlmsg_trim(skb, b);
1345         return -1;
1346 }
1347
1348 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1349 {
1350         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1351 }
1352
1353 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1354                         struct nlmsghdr *n, u32 clid,
1355                         struct Qdisc *old, struct Qdisc *new)
1356 {
1357         struct sk_buff *skb;
1358         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1359
1360         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1361         if (!skb)
1362                 return -ENOBUFS;
1363
1364         if (old && !tc_qdisc_dump_ignore(old)) {
1365                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1366                                   0, RTM_DELQDISC) < 0)
1367                         goto err_out;
1368         }
1369         if (new && !tc_qdisc_dump_ignore(new)) {
1370                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1371                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1372                         goto err_out;
1373         }
1374
1375         if (skb->len)
1376                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1377                                       n->nlmsg_flags & NLM_F_ECHO);
1378
1379 err_out:
1380         kfree_skb(skb);
1381         return -EINVAL;
1382 }
1383
1384 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1385                               struct netlink_callback *cb,
1386                               int *q_idx_p, int s_q_idx)
1387 {
1388         int ret = 0, q_idx = *q_idx_p;
1389         struct Qdisc *q;
1390
1391         if (!root)
1392                 return 0;
1393
1394         q = root;
1395         if (q_idx < s_q_idx) {
1396                 q_idx++;
1397         } else {
1398                 if (!tc_qdisc_dump_ignore(q) &&
1399                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1400                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1401                         goto done;
1402                 q_idx++;
1403         }
1404         list_for_each_entry(q, &root->list, list) {
1405                 if (q_idx < s_q_idx) {
1406                         q_idx++;
1407                         continue;
1408                 }
1409                 if (!tc_qdisc_dump_ignore(q) &&
1410                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1411                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1412                         goto done;
1413                 q_idx++;
1414         }
1415
1416 out:
1417         *q_idx_p = q_idx;
1418         return ret;
1419 done:
1420         ret = -1;
1421         goto out;
1422 }
1423
1424 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1425 {
1426         struct net *net = sock_net(skb->sk);
1427         int idx, q_idx;
1428         int s_idx, s_q_idx;
1429         struct net_device *dev;
1430
1431         s_idx = cb->args[0];
1432         s_q_idx = q_idx = cb->args[1];
1433
1434         rcu_read_lock();
1435         idx = 0;
1436         for_each_netdev_rcu(net, dev) {
1437                 struct netdev_queue *dev_queue;
1438
1439                 if (idx < s_idx)
1440                         goto cont;
1441                 if (idx > s_idx)
1442                         s_q_idx = 0;
1443                 q_idx = 0;
1444
1445                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1446                         goto done;
1447
1448                 dev_queue = dev_ingress_queue(dev);
1449                 if (dev_queue &&
1450                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1451                                        &q_idx, s_q_idx) < 0)
1452                         goto done;
1453
1454 cont:
1455                 idx++;
1456         }
1457
1458 done:
1459         rcu_read_unlock();
1460
1461         cb->args[0] = idx;
1462         cb->args[1] = q_idx;
1463
1464         return skb->len;
1465 }
1466
1467
1468
1469 /************************************************
1470  *      Traffic classes manipulation.           *
1471  ************************************************/
1472
1473
1474
1475 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1476 {
1477         struct net *net = sock_net(skb->sk);
1478         struct tcmsg *tcm = nlmsg_data(n);
1479         struct nlattr *tca[TCA_MAX + 1];
1480         struct net_device *dev;
1481         struct Qdisc *q = NULL;
1482         const struct Qdisc_class_ops *cops;
1483         unsigned long cl = 0;
1484         unsigned long new_cl;
1485         u32 portid;
1486         u32 clid;
1487         u32 qid;
1488         int err;
1489
1490         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1491                 return -EPERM;
1492
1493         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1494         if (err < 0)
1495                 return err;
1496
1497         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1498         if (!dev)
1499                 return -ENODEV;
1500
1501         /*
1502            parent == TC_H_UNSPEC - unspecified parent.
1503            parent == TC_H_ROOT   - class is root, which has no parent.
1504            parent == X:0         - parent is root class.
1505            parent == X:Y         - parent is a node in hierarchy.
1506            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1507
1508            handle == 0:0         - generate handle from kernel pool.
1509            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1510            handle == X:Y         - clear.
1511            handle == X:0         - root class.
1512          */
1513
1514         /* Step 1. Determine qdisc handle X:0 */
1515
1516         portid = tcm->tcm_parent;
1517         clid = tcm->tcm_handle;
1518         qid = TC_H_MAJ(clid);
1519
1520         if (portid != TC_H_ROOT) {
1521                 u32 qid1 = TC_H_MAJ(portid);
1522
1523                 if (qid && qid1) {
1524                         /* If both majors are known, they must be identical. */
1525                         if (qid != qid1)
1526                                 return -EINVAL;
1527                 } else if (qid1) {
1528                         qid = qid1;
1529                 } else if (qid == 0)
1530                         qid = dev->qdisc->handle;
1531
1532                 /* Now qid is genuine qdisc handle consistent
1533                  * both with parent and child.
1534                  *
1535                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1536                  */
1537                 if (portid)
1538                         portid = TC_H_MAKE(qid, portid);
1539         } else {
1540                 if (qid == 0)
1541                         qid = dev->qdisc->handle;
1542         }
1543
1544         /* OK. Locate qdisc */
1545         q = qdisc_lookup(dev, qid);
1546         if (!q)
1547                 return -ENOENT;
1548
1549         /* An check that it supports classes */
1550         cops = q->ops->cl_ops;
1551         if (cops == NULL)
1552                 return -EINVAL;
1553
1554         /* Now try to get class */
1555         if (clid == 0) {
1556                 if (portid == TC_H_ROOT)
1557                         clid = qid;
1558         } else
1559                 clid = TC_H_MAKE(qid, clid);
1560
1561         if (clid)
1562                 cl = cops->get(q, clid);
1563
1564         if (cl == 0) {
1565                 err = -ENOENT;
1566                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1567                     !(n->nlmsg_flags & NLM_F_CREATE))
1568                         goto out;
1569         } else {
1570                 switch (n->nlmsg_type) {
1571                 case RTM_NEWTCLASS:
1572                         err = -EEXIST;
1573                         if (n->nlmsg_flags & NLM_F_EXCL)
1574                                 goto out;
1575                         break;
1576                 case RTM_DELTCLASS:
1577                         err = -EOPNOTSUPP;
1578                         if (cops->delete)
1579                                 err = cops->delete(q, cl);
1580                         if (err == 0)
1581                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1582                         goto out;
1583                 case RTM_GETTCLASS:
1584                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1585                         goto out;
1586                 default:
1587                         err = -EINVAL;
1588                         goto out;
1589                 }
1590         }
1591
1592         new_cl = cl;
1593         err = -EOPNOTSUPP;
1594         if (cops->change)
1595                 err = cops->change(q, clid, portid, tca, &new_cl);
1596         if (err == 0)
1597                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1598
1599 out:
1600         if (cl)
1601                 cops->put(q, cl);
1602
1603         return err;
1604 }
1605
1606
1607 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1608                           unsigned long cl,
1609                           u32 portid, u32 seq, u16 flags, int event)
1610 {
1611         struct tcmsg *tcm;
1612         struct nlmsghdr  *nlh;
1613         unsigned char *b = skb_tail_pointer(skb);
1614         struct gnet_dump d;
1615         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1616
1617         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1618         if (!nlh)
1619                 goto out_nlmsg_trim;
1620         tcm = nlmsg_data(nlh);
1621         tcm->tcm_family = AF_UNSPEC;
1622         tcm->tcm__pad1 = 0;
1623         tcm->tcm__pad2 = 0;
1624         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1625         tcm->tcm_parent = q->handle;
1626         tcm->tcm_handle = q->handle;
1627         tcm->tcm_info = 0;
1628         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1629                 goto nla_put_failure;
1630         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1631                 goto nla_put_failure;
1632
1633         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1634                                          qdisc_root_sleeping_lock(q), &d) < 0)
1635                 goto nla_put_failure;
1636
1637         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1638                 goto nla_put_failure;
1639
1640         if (gnet_stats_finish_copy(&d) < 0)
1641                 goto nla_put_failure;
1642
1643         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1644         return skb->len;
1645
1646 out_nlmsg_trim:
1647 nla_put_failure:
1648         nlmsg_trim(skb, b);
1649         return -1;
1650 }
1651
1652 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1653                          struct nlmsghdr *n, struct Qdisc *q,
1654                          unsigned long cl, int event)
1655 {
1656         struct sk_buff *skb;
1657         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1658
1659         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1660         if (!skb)
1661                 return -ENOBUFS;
1662
1663         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1664                 kfree_skb(skb);
1665                 return -EINVAL;
1666         }
1667
1668         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1669                               n->nlmsg_flags & NLM_F_ECHO);
1670 }
1671
1672 struct qdisc_dump_args {
1673         struct qdisc_walker     w;
1674         struct sk_buff          *skb;
1675         struct netlink_callback *cb;
1676 };
1677
1678 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1679 {
1680         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1681
1682         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1683                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1684 }
1685
1686 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1687                                 struct tcmsg *tcm, struct netlink_callback *cb,
1688                                 int *t_p, int s_t)
1689 {
1690         struct qdisc_dump_args arg;
1691
1692         if (tc_qdisc_dump_ignore(q) ||
1693             *t_p < s_t || !q->ops->cl_ops ||
1694             (tcm->tcm_parent &&
1695              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1696                 (*t_p)++;
1697                 return 0;
1698         }
1699         if (*t_p > s_t)
1700                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1701         arg.w.fn = qdisc_class_dump;
1702         arg.skb = skb;
1703         arg.cb = cb;
1704         arg.w.stop  = 0;
1705         arg.w.skip = cb->args[1];
1706         arg.w.count = 0;
1707         q->ops->cl_ops->walk(q, &arg.w);
1708         cb->args[1] = arg.w.count;
1709         if (arg.w.stop)
1710                 return -1;
1711         (*t_p)++;
1712         return 0;
1713 }
1714
1715 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1716                                struct tcmsg *tcm, struct netlink_callback *cb,
1717                                int *t_p, int s_t)
1718 {
1719         struct Qdisc *q;
1720
1721         if (!root)
1722                 return 0;
1723
1724         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1725                 return -1;
1726
1727         list_for_each_entry(q, &root->list, list) {
1728                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1729                         return -1;
1730         }
1731
1732         return 0;
1733 }
1734
1735 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1736 {
1737         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1738         struct net *net = sock_net(skb->sk);
1739         struct netdev_queue *dev_queue;
1740         struct net_device *dev;
1741         int t, s_t;
1742
1743         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1744                 return 0;
1745         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1746         if (!dev)
1747                 return 0;
1748
1749         s_t = cb->args[0];
1750         t = 0;
1751
1752         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1753                 goto done;
1754
1755         dev_queue = dev_ingress_queue(dev);
1756         if (dev_queue &&
1757             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1758                                 &t, s_t) < 0)
1759                 goto done;
1760
1761 done:
1762         cb->args[0] = t;
1763
1764         dev_put(dev);
1765         return skb->len;
1766 }
1767
1768 /* Main classifier routine: scans classifier chain attached
1769  * to this qdisc, (optionally) tests for protocol and asks
1770  * specific classifiers.
1771  */
1772 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1773                        struct tcf_result *res)
1774 {
1775         __be16 protocol = skb->protocol;
1776         int err;
1777
1778         for (; tp; tp = tp->next) {
1779                 if (tp->protocol != protocol &&
1780                     tp->protocol != htons(ETH_P_ALL))
1781                         continue;
1782                 err = tp->classify(skb, tp, res);
1783
1784                 if (err >= 0) {
1785 #ifdef CONFIG_NET_CLS_ACT
1786                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1787                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1788 #endif
1789                         return err;
1790                 }
1791         }
1792         return -1;
1793 }
1794 EXPORT_SYMBOL(tc_classify_compat);
1795
1796 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1797                 struct tcf_result *res)
1798 {
1799         int err = 0;
1800 #ifdef CONFIG_NET_CLS_ACT
1801         const struct tcf_proto *otp = tp;
1802 reclassify:
1803 #endif
1804
1805         err = tc_classify_compat(skb, tp, res);
1806 #ifdef CONFIG_NET_CLS_ACT
1807         if (err == TC_ACT_RECLASSIFY) {
1808                 u32 verd = G_TC_VERD(skb->tc_verd);
1809                 tp = otp;
1810
1811                 if (verd++ >= MAX_REC_LOOP) {
1812                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1813                                                tp->q->ops->id,
1814                                                tp->prio & 0xffff,
1815                                                ntohs(tp->protocol));
1816                         return TC_ACT_SHOT;
1817                 }
1818                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1819                 goto reclassify;
1820         }
1821 #endif
1822         return err;
1823 }
1824 EXPORT_SYMBOL(tc_classify);
1825
1826 void tcf_destroy(struct tcf_proto *tp)
1827 {
1828         tp->ops->destroy(tp);
1829         module_put(tp->ops->owner);
1830         kfree(tp);
1831 }
1832
1833 void tcf_destroy_chain(struct tcf_proto **fl)
1834 {
1835         struct tcf_proto *tp;
1836
1837         while ((tp = *fl) != NULL) {
1838                 *fl = tp->next;
1839                 tcf_destroy(tp);
1840         }
1841 }
1842 EXPORT_SYMBOL(tcf_destroy_chain);
1843
1844 #ifdef CONFIG_PROC_FS
1845 static int psched_show(struct seq_file *seq, void *v)
1846 {
1847         struct timespec ts;
1848
1849         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1850         seq_printf(seq, "%08x %08x %08x %08x\n",
1851                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1852                    1000000,
1853                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1854
1855         return 0;
1856 }
1857
1858 static int psched_open(struct inode *inode, struct file *file)
1859 {
1860         return single_open(file, psched_show, NULL);
1861 }
1862
1863 static const struct file_operations psched_fops = {
1864         .owner = THIS_MODULE,
1865         .open = psched_open,
1866         .read  = seq_read,
1867         .llseek = seq_lseek,
1868         .release = single_release,
1869 };
1870
1871 static int __net_init psched_net_init(struct net *net)
1872 {
1873         struct proc_dir_entry *e;
1874
1875         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1876         if (e == NULL)
1877                 return -ENOMEM;
1878
1879         return 0;
1880 }
1881
1882 static void __net_exit psched_net_exit(struct net *net)
1883 {
1884         remove_proc_entry("psched", net->proc_net);
1885 }
1886 #else
1887 static int __net_init psched_net_init(struct net *net)
1888 {
1889         return 0;
1890 }
1891
1892 static void __net_exit psched_net_exit(struct net *net)
1893 {
1894 }
1895 #endif
1896
1897 static struct pernet_operations psched_net_ops = {
1898         .init = psched_net_init,
1899         .exit = psched_net_exit,
1900 };
1901
1902 static int __init pktsched_init(void)
1903 {
1904         int err;
1905
1906         err = register_pernet_subsys(&psched_net_ops);
1907         if (err) {
1908                 pr_err("pktsched_init: "
1909                        "cannot initialize per netns operations\n");
1910                 return err;
1911         }
1912
1913         register_qdisc(&pfifo_fast_ops);
1914         register_qdisc(&pfifo_qdisc_ops);
1915         register_qdisc(&bfifo_qdisc_ops);
1916         register_qdisc(&pfifo_head_drop_qdisc_ops);
1917         register_qdisc(&mq_qdisc_ops);
1918
1919         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1920         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1921         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1922         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1923         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1924         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1925
1926         return 0;
1927 }
1928
1929 subsys_initcall(pktsched_init);