Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input
[linux-drm-fsl-dcu.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 static void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
277                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
278 }
279
280 void qdisc_list_del(struct Qdisc *q)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
283                 list_del(&q->list);
284 }
285 EXPORT_SYMBOL(qdisc_list_del);
286
287 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
288 {
289         struct Qdisc *q;
290
291         q = qdisc_match_from_root(dev->qdisc, handle);
292         if (q)
293                 goto out;
294
295         if (dev_ingress_queue(dev))
296                 q = qdisc_match_from_root(
297                         dev_ingress_queue(dev)->qdisc_sleeping,
298                         handle);
299 out:
300         return q;
301 }
302
303 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
304 {
305         unsigned long cl;
306         struct Qdisc *leaf;
307         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
308
309         if (cops == NULL)
310                 return NULL;
311         cl = cops->get(p, classid);
312
313         if (cl == 0)
314                 return NULL;
315         leaf = cops->leaf(p, cl);
316         cops->put(p, cl);
317         return leaf;
318 }
319
320 /* Find queueing discipline by name */
321
322 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
323 {
324         struct Qdisc_ops *q = NULL;
325
326         if (kind) {
327                 read_lock(&qdisc_mod_lock);
328                 for (q = qdisc_base; q; q = q->next) {
329                         if (nla_strcmp(kind, q->id) == 0) {
330                                 if (!try_module_get(q->owner))
331                                         q = NULL;
332                                 break;
333                         }
334                 }
335                 read_unlock(&qdisc_mod_lock);
336         }
337         return q;
338 }
339
340 /* The linklayer setting were not transferred from iproute2, in older
341  * versions, and the rate tables lookup systems have been dropped in
342  * the kernel. To keep backward compatible with older iproute2 tc
343  * utils, we detect the linklayer setting by detecting if the rate
344  * table were modified.
345  *
346  * For linklayer ATM table entries, the rate table will be aligned to
347  * 48 bytes, thus some table entries will contain the same value.  The
348  * mpu (min packet unit) is also encoded into the old rate table, thus
349  * starting from the mpu, we find low and high table entries for
350  * mapping this cell.  If these entries contain the same value, when
351  * the rate tables have been modified for linklayer ATM.
352  *
353  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
354  * and then roundup to the next cell, calc the table entry one below,
355  * and compare.
356  */
357 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
358 {
359         int low       = roundup(r->mpu, 48);
360         int high      = roundup(low+1, 48);
361         int cell_low  = low >> r->cell_log;
362         int cell_high = (high >> r->cell_log) - 1;
363
364         /* rtab is too inaccurate at rates > 100Mbit/s */
365         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
366                 pr_debug("TC linklayer: Giving up ATM detection\n");
367                 return TC_LINKLAYER_ETHERNET;
368         }
369
370         if ((cell_high > cell_low) && (cell_high < 256)
371             && (rtab[cell_low] == rtab[cell_high])) {
372                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
373                          cell_low, cell_high, rtab[cell_high]);
374                 return TC_LINKLAYER_ATM;
375         }
376         return TC_LINKLAYER_ETHERNET;
377 }
378
379 static struct qdisc_rate_table *qdisc_rtab_list;
380
381 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
382 {
383         struct qdisc_rate_table *rtab;
384
385         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
386             nla_len(tab) != TC_RTAB_SIZE)
387                 return NULL;
388
389         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
390                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
391                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
392                         rtab->refcnt++;
393                         return rtab;
394                 }
395         }
396
397         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
398         if (rtab) {
399                 rtab->rate = *r;
400                 rtab->refcnt = 1;
401                 memcpy(rtab->data, nla_data(tab), 1024);
402                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
403                         r->linklayer = __detect_linklayer(r, rtab->data);
404                 rtab->next = qdisc_rtab_list;
405                 qdisc_rtab_list = rtab;
406         }
407         return rtab;
408 }
409 EXPORT_SYMBOL(qdisc_get_rtab);
410
411 void qdisc_put_rtab(struct qdisc_rate_table *tab)
412 {
413         struct qdisc_rate_table *rtab, **rtabp;
414
415         if (!tab || --tab->refcnt)
416                 return;
417
418         for (rtabp = &qdisc_rtab_list;
419              (rtab = *rtabp) != NULL;
420              rtabp = &rtab->next) {
421                 if (rtab == tab) {
422                         *rtabp = rtab->next;
423                         kfree(rtab);
424                         return;
425                 }
426         }
427 }
428 EXPORT_SYMBOL(qdisc_put_rtab);
429
430 static LIST_HEAD(qdisc_stab_list);
431 static DEFINE_SPINLOCK(qdisc_stab_lock);
432
433 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
434         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
435         [TCA_STAB_DATA] = { .type = NLA_BINARY },
436 };
437
438 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
439 {
440         struct nlattr *tb[TCA_STAB_MAX + 1];
441         struct qdisc_size_table *stab;
442         struct tc_sizespec *s;
443         unsigned int tsize = 0;
444         u16 *tab = NULL;
445         int err;
446
447         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
448         if (err < 0)
449                 return ERR_PTR(err);
450         if (!tb[TCA_STAB_BASE])
451                 return ERR_PTR(-EINVAL);
452
453         s = nla_data(tb[TCA_STAB_BASE]);
454
455         if (s->tsize > 0) {
456                 if (!tb[TCA_STAB_DATA])
457                         return ERR_PTR(-EINVAL);
458                 tab = nla_data(tb[TCA_STAB_DATA]);
459                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
460         }
461
462         if (tsize != s->tsize || (!tab && tsize > 0))
463                 return ERR_PTR(-EINVAL);
464
465         spin_lock(&qdisc_stab_lock);
466
467         list_for_each_entry(stab, &qdisc_stab_list, list) {
468                 if (memcmp(&stab->szopts, s, sizeof(*s)))
469                         continue;
470                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
471                         continue;
472                 stab->refcnt++;
473                 spin_unlock(&qdisc_stab_lock);
474                 return stab;
475         }
476
477         spin_unlock(&qdisc_stab_lock);
478
479         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
480         if (!stab)
481                 return ERR_PTR(-ENOMEM);
482
483         stab->refcnt = 1;
484         stab->szopts = *s;
485         if (tsize > 0)
486                 memcpy(stab->data, tab, tsize * sizeof(u16));
487
488         spin_lock(&qdisc_stab_lock);
489         list_add_tail(&stab->list, &qdisc_stab_list);
490         spin_unlock(&qdisc_stab_lock);
491
492         return stab;
493 }
494
495 static void stab_kfree_rcu(struct rcu_head *head)
496 {
497         kfree(container_of(head, struct qdisc_size_table, rcu));
498 }
499
500 void qdisc_put_stab(struct qdisc_size_table *tab)
501 {
502         if (!tab)
503                 return;
504
505         spin_lock(&qdisc_stab_lock);
506
507         if (--tab->refcnt == 0) {
508                 list_del(&tab->list);
509                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
510         }
511
512         spin_unlock(&qdisc_stab_lock);
513 }
514 EXPORT_SYMBOL(qdisc_put_stab);
515
516 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
517 {
518         struct nlattr *nest;
519
520         nest = nla_nest_start(skb, TCA_STAB);
521         if (nest == NULL)
522                 goto nla_put_failure;
523         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
524                 goto nla_put_failure;
525         nla_nest_end(skb, nest);
526
527         return skb->len;
528
529 nla_put_failure:
530         return -1;
531 }
532
533 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
534 {
535         int pkt_len, slot;
536
537         pkt_len = skb->len + stab->szopts.overhead;
538         if (unlikely(!stab->szopts.tsize))
539                 goto out;
540
541         slot = pkt_len + stab->szopts.cell_align;
542         if (unlikely(slot < 0))
543                 slot = 0;
544
545         slot >>= stab->szopts.cell_log;
546         if (likely(slot < stab->szopts.tsize))
547                 pkt_len = stab->data[slot];
548         else
549                 pkt_len = stab->data[stab->szopts.tsize - 1] *
550                                 (slot / stab->szopts.tsize) +
551                                 stab->data[slot % stab->szopts.tsize];
552
553         pkt_len <<= stab->szopts.size_log;
554 out:
555         if (unlikely(pkt_len < 1))
556                 pkt_len = 1;
557         qdisc_skb_cb(skb)->pkt_len = pkt_len;
558 }
559 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
560
561 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
562 {
563         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
564                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
565                         txt, qdisc->ops->id, qdisc->handle >> 16);
566                 qdisc->flags |= TCQ_F_WARN_NONWC;
567         }
568 }
569 EXPORT_SYMBOL(qdisc_warn_nonwc);
570
571 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
572 {
573         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
574                                                  timer);
575
576         qdisc_unthrottled(wd->qdisc);
577         __netif_schedule(qdisc_root(wd->qdisc));
578
579         return HRTIMER_NORESTART;
580 }
581
582 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
583 {
584         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
585         wd->timer.function = qdisc_watchdog;
586         wd->qdisc = qdisc;
587 }
588 EXPORT_SYMBOL(qdisc_watchdog_init);
589
590 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
591 {
592         if (test_bit(__QDISC_STATE_DEACTIVATED,
593                      &qdisc_root_sleeping(wd->qdisc)->state))
594                 return;
595
596         qdisc_throttled(wd->qdisc);
597
598         hrtimer_start(&wd->timer,
599                       ns_to_ktime(expires),
600                       HRTIMER_MODE_ABS);
601 }
602 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
603
604 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
605 {
606         hrtimer_cancel(&wd->timer);
607         qdisc_unthrottled(wd->qdisc);
608 }
609 EXPORT_SYMBOL(qdisc_watchdog_cancel);
610
611 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
612 {
613         unsigned int size = n * sizeof(struct hlist_head), i;
614         struct hlist_head *h;
615
616         if (size <= PAGE_SIZE)
617                 h = kmalloc(size, GFP_KERNEL);
618         else
619                 h = (struct hlist_head *)
620                         __get_free_pages(GFP_KERNEL, get_order(size));
621
622         if (h != NULL) {
623                 for (i = 0; i < n; i++)
624                         INIT_HLIST_HEAD(&h[i]);
625         }
626         return h;
627 }
628
629 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
630 {
631         unsigned int size = n * sizeof(struct hlist_head);
632
633         if (size <= PAGE_SIZE)
634                 kfree(h);
635         else
636                 free_pages((unsigned long)h, get_order(size));
637 }
638
639 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
640 {
641         struct Qdisc_class_common *cl;
642         struct hlist_node *next;
643         struct hlist_head *nhash, *ohash;
644         unsigned int nsize, nmask, osize;
645         unsigned int i, h;
646
647         /* Rehash when load factor exceeds 0.75 */
648         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
649                 return;
650         nsize = clhash->hashsize * 2;
651         nmask = nsize - 1;
652         nhash = qdisc_class_hash_alloc(nsize);
653         if (nhash == NULL)
654                 return;
655
656         ohash = clhash->hash;
657         osize = clhash->hashsize;
658
659         sch_tree_lock(sch);
660         for (i = 0; i < osize; i++) {
661                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
662                         h = qdisc_class_hash(cl->classid, nmask);
663                         hlist_add_head(&cl->hnode, &nhash[h]);
664                 }
665         }
666         clhash->hash     = nhash;
667         clhash->hashsize = nsize;
668         clhash->hashmask = nmask;
669         sch_tree_unlock(sch);
670
671         qdisc_class_hash_free(ohash, osize);
672 }
673 EXPORT_SYMBOL(qdisc_class_hash_grow);
674
675 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
676 {
677         unsigned int size = 4;
678
679         clhash->hash = qdisc_class_hash_alloc(size);
680         if (clhash->hash == NULL)
681                 return -ENOMEM;
682         clhash->hashsize  = size;
683         clhash->hashmask  = size - 1;
684         clhash->hashelems = 0;
685         return 0;
686 }
687 EXPORT_SYMBOL(qdisc_class_hash_init);
688
689 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
690 {
691         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
692 }
693 EXPORT_SYMBOL(qdisc_class_hash_destroy);
694
695 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
696                              struct Qdisc_class_common *cl)
697 {
698         unsigned int h;
699
700         INIT_HLIST_NODE(&cl->hnode);
701         h = qdisc_class_hash(cl->classid, clhash->hashmask);
702         hlist_add_head(&cl->hnode, &clhash->hash[h]);
703         clhash->hashelems++;
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_insert);
706
707 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
708                              struct Qdisc_class_common *cl)
709 {
710         hlist_del(&cl->hnode);
711         clhash->hashelems--;
712 }
713 EXPORT_SYMBOL(qdisc_class_hash_remove);
714
715 /* Allocate an unique handle from space managed by kernel
716  * Possible range is [8000-FFFF]:0000 (0x8000 values)
717  */
718 static u32 qdisc_alloc_handle(struct net_device *dev)
719 {
720         int i = 0x8000;
721         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
722
723         do {
724                 autohandle += TC_H_MAKE(0x10000U, 0);
725                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
726                         autohandle = TC_H_MAKE(0x80000000U, 0);
727                 if (!qdisc_lookup(dev, autohandle))
728                         return autohandle;
729                 cond_resched();
730         } while (--i > 0);
731
732         return 0;
733 }
734
735 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
736 {
737         const struct Qdisc_class_ops *cops;
738         unsigned long cl;
739         u32 parentid;
740         int drops;
741
742         if (n == 0)
743                 return;
744         drops = max_t(int, n, 0);
745         while ((parentid = sch->parent)) {
746                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
747                         return;
748
749                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
750                 if (sch == NULL) {
751                         WARN_ON(parentid != TC_H_ROOT);
752                         return;
753                 }
754                 cops = sch->ops->cl_ops;
755                 if (cops->qlen_notify) {
756                         cl = cops->get(sch, parentid);
757                         cops->qlen_notify(sch, cl);
758                         cops->put(sch, cl);
759                 }
760                 sch->q.qlen -= n;
761                 sch->qstats.drops += drops;
762         }
763 }
764 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
765
766 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
767                                struct nlmsghdr *n, u32 clid,
768                                struct Qdisc *old, struct Qdisc *new)
769 {
770         if (new || old)
771                 qdisc_notify(net, skb, n, clid, old, new);
772
773         if (old)
774                 qdisc_destroy(old);
775 }
776
777 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
778  * to device "dev".
779  *
780  * When appropriate send a netlink notification using 'skb'
781  * and "n".
782  *
783  * On success, destroy old qdisc.
784  */
785
786 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
787                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
788                        struct Qdisc *new, struct Qdisc *old)
789 {
790         struct Qdisc *q = old;
791         struct net *net = dev_net(dev);
792         int err = 0;
793
794         if (parent == NULL) {
795                 unsigned int i, num_q, ingress;
796
797                 ingress = 0;
798                 num_q = dev->num_tx_queues;
799                 if ((q && q->flags & TCQ_F_INGRESS) ||
800                     (new && new->flags & TCQ_F_INGRESS)) {
801                         num_q = 1;
802                         ingress = 1;
803                         if (!dev_ingress_queue(dev))
804                                 return -ENOENT;
805                 }
806
807                 if (dev->flags & IFF_UP)
808                         dev_deactivate(dev);
809
810                 if (new && new->ops->attach) {
811                         new->ops->attach(new);
812                         num_q = 0;
813                 }
814
815                 for (i = 0; i < num_q; i++) {
816                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
817
818                         if (!ingress)
819                                 dev_queue = netdev_get_tx_queue(dev, i);
820
821                         old = dev_graft_qdisc(dev_queue, new);
822                         if (new && i > 0)
823                                 atomic_inc(&new->refcnt);
824
825                         if (!ingress)
826                                 qdisc_destroy(old);
827                 }
828
829                 if (!ingress) {
830                         notify_and_destroy(net, skb, n, classid,
831                                            dev->qdisc, new);
832                         if (new && !new->ops->attach)
833                                 atomic_inc(&new->refcnt);
834                         dev->qdisc = new ? : &noop_qdisc;
835                 } else {
836                         notify_and_destroy(net, skb, n, classid, old, new);
837                 }
838
839                 if (dev->flags & IFF_UP)
840                         dev_activate(dev);
841         } else {
842                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
843
844                 err = -EOPNOTSUPP;
845                 if (cops && cops->graft) {
846                         unsigned long cl = cops->get(parent, classid);
847                         if (cl) {
848                                 err = cops->graft(parent, cl, new, &old);
849                                 cops->put(parent, cl);
850                         } else
851                                 err = -ENOENT;
852                 }
853                 if (!err)
854                         notify_and_destroy(net, skb, n, classid, old, new);
855         }
856         return err;
857 }
858
859 /* lockdep annotation is needed for ingress; egress gets it only for name */
860 static struct lock_class_key qdisc_tx_lock;
861 static struct lock_class_key qdisc_rx_lock;
862
863 /*
864    Allocate and initialize new qdisc.
865
866    Parameters are passed via opt.
867  */
868
869 static struct Qdisc *
870 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
871              struct Qdisc *p, u32 parent, u32 handle,
872              struct nlattr **tca, int *errp)
873 {
874         int err;
875         struct nlattr *kind = tca[TCA_KIND];
876         struct Qdisc *sch;
877         struct Qdisc_ops *ops;
878         struct qdisc_size_table *stab;
879
880         ops = qdisc_lookup_ops(kind);
881 #ifdef CONFIG_MODULES
882         if (ops == NULL && kind != NULL) {
883                 char name[IFNAMSIZ];
884                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
885                         /* We dropped the RTNL semaphore in order to
886                          * perform the module load.  So, even if we
887                          * succeeded in loading the module we have to
888                          * tell the caller to replay the request.  We
889                          * indicate this using -EAGAIN.
890                          * We replay the request because the device may
891                          * go away in the mean time.
892                          */
893                         rtnl_unlock();
894                         request_module("sch_%s", name);
895                         rtnl_lock();
896                         ops = qdisc_lookup_ops(kind);
897                         if (ops != NULL) {
898                                 /* We will try again qdisc_lookup_ops,
899                                  * so don't keep a reference.
900                                  */
901                                 module_put(ops->owner);
902                                 err = -EAGAIN;
903                                 goto err_out;
904                         }
905                 }
906         }
907 #endif
908
909         err = -ENOENT;
910         if (ops == NULL)
911                 goto err_out;
912
913         sch = qdisc_alloc(dev_queue, ops);
914         if (IS_ERR(sch)) {
915                 err = PTR_ERR(sch);
916                 goto err_out2;
917         }
918
919         sch->parent = parent;
920
921         if (handle == TC_H_INGRESS) {
922                 sch->flags |= TCQ_F_INGRESS;
923                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
924                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
925         } else {
926                 if (handle == 0) {
927                         handle = qdisc_alloc_handle(dev);
928                         err = -ENOMEM;
929                         if (handle == 0)
930                                 goto err_out3;
931                 }
932                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
933                 if (!netif_is_multiqueue(dev))
934                         sch->flags |= TCQ_F_ONETXQUEUE;
935         }
936
937         sch->handle = handle;
938
939         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
940                 if (tca[TCA_STAB]) {
941                         stab = qdisc_get_stab(tca[TCA_STAB]);
942                         if (IS_ERR(stab)) {
943                                 err = PTR_ERR(stab);
944                                 goto err_out4;
945                         }
946                         rcu_assign_pointer(sch->stab, stab);
947                 }
948                 if (tca[TCA_RATE]) {
949                         spinlock_t *root_lock;
950
951                         err = -EOPNOTSUPP;
952                         if (sch->flags & TCQ_F_MQROOT)
953                                 goto err_out4;
954
955                         if ((sch->parent != TC_H_ROOT) &&
956                             !(sch->flags & TCQ_F_INGRESS) &&
957                             (!p || !(p->flags & TCQ_F_MQROOT)))
958                                 root_lock = qdisc_root_sleeping_lock(sch);
959                         else
960                                 root_lock = qdisc_lock(sch);
961
962                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
963                                                 root_lock, tca[TCA_RATE]);
964                         if (err)
965                                 goto err_out4;
966                 }
967
968                 qdisc_list_add(sch);
969
970                 return sch;
971         }
972 err_out3:
973         dev_put(dev);
974         kfree((char *) sch - sch->padded);
975 err_out2:
976         module_put(ops->owner);
977 err_out:
978         *errp = err;
979         return NULL;
980
981 err_out4:
982         /*
983          * Any broken qdiscs that would require a ops->reset() here?
984          * The qdisc was never in action so it shouldn't be necessary.
985          */
986         qdisc_put_stab(rtnl_dereference(sch->stab));
987         if (ops->destroy)
988                 ops->destroy(sch);
989         goto err_out3;
990 }
991
992 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
993 {
994         struct qdisc_size_table *ostab, *stab = NULL;
995         int err = 0;
996
997         if (tca[TCA_OPTIONS]) {
998                 if (sch->ops->change == NULL)
999                         return -EINVAL;
1000                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1001                 if (err)
1002                         return err;
1003         }
1004
1005         if (tca[TCA_STAB]) {
1006                 stab = qdisc_get_stab(tca[TCA_STAB]);
1007                 if (IS_ERR(stab))
1008                         return PTR_ERR(stab);
1009         }
1010
1011         ostab = rtnl_dereference(sch->stab);
1012         rcu_assign_pointer(sch->stab, stab);
1013         qdisc_put_stab(ostab);
1014
1015         if (tca[TCA_RATE]) {
1016                 /* NB: ignores errors from replace_estimator
1017                    because change can't be undone. */
1018                 if (sch->flags & TCQ_F_MQROOT)
1019                         goto out;
1020                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
1021                                             qdisc_root_sleeping_lock(sch),
1022                                             tca[TCA_RATE]);
1023         }
1024 out:
1025         return 0;
1026 }
1027
1028 struct check_loop_arg {
1029         struct qdisc_walker     w;
1030         struct Qdisc            *p;
1031         int                     depth;
1032 };
1033
1034 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1035
1036 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1037 {
1038         struct check_loop_arg   arg;
1039
1040         if (q->ops->cl_ops == NULL)
1041                 return 0;
1042
1043         arg.w.stop = arg.w.skip = arg.w.count = 0;
1044         arg.w.fn = check_loop_fn;
1045         arg.depth = depth;
1046         arg.p = p;
1047         q->ops->cl_ops->walk(q, &arg.w);
1048         return arg.w.stop ? -ELOOP : 0;
1049 }
1050
1051 static int
1052 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1053 {
1054         struct Qdisc *leaf;
1055         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1056         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1057
1058         leaf = cops->leaf(q, cl);
1059         if (leaf) {
1060                 if (leaf == arg->p || arg->depth > 7)
1061                         return -ELOOP;
1062                 return check_loop(leaf, arg->p, arg->depth + 1);
1063         }
1064         return 0;
1065 }
1066
1067 /*
1068  * Delete/get qdisc.
1069  */
1070
1071 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1072 {
1073         struct net *net = sock_net(skb->sk);
1074         struct tcmsg *tcm = nlmsg_data(n);
1075         struct nlattr *tca[TCA_MAX + 1];
1076         struct net_device *dev;
1077         u32 clid;
1078         struct Qdisc *q = NULL;
1079         struct Qdisc *p = NULL;
1080         int err;
1081
1082         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1083                 return -EPERM;
1084
1085         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1086         if (err < 0)
1087                 return err;
1088
1089         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1090         if (!dev)
1091                 return -ENODEV;
1092
1093         clid = tcm->tcm_parent;
1094         if (clid) {
1095                 if (clid != TC_H_ROOT) {
1096                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1097                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1098                                 if (!p)
1099                                         return -ENOENT;
1100                                 q = qdisc_leaf(p, clid);
1101                         } else if (dev_ingress_queue(dev)) {
1102                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1103                         }
1104                 } else {
1105                         q = dev->qdisc;
1106                 }
1107                 if (!q)
1108                         return -ENOENT;
1109
1110                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1111                         return -EINVAL;
1112         } else {
1113                 q = qdisc_lookup(dev, tcm->tcm_handle);
1114                 if (!q)
1115                         return -ENOENT;
1116         }
1117
1118         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1119                 return -EINVAL;
1120
1121         if (n->nlmsg_type == RTM_DELQDISC) {
1122                 if (!clid)
1123                         return -EINVAL;
1124                 if (q->handle == 0)
1125                         return -ENOENT;
1126                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1127                 if (err != 0)
1128                         return err;
1129         } else {
1130                 qdisc_notify(net, skb, n, clid, NULL, q);
1131         }
1132         return 0;
1133 }
1134
1135 /*
1136  * Create/change qdisc.
1137  */
1138
1139 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1140 {
1141         struct net *net = sock_net(skb->sk);
1142         struct tcmsg *tcm;
1143         struct nlattr *tca[TCA_MAX + 1];
1144         struct net_device *dev;
1145         u32 clid;
1146         struct Qdisc *q, *p;
1147         int err;
1148
1149         if (!capable(CAP_NET_ADMIN))
1150                 return -EPERM;
1151
1152 replay:
1153         /* Reinit, just in case something touches this. */
1154         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1155         if (err < 0)
1156                 return err;
1157
1158         tcm = nlmsg_data(n);
1159         clid = tcm->tcm_parent;
1160         q = p = NULL;
1161
1162         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1163         if (!dev)
1164                 return -ENODEV;
1165
1166
1167         if (clid) {
1168                 if (clid != TC_H_ROOT) {
1169                         if (clid != TC_H_INGRESS) {
1170                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1171                                 if (!p)
1172                                         return -ENOENT;
1173                                 q = qdisc_leaf(p, clid);
1174                         } else if (dev_ingress_queue_create(dev)) {
1175                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1176                         }
1177                 } else {
1178                         q = dev->qdisc;
1179                 }
1180
1181                 /* It may be default qdisc, ignore it */
1182                 if (q && q->handle == 0)
1183                         q = NULL;
1184
1185                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1186                         if (tcm->tcm_handle) {
1187                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1188                                         return -EEXIST;
1189                                 if (TC_H_MIN(tcm->tcm_handle))
1190                                         return -EINVAL;
1191                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1192                                 if (!q)
1193                                         goto create_n_graft;
1194                                 if (n->nlmsg_flags & NLM_F_EXCL)
1195                                         return -EEXIST;
1196                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1197                                         return -EINVAL;
1198                                 if (q == p ||
1199                                     (p && check_loop(q, p, 0)))
1200                                         return -ELOOP;
1201                                 atomic_inc(&q->refcnt);
1202                                 goto graft;
1203                         } else {
1204                                 if (!q)
1205                                         goto create_n_graft;
1206
1207                                 /* This magic test requires explanation.
1208                                  *
1209                                  *   We know, that some child q is already
1210                                  *   attached to this parent and have choice:
1211                                  *   either to change it or to create/graft new one.
1212                                  *
1213                                  *   1. We are allowed to create/graft only
1214                                  *   if CREATE and REPLACE flags are set.
1215                                  *
1216                                  *   2. If EXCL is set, requestor wanted to say,
1217                                  *   that qdisc tcm_handle is not expected
1218                                  *   to exist, so that we choose create/graft too.
1219                                  *
1220                                  *   3. The last case is when no flags are set.
1221                                  *   Alas, it is sort of hole in API, we
1222                                  *   cannot decide what to do unambiguously.
1223                                  *   For now we select create/graft, if
1224                                  *   user gave KIND, which does not match existing.
1225                                  */
1226                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1227                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1228                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1229                                      (tca[TCA_KIND] &&
1230                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1231                                         goto create_n_graft;
1232                         }
1233                 }
1234         } else {
1235                 if (!tcm->tcm_handle)
1236                         return -EINVAL;
1237                 q = qdisc_lookup(dev, tcm->tcm_handle);
1238         }
1239
1240         /* Change qdisc parameters */
1241         if (q == NULL)
1242                 return -ENOENT;
1243         if (n->nlmsg_flags & NLM_F_EXCL)
1244                 return -EEXIST;
1245         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1246                 return -EINVAL;
1247         err = qdisc_change(q, tca);
1248         if (err == 0)
1249                 qdisc_notify(net, skb, n, clid, NULL, q);
1250         return err;
1251
1252 create_n_graft:
1253         if (!(n->nlmsg_flags & NLM_F_CREATE))
1254                 return -ENOENT;
1255         if (clid == TC_H_INGRESS) {
1256                 if (dev_ingress_queue(dev))
1257                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1258                                          tcm->tcm_parent, tcm->tcm_parent,
1259                                          tca, &err);
1260                 else
1261                         err = -ENOENT;
1262         } else {
1263                 struct netdev_queue *dev_queue;
1264
1265                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1266                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1267                 else if (p)
1268                         dev_queue = p->dev_queue;
1269                 else
1270                         dev_queue = netdev_get_tx_queue(dev, 0);
1271
1272                 q = qdisc_create(dev, dev_queue, p,
1273                                  tcm->tcm_parent, tcm->tcm_handle,
1274                                  tca, &err);
1275         }
1276         if (q == NULL) {
1277                 if (err == -EAGAIN)
1278                         goto replay;
1279                 return err;
1280         }
1281
1282 graft:
1283         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1284         if (err) {
1285                 if (q)
1286                         qdisc_destroy(q);
1287                 return err;
1288         }
1289
1290         return 0;
1291 }
1292
1293 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1294                          u32 portid, u32 seq, u16 flags, int event)
1295 {
1296         struct tcmsg *tcm;
1297         struct nlmsghdr  *nlh;
1298         unsigned char *b = skb_tail_pointer(skb);
1299         struct gnet_dump d;
1300         struct qdisc_size_table *stab;
1301
1302         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1303         if (!nlh)
1304                 goto out_nlmsg_trim;
1305         tcm = nlmsg_data(nlh);
1306         tcm->tcm_family = AF_UNSPEC;
1307         tcm->tcm__pad1 = 0;
1308         tcm->tcm__pad2 = 0;
1309         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1310         tcm->tcm_parent = clid;
1311         tcm->tcm_handle = q->handle;
1312         tcm->tcm_info = atomic_read(&q->refcnt);
1313         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1314                 goto nla_put_failure;
1315         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1316                 goto nla_put_failure;
1317         q->qstats.qlen = q->q.qlen;
1318
1319         stab = rtnl_dereference(q->stab);
1320         if (stab && qdisc_dump_stab(skb, stab) < 0)
1321                 goto nla_put_failure;
1322
1323         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1324                                          qdisc_root_sleeping_lock(q), &d) < 0)
1325                 goto nla_put_failure;
1326
1327         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1328                 goto nla_put_failure;
1329
1330         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1331             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1332             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1333                 goto nla_put_failure;
1334
1335         if (gnet_stats_finish_copy(&d) < 0)
1336                 goto nla_put_failure;
1337
1338         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1339         return skb->len;
1340
1341 out_nlmsg_trim:
1342 nla_put_failure:
1343         nlmsg_trim(skb, b);
1344         return -1;
1345 }
1346
1347 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1348 {
1349         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1350 }
1351
1352 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1353                         struct nlmsghdr *n, u32 clid,
1354                         struct Qdisc *old, struct Qdisc *new)
1355 {
1356         struct sk_buff *skb;
1357         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1358
1359         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1360         if (!skb)
1361                 return -ENOBUFS;
1362
1363         if (old && !tc_qdisc_dump_ignore(old)) {
1364                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1365                                   0, RTM_DELQDISC) < 0)
1366                         goto err_out;
1367         }
1368         if (new && !tc_qdisc_dump_ignore(new)) {
1369                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1370                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1371                         goto err_out;
1372         }
1373
1374         if (skb->len)
1375                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1376                                       n->nlmsg_flags & NLM_F_ECHO);
1377
1378 err_out:
1379         kfree_skb(skb);
1380         return -EINVAL;
1381 }
1382
1383 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1384                               struct netlink_callback *cb,
1385                               int *q_idx_p, int s_q_idx)
1386 {
1387         int ret = 0, q_idx = *q_idx_p;
1388         struct Qdisc *q;
1389
1390         if (!root)
1391                 return 0;
1392
1393         q = root;
1394         if (q_idx < s_q_idx) {
1395                 q_idx++;
1396         } else {
1397                 if (!tc_qdisc_dump_ignore(q) &&
1398                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1399                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1400                         goto done;
1401                 q_idx++;
1402         }
1403         list_for_each_entry(q, &root->list, list) {
1404                 if (q_idx < s_q_idx) {
1405                         q_idx++;
1406                         continue;
1407                 }
1408                 if (!tc_qdisc_dump_ignore(q) &&
1409                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1410                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1411                         goto done;
1412                 q_idx++;
1413         }
1414
1415 out:
1416         *q_idx_p = q_idx;
1417         return ret;
1418 done:
1419         ret = -1;
1420         goto out;
1421 }
1422
1423 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1424 {
1425         struct net *net = sock_net(skb->sk);
1426         int idx, q_idx;
1427         int s_idx, s_q_idx;
1428         struct net_device *dev;
1429
1430         s_idx = cb->args[0];
1431         s_q_idx = q_idx = cb->args[1];
1432
1433         rcu_read_lock();
1434         idx = 0;
1435         for_each_netdev_rcu(net, dev) {
1436                 struct netdev_queue *dev_queue;
1437
1438                 if (idx < s_idx)
1439                         goto cont;
1440                 if (idx > s_idx)
1441                         s_q_idx = 0;
1442                 q_idx = 0;
1443
1444                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1445                         goto done;
1446
1447                 dev_queue = dev_ingress_queue(dev);
1448                 if (dev_queue &&
1449                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1450                                        &q_idx, s_q_idx) < 0)
1451                         goto done;
1452
1453 cont:
1454                 idx++;
1455         }
1456
1457 done:
1458         rcu_read_unlock();
1459
1460         cb->args[0] = idx;
1461         cb->args[1] = q_idx;
1462
1463         return skb->len;
1464 }
1465
1466
1467
1468 /************************************************
1469  *      Traffic classes manipulation.           *
1470  ************************************************/
1471
1472
1473
1474 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1475 {
1476         struct net *net = sock_net(skb->sk);
1477         struct tcmsg *tcm = nlmsg_data(n);
1478         struct nlattr *tca[TCA_MAX + 1];
1479         struct net_device *dev;
1480         struct Qdisc *q = NULL;
1481         const struct Qdisc_class_ops *cops;
1482         unsigned long cl = 0;
1483         unsigned long new_cl;
1484         u32 portid;
1485         u32 clid;
1486         u32 qid;
1487         int err;
1488
1489         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1490                 return -EPERM;
1491
1492         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1493         if (err < 0)
1494                 return err;
1495
1496         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1497         if (!dev)
1498                 return -ENODEV;
1499
1500         /*
1501            parent == TC_H_UNSPEC - unspecified parent.
1502            parent == TC_H_ROOT   - class is root, which has no parent.
1503            parent == X:0         - parent is root class.
1504            parent == X:Y         - parent is a node in hierarchy.
1505            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1506
1507            handle == 0:0         - generate handle from kernel pool.
1508            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1509            handle == X:Y         - clear.
1510            handle == X:0         - root class.
1511          */
1512
1513         /* Step 1. Determine qdisc handle X:0 */
1514
1515         portid = tcm->tcm_parent;
1516         clid = tcm->tcm_handle;
1517         qid = TC_H_MAJ(clid);
1518
1519         if (portid != TC_H_ROOT) {
1520                 u32 qid1 = TC_H_MAJ(portid);
1521
1522                 if (qid && qid1) {
1523                         /* If both majors are known, they must be identical. */
1524                         if (qid != qid1)
1525                                 return -EINVAL;
1526                 } else if (qid1) {
1527                         qid = qid1;
1528                 } else if (qid == 0)
1529                         qid = dev->qdisc->handle;
1530
1531                 /* Now qid is genuine qdisc handle consistent
1532                  * both with parent and child.
1533                  *
1534                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1535                  */
1536                 if (portid)
1537                         portid = TC_H_MAKE(qid, portid);
1538         } else {
1539                 if (qid == 0)
1540                         qid = dev->qdisc->handle;
1541         }
1542
1543         /* OK. Locate qdisc */
1544         q = qdisc_lookup(dev, qid);
1545         if (!q)
1546                 return -ENOENT;
1547
1548         /* An check that it supports classes */
1549         cops = q->ops->cl_ops;
1550         if (cops == NULL)
1551                 return -EINVAL;
1552
1553         /* Now try to get class */
1554         if (clid == 0) {
1555                 if (portid == TC_H_ROOT)
1556                         clid = qid;
1557         } else
1558                 clid = TC_H_MAKE(qid, clid);
1559
1560         if (clid)
1561                 cl = cops->get(q, clid);
1562
1563         if (cl == 0) {
1564                 err = -ENOENT;
1565                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1566                     !(n->nlmsg_flags & NLM_F_CREATE))
1567                         goto out;
1568         } else {
1569                 switch (n->nlmsg_type) {
1570                 case RTM_NEWTCLASS:
1571                         err = -EEXIST;
1572                         if (n->nlmsg_flags & NLM_F_EXCL)
1573                                 goto out;
1574                         break;
1575                 case RTM_DELTCLASS:
1576                         err = -EOPNOTSUPP;
1577                         if (cops->delete)
1578                                 err = cops->delete(q, cl);
1579                         if (err == 0)
1580                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1581                         goto out;
1582                 case RTM_GETTCLASS:
1583                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1584                         goto out;
1585                 default:
1586                         err = -EINVAL;
1587                         goto out;
1588                 }
1589         }
1590
1591         new_cl = cl;
1592         err = -EOPNOTSUPP;
1593         if (cops->change)
1594                 err = cops->change(q, clid, portid, tca, &new_cl);
1595         if (err == 0)
1596                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1597
1598 out:
1599         if (cl)
1600                 cops->put(q, cl);
1601
1602         return err;
1603 }
1604
1605
1606 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1607                           unsigned long cl,
1608                           u32 portid, u32 seq, u16 flags, int event)
1609 {
1610         struct tcmsg *tcm;
1611         struct nlmsghdr  *nlh;
1612         unsigned char *b = skb_tail_pointer(skb);
1613         struct gnet_dump d;
1614         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1615
1616         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1617         if (!nlh)
1618                 goto out_nlmsg_trim;
1619         tcm = nlmsg_data(nlh);
1620         tcm->tcm_family = AF_UNSPEC;
1621         tcm->tcm__pad1 = 0;
1622         tcm->tcm__pad2 = 0;
1623         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1624         tcm->tcm_parent = q->handle;
1625         tcm->tcm_handle = q->handle;
1626         tcm->tcm_info = 0;
1627         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1628                 goto nla_put_failure;
1629         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1630                 goto nla_put_failure;
1631
1632         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1633                                          qdisc_root_sleeping_lock(q), &d) < 0)
1634                 goto nla_put_failure;
1635
1636         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1637                 goto nla_put_failure;
1638
1639         if (gnet_stats_finish_copy(&d) < 0)
1640                 goto nla_put_failure;
1641
1642         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1643         return skb->len;
1644
1645 out_nlmsg_trim:
1646 nla_put_failure:
1647         nlmsg_trim(skb, b);
1648         return -1;
1649 }
1650
1651 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1652                          struct nlmsghdr *n, struct Qdisc *q,
1653                          unsigned long cl, int event)
1654 {
1655         struct sk_buff *skb;
1656         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1657
1658         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1659         if (!skb)
1660                 return -ENOBUFS;
1661
1662         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1663                 kfree_skb(skb);
1664                 return -EINVAL;
1665         }
1666
1667         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1668                               n->nlmsg_flags & NLM_F_ECHO);
1669 }
1670
1671 struct qdisc_dump_args {
1672         struct qdisc_walker     w;
1673         struct sk_buff          *skb;
1674         struct netlink_callback *cb;
1675 };
1676
1677 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1678 {
1679         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1680
1681         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1682                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1683 }
1684
1685 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1686                                 struct tcmsg *tcm, struct netlink_callback *cb,
1687                                 int *t_p, int s_t)
1688 {
1689         struct qdisc_dump_args arg;
1690
1691         if (tc_qdisc_dump_ignore(q) ||
1692             *t_p < s_t || !q->ops->cl_ops ||
1693             (tcm->tcm_parent &&
1694              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1695                 (*t_p)++;
1696                 return 0;
1697         }
1698         if (*t_p > s_t)
1699                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1700         arg.w.fn = qdisc_class_dump;
1701         arg.skb = skb;
1702         arg.cb = cb;
1703         arg.w.stop  = 0;
1704         arg.w.skip = cb->args[1];
1705         arg.w.count = 0;
1706         q->ops->cl_ops->walk(q, &arg.w);
1707         cb->args[1] = arg.w.count;
1708         if (arg.w.stop)
1709                 return -1;
1710         (*t_p)++;
1711         return 0;
1712 }
1713
1714 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1715                                struct tcmsg *tcm, struct netlink_callback *cb,
1716                                int *t_p, int s_t)
1717 {
1718         struct Qdisc *q;
1719
1720         if (!root)
1721                 return 0;
1722
1723         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1724                 return -1;
1725
1726         list_for_each_entry(q, &root->list, list) {
1727                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1728                         return -1;
1729         }
1730
1731         return 0;
1732 }
1733
1734 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1735 {
1736         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1737         struct net *net = sock_net(skb->sk);
1738         struct netdev_queue *dev_queue;
1739         struct net_device *dev;
1740         int t, s_t;
1741
1742         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1743                 return 0;
1744         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1745         if (!dev)
1746                 return 0;
1747
1748         s_t = cb->args[0];
1749         t = 0;
1750
1751         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1752                 goto done;
1753
1754         dev_queue = dev_ingress_queue(dev);
1755         if (dev_queue &&
1756             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1757                                 &t, s_t) < 0)
1758                 goto done;
1759
1760 done:
1761         cb->args[0] = t;
1762
1763         dev_put(dev);
1764         return skb->len;
1765 }
1766
1767 /* Main classifier routine: scans classifier chain attached
1768  * to this qdisc, (optionally) tests for protocol and asks
1769  * specific classifiers.
1770  */
1771 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1772                        struct tcf_result *res)
1773 {
1774         __be16 protocol = skb->protocol;
1775         int err;
1776
1777         for (; tp; tp = tp->next) {
1778                 if (tp->protocol != protocol &&
1779                     tp->protocol != htons(ETH_P_ALL))
1780                         continue;
1781                 err = tp->classify(skb, tp, res);
1782
1783                 if (err >= 0) {
1784 #ifdef CONFIG_NET_CLS_ACT
1785                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1786                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1787 #endif
1788                         return err;
1789                 }
1790         }
1791         return -1;
1792 }
1793 EXPORT_SYMBOL(tc_classify_compat);
1794
1795 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1796                 struct tcf_result *res)
1797 {
1798         int err = 0;
1799 #ifdef CONFIG_NET_CLS_ACT
1800         const struct tcf_proto *otp = tp;
1801 reclassify:
1802 #endif
1803
1804         err = tc_classify_compat(skb, tp, res);
1805 #ifdef CONFIG_NET_CLS_ACT
1806         if (err == TC_ACT_RECLASSIFY) {
1807                 u32 verd = G_TC_VERD(skb->tc_verd);
1808                 tp = otp;
1809
1810                 if (verd++ >= MAX_REC_LOOP) {
1811                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1812                                                tp->q->ops->id,
1813                                                tp->prio & 0xffff,
1814                                                ntohs(tp->protocol));
1815                         return TC_ACT_SHOT;
1816                 }
1817                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1818                 goto reclassify;
1819         }
1820 #endif
1821         return err;
1822 }
1823 EXPORT_SYMBOL(tc_classify);
1824
1825 void tcf_destroy(struct tcf_proto *tp)
1826 {
1827         tp->ops->destroy(tp);
1828         module_put(tp->ops->owner);
1829         kfree(tp);
1830 }
1831
1832 void tcf_destroy_chain(struct tcf_proto **fl)
1833 {
1834         struct tcf_proto *tp;
1835
1836         while ((tp = *fl) != NULL) {
1837                 *fl = tp->next;
1838                 tcf_destroy(tp);
1839         }
1840 }
1841 EXPORT_SYMBOL(tcf_destroy_chain);
1842
1843 #ifdef CONFIG_PROC_FS
1844 static int psched_show(struct seq_file *seq, void *v)
1845 {
1846         struct timespec ts;
1847
1848         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1849         seq_printf(seq, "%08x %08x %08x %08x\n",
1850                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1851                    1000000,
1852                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1853
1854         return 0;
1855 }
1856
1857 static int psched_open(struct inode *inode, struct file *file)
1858 {
1859         return single_open(file, psched_show, NULL);
1860 }
1861
1862 static const struct file_operations psched_fops = {
1863         .owner = THIS_MODULE,
1864         .open = psched_open,
1865         .read  = seq_read,
1866         .llseek = seq_lseek,
1867         .release = single_release,
1868 };
1869
1870 static int __net_init psched_net_init(struct net *net)
1871 {
1872         struct proc_dir_entry *e;
1873
1874         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1875         if (e == NULL)
1876                 return -ENOMEM;
1877
1878         return 0;
1879 }
1880
1881 static void __net_exit psched_net_exit(struct net *net)
1882 {
1883         remove_proc_entry("psched", net->proc_net);
1884 }
1885 #else
1886 static int __net_init psched_net_init(struct net *net)
1887 {
1888         return 0;
1889 }
1890
1891 static void __net_exit psched_net_exit(struct net *net)
1892 {
1893 }
1894 #endif
1895
1896 static struct pernet_operations psched_net_ops = {
1897         .init = psched_net_init,
1898         .exit = psched_net_exit,
1899 };
1900
1901 static int __init pktsched_init(void)
1902 {
1903         int err;
1904
1905         err = register_pernet_subsys(&psched_net_ops);
1906         if (err) {
1907                 pr_err("pktsched_init: "
1908                        "cannot initialize per netns operations\n");
1909                 return err;
1910         }
1911
1912         register_qdisc(&pfifo_fast_ops);
1913         register_qdisc(&pfifo_qdisc_ops);
1914         register_qdisc(&bfifo_qdisc_ops);
1915         register_qdisc(&pfifo_head_drop_qdisc_ops);
1916         register_qdisc(&mq_qdisc_ops);
1917
1918         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1919         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1920         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1921         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1922         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1923         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1924
1925         return 0;
1926 }
1927
1928 subsys_initcall(pktsched_init);