0a82653efc88e726b610a249ead90809c91af0b2
[linux-drm-fsl-dcu.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags)
322 {
323         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
324                                         0, DST_OBSOLETE_FORCE_CHK, flags);
325
326         if (rt) {
327                 struct dst_entry *dst = &rt->dst;
328
329                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
330                 INIT_LIST_HEAD(&rt->rt6i_siblings);
331                 INIT_LIST_HEAD(&rt->rt6i_uncached);
332         }
333         return rt;
334 }
335
336 static struct rt6_info *ip6_dst_alloc(struct net *net,
337                                       struct net_device *dev,
338                                       int flags)
339 {
340         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
341
342         if (rt) {
343                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
344                 if (rt->rt6i_pcpu) {
345                         int cpu;
346
347                         for_each_possible_cpu(cpu) {
348                                 struct rt6_info **p;
349
350                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
351                                 /* no one shares rt */
352                                 *p =  NULL;
353                         }
354                 } else {
355                         dst_destroy((struct dst_entry *)rt);
356                         return NULL;
357                 }
358         }
359
360         return rt;
361 }
362
363 static void ip6_dst_destroy(struct dst_entry *dst)
364 {
365         struct rt6_info *rt = (struct rt6_info *)dst;
366         struct dst_entry *from = dst->from;
367         struct inet6_dev *idev;
368
369         dst_destroy_metrics_generic(dst);
370         free_percpu(rt->rt6i_pcpu);
371         rt6_uncached_list_del(rt);
372
373         idev = rt->rt6i_idev;
374         if (idev) {
375                 rt->rt6i_idev = NULL;
376                 in6_dev_put(idev);
377         }
378
379         dst->from = NULL;
380         dst_release(from);
381 }
382
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384                            int how)
385 {
386         struct rt6_info *rt = (struct rt6_info *)dst;
387         struct inet6_dev *idev = rt->rt6i_idev;
388         struct net_device *loopback_dev =
389                 dev_net(dev)->loopback_dev;
390
391         if (dev != loopback_dev) {
392                 if (idev && idev->dev == dev) {
393                         struct inet6_dev *loopback_idev =
394                                 in6_dev_get(loopback_dev);
395                         if (loopback_idev) {
396                                 rt->rt6i_idev = loopback_idev;
397                                 in6_dev_put(idev);
398                         }
399                 }
400         }
401 }
402
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405         if (rt->rt6i_flags & RTF_EXPIRES) {
406                 if (time_after(jiffies, rt->dst.expires))
407                         return true;
408         } else if (rt->dst.from) {
409                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
410         }
411         return false;
412 }
413
414 /* Multipath route selection:
415  *   Hash based function using packet header and flowlabel.
416  * Adapted from fib_info_hashfn()
417  */
418 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
419                                const struct flowi6 *fl6)
420 {
421         unsigned int val = fl6->flowi6_proto;
422
423         val ^= ipv6_addr_hash(&fl6->daddr);
424         val ^= ipv6_addr_hash(&fl6->saddr);
425
426         /* Work only if this not encapsulated */
427         switch (fl6->flowi6_proto) {
428         case IPPROTO_UDP:
429         case IPPROTO_TCP:
430         case IPPROTO_SCTP:
431                 val ^= (__force u16)fl6->fl6_sport;
432                 val ^= (__force u16)fl6->fl6_dport;
433                 break;
434
435         case IPPROTO_ICMPV6:
436                 val ^= (__force u16)fl6->fl6_icmp_type;
437                 val ^= (__force u16)fl6->fl6_icmp_code;
438                 break;
439         }
440         /* RFC6438 recommands to use flowlabel */
441         val ^= (__force u32)fl6->flowlabel;
442
443         /* Perhaps, we need to tune, this function? */
444         val = val ^ (val >> 7) ^ (val >> 12);
445         return val % candidate_count;
446 }
447
448 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
449                                              struct flowi6 *fl6, int oif,
450                                              int strict)
451 {
452         struct rt6_info *sibling, *next_sibling;
453         int route_choosen;
454
455         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
456         /* Don't change the route, if route_choosen == 0
457          * (siblings does not include ourself)
458          */
459         if (route_choosen)
460                 list_for_each_entry_safe(sibling, next_sibling,
461                                 &match->rt6i_siblings, rt6i_siblings) {
462                         route_choosen--;
463                         if (route_choosen == 0) {
464                                 if (rt6_score_route(sibling, oif, strict) < 0)
465                                         break;
466                                 match = sibling;
467                                 break;
468                         }
469                 }
470         return match;
471 }
472
473 /*
474  *      Route lookup. Any table->tb6_lock is implied.
475  */
476
477 static inline struct rt6_info *rt6_device_match(struct net *net,
478                                                     struct rt6_info *rt,
479                                                     const struct in6_addr *saddr,
480                                                     int oif,
481                                                     int flags)
482 {
483         struct rt6_info *local = NULL;
484         struct rt6_info *sprt;
485
486         if (!oif && ipv6_addr_any(saddr))
487                 goto out;
488
489         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
490                 struct net_device *dev = sprt->dst.dev;
491
492                 if (oif) {
493                         if (dev->ifindex == oif)
494                                 return sprt;
495                         if (dev->flags & IFF_LOOPBACK) {
496                                 if (!sprt->rt6i_idev ||
497                                     sprt->rt6i_idev->dev->ifindex != oif) {
498                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
499                                                 continue;
500                                         if (local && (!oif ||
501                                                       local->rt6i_idev->dev->ifindex == oif))
502                                                 continue;
503                                 }
504                                 local = sprt;
505                         }
506                 } else {
507                         if (ipv6_chk_addr(net, saddr, dev,
508                                           flags & RT6_LOOKUP_F_IFACE))
509                                 return sprt;
510                 }
511         }
512
513         if (oif) {
514                 if (local)
515                         return local;
516
517                 if (flags & RT6_LOOKUP_F_IFACE)
518                         return net->ipv6.ip6_null_entry;
519         }
520 out:
521         return rt;
522 }
523
524 #ifdef CONFIG_IPV6_ROUTER_PREF
525 struct __rt6_probe_work {
526         struct work_struct work;
527         struct in6_addr target;
528         struct net_device *dev;
529 };
530
531 static void rt6_probe_deferred(struct work_struct *w)
532 {
533         struct in6_addr mcaddr;
534         struct __rt6_probe_work *work =
535                 container_of(w, struct __rt6_probe_work, work);
536
537         addrconf_addr_solict_mult(&work->target, &mcaddr);
538         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
539         dev_put(work->dev);
540         kfree(work);
541 }
542
543 static void rt6_probe(struct rt6_info *rt)
544 {
545         struct neighbour *neigh;
546         /*
547          * Okay, this does not seem to be appropriate
548          * for now, however, we need to check if it
549          * is really so; aka Router Reachability Probing.
550          *
551          * Router Reachability Probe MUST be rate-limited
552          * to no more than one per minute.
553          */
554         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
555                 return;
556         rcu_read_lock_bh();
557         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
558         if (neigh) {
559                 write_lock(&neigh->lock);
560                 if (neigh->nud_state & NUD_VALID)
561                         goto out;
562         }
563
564         if (!neigh ||
565             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
566                 struct __rt6_probe_work *work;
567
568                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
569
570                 if (neigh && work)
571                         __neigh_set_probe_once(neigh);
572
573                 if (neigh)
574                         write_unlock(&neigh->lock);
575
576                 if (work) {
577                         INIT_WORK(&work->work, rt6_probe_deferred);
578                         work->target = rt->rt6i_gateway;
579                         dev_hold(rt->dst.dev);
580                         work->dev = rt->dst.dev;
581                         schedule_work(&work->work);
582                 }
583         } else {
584 out:
585                 write_unlock(&neigh->lock);
586         }
587         rcu_read_unlock_bh();
588 }
589 #else
590 static inline void rt6_probe(struct rt6_info *rt)
591 {
592 }
593 #endif
594
595 /*
596  * Default Router Selection (RFC 2461 6.3.6)
597  */
598 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
599 {
600         struct net_device *dev = rt->dst.dev;
601         if (!oif || dev->ifindex == oif)
602                 return 2;
603         if ((dev->flags & IFF_LOOPBACK) &&
604             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
605                 return 1;
606         return 0;
607 }
608
609 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
610 {
611         struct neighbour *neigh;
612         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
613
614         if (rt->rt6i_flags & RTF_NONEXTHOP ||
615             !(rt->rt6i_flags & RTF_GATEWAY))
616                 return RT6_NUD_SUCCEED;
617
618         rcu_read_lock_bh();
619         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
620         if (neigh) {
621                 read_lock(&neigh->lock);
622                 if (neigh->nud_state & NUD_VALID)
623                         ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625                 else if (!(neigh->nud_state & NUD_FAILED))
626                         ret = RT6_NUD_SUCCEED;
627                 else
628                         ret = RT6_NUD_FAIL_PROBE;
629 #endif
630                 read_unlock(&neigh->lock);
631         } else {
632                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634         }
635         rcu_read_unlock_bh();
636
637         return ret;
638 }
639
640 static int rt6_score_route(struct rt6_info *rt, int oif,
641                            int strict)
642 {
643         int m;
644
645         m = rt6_check_dev(rt, oif);
646         if (!m && (strict & RT6_LOOKUP_F_IFACE))
647                 return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
650 #endif
651         if (strict & RT6_LOOKUP_F_REACHABLE) {
652                 int n = rt6_check_neigh(rt);
653                 if (n < 0)
654                         return n;
655         }
656         return m;
657 }
658
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660                                    int *mpri, struct rt6_info *match,
661                                    bool *do_rr)
662 {
663         int m;
664         bool match_do_rr = false;
665
666         if (rt6_check_expired(rt))
667                 goto out;
668
669         m = rt6_score_route(rt, oif, strict);
670         if (m == RT6_NUD_FAIL_DO_RR) {
671                 match_do_rr = true;
672                 m = 0; /* lowest valid score */
673         } else if (m == RT6_NUD_FAIL_HARD) {
674                 goto out;
675         }
676
677         if (strict & RT6_LOOKUP_F_REACHABLE)
678                 rt6_probe(rt);
679
680         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
681         if (m > *mpri) {
682                 *do_rr = match_do_rr;
683                 *mpri = m;
684                 match = rt;
685         }
686 out:
687         return match;
688 }
689
690 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
691                                      struct rt6_info *rr_head,
692                                      u32 metric, int oif, int strict,
693                                      bool *do_rr)
694 {
695         struct rt6_info *rt, *match, *cont;
696         int mpri = -1;
697
698         match = NULL;
699         cont = NULL;
700         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
701                 if (rt->rt6i_metric != metric) {
702                         cont = rt;
703                         break;
704                 }
705
706                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
707         }
708
709         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
710                 if (rt->rt6i_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rt->dst.rt6_next)
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
728 {
729         struct rt6_info *match, *rt0;
730         struct net *net;
731         bool do_rr = false;
732
733         rt0 = fn->rr_ptr;
734         if (!rt0)
735                 fn->rr_ptr = rt0 = fn->leaf;
736
737         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
738                              &do_rr);
739
740         if (do_rr) {
741                 struct rt6_info *next = rt0->dst.rt6_next;
742
743                 /* no entries matched; do round-robin */
744                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
745                         next = fn->leaf;
746
747                 if (next != rt0)
748                         fn->rr_ptr = next;
749         }
750
751         net = dev_net(rt0->dst.dev);
752         return match ? match : net->ipv6.ip6_null_entry;
753 }
754
755 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
756 {
757         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
758 }
759
760 #ifdef CONFIG_IPV6_ROUTE_INFO
761 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
762                   const struct in6_addr *gwaddr)
763 {
764         struct net *net = dev_net(dev);
765         struct route_info *rinfo = (struct route_info *) opt;
766         struct in6_addr prefix_buf, *prefix;
767         unsigned int pref;
768         unsigned long lifetime;
769         struct rt6_info *rt;
770
771         if (len < sizeof(struct route_info)) {
772                 return -EINVAL;
773         }
774
775         /* Sanity check for prefix_len and length */
776         if (rinfo->length > 3) {
777                 return -EINVAL;
778         } else if (rinfo->prefix_len > 128) {
779                 return -EINVAL;
780         } else if (rinfo->prefix_len > 64) {
781                 if (rinfo->length < 2) {
782                         return -EINVAL;
783                 }
784         } else if (rinfo->prefix_len > 0) {
785                 if (rinfo->length < 1) {
786                         return -EINVAL;
787                 }
788         }
789
790         pref = rinfo->route_pref;
791         if (pref == ICMPV6_ROUTER_PREF_INVALID)
792                 return -EINVAL;
793
794         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
795
796         if (rinfo->length == 3)
797                 prefix = (struct in6_addr *)rinfo->prefix;
798         else {
799                 /* this function is safe */
800                 ipv6_addr_prefix(&prefix_buf,
801                                  (struct in6_addr *)rinfo->prefix,
802                                  rinfo->prefix_len);
803                 prefix = &prefix_buf;
804         }
805
806         if (rinfo->prefix_len == 0)
807                 rt = rt6_get_dflt_router(gwaddr, dev);
808         else
809                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
810                                         gwaddr, dev->ifindex);
811
812         if (rt && !lifetime) {
813                 ip6_del_rt(rt);
814                 rt = NULL;
815         }
816
817         if (!rt && lifetime)
818                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
819                                         pref);
820         else if (rt)
821                 rt->rt6i_flags = RTF_ROUTEINFO |
822                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
823
824         if (rt) {
825                 if (!addrconf_finite_timeout(lifetime))
826                         rt6_clean_expires(rt);
827                 else
828                         rt6_set_expires(rt, jiffies + HZ * lifetime);
829
830                 ip6_rt_put(rt);
831         }
832         return 0;
833 }
834 #endif
835
836 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
837                                         struct in6_addr *saddr)
838 {
839         struct fib6_node *pn;
840         while (1) {
841                 if (fn->fn_flags & RTN_TL_ROOT)
842                         return NULL;
843                 pn = fn->parent;
844                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
845                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
846                 else
847                         fn = pn;
848                 if (fn->fn_flags & RTN_RTINFO)
849                         return fn;
850         }
851 }
852
853 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
854                                              struct fib6_table *table,
855                                              struct flowi6 *fl6, int flags)
856 {
857         struct fib6_node *fn;
858         struct rt6_info *rt;
859
860         read_lock_bh(&table->tb6_lock);
861         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
862 restart:
863         rt = fn->leaf;
864         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
865         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
866                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
867         if (rt == net->ipv6.ip6_null_entry) {
868                 fn = fib6_backtrack(fn, &fl6->saddr);
869                 if (fn)
870                         goto restart;
871         }
872         dst_use(&rt->dst, jiffies);
873         read_unlock_bh(&table->tb6_lock);
874         return rt;
875
876 }
877
878 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
879                                     int flags)
880 {
881         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
882 }
883 EXPORT_SYMBOL_GPL(ip6_route_lookup);
884
885 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
886                             const struct in6_addr *saddr, int oif, int strict)
887 {
888         struct flowi6 fl6 = {
889                 .flowi6_oif = oif,
890                 .daddr = *daddr,
891         };
892         struct dst_entry *dst;
893         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
894
895         if (saddr) {
896                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
897                 flags |= RT6_LOOKUP_F_HAS_SADDR;
898         }
899
900         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
901         if (dst->error == 0)
902                 return (struct rt6_info *) dst;
903
904         dst_release(dst);
905
906         return NULL;
907 }
908 EXPORT_SYMBOL(rt6_lookup);
909
910 /* ip6_ins_rt is called with FREE table->tb6_lock.
911    It takes new route entry, the addition fails by any reason the
912    route is freed. In any case, if caller does not hold it, it may
913    be destroyed.
914  */
915
916 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
917                         struct mx6_config *mxc)
918 {
919         int err;
920         struct fib6_table *table;
921
922         table = rt->rt6i_table;
923         write_lock_bh(&table->tb6_lock);
924         err = fib6_add(&table->tb6_root, rt, info, mxc);
925         write_unlock_bh(&table->tb6_lock);
926
927         return err;
928 }
929
930 int ip6_ins_rt(struct rt6_info *rt)
931 {
932         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
933         struct mx6_config mxc = { .mx = NULL, };
934
935         return __ip6_ins_rt(rt, &info, &mxc);
936 }
937
938 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
939                                            const struct in6_addr *daddr,
940                                            const struct in6_addr *saddr)
941 {
942         struct rt6_info *rt;
943
944         /*
945          *      Clone the route.
946          */
947
948         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
949                 ort = (struct rt6_info *)ort->dst.from;
950
951         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
952
953         if (!rt)
954                 return NULL;
955
956         ip6_rt_copy_init(rt, ort);
957         rt->rt6i_flags |= RTF_CACHE;
958         rt->rt6i_metric = 0;
959         rt->dst.flags |= DST_HOST;
960         rt->rt6i_dst.addr = *daddr;
961         rt->rt6i_dst.plen = 128;
962
963         if (!rt6_is_gw_or_nonexthop(ort)) {
964                 if (ort->rt6i_dst.plen != 128 &&
965                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
966                         rt->rt6i_flags |= RTF_ANYCAST;
967 #ifdef CONFIG_IPV6_SUBTREES
968                 if (rt->rt6i_src.plen && saddr) {
969                         rt->rt6i_src.addr = *saddr;
970                         rt->rt6i_src.plen = 128;
971                 }
972 #endif
973         }
974
975         return rt;
976 }
977
978 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
979 {
980         struct rt6_info *pcpu_rt;
981
982         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
983                                   rt->dst.dev, rt->dst.flags);
984
985         if (!pcpu_rt)
986                 return NULL;
987         ip6_rt_copy_init(pcpu_rt, rt);
988         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
989         pcpu_rt->rt6i_flags |= RTF_PCPU;
990         return pcpu_rt;
991 }
992
993 /* It should be called with read_lock_bh(&tb6_lock) acquired */
994 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
995 {
996         struct rt6_info *pcpu_rt, **p;
997
998         p = this_cpu_ptr(rt->rt6i_pcpu);
999         pcpu_rt = *p;
1000
1001         if (pcpu_rt) {
1002                 dst_hold(&pcpu_rt->dst);
1003                 rt6_dst_from_metrics_check(pcpu_rt);
1004         }
1005         return pcpu_rt;
1006 }
1007
1008 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1009 {
1010         struct rt6_info *pcpu_rt, *prev, **p;
1011
1012         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1013         if (!pcpu_rt) {
1014                 struct net *net = dev_net(rt->dst.dev);
1015
1016                 pcpu_rt = net->ipv6.ip6_null_entry;
1017                 goto done;
1018         }
1019
1020         p = this_cpu_ptr(rt->rt6i_pcpu);
1021         prev = cmpxchg(p, NULL, pcpu_rt);
1022         if (prev) {
1023                 /* If someone did it before us, return prev instead */
1024                 dst_destroy(&pcpu_rt->dst);
1025                 pcpu_rt = prev;
1026         }
1027
1028 done:
1029         dst_hold(&pcpu_rt->dst);
1030         rt6_dst_from_metrics_check(pcpu_rt);
1031         return pcpu_rt;
1032 }
1033
1034 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1035                                       struct flowi6 *fl6, int flags)
1036 {
1037         struct fib6_node *fn, *saved_fn;
1038         struct rt6_info *rt;
1039         int strict = 0;
1040
1041         strict |= flags & RT6_LOOKUP_F_IFACE;
1042         if (net->ipv6.devconf_all->forwarding == 0)
1043                 strict |= RT6_LOOKUP_F_REACHABLE;
1044
1045         read_lock_bh(&table->tb6_lock);
1046
1047         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1048         saved_fn = fn;
1049
1050 redo_rt6_select:
1051         rt = rt6_select(fn, oif, strict);
1052         if (rt->rt6i_nsiblings)
1053                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1054         if (rt == net->ipv6.ip6_null_entry) {
1055                 fn = fib6_backtrack(fn, &fl6->saddr);
1056                 if (fn)
1057                         goto redo_rt6_select;
1058                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1059                         /* also consider unreachable route */
1060                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1061                         fn = saved_fn;
1062                         goto redo_rt6_select;
1063                 }
1064         }
1065
1066
1067         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1068                 dst_use(&rt->dst, jiffies);
1069                 read_unlock_bh(&table->tb6_lock);
1070
1071                 rt6_dst_from_metrics_check(rt);
1072                 return rt;
1073         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1074                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1075                 /* Create a RTF_CACHE clone which will not be
1076                  * owned by the fib6 tree.  It is for the special case where
1077                  * the daddr in the skb during the neighbor look-up is different
1078                  * from the fl6->daddr used to look-up route here.
1079                  */
1080
1081                 struct rt6_info *uncached_rt;
1082
1083                 dst_use(&rt->dst, jiffies);
1084                 read_unlock_bh(&table->tb6_lock);
1085
1086                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1087                 dst_release(&rt->dst);
1088
1089                 if (uncached_rt)
1090                         rt6_uncached_list_add(uncached_rt);
1091                 else
1092                         uncached_rt = net->ipv6.ip6_null_entry;
1093
1094                 dst_hold(&uncached_rt->dst);
1095                 return uncached_rt;
1096
1097         } else {
1098                 /* Get a percpu copy */
1099
1100                 struct rt6_info *pcpu_rt;
1101
1102                 rt->dst.lastuse = jiffies;
1103                 rt->dst.__use++;
1104                 pcpu_rt = rt6_get_pcpu_route(rt);
1105
1106                 if (!pcpu_rt)
1107                         pcpu_rt = rt6_make_pcpu_route(rt);
1108
1109                 read_unlock_bh(&table->tb6_lock);
1110                 return pcpu_rt;
1111         }
1112 }
1113
1114 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1115                                             struct flowi6 *fl6, int flags)
1116 {
1117         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1118 }
1119
1120 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1121                                                 struct net_device *dev,
1122                                                 struct flowi6 *fl6, int flags)
1123 {
1124         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1125                 flags |= RT6_LOOKUP_F_IFACE;
1126
1127         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1128 }
1129
1130 void ip6_route_input(struct sk_buff *skb)
1131 {
1132         const struct ipv6hdr *iph = ipv6_hdr(skb);
1133         struct net *net = dev_net(skb->dev);
1134         int flags = RT6_LOOKUP_F_HAS_SADDR;
1135         struct flowi6 fl6 = {
1136                 .flowi6_iif = skb->dev->ifindex,
1137                 .daddr = iph->daddr,
1138                 .saddr = iph->saddr,
1139                 .flowlabel = ip6_flowinfo(iph),
1140                 .flowi6_mark = skb->mark,
1141                 .flowi6_proto = iph->nexthdr,
1142         };
1143
1144         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1145 }
1146
1147 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1148                                              struct flowi6 *fl6, int flags)
1149 {
1150         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1151 }
1152
1153 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1154                                     struct flowi6 *fl6)
1155 {
1156         int flags = 0;
1157
1158         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1159
1160         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1161                 flags |= RT6_LOOKUP_F_IFACE;
1162
1163         if (!ipv6_addr_any(&fl6->saddr))
1164                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1165         else if (sk)
1166                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1167
1168         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1169 }
1170 EXPORT_SYMBOL(ip6_route_output);
1171
1172 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1173 {
1174         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1175         struct dst_entry *new = NULL;
1176
1177         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1178         if (rt) {
1179                 new = &rt->dst;
1180
1181                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1182
1183                 new->__use = 1;
1184                 new->input = dst_discard;
1185                 new->output = dst_discard_sk;
1186
1187                 if (dst_metrics_read_only(&ort->dst))
1188                         new->_metrics = ort->dst._metrics;
1189                 else
1190                         dst_copy_metrics(new, &ort->dst);
1191                 rt->rt6i_idev = ort->rt6i_idev;
1192                 if (rt->rt6i_idev)
1193                         in6_dev_hold(rt->rt6i_idev);
1194
1195                 rt->rt6i_gateway = ort->rt6i_gateway;
1196                 rt->rt6i_flags = ort->rt6i_flags;
1197                 rt->rt6i_metric = 0;
1198
1199                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1200 #ifdef CONFIG_IPV6_SUBTREES
1201                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1202 #endif
1203
1204                 dst_free(new);
1205         }
1206
1207         dst_release(dst_orig);
1208         return new ? new : ERR_PTR(-ENOMEM);
1209 }
1210
1211 /*
1212  *      Destination cache support functions
1213  */
1214
1215 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1216 {
1217         if (rt->dst.from &&
1218             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1219                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1220 }
1221
1222 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1223 {
1224         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1225                 return NULL;
1226
1227         if (rt6_check_expired(rt))
1228                 return NULL;
1229
1230         return &rt->dst;
1231 }
1232
1233 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1234 {
1235         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1236             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1237                 return &rt->dst;
1238         else
1239                 return NULL;
1240 }
1241
1242 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1243 {
1244         struct rt6_info *rt;
1245
1246         rt = (struct rt6_info *) dst;
1247
1248         /* All IPV6 dsts are created with ->obsolete set to the value
1249          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1250          * into this function always.
1251          */
1252
1253         rt6_dst_from_metrics_check(rt);
1254
1255         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1256                 return rt6_dst_from_check(rt, cookie);
1257         else
1258                 return rt6_check(rt, cookie);
1259 }
1260
1261 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1262 {
1263         struct rt6_info *rt = (struct rt6_info *) dst;
1264
1265         if (rt) {
1266                 if (rt->rt6i_flags & RTF_CACHE) {
1267                         if (rt6_check_expired(rt)) {
1268                                 ip6_del_rt(rt);
1269                                 dst = NULL;
1270                         }
1271                 } else {
1272                         dst_release(dst);
1273                         dst = NULL;
1274                 }
1275         }
1276         return dst;
1277 }
1278
1279 static void ip6_link_failure(struct sk_buff *skb)
1280 {
1281         struct rt6_info *rt;
1282
1283         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1284
1285         rt = (struct rt6_info *) skb_dst(skb);
1286         if (rt) {
1287                 if (rt->rt6i_flags & RTF_CACHE) {
1288                         dst_hold(&rt->dst);
1289                         if (ip6_del_rt(rt))
1290                                 dst_free(&rt->dst);
1291                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1292                         rt->rt6i_node->fn_sernum = -1;
1293                 }
1294         }
1295 }
1296
1297 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1298 {
1299         struct net *net = dev_net(rt->dst.dev);
1300
1301         rt->rt6i_flags |= RTF_MODIFIED;
1302         rt->rt6i_pmtu = mtu;
1303         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1304 }
1305
1306 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1307                                  const struct ipv6hdr *iph, u32 mtu)
1308 {
1309         struct rt6_info *rt6 = (struct rt6_info *)dst;
1310
1311         if (rt6->rt6i_flags & RTF_LOCAL)
1312                 return;
1313
1314         dst_confirm(dst);
1315         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1316         if (mtu >= dst_mtu(dst))
1317                 return;
1318
1319         if (rt6->rt6i_flags & RTF_CACHE) {
1320                 rt6_do_update_pmtu(rt6, mtu);
1321         } else {
1322                 const struct in6_addr *daddr, *saddr;
1323                 struct rt6_info *nrt6;
1324
1325                 if (iph) {
1326                         daddr = &iph->daddr;
1327                         saddr = &iph->saddr;
1328                 } else if (sk) {
1329                         daddr = &sk->sk_v6_daddr;
1330                         saddr = &inet6_sk(sk)->saddr;
1331                 } else {
1332                         return;
1333                 }
1334                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1335                 if (nrt6) {
1336                         rt6_do_update_pmtu(nrt6, mtu);
1337
1338                         /* ip6_ins_rt(nrt6) will bump the
1339                          * rt6->rt6i_node->fn_sernum
1340                          * which will fail the next rt6_check() and
1341                          * invalidate the sk->sk_dst_cache.
1342                          */
1343                         ip6_ins_rt(nrt6);
1344                 }
1345         }
1346 }
1347
1348 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1349                                struct sk_buff *skb, u32 mtu)
1350 {
1351         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1352 }
1353
1354 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1355                      int oif, u32 mark)
1356 {
1357         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1358         struct dst_entry *dst;
1359         struct flowi6 fl6;
1360
1361         memset(&fl6, 0, sizeof(fl6));
1362         fl6.flowi6_oif = oif;
1363         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1364         fl6.daddr = iph->daddr;
1365         fl6.saddr = iph->saddr;
1366         fl6.flowlabel = ip6_flowinfo(iph);
1367
1368         dst = ip6_route_output(net, NULL, &fl6);
1369         if (!dst->error)
1370                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1371         dst_release(dst);
1372 }
1373 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1374
1375 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1376 {
1377         ip6_update_pmtu(skb, sock_net(sk), mtu,
1378                         sk->sk_bound_dev_if, sk->sk_mark);
1379 }
1380 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1381
1382 /* Handle redirects */
1383 struct ip6rd_flowi {
1384         struct flowi6 fl6;
1385         struct in6_addr gateway;
1386 };
1387
1388 static struct rt6_info *__ip6_route_redirect(struct net *net,
1389                                              struct fib6_table *table,
1390                                              struct flowi6 *fl6,
1391                                              int flags)
1392 {
1393         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1394         struct rt6_info *rt;
1395         struct fib6_node *fn;
1396
1397         /* Get the "current" route for this destination and
1398          * check if the redirect has come from approriate router.
1399          *
1400          * RFC 4861 specifies that redirects should only be
1401          * accepted if they come from the nexthop to the target.
1402          * Due to the way the routes are chosen, this notion
1403          * is a bit fuzzy and one might need to check all possible
1404          * routes.
1405          */
1406
1407         read_lock_bh(&table->tb6_lock);
1408         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1409 restart:
1410         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1411                 if (rt6_check_expired(rt))
1412                         continue;
1413                 if (rt->dst.error)
1414                         break;
1415                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1416                         continue;
1417                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1418                         continue;
1419                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1420                         continue;
1421                 break;
1422         }
1423
1424         if (!rt)
1425                 rt = net->ipv6.ip6_null_entry;
1426         else if (rt->dst.error) {
1427                 rt = net->ipv6.ip6_null_entry;
1428                 goto out;
1429         }
1430
1431         if (rt == net->ipv6.ip6_null_entry) {
1432                 fn = fib6_backtrack(fn, &fl6->saddr);
1433                 if (fn)
1434                         goto restart;
1435         }
1436
1437 out:
1438         dst_hold(&rt->dst);
1439
1440         read_unlock_bh(&table->tb6_lock);
1441
1442         return rt;
1443 };
1444
1445 static struct dst_entry *ip6_route_redirect(struct net *net,
1446                                         const struct flowi6 *fl6,
1447                                         const struct in6_addr *gateway)
1448 {
1449         int flags = RT6_LOOKUP_F_HAS_SADDR;
1450         struct ip6rd_flowi rdfl;
1451
1452         rdfl.fl6 = *fl6;
1453         rdfl.gateway = *gateway;
1454
1455         return fib6_rule_lookup(net, &rdfl.fl6,
1456                                 flags, __ip6_route_redirect);
1457 }
1458
1459 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1460 {
1461         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1462         struct dst_entry *dst;
1463         struct flowi6 fl6;
1464
1465         memset(&fl6, 0, sizeof(fl6));
1466         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1467         fl6.flowi6_oif = oif;
1468         fl6.flowi6_mark = mark;
1469         fl6.daddr = iph->daddr;
1470         fl6.saddr = iph->saddr;
1471         fl6.flowlabel = ip6_flowinfo(iph);
1472
1473         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1474         rt6_do_redirect(dst, NULL, skb);
1475         dst_release(dst);
1476 }
1477 EXPORT_SYMBOL_GPL(ip6_redirect);
1478
1479 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1480                             u32 mark)
1481 {
1482         const struct ipv6hdr *iph = ipv6_hdr(skb);
1483         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1484         struct dst_entry *dst;
1485         struct flowi6 fl6;
1486
1487         memset(&fl6, 0, sizeof(fl6));
1488         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1489         fl6.flowi6_oif = oif;
1490         fl6.flowi6_mark = mark;
1491         fl6.daddr = msg->dest;
1492         fl6.saddr = iph->daddr;
1493
1494         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1495         rt6_do_redirect(dst, NULL, skb);
1496         dst_release(dst);
1497 }
1498
1499 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1500 {
1501         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1502 }
1503 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1504
1505 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1506 {
1507         struct net_device *dev = dst->dev;
1508         unsigned int mtu = dst_mtu(dst);
1509         struct net *net = dev_net(dev);
1510
1511         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1512
1513         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1514                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1515
1516         /*
1517          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1518          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1519          * IPV6_MAXPLEN is also valid and means: "any MSS,
1520          * rely only on pmtu discovery"
1521          */
1522         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1523                 mtu = IPV6_MAXPLEN;
1524         return mtu;
1525 }
1526
1527 static unsigned int ip6_mtu(const struct dst_entry *dst)
1528 {
1529         const struct rt6_info *rt = (const struct rt6_info *)dst;
1530         unsigned int mtu = rt->rt6i_pmtu;
1531         struct inet6_dev *idev;
1532
1533         if (mtu)
1534                 goto out;
1535
1536         mtu = dst_metric_raw(dst, RTAX_MTU);
1537         if (mtu)
1538                 goto out;
1539
1540         mtu = IPV6_MIN_MTU;
1541
1542         rcu_read_lock();
1543         idev = __in6_dev_get(dst->dev);
1544         if (idev)
1545                 mtu = idev->cnf.mtu6;
1546         rcu_read_unlock();
1547
1548 out:
1549         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1550 }
1551
1552 static struct dst_entry *icmp6_dst_gc_list;
1553 static DEFINE_SPINLOCK(icmp6_dst_lock);
1554
1555 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1556                                   struct flowi6 *fl6)
1557 {
1558         struct dst_entry *dst;
1559         struct rt6_info *rt;
1560         struct inet6_dev *idev = in6_dev_get(dev);
1561         struct net *net = dev_net(dev);
1562
1563         if (unlikely(!idev))
1564                 return ERR_PTR(-ENODEV);
1565
1566         rt = ip6_dst_alloc(net, dev, 0);
1567         if (unlikely(!rt)) {
1568                 in6_dev_put(idev);
1569                 dst = ERR_PTR(-ENOMEM);
1570                 goto out;
1571         }
1572
1573         rt->dst.flags |= DST_HOST;
1574         rt->dst.output  = ip6_output;
1575         atomic_set(&rt->dst.__refcnt, 1);
1576         rt->rt6i_gateway  = fl6->daddr;
1577         rt->rt6i_dst.addr = fl6->daddr;
1578         rt->rt6i_dst.plen = 128;
1579         rt->rt6i_idev     = idev;
1580         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1581
1582         spin_lock_bh(&icmp6_dst_lock);
1583         rt->dst.next = icmp6_dst_gc_list;
1584         icmp6_dst_gc_list = &rt->dst;
1585         spin_unlock_bh(&icmp6_dst_lock);
1586
1587         fib6_force_start_gc(net);
1588
1589         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1590
1591 out:
1592         return dst;
1593 }
1594
1595 int icmp6_dst_gc(void)
1596 {
1597         struct dst_entry *dst, **pprev;
1598         int more = 0;
1599
1600         spin_lock_bh(&icmp6_dst_lock);
1601         pprev = &icmp6_dst_gc_list;
1602
1603         while ((dst = *pprev) != NULL) {
1604                 if (!atomic_read(&dst->__refcnt)) {
1605                         *pprev = dst->next;
1606                         dst_free(dst);
1607                 } else {
1608                         pprev = &dst->next;
1609                         ++more;
1610                 }
1611         }
1612
1613         spin_unlock_bh(&icmp6_dst_lock);
1614
1615         return more;
1616 }
1617
1618 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1619                             void *arg)
1620 {
1621         struct dst_entry *dst, **pprev;
1622
1623         spin_lock_bh(&icmp6_dst_lock);
1624         pprev = &icmp6_dst_gc_list;
1625         while ((dst = *pprev) != NULL) {
1626                 struct rt6_info *rt = (struct rt6_info *) dst;
1627                 if (func(rt, arg)) {
1628                         *pprev = dst->next;
1629                         dst_free(dst);
1630                 } else {
1631                         pprev = &dst->next;
1632                 }
1633         }
1634         spin_unlock_bh(&icmp6_dst_lock);
1635 }
1636
1637 static int ip6_dst_gc(struct dst_ops *ops)
1638 {
1639         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1640         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1641         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1642         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1643         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1644         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1645         int entries;
1646
1647         entries = dst_entries_get_fast(ops);
1648         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1649             entries <= rt_max_size)
1650                 goto out;
1651
1652         net->ipv6.ip6_rt_gc_expire++;
1653         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1654         entries = dst_entries_get_slow(ops);
1655         if (entries < ops->gc_thresh)
1656                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1657 out:
1658         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1659         return entries > rt_max_size;
1660 }
1661
1662 static int ip6_convert_metrics(struct mx6_config *mxc,
1663                                const struct fib6_config *cfg)
1664 {
1665         struct nlattr *nla;
1666         int remaining;
1667         u32 *mp;
1668
1669         if (!cfg->fc_mx)
1670                 return 0;
1671
1672         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1673         if (unlikely(!mp))
1674                 return -ENOMEM;
1675
1676         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1677                 int type = nla_type(nla);
1678
1679                 if (type) {
1680                         u32 val;
1681
1682                         if (unlikely(type > RTAX_MAX))
1683                                 goto err;
1684                         if (type == RTAX_CC_ALGO) {
1685                                 char tmp[TCP_CA_NAME_MAX];
1686
1687                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1688                                 val = tcp_ca_get_key_by_name(tmp);
1689                                 if (val == TCP_CA_UNSPEC)
1690                                         goto err;
1691                         } else {
1692                                 val = nla_get_u32(nla);
1693                         }
1694
1695                         mp[type - 1] = val;
1696                         __set_bit(type - 1, mxc->mx_valid);
1697                 }
1698         }
1699
1700         mxc->mx = mp;
1701
1702         return 0;
1703  err:
1704         kfree(mp);
1705         return -EINVAL;
1706 }
1707
1708 int ip6_route_add(struct fib6_config *cfg)
1709 {
1710         int err;
1711         struct net *net = cfg->fc_nlinfo.nl_net;
1712         struct rt6_info *rt = NULL;
1713         struct net_device *dev = NULL;
1714         struct inet6_dev *idev = NULL;
1715         struct fib6_table *table;
1716         struct mx6_config mxc = { .mx = NULL, };
1717         int addr_type;
1718
1719         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1720                 return -EINVAL;
1721 #ifndef CONFIG_IPV6_SUBTREES
1722         if (cfg->fc_src_len)
1723                 return -EINVAL;
1724 #endif
1725         if (cfg->fc_ifindex) {
1726                 err = -ENODEV;
1727                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1728                 if (!dev)
1729                         goto out;
1730                 idev = in6_dev_get(dev);
1731                 if (!idev)
1732                         goto out;
1733         }
1734
1735         if (cfg->fc_metric == 0)
1736                 cfg->fc_metric = IP6_RT_PRIO_USER;
1737
1738         err = -ENOBUFS;
1739         if (cfg->fc_nlinfo.nlh &&
1740             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1741                 table = fib6_get_table(net, cfg->fc_table);
1742                 if (!table) {
1743                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1744                         table = fib6_new_table(net, cfg->fc_table);
1745                 }
1746         } else {
1747                 table = fib6_new_table(net, cfg->fc_table);
1748         }
1749
1750         if (!table)
1751                 goto out;
1752
1753         rt = ip6_dst_alloc(net, NULL,
1754                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1755
1756         if (!rt) {
1757                 err = -ENOMEM;
1758                 goto out;
1759         }
1760
1761         if (cfg->fc_flags & RTF_EXPIRES)
1762                 rt6_set_expires(rt, jiffies +
1763                                 clock_t_to_jiffies(cfg->fc_expires));
1764         else
1765                 rt6_clean_expires(rt);
1766
1767         if (cfg->fc_protocol == RTPROT_UNSPEC)
1768                 cfg->fc_protocol = RTPROT_BOOT;
1769         rt->rt6i_protocol = cfg->fc_protocol;
1770
1771         addr_type = ipv6_addr_type(&cfg->fc_dst);
1772
1773         if (addr_type & IPV6_ADDR_MULTICAST)
1774                 rt->dst.input = ip6_mc_input;
1775         else if (cfg->fc_flags & RTF_LOCAL)
1776                 rt->dst.input = ip6_input;
1777         else
1778                 rt->dst.input = ip6_forward;
1779
1780         rt->dst.output = ip6_output;
1781
1782         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1783         rt->rt6i_dst.plen = cfg->fc_dst_len;
1784         if (rt->rt6i_dst.plen == 128)
1785                 rt->dst.flags |= DST_HOST;
1786
1787 #ifdef CONFIG_IPV6_SUBTREES
1788         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1789         rt->rt6i_src.plen = cfg->fc_src_len;
1790 #endif
1791
1792         rt->rt6i_metric = cfg->fc_metric;
1793
1794         /* We cannot add true routes via loopback here,
1795            they would result in kernel looping; promote them to reject routes
1796          */
1797         if ((cfg->fc_flags & RTF_REJECT) ||
1798             (dev && (dev->flags & IFF_LOOPBACK) &&
1799              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1800              !(cfg->fc_flags & RTF_LOCAL))) {
1801                 /* hold loopback dev/idev if we haven't done so. */
1802                 if (dev != net->loopback_dev) {
1803                         if (dev) {
1804                                 dev_put(dev);
1805                                 in6_dev_put(idev);
1806                         }
1807                         dev = net->loopback_dev;
1808                         dev_hold(dev);
1809                         idev = in6_dev_get(dev);
1810                         if (!idev) {
1811                                 err = -ENODEV;
1812                                 goto out;
1813                         }
1814                 }
1815                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1816                 switch (cfg->fc_type) {
1817                 case RTN_BLACKHOLE:
1818                         rt->dst.error = -EINVAL;
1819                         rt->dst.output = dst_discard_sk;
1820                         rt->dst.input = dst_discard;
1821                         break;
1822                 case RTN_PROHIBIT:
1823                         rt->dst.error = -EACCES;
1824                         rt->dst.output = ip6_pkt_prohibit_out;
1825                         rt->dst.input = ip6_pkt_prohibit;
1826                         break;
1827                 case RTN_THROW:
1828                 default:
1829                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1830                                         : -ENETUNREACH;
1831                         rt->dst.output = ip6_pkt_discard_out;
1832                         rt->dst.input = ip6_pkt_discard;
1833                         break;
1834                 }
1835                 goto install_route;
1836         }
1837
1838         if (cfg->fc_flags & RTF_GATEWAY) {
1839                 const struct in6_addr *gw_addr;
1840                 int gwa_type;
1841
1842                 gw_addr = &cfg->fc_gateway;
1843                 gwa_type = ipv6_addr_type(gw_addr);
1844
1845                 /* if gw_addr is local we will fail to detect this in case
1846                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1847                  * will return already-added prefix route via interface that
1848                  * prefix route was assigned to, which might be non-loopback.
1849                  */
1850                 err = -EINVAL;
1851                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1852                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1853                                             dev : NULL, 0, 0))
1854                         goto out;
1855
1856                 rt->rt6i_gateway = *gw_addr;
1857
1858                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1859                         struct rt6_info *grt;
1860
1861                         /* IPv6 strictly inhibits using not link-local
1862                            addresses as nexthop address.
1863                            Otherwise, router will not able to send redirects.
1864                            It is very good, but in some (rare!) circumstances
1865                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1866                            some exceptions. --ANK
1867                          */
1868                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1869                                 goto out;
1870
1871                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1872
1873                         err = -EHOSTUNREACH;
1874                         if (!grt)
1875                                 goto out;
1876                         if (dev) {
1877                                 if (dev != grt->dst.dev) {
1878                                         ip6_rt_put(grt);
1879                                         goto out;
1880                                 }
1881                         } else {
1882                                 dev = grt->dst.dev;
1883                                 idev = grt->rt6i_idev;
1884                                 dev_hold(dev);
1885                                 in6_dev_hold(grt->rt6i_idev);
1886                         }
1887                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1888                                 err = 0;
1889                         ip6_rt_put(grt);
1890
1891                         if (err)
1892                                 goto out;
1893                 }
1894                 err = -EINVAL;
1895                 if (!dev || (dev->flags & IFF_LOOPBACK))
1896                         goto out;
1897         }
1898
1899         err = -ENODEV;
1900         if (!dev)
1901                 goto out;
1902
1903         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1904                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1905                         err = -EINVAL;
1906                         goto out;
1907                 }
1908                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1909                 rt->rt6i_prefsrc.plen = 128;
1910         } else
1911                 rt->rt6i_prefsrc.plen = 0;
1912
1913         rt->rt6i_flags = cfg->fc_flags;
1914
1915 install_route:
1916         rt->dst.dev = dev;
1917         rt->rt6i_idev = idev;
1918         rt->rt6i_table = table;
1919
1920         cfg->fc_nlinfo.nl_net = dev_net(dev);
1921
1922         err = ip6_convert_metrics(&mxc, cfg);
1923         if (err)
1924                 goto out;
1925
1926         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1927
1928         kfree(mxc.mx);
1929         return err;
1930 out:
1931         if (dev)
1932                 dev_put(dev);
1933         if (idev)
1934                 in6_dev_put(idev);
1935         if (rt)
1936                 dst_free(&rt->dst);
1937         return err;
1938 }
1939
1940 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1941 {
1942         int err;
1943         struct fib6_table *table;
1944         struct net *net = dev_net(rt->dst.dev);
1945
1946         if (rt == net->ipv6.ip6_null_entry) {
1947                 err = -ENOENT;
1948                 goto out;
1949         }
1950
1951         table = rt->rt6i_table;
1952         write_lock_bh(&table->tb6_lock);
1953         err = fib6_del(rt, info);
1954         write_unlock_bh(&table->tb6_lock);
1955
1956 out:
1957         ip6_rt_put(rt);
1958         return err;
1959 }
1960
1961 int ip6_del_rt(struct rt6_info *rt)
1962 {
1963         struct nl_info info = {
1964                 .nl_net = dev_net(rt->dst.dev),
1965         };
1966         return __ip6_del_rt(rt, &info);
1967 }
1968
1969 static int ip6_route_del(struct fib6_config *cfg)
1970 {
1971         struct fib6_table *table;
1972         struct fib6_node *fn;
1973         struct rt6_info *rt;
1974         int err = -ESRCH;
1975
1976         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1977         if (!table)
1978                 return err;
1979
1980         read_lock_bh(&table->tb6_lock);
1981
1982         fn = fib6_locate(&table->tb6_root,
1983                          &cfg->fc_dst, cfg->fc_dst_len,
1984                          &cfg->fc_src, cfg->fc_src_len);
1985
1986         if (fn) {
1987                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1988                         if ((rt->rt6i_flags & RTF_CACHE) &&
1989                             !(cfg->fc_flags & RTF_CACHE))
1990                                 continue;
1991                         if (cfg->fc_ifindex &&
1992                             (!rt->dst.dev ||
1993                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1994                                 continue;
1995                         if (cfg->fc_flags & RTF_GATEWAY &&
1996                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1997                                 continue;
1998                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1999                                 continue;
2000                         dst_hold(&rt->dst);
2001                         read_unlock_bh(&table->tb6_lock);
2002
2003                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2004                 }
2005         }
2006         read_unlock_bh(&table->tb6_lock);
2007
2008         return err;
2009 }
2010
2011 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2012 {
2013         struct net *net = dev_net(skb->dev);
2014         struct netevent_redirect netevent;
2015         struct rt6_info *rt, *nrt = NULL;
2016         struct ndisc_options ndopts;
2017         struct inet6_dev *in6_dev;
2018         struct neighbour *neigh;
2019         struct rd_msg *msg;
2020         int optlen, on_link;
2021         u8 *lladdr;
2022
2023         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2024         optlen -= sizeof(*msg);
2025
2026         if (optlen < 0) {
2027                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2028                 return;
2029         }
2030
2031         msg = (struct rd_msg *)icmp6_hdr(skb);
2032
2033         if (ipv6_addr_is_multicast(&msg->dest)) {
2034                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2035                 return;
2036         }
2037
2038         on_link = 0;
2039         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2040                 on_link = 1;
2041         } else if (ipv6_addr_type(&msg->target) !=
2042                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2043                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2044                 return;
2045         }
2046
2047         in6_dev = __in6_dev_get(skb->dev);
2048         if (!in6_dev)
2049                 return;
2050         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2051                 return;
2052
2053         /* RFC2461 8.1:
2054          *      The IP source address of the Redirect MUST be the same as the current
2055          *      first-hop router for the specified ICMP Destination Address.
2056          */
2057
2058         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2059                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2060                 return;
2061         }
2062
2063         lladdr = NULL;
2064         if (ndopts.nd_opts_tgt_lladdr) {
2065                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2066                                              skb->dev);
2067                 if (!lladdr) {
2068                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2069                         return;
2070                 }
2071         }
2072
2073         rt = (struct rt6_info *) dst;
2074         if (rt == net->ipv6.ip6_null_entry) {
2075                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2076                 return;
2077         }
2078
2079         /* Redirect received -> path was valid.
2080          * Look, redirects are sent only in response to data packets,
2081          * so that this nexthop apparently is reachable. --ANK
2082          */
2083         dst_confirm(&rt->dst);
2084
2085         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2086         if (!neigh)
2087                 return;
2088
2089         /*
2090          *      We have finally decided to accept it.
2091          */
2092
2093         neigh_update(neigh, lladdr, NUD_STALE,
2094                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2095                      NEIGH_UPDATE_F_OVERRIDE|
2096                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2097                                      NEIGH_UPDATE_F_ISROUTER))
2098                      );
2099
2100         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2101         if (!nrt)
2102                 goto out;
2103
2104         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2105         if (on_link)
2106                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2107
2108         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2109
2110         if (ip6_ins_rt(nrt))
2111                 goto out;
2112
2113         netevent.old = &rt->dst;
2114         netevent.new = &nrt->dst;
2115         netevent.daddr = &msg->dest;
2116         netevent.neigh = neigh;
2117         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2118
2119         if (rt->rt6i_flags & RTF_CACHE) {
2120                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2121                 ip6_del_rt(rt);
2122         }
2123
2124 out:
2125         neigh_release(neigh);
2126 }
2127
2128 /*
2129  *      Misc support functions
2130  */
2131
2132 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2133 {
2134         BUG_ON(from->dst.from);
2135
2136         rt->rt6i_flags &= ~RTF_EXPIRES;
2137         dst_hold(&from->dst);
2138         rt->dst.from = &from->dst;
2139         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2140 }
2141
2142 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2143 {
2144         rt->dst.input = ort->dst.input;
2145         rt->dst.output = ort->dst.output;
2146         rt->rt6i_dst = ort->rt6i_dst;
2147         rt->dst.error = ort->dst.error;
2148         rt->rt6i_idev = ort->rt6i_idev;
2149         if (rt->rt6i_idev)
2150                 in6_dev_hold(rt->rt6i_idev);
2151         rt->dst.lastuse = jiffies;
2152         rt->rt6i_gateway = ort->rt6i_gateway;
2153         rt->rt6i_flags = ort->rt6i_flags;
2154         rt6_set_from(rt, ort);
2155         rt->rt6i_metric = ort->rt6i_metric;
2156 #ifdef CONFIG_IPV6_SUBTREES
2157         rt->rt6i_src = ort->rt6i_src;
2158 #endif
2159         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2160         rt->rt6i_table = ort->rt6i_table;
2161 }
2162
2163 #ifdef CONFIG_IPV6_ROUTE_INFO
2164 static struct rt6_info *rt6_get_route_info(struct net *net,
2165                                            const struct in6_addr *prefix, int prefixlen,
2166                                            const struct in6_addr *gwaddr, int ifindex)
2167 {
2168         struct fib6_node *fn;
2169         struct rt6_info *rt = NULL;
2170         struct fib6_table *table;
2171
2172         table = fib6_get_table(net, RT6_TABLE_INFO);
2173         if (!table)
2174                 return NULL;
2175
2176         read_lock_bh(&table->tb6_lock);
2177         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2178         if (!fn)
2179                 goto out;
2180
2181         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2182                 if (rt->dst.dev->ifindex != ifindex)
2183                         continue;
2184                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2185                         continue;
2186                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2187                         continue;
2188                 dst_hold(&rt->dst);
2189                 break;
2190         }
2191 out:
2192         read_unlock_bh(&table->tb6_lock);
2193         return rt;
2194 }
2195
2196 static struct rt6_info *rt6_add_route_info(struct net *net,
2197                                            const struct in6_addr *prefix, int prefixlen,
2198                                            const struct in6_addr *gwaddr, int ifindex,
2199                                            unsigned int pref)
2200 {
2201         struct fib6_config cfg = {
2202                 .fc_table       = RT6_TABLE_INFO,
2203                 .fc_metric      = IP6_RT_PRIO_USER,
2204                 .fc_ifindex     = ifindex,
2205                 .fc_dst_len     = prefixlen,
2206                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2207                                   RTF_UP | RTF_PREF(pref),
2208                 .fc_nlinfo.portid = 0,
2209                 .fc_nlinfo.nlh = NULL,
2210                 .fc_nlinfo.nl_net = net,
2211         };
2212
2213         cfg.fc_dst = *prefix;
2214         cfg.fc_gateway = *gwaddr;
2215
2216         /* We should treat it as a default route if prefix length is 0. */
2217         if (!prefixlen)
2218                 cfg.fc_flags |= RTF_DEFAULT;
2219
2220         ip6_route_add(&cfg);
2221
2222         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2223 }
2224 #endif
2225
2226 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2227 {
2228         struct rt6_info *rt;
2229         struct fib6_table *table;
2230
2231         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2232         if (!table)
2233                 return NULL;
2234
2235         read_lock_bh(&table->tb6_lock);
2236         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2237                 if (dev == rt->dst.dev &&
2238                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2239                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2240                         break;
2241         }
2242         if (rt)
2243                 dst_hold(&rt->dst);
2244         read_unlock_bh(&table->tb6_lock);
2245         return rt;
2246 }
2247
2248 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2249                                      struct net_device *dev,
2250                                      unsigned int pref)
2251 {
2252         struct fib6_config cfg = {
2253                 .fc_table       = RT6_TABLE_DFLT,
2254                 .fc_metric      = IP6_RT_PRIO_USER,
2255                 .fc_ifindex     = dev->ifindex,
2256                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2257                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2258                 .fc_nlinfo.portid = 0,
2259                 .fc_nlinfo.nlh = NULL,
2260                 .fc_nlinfo.nl_net = dev_net(dev),
2261         };
2262
2263         cfg.fc_gateway = *gwaddr;
2264
2265         ip6_route_add(&cfg);
2266
2267         return rt6_get_dflt_router(gwaddr, dev);
2268 }
2269
2270 void rt6_purge_dflt_routers(struct net *net)
2271 {
2272         struct rt6_info *rt;
2273         struct fib6_table *table;
2274
2275         /* NOTE: Keep consistent with rt6_get_dflt_router */
2276         table = fib6_get_table(net, RT6_TABLE_DFLT);
2277         if (!table)
2278                 return;
2279
2280 restart:
2281         read_lock_bh(&table->tb6_lock);
2282         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2283                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2284                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2285                         dst_hold(&rt->dst);
2286                         read_unlock_bh(&table->tb6_lock);
2287                         ip6_del_rt(rt);
2288                         goto restart;
2289                 }
2290         }
2291         read_unlock_bh(&table->tb6_lock);
2292 }
2293
2294 static void rtmsg_to_fib6_config(struct net *net,
2295                                  struct in6_rtmsg *rtmsg,
2296                                  struct fib6_config *cfg)
2297 {
2298         memset(cfg, 0, sizeof(*cfg));
2299
2300         cfg->fc_table = RT6_TABLE_MAIN;
2301         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2302         cfg->fc_metric = rtmsg->rtmsg_metric;
2303         cfg->fc_expires = rtmsg->rtmsg_info;
2304         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2305         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2306         cfg->fc_flags = rtmsg->rtmsg_flags;
2307
2308         cfg->fc_nlinfo.nl_net = net;
2309
2310         cfg->fc_dst = rtmsg->rtmsg_dst;
2311         cfg->fc_src = rtmsg->rtmsg_src;
2312         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2313 }
2314
2315 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2316 {
2317         struct fib6_config cfg;
2318         struct in6_rtmsg rtmsg;
2319         int err;
2320
2321         switch (cmd) {
2322         case SIOCADDRT:         /* Add a route */
2323         case SIOCDELRT:         /* Delete a route */
2324                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2325                         return -EPERM;
2326                 err = copy_from_user(&rtmsg, arg,
2327                                      sizeof(struct in6_rtmsg));
2328                 if (err)
2329                         return -EFAULT;
2330
2331                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2332
2333                 rtnl_lock();
2334                 switch (cmd) {
2335                 case SIOCADDRT:
2336                         err = ip6_route_add(&cfg);
2337                         break;
2338                 case SIOCDELRT:
2339                         err = ip6_route_del(&cfg);
2340                         break;
2341                 default:
2342                         err = -EINVAL;
2343                 }
2344                 rtnl_unlock();
2345
2346                 return err;
2347         }
2348
2349         return -EINVAL;
2350 }
2351
2352 /*
2353  *      Drop the packet on the floor
2354  */
2355
2356 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2357 {
2358         int type;
2359         struct dst_entry *dst = skb_dst(skb);
2360         switch (ipstats_mib_noroutes) {
2361         case IPSTATS_MIB_INNOROUTES:
2362                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2363                 if (type == IPV6_ADDR_ANY) {
2364                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2365                                       IPSTATS_MIB_INADDRERRORS);
2366                         break;
2367                 }
2368                 /* FALLTHROUGH */
2369         case IPSTATS_MIB_OUTNOROUTES:
2370                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2371                               ipstats_mib_noroutes);
2372                 break;
2373         }
2374         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2375         kfree_skb(skb);
2376         return 0;
2377 }
2378
2379 static int ip6_pkt_discard(struct sk_buff *skb)
2380 {
2381         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2382 }
2383
2384 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2385 {
2386         skb->dev = skb_dst(skb)->dev;
2387         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2388 }
2389
2390 static int ip6_pkt_prohibit(struct sk_buff *skb)
2391 {
2392         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2393 }
2394
2395 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2396 {
2397         skb->dev = skb_dst(skb)->dev;
2398         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2399 }
2400
2401 /*
2402  *      Allocate a dst for local (unicast / anycast) address.
2403  */
2404
2405 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2406                                     const struct in6_addr *addr,
2407                                     bool anycast)
2408 {
2409         struct net *net = dev_net(idev->dev);
2410         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2411                                             DST_NOCOUNT);
2412         if (!rt)
2413                 return ERR_PTR(-ENOMEM);
2414
2415         in6_dev_hold(idev);
2416
2417         rt->dst.flags |= DST_HOST;
2418         rt->dst.input = ip6_input;
2419         rt->dst.output = ip6_output;
2420         rt->rt6i_idev = idev;
2421
2422         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2423         if (anycast)
2424                 rt->rt6i_flags |= RTF_ANYCAST;
2425         else
2426                 rt->rt6i_flags |= RTF_LOCAL;
2427
2428         rt->rt6i_gateway  = *addr;
2429         rt->rt6i_dst.addr = *addr;
2430         rt->rt6i_dst.plen = 128;
2431         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2432
2433         atomic_set(&rt->dst.__refcnt, 1);
2434
2435         return rt;
2436 }
2437
2438 int ip6_route_get_saddr(struct net *net,
2439                         struct rt6_info *rt,
2440                         const struct in6_addr *daddr,
2441                         unsigned int prefs,
2442                         struct in6_addr *saddr)
2443 {
2444         struct inet6_dev *idev =
2445                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2446         int err = 0;
2447         if (rt && rt->rt6i_prefsrc.plen)
2448                 *saddr = rt->rt6i_prefsrc.addr;
2449         else
2450                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2451                                          daddr, prefs, saddr);
2452         return err;
2453 }
2454
2455 /* remove deleted ip from prefsrc entries */
2456 struct arg_dev_net_ip {
2457         struct net_device *dev;
2458         struct net *net;
2459         struct in6_addr *addr;
2460 };
2461
2462 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2463 {
2464         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2465         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2466         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2467
2468         if (((void *)rt->dst.dev == dev || !dev) &&
2469             rt != net->ipv6.ip6_null_entry &&
2470             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2471                 /* remove prefsrc entry */
2472                 rt->rt6i_prefsrc.plen = 0;
2473         }
2474         return 0;
2475 }
2476
2477 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2478 {
2479         struct net *net = dev_net(ifp->idev->dev);
2480         struct arg_dev_net_ip adni = {
2481                 .dev = ifp->idev->dev,
2482                 .net = net,
2483                 .addr = &ifp->addr,
2484         };
2485         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2486 }
2487
2488 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2489 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2490
2491 /* Remove routers and update dst entries when gateway turn into host. */
2492 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2493 {
2494         struct in6_addr *gateway = (struct in6_addr *)arg;
2495
2496         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2497              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2498              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2499                 return -1;
2500         }
2501         return 0;
2502 }
2503
2504 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2505 {
2506         fib6_clean_all(net, fib6_clean_tohost, gateway);
2507 }
2508
2509 struct arg_dev_net {
2510         struct net_device *dev;
2511         struct net *net;
2512 };
2513
2514 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2515 {
2516         const struct arg_dev_net *adn = arg;
2517         const struct net_device *dev = adn->dev;
2518
2519         if ((rt->dst.dev == dev || !dev) &&
2520             rt != adn->net->ipv6.ip6_null_entry)
2521                 return -1;
2522
2523         return 0;
2524 }
2525
2526 void rt6_ifdown(struct net *net, struct net_device *dev)
2527 {
2528         struct arg_dev_net adn = {
2529                 .dev = dev,
2530                 .net = net,
2531         };
2532
2533         fib6_clean_all(net, fib6_ifdown, &adn);
2534         icmp6_clean_all(fib6_ifdown, &adn);
2535         rt6_uncached_list_flush_dev(net, dev);
2536 }
2537
2538 struct rt6_mtu_change_arg {
2539         struct net_device *dev;
2540         unsigned int mtu;
2541 };
2542
2543 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2544 {
2545         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2546         struct inet6_dev *idev;
2547
2548         /* In IPv6 pmtu discovery is not optional,
2549            so that RTAX_MTU lock cannot disable it.
2550            We still use this lock to block changes
2551            caused by addrconf/ndisc.
2552         */
2553
2554         idev = __in6_dev_get(arg->dev);
2555         if (!idev)
2556                 return 0;
2557
2558         /* For administrative MTU increase, there is no way to discover
2559            IPv6 PMTU increase, so PMTU increase should be updated here.
2560            Since RFC 1981 doesn't include administrative MTU increase
2561            update PMTU increase is a MUST. (i.e. jumbo frame)
2562          */
2563         /*
2564            If new MTU is less than route PMTU, this new MTU will be the
2565            lowest MTU in the path, update the route PMTU to reflect PMTU
2566            decreases; if new MTU is greater than route PMTU, and the
2567            old MTU is the lowest MTU in the path, update the route PMTU
2568            to reflect the increase. In this case if the other nodes' MTU
2569            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2570            PMTU discouvery.
2571          */
2572         if (rt->dst.dev == arg->dev &&
2573             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2574                 if (rt->rt6i_flags & RTF_CACHE) {
2575                         /* For RTF_CACHE with rt6i_pmtu == 0
2576                          * (i.e. a redirected route),
2577                          * the metrics of its rt->dst.from has already
2578                          * been updated.
2579                          */
2580                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2581                                 rt->rt6i_pmtu = arg->mtu;
2582                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2583                            (dst_mtu(&rt->dst) < arg->mtu &&
2584                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2585                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2586                 }
2587         }
2588         return 0;
2589 }
2590
2591 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2592 {
2593         struct rt6_mtu_change_arg arg = {
2594                 .dev = dev,
2595                 .mtu = mtu,
2596         };
2597
2598         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2599 }
2600
2601 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2602         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2603         [RTA_OIF]               = { .type = NLA_U32 },
2604         [RTA_IIF]               = { .type = NLA_U32 },
2605         [RTA_PRIORITY]          = { .type = NLA_U32 },
2606         [RTA_METRICS]           = { .type = NLA_NESTED },
2607         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2608         [RTA_PREF]              = { .type = NLA_U8 },
2609 };
2610
2611 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2612                               struct fib6_config *cfg)
2613 {
2614         struct rtmsg *rtm;
2615         struct nlattr *tb[RTA_MAX+1];
2616         unsigned int pref;
2617         int err;
2618
2619         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2620         if (err < 0)
2621                 goto errout;
2622
2623         err = -EINVAL;
2624         rtm = nlmsg_data(nlh);
2625         memset(cfg, 0, sizeof(*cfg));
2626
2627         cfg->fc_table = rtm->rtm_table;
2628         cfg->fc_dst_len = rtm->rtm_dst_len;
2629         cfg->fc_src_len = rtm->rtm_src_len;
2630         cfg->fc_flags = RTF_UP;
2631         cfg->fc_protocol = rtm->rtm_protocol;
2632         cfg->fc_type = rtm->rtm_type;
2633
2634         if (rtm->rtm_type == RTN_UNREACHABLE ||
2635             rtm->rtm_type == RTN_BLACKHOLE ||
2636             rtm->rtm_type == RTN_PROHIBIT ||
2637             rtm->rtm_type == RTN_THROW)
2638                 cfg->fc_flags |= RTF_REJECT;
2639
2640         if (rtm->rtm_type == RTN_LOCAL)
2641                 cfg->fc_flags |= RTF_LOCAL;
2642
2643         if (rtm->rtm_flags & RTM_F_CLONED)
2644                 cfg->fc_flags |= RTF_CACHE;
2645
2646         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2647         cfg->fc_nlinfo.nlh = nlh;
2648         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2649
2650         if (tb[RTA_GATEWAY]) {
2651                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2652                 cfg->fc_flags |= RTF_GATEWAY;
2653         }
2654
2655         if (tb[RTA_DST]) {
2656                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2657
2658                 if (nla_len(tb[RTA_DST]) < plen)
2659                         goto errout;
2660
2661                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2662         }
2663
2664         if (tb[RTA_SRC]) {
2665                 int plen = (rtm->rtm_src_len + 7) >> 3;
2666
2667                 if (nla_len(tb[RTA_SRC]) < plen)
2668                         goto errout;
2669
2670                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2671         }
2672
2673         if (tb[RTA_PREFSRC])
2674                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2675
2676         if (tb[RTA_OIF])
2677                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2678
2679         if (tb[RTA_PRIORITY])
2680                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2681
2682         if (tb[RTA_METRICS]) {
2683                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2684                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2685         }
2686
2687         if (tb[RTA_TABLE])
2688                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2689
2690         if (tb[RTA_MULTIPATH]) {
2691                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2692                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2693         }
2694
2695         if (tb[RTA_PREF]) {
2696                 pref = nla_get_u8(tb[RTA_PREF]);
2697                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2698                     pref != ICMPV6_ROUTER_PREF_HIGH)
2699                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2700                 cfg->fc_flags |= RTF_PREF(pref);
2701         }
2702
2703         err = 0;
2704 errout:
2705         return err;
2706 }
2707
2708 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2709 {
2710         struct fib6_config r_cfg;
2711         struct rtnexthop *rtnh;
2712         int remaining;
2713         int attrlen;
2714         int err = 0, last_err = 0;
2715
2716         remaining = cfg->fc_mp_len;
2717 beginning:
2718         rtnh = (struct rtnexthop *)cfg->fc_mp;
2719
2720         /* Parse a Multipath Entry */
2721         while (rtnh_ok(rtnh, remaining)) {
2722                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2723                 if (rtnh->rtnh_ifindex)
2724                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2725
2726                 attrlen = rtnh_attrlen(rtnh);
2727                 if (attrlen > 0) {
2728                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2729
2730                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2731                         if (nla) {
2732                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2733                                 r_cfg.fc_flags |= RTF_GATEWAY;
2734                         }
2735                 }
2736                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2737                 if (err) {
2738                         last_err = err;
2739                         /* If we are trying to remove a route, do not stop the
2740                          * loop when ip6_route_del() fails (because next hop is
2741                          * already gone), we should try to remove all next hops.
2742                          */
2743                         if (add) {
2744                                 /* If add fails, we should try to delete all
2745                                  * next hops that have been already added.
2746                                  */
2747                                 add = 0;
2748                                 remaining = cfg->fc_mp_len - remaining;
2749                                 goto beginning;
2750                         }
2751                 }
2752                 /* Because each route is added like a single route we remove
2753                  * these flags after the first nexthop: if there is a collision,
2754                  * we have already failed to add the first nexthop:
2755                  * fib6_add_rt2node() has rejected it; when replacing, old
2756                  * nexthops have been replaced by first new, the rest should
2757                  * be added to it.
2758                  */
2759                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2760                                                      NLM_F_REPLACE);
2761                 rtnh = rtnh_next(rtnh, &remaining);
2762         }
2763
2764         return last_err;
2765 }
2766
2767 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2768 {
2769         struct fib6_config cfg;
2770         int err;
2771
2772         err = rtm_to_fib6_config(skb, nlh, &cfg);
2773         if (err < 0)
2774                 return err;
2775
2776         if (cfg.fc_mp)
2777                 return ip6_route_multipath(&cfg, 0);
2778         else
2779                 return ip6_route_del(&cfg);
2780 }
2781
2782 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2783 {
2784         struct fib6_config cfg;
2785         int err;
2786
2787         err = rtm_to_fib6_config(skb, nlh, &cfg);
2788         if (err < 0)
2789                 return err;
2790
2791         if (cfg.fc_mp)
2792                 return ip6_route_multipath(&cfg, 1);
2793         else
2794                 return ip6_route_add(&cfg);
2795 }
2796
2797 static inline size_t rt6_nlmsg_size(void)
2798 {
2799         return NLMSG_ALIGN(sizeof(struct rtmsg))
2800                + nla_total_size(16) /* RTA_SRC */
2801                + nla_total_size(16) /* RTA_DST */
2802                + nla_total_size(16) /* RTA_GATEWAY */
2803                + nla_total_size(16) /* RTA_PREFSRC */
2804                + nla_total_size(4) /* RTA_TABLE */
2805                + nla_total_size(4) /* RTA_IIF */
2806                + nla_total_size(4) /* RTA_OIF */
2807                + nla_total_size(4) /* RTA_PRIORITY */
2808                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2809                + nla_total_size(sizeof(struct rta_cacheinfo))
2810                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2811                + nla_total_size(1); /* RTA_PREF */
2812 }
2813
2814 static int rt6_fill_node(struct net *net,
2815                          struct sk_buff *skb, struct rt6_info *rt,
2816                          struct in6_addr *dst, struct in6_addr *src,
2817                          int iif, int type, u32 portid, u32 seq,
2818                          int prefix, int nowait, unsigned int flags)
2819 {
2820         u32 metrics[RTAX_MAX];
2821         struct rtmsg *rtm;
2822         struct nlmsghdr *nlh;
2823         long expires;
2824         u32 table;
2825
2826         if (prefix) {   /* user wants prefix routes only */
2827                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2828                         /* success since this is not a prefix route */
2829                         return 1;
2830                 }
2831         }
2832
2833         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2834         if (!nlh)
2835                 return -EMSGSIZE;
2836
2837         rtm = nlmsg_data(nlh);
2838         rtm->rtm_family = AF_INET6;
2839         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2840         rtm->rtm_src_len = rt->rt6i_src.plen;
2841         rtm->rtm_tos = 0;
2842         if (rt->rt6i_table)
2843                 table = rt->rt6i_table->tb6_id;
2844         else
2845                 table = RT6_TABLE_UNSPEC;
2846         rtm->rtm_table = table;
2847         if (nla_put_u32(skb, RTA_TABLE, table))
2848                 goto nla_put_failure;
2849         if (rt->rt6i_flags & RTF_REJECT) {
2850                 switch (rt->dst.error) {
2851                 case -EINVAL:
2852                         rtm->rtm_type = RTN_BLACKHOLE;
2853                         break;
2854                 case -EACCES:
2855                         rtm->rtm_type = RTN_PROHIBIT;
2856                         break;
2857                 case -EAGAIN:
2858                         rtm->rtm_type = RTN_THROW;
2859                         break;
2860                 default:
2861                         rtm->rtm_type = RTN_UNREACHABLE;
2862                         break;
2863                 }
2864         }
2865         else if (rt->rt6i_flags & RTF_LOCAL)
2866                 rtm->rtm_type = RTN_LOCAL;
2867         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2868                 rtm->rtm_type = RTN_LOCAL;
2869         else
2870                 rtm->rtm_type = RTN_UNICAST;
2871         rtm->rtm_flags = 0;
2872         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2873         rtm->rtm_protocol = rt->rt6i_protocol;
2874         if (rt->rt6i_flags & RTF_DYNAMIC)
2875                 rtm->rtm_protocol = RTPROT_REDIRECT;
2876         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2877                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2878                         rtm->rtm_protocol = RTPROT_RA;
2879                 else
2880                         rtm->rtm_protocol = RTPROT_KERNEL;
2881         }
2882
2883         if (rt->rt6i_flags & RTF_CACHE)
2884                 rtm->rtm_flags |= RTM_F_CLONED;
2885
2886         if (dst) {
2887                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2888                         goto nla_put_failure;
2889                 rtm->rtm_dst_len = 128;
2890         } else if (rtm->rtm_dst_len)
2891                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2892                         goto nla_put_failure;
2893 #ifdef CONFIG_IPV6_SUBTREES
2894         if (src) {
2895                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2896                         goto nla_put_failure;
2897                 rtm->rtm_src_len = 128;
2898         } else if (rtm->rtm_src_len &&
2899                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2900                 goto nla_put_failure;
2901 #endif
2902         if (iif) {
2903 #ifdef CONFIG_IPV6_MROUTE
2904                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2905                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2906                         if (err <= 0) {
2907                                 if (!nowait) {
2908                                         if (err == 0)
2909                                                 return 0;
2910                                         goto nla_put_failure;
2911                                 } else {
2912                                         if (err == -EMSGSIZE)
2913                                                 goto nla_put_failure;
2914                                 }
2915                         }
2916                 } else
2917 #endif
2918                         if (nla_put_u32(skb, RTA_IIF, iif))
2919                                 goto nla_put_failure;
2920         } else if (dst) {
2921                 struct in6_addr saddr_buf;
2922                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2923                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2924                         goto nla_put_failure;
2925         }
2926
2927         if (rt->rt6i_prefsrc.plen) {
2928                 struct in6_addr saddr_buf;
2929                 saddr_buf = rt->rt6i_prefsrc.addr;
2930                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2931                         goto nla_put_failure;
2932         }
2933
2934         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2935         if (rt->rt6i_pmtu)
2936                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2937         if (rtnetlink_put_metrics(skb, metrics) < 0)
2938                 goto nla_put_failure;
2939
2940         if (rt->rt6i_flags & RTF_GATEWAY) {
2941                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2942                         goto nla_put_failure;
2943         }
2944
2945         if (rt->dst.dev &&
2946             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2947                 goto nla_put_failure;
2948         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2949                 goto nla_put_failure;
2950
2951         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2952
2953         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2954                 goto nla_put_failure;
2955
2956         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2957                 goto nla_put_failure;
2958
2959         nlmsg_end(skb, nlh);
2960         return 0;
2961
2962 nla_put_failure:
2963         nlmsg_cancel(skb, nlh);
2964         return -EMSGSIZE;
2965 }
2966
2967 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2968 {
2969         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2970         int prefix;
2971
2972         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2973                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2974                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2975         } else
2976                 prefix = 0;
2977
2978         return rt6_fill_node(arg->net,
2979                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2980                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2981                      prefix, 0, NLM_F_MULTI);
2982 }
2983
2984 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2985 {
2986         struct net *net = sock_net(in_skb->sk);
2987         struct nlattr *tb[RTA_MAX+1];
2988         struct rt6_info *rt;
2989         struct sk_buff *skb;
2990         struct rtmsg *rtm;
2991         struct flowi6 fl6;
2992         int err, iif = 0, oif = 0;
2993
2994         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2995         if (err < 0)
2996                 goto errout;
2997
2998         err = -EINVAL;
2999         memset(&fl6, 0, sizeof(fl6));
3000
3001         if (tb[RTA_SRC]) {
3002                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3003                         goto errout;
3004
3005                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3006         }
3007
3008         if (tb[RTA_DST]) {
3009                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3010                         goto errout;
3011
3012                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3013         }
3014
3015         if (tb[RTA_IIF])
3016                 iif = nla_get_u32(tb[RTA_IIF]);
3017
3018         if (tb[RTA_OIF])
3019                 oif = nla_get_u32(tb[RTA_OIF]);
3020
3021         if (tb[RTA_MARK])
3022                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3023
3024         if (iif) {
3025                 struct net_device *dev;
3026                 int flags = 0;
3027
3028                 dev = __dev_get_by_index(net, iif);
3029                 if (!dev) {
3030                         err = -ENODEV;
3031                         goto errout;
3032                 }
3033
3034                 fl6.flowi6_iif = iif;
3035
3036                 if (!ipv6_addr_any(&fl6.saddr))
3037                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3038
3039                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3040                                                                flags);
3041         } else {
3042                 fl6.flowi6_oif = oif;
3043
3044                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3045         }
3046
3047         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3048         if (!skb) {
3049                 ip6_rt_put(rt);
3050                 err = -ENOBUFS;
3051                 goto errout;
3052         }
3053
3054         /* Reserve room for dummy headers, this skb can pass
3055            through good chunk of routing engine.
3056          */
3057         skb_reset_mac_header(skb);
3058         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3059
3060         skb_dst_set(skb, &rt->dst);
3061
3062         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3063                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3064                             nlh->nlmsg_seq, 0, 0, 0);
3065         if (err < 0) {
3066                 kfree_skb(skb);
3067                 goto errout;
3068         }
3069
3070         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3071 errout:
3072         return err;
3073 }
3074
3075 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3076 {
3077         struct sk_buff *skb;
3078         struct net *net = info->nl_net;
3079         u32 seq;
3080         int err;
3081
3082         err = -ENOBUFS;
3083         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3084
3085         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3086         if (!skb)
3087                 goto errout;
3088
3089         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3090                                 event, info->portid, seq, 0, 0, 0);
3091         if (err < 0) {
3092                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3093                 WARN_ON(err == -EMSGSIZE);
3094                 kfree_skb(skb);
3095                 goto errout;
3096         }
3097         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3098                     info->nlh, gfp_any());
3099         return;
3100 errout:
3101         if (err < 0)
3102                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3103 }
3104
3105 static int ip6_route_dev_notify(struct notifier_block *this,
3106                                 unsigned long event, void *ptr)
3107 {
3108         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3109         struct net *net = dev_net(dev);
3110
3111         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3112                 net->ipv6.ip6_null_entry->dst.dev = dev;
3113                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3114 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3115                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3116                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3117                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3118                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3119 #endif
3120         }
3121
3122         return NOTIFY_OK;
3123 }
3124
3125 /*
3126  *      /proc
3127  */
3128
3129 #ifdef CONFIG_PROC_FS
3130
3131 static const struct file_operations ipv6_route_proc_fops = {
3132         .owner          = THIS_MODULE,
3133         .open           = ipv6_route_open,
3134         .read           = seq_read,
3135         .llseek         = seq_lseek,
3136         .release        = seq_release_net,
3137 };
3138
3139 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3140 {
3141         struct net *net = (struct net *)seq->private;
3142         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3143                    net->ipv6.rt6_stats->fib_nodes,
3144                    net->ipv6.rt6_stats->fib_route_nodes,
3145                    net->ipv6.rt6_stats->fib_rt_alloc,
3146                    net->ipv6.rt6_stats->fib_rt_entries,
3147                    net->ipv6.rt6_stats->fib_rt_cache,
3148                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3149                    net->ipv6.rt6_stats->fib_discarded_routes);
3150
3151         return 0;
3152 }
3153
3154 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3155 {
3156         return single_open_net(inode, file, rt6_stats_seq_show);
3157 }
3158
3159 static const struct file_operations rt6_stats_seq_fops = {
3160         .owner   = THIS_MODULE,
3161         .open    = rt6_stats_seq_open,
3162         .read    = seq_read,
3163         .llseek  = seq_lseek,
3164         .release = single_release_net,
3165 };
3166 #endif  /* CONFIG_PROC_FS */
3167
3168 #ifdef CONFIG_SYSCTL
3169
3170 static
3171 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3172                               void __user *buffer, size_t *lenp, loff_t *ppos)
3173 {
3174         struct net *net;
3175         int delay;
3176         if (!write)
3177                 return -EINVAL;
3178
3179         net = (struct net *)ctl->extra1;
3180         delay = net->ipv6.sysctl.flush_delay;
3181         proc_dointvec(ctl, write, buffer, lenp, ppos);
3182         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3183         return 0;
3184 }
3185
3186 struct ctl_table ipv6_route_table_template[] = {
3187         {
3188                 .procname       =       "flush",
3189                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3190                 .maxlen         =       sizeof(int),
3191                 .mode           =       0200,
3192                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3193         },
3194         {
3195                 .procname       =       "gc_thresh",
3196                 .data           =       &ip6_dst_ops_template.gc_thresh,
3197                 .maxlen         =       sizeof(int),
3198                 .mode           =       0644,
3199                 .proc_handler   =       proc_dointvec,
3200         },
3201         {
3202                 .procname       =       "max_size",
3203                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3204                 .maxlen         =       sizeof(int),
3205                 .mode           =       0644,
3206                 .proc_handler   =       proc_dointvec,
3207         },
3208         {
3209                 .procname       =       "gc_min_interval",
3210                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3211                 .maxlen         =       sizeof(int),
3212                 .mode           =       0644,
3213                 .proc_handler   =       proc_dointvec_jiffies,
3214         },
3215         {
3216                 .procname       =       "gc_timeout",
3217                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3218                 .maxlen         =       sizeof(int),
3219                 .mode           =       0644,
3220                 .proc_handler   =       proc_dointvec_jiffies,
3221         },
3222         {
3223                 .procname       =       "gc_interval",
3224                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3225                 .maxlen         =       sizeof(int),
3226                 .mode           =       0644,
3227                 .proc_handler   =       proc_dointvec_jiffies,
3228         },
3229         {
3230                 .procname       =       "gc_elasticity",
3231                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3232                 .maxlen         =       sizeof(int),
3233                 .mode           =       0644,
3234                 .proc_handler   =       proc_dointvec,
3235         },
3236         {
3237                 .procname       =       "mtu_expires",
3238                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3239                 .maxlen         =       sizeof(int),
3240                 .mode           =       0644,
3241                 .proc_handler   =       proc_dointvec_jiffies,
3242         },
3243         {
3244                 .procname       =       "min_adv_mss",
3245                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3246                 .maxlen         =       sizeof(int),
3247                 .mode           =       0644,
3248                 .proc_handler   =       proc_dointvec,
3249         },
3250         {
3251                 .procname       =       "gc_min_interval_ms",
3252                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3253                 .maxlen         =       sizeof(int),
3254                 .mode           =       0644,
3255                 .proc_handler   =       proc_dointvec_ms_jiffies,
3256         },
3257         { }
3258 };
3259
3260 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3261 {
3262         struct ctl_table *table;
3263
3264         table = kmemdup(ipv6_route_table_template,
3265                         sizeof(ipv6_route_table_template),
3266                         GFP_KERNEL);
3267
3268         if (table) {
3269                 table[0].data = &net->ipv6.sysctl.flush_delay;
3270                 table[0].extra1 = net;
3271                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3272                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3273                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3274                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3275                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3276                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3277                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3278                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3279                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3280
3281                 /* Don't export sysctls to unprivileged users */
3282                 if (net->user_ns != &init_user_ns)
3283                         table[0].procname = NULL;
3284         }
3285
3286         return table;
3287 }
3288 #endif
3289
3290 static int __net_init ip6_route_net_init(struct net *net)
3291 {
3292         int ret = -ENOMEM;
3293
3294         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3295                sizeof(net->ipv6.ip6_dst_ops));
3296
3297         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3298                 goto out_ip6_dst_ops;
3299
3300         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3301                                            sizeof(*net->ipv6.ip6_null_entry),
3302                                            GFP_KERNEL);
3303         if (!net->ipv6.ip6_null_entry)
3304                 goto out_ip6_dst_entries;
3305         net->ipv6.ip6_null_entry->dst.path =
3306                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3307         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3308         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3309                          ip6_template_metrics, true);
3310
3311 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3312         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3313                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3314                                                GFP_KERNEL);
3315         if (!net->ipv6.ip6_prohibit_entry)
3316                 goto out_ip6_null_entry;
3317         net->ipv6.ip6_prohibit_entry->dst.path =
3318                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3319         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3320         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3321                          ip6_template_metrics, true);
3322
3323         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3324                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3325                                                GFP_KERNEL);
3326         if (!net->ipv6.ip6_blk_hole_entry)
3327                 goto out_ip6_prohibit_entry;
3328         net->ipv6.ip6_blk_hole_entry->dst.path =
3329                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3330         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3331         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3332                          ip6_template_metrics, true);
3333 #endif
3334
3335         net->ipv6.sysctl.flush_delay = 0;
3336         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3337         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3338         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3339         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3340         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3341         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3342         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3343
3344         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3345
3346         ret = 0;
3347 out:
3348         return ret;
3349
3350 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3351 out_ip6_prohibit_entry:
3352         kfree(net->ipv6.ip6_prohibit_entry);
3353 out_ip6_null_entry:
3354         kfree(net->ipv6.ip6_null_entry);
3355 #endif
3356 out_ip6_dst_entries:
3357         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3358 out_ip6_dst_ops:
3359         goto out;
3360 }
3361
3362 static void __net_exit ip6_route_net_exit(struct net *net)
3363 {
3364         kfree(net->ipv6.ip6_null_entry);
3365 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3366         kfree(net->ipv6.ip6_prohibit_entry);
3367         kfree(net->ipv6.ip6_blk_hole_entry);
3368 #endif
3369         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3370 }
3371
3372 static int __net_init ip6_route_net_init_late(struct net *net)
3373 {
3374 #ifdef CONFIG_PROC_FS
3375         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3376         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3377 #endif
3378         return 0;
3379 }
3380
3381 static void __net_exit ip6_route_net_exit_late(struct net *net)
3382 {
3383 #ifdef CONFIG_PROC_FS
3384         remove_proc_entry("ipv6_route", net->proc_net);
3385         remove_proc_entry("rt6_stats", net->proc_net);
3386 #endif
3387 }
3388
3389 static struct pernet_operations ip6_route_net_ops = {
3390         .init = ip6_route_net_init,
3391         .exit = ip6_route_net_exit,
3392 };
3393
3394 static int __net_init ipv6_inetpeer_init(struct net *net)
3395 {
3396         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3397
3398         if (!bp)
3399                 return -ENOMEM;
3400         inet_peer_base_init(bp);
3401         net->ipv6.peers = bp;
3402         return 0;
3403 }
3404
3405 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3406 {
3407         struct inet_peer_base *bp = net->ipv6.peers;
3408
3409         net->ipv6.peers = NULL;
3410         inetpeer_invalidate_tree(bp);
3411         kfree(bp);
3412 }
3413
3414 static struct pernet_operations ipv6_inetpeer_ops = {
3415         .init   =       ipv6_inetpeer_init,
3416         .exit   =       ipv6_inetpeer_exit,
3417 };
3418
3419 static struct pernet_operations ip6_route_net_late_ops = {
3420         .init = ip6_route_net_init_late,
3421         .exit = ip6_route_net_exit_late,
3422 };
3423
3424 static struct notifier_block ip6_route_dev_notifier = {
3425         .notifier_call = ip6_route_dev_notify,
3426         .priority = 0,
3427 };
3428
3429 int __init ip6_route_init(void)
3430 {
3431         int ret;
3432         int cpu;
3433
3434         ret = -ENOMEM;
3435         ip6_dst_ops_template.kmem_cachep =
3436                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3437                                   SLAB_HWCACHE_ALIGN, NULL);
3438         if (!ip6_dst_ops_template.kmem_cachep)
3439                 goto out;
3440
3441         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3442         if (ret)
3443                 goto out_kmem_cache;
3444
3445         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3446         if (ret)
3447                 goto out_dst_entries;
3448
3449         ret = register_pernet_subsys(&ip6_route_net_ops);
3450         if (ret)
3451                 goto out_register_inetpeer;
3452
3453         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3454
3455         /* Registering of the loopback is done before this portion of code,
3456          * the loopback reference in rt6_info will not be taken, do it
3457          * manually for init_net */
3458         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3459         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3460   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3461         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3462         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3463         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3464         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3465   #endif
3466         ret = fib6_init();
3467         if (ret)
3468                 goto out_register_subsys;
3469
3470         ret = xfrm6_init();
3471         if (ret)
3472                 goto out_fib6_init;
3473
3474         ret = fib6_rules_init();
3475         if (ret)
3476                 goto xfrm6_init;
3477
3478         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3479         if (ret)
3480                 goto fib6_rules_init;
3481
3482         ret = -ENOBUFS;
3483         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3484             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3485             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3486                 goto out_register_late_subsys;
3487
3488         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3489         if (ret)
3490                 goto out_register_late_subsys;
3491
3492         for_each_possible_cpu(cpu) {
3493                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3494
3495                 INIT_LIST_HEAD(&ul->head);
3496                 spin_lock_init(&ul->lock);
3497         }
3498
3499 out:
3500         return ret;
3501
3502 out_register_late_subsys:
3503         unregister_pernet_subsys(&ip6_route_net_late_ops);
3504 fib6_rules_init:
3505         fib6_rules_cleanup();
3506 xfrm6_init:
3507         xfrm6_fini();
3508 out_fib6_init:
3509         fib6_gc_cleanup();
3510 out_register_subsys:
3511         unregister_pernet_subsys(&ip6_route_net_ops);
3512 out_register_inetpeer:
3513         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3514 out_dst_entries:
3515         dst_entries_destroy(&ip6_dst_blackhole_ops);
3516 out_kmem_cache:
3517         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3518         goto out;
3519 }
3520
3521 void ip6_route_cleanup(void)
3522 {
3523         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3524         unregister_pernet_subsys(&ip6_route_net_late_ops);
3525         fib6_rules_cleanup();
3526         xfrm6_fini();
3527         fib6_gc_cleanup();
3528         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3529         unregister_pernet_subsys(&ip6_route_net_ops);
3530         dst_entries_destroy(&ip6_dst_blackhole_ops);
3531         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3532 }