ipv6: Fix a potential deadlock when creating pcpu rt
[linux-drm-fsl-dcu.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags)
322 {
323         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
324                                         0, DST_OBSOLETE_FORCE_CHK, flags);
325
326         if (rt) {
327                 struct dst_entry *dst = &rt->dst;
328
329                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
330                 INIT_LIST_HEAD(&rt->rt6i_siblings);
331                 INIT_LIST_HEAD(&rt->rt6i_uncached);
332         }
333         return rt;
334 }
335
336 static struct rt6_info *ip6_dst_alloc(struct net *net,
337                                       struct net_device *dev,
338                                       int flags)
339 {
340         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
341
342         if (rt) {
343                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
344                 if (rt->rt6i_pcpu) {
345                         int cpu;
346
347                         for_each_possible_cpu(cpu) {
348                                 struct rt6_info **p;
349
350                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
351                                 /* no one shares rt */
352                                 *p =  NULL;
353                         }
354                 } else {
355                         dst_destroy((struct dst_entry *)rt);
356                         return NULL;
357                 }
358         }
359
360         return rt;
361 }
362
363 static void ip6_dst_destroy(struct dst_entry *dst)
364 {
365         struct rt6_info *rt = (struct rt6_info *)dst;
366         struct dst_entry *from = dst->from;
367         struct inet6_dev *idev;
368
369         dst_destroy_metrics_generic(dst);
370         free_percpu(rt->rt6i_pcpu);
371         rt6_uncached_list_del(rt);
372
373         idev = rt->rt6i_idev;
374         if (idev) {
375                 rt->rt6i_idev = NULL;
376                 in6_dev_put(idev);
377         }
378
379         dst->from = NULL;
380         dst_release(from);
381 }
382
383 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
384                            int how)
385 {
386         struct rt6_info *rt = (struct rt6_info *)dst;
387         struct inet6_dev *idev = rt->rt6i_idev;
388         struct net_device *loopback_dev =
389                 dev_net(dev)->loopback_dev;
390
391         if (dev != loopback_dev) {
392                 if (idev && idev->dev == dev) {
393                         struct inet6_dev *loopback_idev =
394                                 in6_dev_get(loopback_dev);
395                         if (loopback_idev) {
396                                 rt->rt6i_idev = loopback_idev;
397                                 in6_dev_put(idev);
398                         }
399                 }
400         }
401 }
402
403 static bool rt6_check_expired(const struct rt6_info *rt)
404 {
405         if (rt->rt6i_flags & RTF_EXPIRES) {
406                 if (time_after(jiffies, rt->dst.expires))
407                         return true;
408         } else if (rt->dst.from) {
409                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
410         }
411         return false;
412 }
413
414 /* Multipath route selection:
415  *   Hash based function using packet header and flowlabel.
416  * Adapted from fib_info_hashfn()
417  */
418 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
419                                const struct flowi6 *fl6)
420 {
421         unsigned int val = fl6->flowi6_proto;
422
423         val ^= ipv6_addr_hash(&fl6->daddr);
424         val ^= ipv6_addr_hash(&fl6->saddr);
425
426         /* Work only if this not encapsulated */
427         switch (fl6->flowi6_proto) {
428         case IPPROTO_UDP:
429         case IPPROTO_TCP:
430         case IPPROTO_SCTP:
431                 val ^= (__force u16)fl6->fl6_sport;
432                 val ^= (__force u16)fl6->fl6_dport;
433                 break;
434
435         case IPPROTO_ICMPV6:
436                 val ^= (__force u16)fl6->fl6_icmp_type;
437                 val ^= (__force u16)fl6->fl6_icmp_code;
438                 break;
439         }
440         /* RFC6438 recommands to use flowlabel */
441         val ^= (__force u32)fl6->flowlabel;
442
443         /* Perhaps, we need to tune, this function? */
444         val = val ^ (val >> 7) ^ (val >> 12);
445         return val % candidate_count;
446 }
447
448 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
449                                              struct flowi6 *fl6, int oif,
450                                              int strict)
451 {
452         struct rt6_info *sibling, *next_sibling;
453         int route_choosen;
454
455         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
456         /* Don't change the route, if route_choosen == 0
457          * (siblings does not include ourself)
458          */
459         if (route_choosen)
460                 list_for_each_entry_safe(sibling, next_sibling,
461                                 &match->rt6i_siblings, rt6i_siblings) {
462                         route_choosen--;
463                         if (route_choosen == 0) {
464                                 if (rt6_score_route(sibling, oif, strict) < 0)
465                                         break;
466                                 match = sibling;
467                                 break;
468                         }
469                 }
470         return match;
471 }
472
473 /*
474  *      Route lookup. Any table->tb6_lock is implied.
475  */
476
477 static inline struct rt6_info *rt6_device_match(struct net *net,
478                                                     struct rt6_info *rt,
479                                                     const struct in6_addr *saddr,
480                                                     int oif,
481                                                     int flags)
482 {
483         struct rt6_info *local = NULL;
484         struct rt6_info *sprt;
485
486         if (!oif && ipv6_addr_any(saddr))
487                 goto out;
488
489         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
490                 struct net_device *dev = sprt->dst.dev;
491
492                 if (oif) {
493                         if (dev->ifindex == oif)
494                                 return sprt;
495                         if (dev->flags & IFF_LOOPBACK) {
496                                 if (!sprt->rt6i_idev ||
497                                     sprt->rt6i_idev->dev->ifindex != oif) {
498                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
499                                                 continue;
500                                         if (local && (!oif ||
501                                                       local->rt6i_idev->dev->ifindex == oif))
502                                                 continue;
503                                 }
504                                 local = sprt;
505                         }
506                 } else {
507                         if (ipv6_chk_addr(net, saddr, dev,
508                                           flags & RT6_LOOKUP_F_IFACE))
509                                 return sprt;
510                 }
511         }
512
513         if (oif) {
514                 if (local)
515                         return local;
516
517                 if (flags & RT6_LOOKUP_F_IFACE)
518                         return net->ipv6.ip6_null_entry;
519         }
520 out:
521         return rt;
522 }
523
524 #ifdef CONFIG_IPV6_ROUTER_PREF
525 struct __rt6_probe_work {
526         struct work_struct work;
527         struct in6_addr target;
528         struct net_device *dev;
529 };
530
531 static void rt6_probe_deferred(struct work_struct *w)
532 {
533         struct in6_addr mcaddr;
534         struct __rt6_probe_work *work =
535                 container_of(w, struct __rt6_probe_work, work);
536
537         addrconf_addr_solict_mult(&work->target, &mcaddr);
538         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
539         dev_put(work->dev);
540         kfree(work);
541 }
542
543 static void rt6_probe(struct rt6_info *rt)
544 {
545         struct neighbour *neigh;
546         /*
547          * Okay, this does not seem to be appropriate
548          * for now, however, we need to check if it
549          * is really so; aka Router Reachability Probing.
550          *
551          * Router Reachability Probe MUST be rate-limited
552          * to no more than one per minute.
553          */
554         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
555                 return;
556         rcu_read_lock_bh();
557         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
558         if (neigh) {
559                 write_lock(&neigh->lock);
560                 if (neigh->nud_state & NUD_VALID)
561                         goto out;
562         }
563
564         if (!neigh ||
565             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
566                 struct __rt6_probe_work *work;
567
568                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
569
570                 if (neigh && work)
571                         __neigh_set_probe_once(neigh);
572
573                 if (neigh)
574                         write_unlock(&neigh->lock);
575
576                 if (work) {
577                         INIT_WORK(&work->work, rt6_probe_deferred);
578                         work->target = rt->rt6i_gateway;
579                         dev_hold(rt->dst.dev);
580                         work->dev = rt->dst.dev;
581                         schedule_work(&work->work);
582                 }
583         } else {
584 out:
585                 write_unlock(&neigh->lock);
586         }
587         rcu_read_unlock_bh();
588 }
589 #else
590 static inline void rt6_probe(struct rt6_info *rt)
591 {
592 }
593 #endif
594
595 /*
596  * Default Router Selection (RFC 2461 6.3.6)
597  */
598 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
599 {
600         struct net_device *dev = rt->dst.dev;
601         if (!oif || dev->ifindex == oif)
602                 return 2;
603         if ((dev->flags & IFF_LOOPBACK) &&
604             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
605                 return 1;
606         return 0;
607 }
608
609 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
610 {
611         struct neighbour *neigh;
612         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
613
614         if (rt->rt6i_flags & RTF_NONEXTHOP ||
615             !(rt->rt6i_flags & RTF_GATEWAY))
616                 return RT6_NUD_SUCCEED;
617
618         rcu_read_lock_bh();
619         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
620         if (neigh) {
621                 read_lock(&neigh->lock);
622                 if (neigh->nud_state & NUD_VALID)
623                         ret = RT6_NUD_SUCCEED;
624 #ifdef CONFIG_IPV6_ROUTER_PREF
625                 else if (!(neigh->nud_state & NUD_FAILED))
626                         ret = RT6_NUD_SUCCEED;
627                 else
628                         ret = RT6_NUD_FAIL_PROBE;
629 #endif
630                 read_unlock(&neigh->lock);
631         } else {
632                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
633                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
634         }
635         rcu_read_unlock_bh();
636
637         return ret;
638 }
639
640 static int rt6_score_route(struct rt6_info *rt, int oif,
641                            int strict)
642 {
643         int m;
644
645         m = rt6_check_dev(rt, oif);
646         if (!m && (strict & RT6_LOOKUP_F_IFACE))
647                 return RT6_NUD_FAIL_HARD;
648 #ifdef CONFIG_IPV6_ROUTER_PREF
649         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
650 #endif
651         if (strict & RT6_LOOKUP_F_REACHABLE) {
652                 int n = rt6_check_neigh(rt);
653                 if (n < 0)
654                         return n;
655         }
656         return m;
657 }
658
659 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
660                                    int *mpri, struct rt6_info *match,
661                                    bool *do_rr)
662 {
663         int m;
664         bool match_do_rr = false;
665
666         if (rt6_check_expired(rt))
667                 goto out;
668
669         m = rt6_score_route(rt, oif, strict);
670         if (m == RT6_NUD_FAIL_DO_RR) {
671                 match_do_rr = true;
672                 m = 0; /* lowest valid score */
673         } else if (m == RT6_NUD_FAIL_HARD) {
674                 goto out;
675         }
676
677         if (strict & RT6_LOOKUP_F_REACHABLE)
678                 rt6_probe(rt);
679
680         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
681         if (m > *mpri) {
682                 *do_rr = match_do_rr;
683                 *mpri = m;
684                 match = rt;
685         }
686 out:
687         return match;
688 }
689
690 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
691                                      struct rt6_info *rr_head,
692                                      u32 metric, int oif, int strict,
693                                      bool *do_rr)
694 {
695         struct rt6_info *rt, *match, *cont;
696         int mpri = -1;
697
698         match = NULL;
699         cont = NULL;
700         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
701                 if (rt->rt6i_metric != metric) {
702                         cont = rt;
703                         break;
704                 }
705
706                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
707         }
708
709         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
710                 if (rt->rt6i_metric != metric) {
711                         cont = rt;
712                         break;
713                 }
714
715                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
716         }
717
718         if (match || !cont)
719                 return match;
720
721         for (rt = cont; rt; rt = rt->dst.rt6_next)
722                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723
724         return match;
725 }
726
727 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
728 {
729         struct rt6_info *match, *rt0;
730         struct net *net;
731         bool do_rr = false;
732
733         rt0 = fn->rr_ptr;
734         if (!rt0)
735                 fn->rr_ptr = rt0 = fn->leaf;
736
737         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
738                              &do_rr);
739
740         if (do_rr) {
741                 struct rt6_info *next = rt0->dst.rt6_next;
742
743                 /* no entries matched; do round-robin */
744                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
745                         next = fn->leaf;
746
747                 if (next != rt0)
748                         fn->rr_ptr = next;
749         }
750
751         net = dev_net(rt0->dst.dev);
752         return match ? match : net->ipv6.ip6_null_entry;
753 }
754
755 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
756 {
757         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
758 }
759
760 #ifdef CONFIG_IPV6_ROUTE_INFO
761 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
762                   const struct in6_addr *gwaddr)
763 {
764         struct net *net = dev_net(dev);
765         struct route_info *rinfo = (struct route_info *) opt;
766         struct in6_addr prefix_buf, *prefix;
767         unsigned int pref;
768         unsigned long lifetime;
769         struct rt6_info *rt;
770
771         if (len < sizeof(struct route_info)) {
772                 return -EINVAL;
773         }
774
775         /* Sanity check for prefix_len and length */
776         if (rinfo->length > 3) {
777                 return -EINVAL;
778         } else if (rinfo->prefix_len > 128) {
779                 return -EINVAL;
780         } else if (rinfo->prefix_len > 64) {
781                 if (rinfo->length < 2) {
782                         return -EINVAL;
783                 }
784         } else if (rinfo->prefix_len > 0) {
785                 if (rinfo->length < 1) {
786                         return -EINVAL;
787                 }
788         }
789
790         pref = rinfo->route_pref;
791         if (pref == ICMPV6_ROUTER_PREF_INVALID)
792                 return -EINVAL;
793
794         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
795
796         if (rinfo->length == 3)
797                 prefix = (struct in6_addr *)rinfo->prefix;
798         else {
799                 /* this function is safe */
800                 ipv6_addr_prefix(&prefix_buf,
801                                  (struct in6_addr *)rinfo->prefix,
802                                  rinfo->prefix_len);
803                 prefix = &prefix_buf;
804         }
805
806         if (rinfo->prefix_len == 0)
807                 rt = rt6_get_dflt_router(gwaddr, dev);
808         else
809                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
810                                         gwaddr, dev->ifindex);
811
812         if (rt && !lifetime) {
813                 ip6_del_rt(rt);
814                 rt = NULL;
815         }
816
817         if (!rt && lifetime)
818                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
819                                         pref);
820         else if (rt)
821                 rt->rt6i_flags = RTF_ROUTEINFO |
822                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
823
824         if (rt) {
825                 if (!addrconf_finite_timeout(lifetime))
826                         rt6_clean_expires(rt);
827                 else
828                         rt6_set_expires(rt, jiffies + HZ * lifetime);
829
830                 ip6_rt_put(rt);
831         }
832         return 0;
833 }
834 #endif
835
836 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
837                                         struct in6_addr *saddr)
838 {
839         struct fib6_node *pn;
840         while (1) {
841                 if (fn->fn_flags & RTN_TL_ROOT)
842                         return NULL;
843                 pn = fn->parent;
844                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
845                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
846                 else
847                         fn = pn;
848                 if (fn->fn_flags & RTN_RTINFO)
849                         return fn;
850         }
851 }
852
853 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
854                                              struct fib6_table *table,
855                                              struct flowi6 *fl6, int flags)
856 {
857         struct fib6_node *fn;
858         struct rt6_info *rt;
859
860         read_lock_bh(&table->tb6_lock);
861         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
862 restart:
863         rt = fn->leaf;
864         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
865         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
866                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
867         if (rt == net->ipv6.ip6_null_entry) {
868                 fn = fib6_backtrack(fn, &fl6->saddr);
869                 if (fn)
870                         goto restart;
871         }
872         dst_use(&rt->dst, jiffies);
873         read_unlock_bh(&table->tb6_lock);
874         return rt;
875
876 }
877
878 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
879                                     int flags)
880 {
881         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
882 }
883 EXPORT_SYMBOL_GPL(ip6_route_lookup);
884
885 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
886                             const struct in6_addr *saddr, int oif, int strict)
887 {
888         struct flowi6 fl6 = {
889                 .flowi6_oif = oif,
890                 .daddr = *daddr,
891         };
892         struct dst_entry *dst;
893         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
894
895         if (saddr) {
896                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
897                 flags |= RT6_LOOKUP_F_HAS_SADDR;
898         }
899
900         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
901         if (dst->error == 0)
902                 return (struct rt6_info *) dst;
903
904         dst_release(dst);
905
906         return NULL;
907 }
908 EXPORT_SYMBOL(rt6_lookup);
909
910 /* ip6_ins_rt is called with FREE table->tb6_lock.
911    It takes new route entry, the addition fails by any reason the
912    route is freed. In any case, if caller does not hold it, it may
913    be destroyed.
914  */
915
916 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
917                         struct mx6_config *mxc)
918 {
919         int err;
920         struct fib6_table *table;
921
922         table = rt->rt6i_table;
923         write_lock_bh(&table->tb6_lock);
924         err = fib6_add(&table->tb6_root, rt, info, mxc);
925         write_unlock_bh(&table->tb6_lock);
926
927         return err;
928 }
929
930 int ip6_ins_rt(struct rt6_info *rt)
931 {
932         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
933         struct mx6_config mxc = { .mx = NULL, };
934
935         return __ip6_ins_rt(rt, &info, &mxc);
936 }
937
938 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
939                                            const struct in6_addr *daddr,
940                                            const struct in6_addr *saddr)
941 {
942         struct rt6_info *rt;
943
944         /*
945          *      Clone the route.
946          */
947
948         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
949                 ort = (struct rt6_info *)ort->dst.from;
950
951         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
952
953         if (!rt)
954                 return NULL;
955
956         ip6_rt_copy_init(rt, ort);
957         rt->rt6i_flags |= RTF_CACHE;
958         rt->rt6i_metric = 0;
959         rt->dst.flags |= DST_HOST;
960         rt->rt6i_dst.addr = *daddr;
961         rt->rt6i_dst.plen = 128;
962
963         if (!rt6_is_gw_or_nonexthop(ort)) {
964                 if (ort->rt6i_dst.plen != 128 &&
965                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
966                         rt->rt6i_flags |= RTF_ANYCAST;
967 #ifdef CONFIG_IPV6_SUBTREES
968                 if (rt->rt6i_src.plen && saddr) {
969                         rt->rt6i_src.addr = *saddr;
970                         rt->rt6i_src.plen = 128;
971                 }
972 #endif
973         }
974
975         return rt;
976 }
977
978 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
979 {
980         struct rt6_info *pcpu_rt;
981
982         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
983                                   rt->dst.dev, rt->dst.flags);
984
985         if (!pcpu_rt)
986                 return NULL;
987         ip6_rt_copy_init(pcpu_rt, rt);
988         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
989         pcpu_rt->rt6i_flags |= RTF_PCPU;
990         return pcpu_rt;
991 }
992
993 /* It should be called with read_lock_bh(&tb6_lock) acquired */
994 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
995 {
996         struct rt6_info *pcpu_rt, **p;
997
998         p = this_cpu_ptr(rt->rt6i_pcpu);
999         pcpu_rt = *p;
1000
1001         if (pcpu_rt) {
1002                 dst_hold(&pcpu_rt->dst);
1003                 rt6_dst_from_metrics_check(pcpu_rt);
1004         }
1005         return pcpu_rt;
1006 }
1007
1008 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1009 {
1010         struct fib6_table *table = rt->rt6i_table;
1011         struct rt6_info *pcpu_rt, *prev, **p;
1012
1013         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1014         if (!pcpu_rt) {
1015                 struct net *net = dev_net(rt->dst.dev);
1016
1017                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1018                 return net->ipv6.ip6_null_entry;
1019         }
1020
1021         read_lock_bh(&table->tb6_lock);
1022         if (rt->rt6i_pcpu) {
1023                 p = this_cpu_ptr(rt->rt6i_pcpu);
1024                 prev = cmpxchg(p, NULL, pcpu_rt);
1025                 if (prev) {
1026                         /* If someone did it before us, return prev instead */
1027                         dst_destroy(&pcpu_rt->dst);
1028                         pcpu_rt = prev;
1029                 }
1030         } else {
1031                 /* rt has been removed from the fib6 tree
1032                  * before we have a chance to acquire the read_lock.
1033                  * In this case, don't brother to create a pcpu rt
1034                  * since rt is going away anyway.  The next
1035                  * dst_check() will trigger a re-lookup.
1036                  */
1037                 dst_destroy(&pcpu_rt->dst);
1038                 pcpu_rt = rt;
1039         }
1040         dst_hold(&pcpu_rt->dst);
1041         rt6_dst_from_metrics_check(pcpu_rt);
1042         read_unlock_bh(&table->tb6_lock);
1043         return pcpu_rt;
1044 }
1045
1046 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1047                                       struct flowi6 *fl6, int flags)
1048 {
1049         struct fib6_node *fn, *saved_fn;
1050         struct rt6_info *rt;
1051         int strict = 0;
1052
1053         strict |= flags & RT6_LOOKUP_F_IFACE;
1054         if (net->ipv6.devconf_all->forwarding == 0)
1055                 strict |= RT6_LOOKUP_F_REACHABLE;
1056
1057         read_lock_bh(&table->tb6_lock);
1058
1059         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1060         saved_fn = fn;
1061
1062 redo_rt6_select:
1063         rt = rt6_select(fn, oif, strict);
1064         if (rt->rt6i_nsiblings)
1065                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1066         if (rt == net->ipv6.ip6_null_entry) {
1067                 fn = fib6_backtrack(fn, &fl6->saddr);
1068                 if (fn)
1069                         goto redo_rt6_select;
1070                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1071                         /* also consider unreachable route */
1072                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1073                         fn = saved_fn;
1074                         goto redo_rt6_select;
1075                 }
1076         }
1077
1078
1079         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1080                 dst_use(&rt->dst, jiffies);
1081                 read_unlock_bh(&table->tb6_lock);
1082
1083                 rt6_dst_from_metrics_check(rt);
1084                 return rt;
1085         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1086                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1087                 /* Create a RTF_CACHE clone which will not be
1088                  * owned by the fib6 tree.  It is for the special case where
1089                  * the daddr in the skb during the neighbor look-up is different
1090                  * from the fl6->daddr used to look-up route here.
1091                  */
1092
1093                 struct rt6_info *uncached_rt;
1094
1095                 dst_use(&rt->dst, jiffies);
1096                 read_unlock_bh(&table->tb6_lock);
1097
1098                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1099                 dst_release(&rt->dst);
1100
1101                 if (uncached_rt)
1102                         rt6_uncached_list_add(uncached_rt);
1103                 else
1104                         uncached_rt = net->ipv6.ip6_null_entry;
1105
1106                 dst_hold(&uncached_rt->dst);
1107                 return uncached_rt;
1108
1109         } else {
1110                 /* Get a percpu copy */
1111
1112                 struct rt6_info *pcpu_rt;
1113
1114                 rt->dst.lastuse = jiffies;
1115                 rt->dst.__use++;
1116                 pcpu_rt = rt6_get_pcpu_route(rt);
1117
1118                 if (pcpu_rt) {
1119                         read_unlock_bh(&table->tb6_lock);
1120                 } else {
1121                         /* We have to do the read_unlock first
1122                          * because rt6_make_pcpu_route() may trigger
1123                          * ip6_dst_gc() which will take the write_lock.
1124                          */
1125                         dst_hold(&rt->dst);
1126                         read_unlock_bh(&table->tb6_lock);
1127                         pcpu_rt = rt6_make_pcpu_route(rt);
1128                         dst_release(&rt->dst);
1129                 }
1130
1131                 return pcpu_rt;
1132
1133         }
1134 }
1135
1136 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1137                                             struct flowi6 *fl6, int flags)
1138 {
1139         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1140 }
1141
1142 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1143                                                 struct net_device *dev,
1144                                                 struct flowi6 *fl6, int flags)
1145 {
1146         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1147                 flags |= RT6_LOOKUP_F_IFACE;
1148
1149         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1150 }
1151
1152 void ip6_route_input(struct sk_buff *skb)
1153 {
1154         const struct ipv6hdr *iph = ipv6_hdr(skb);
1155         struct net *net = dev_net(skb->dev);
1156         int flags = RT6_LOOKUP_F_HAS_SADDR;
1157         struct flowi6 fl6 = {
1158                 .flowi6_iif = skb->dev->ifindex,
1159                 .daddr = iph->daddr,
1160                 .saddr = iph->saddr,
1161                 .flowlabel = ip6_flowinfo(iph),
1162                 .flowi6_mark = skb->mark,
1163                 .flowi6_proto = iph->nexthdr,
1164         };
1165
1166         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1167 }
1168
1169 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1170                                              struct flowi6 *fl6, int flags)
1171 {
1172         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1173 }
1174
1175 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1176                                     struct flowi6 *fl6)
1177 {
1178         int flags = 0;
1179
1180         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1181
1182         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1183                 flags |= RT6_LOOKUP_F_IFACE;
1184
1185         if (!ipv6_addr_any(&fl6->saddr))
1186                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1187         else if (sk)
1188                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1189
1190         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1191 }
1192 EXPORT_SYMBOL(ip6_route_output);
1193
1194 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1195 {
1196         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1197         struct dst_entry *new = NULL;
1198
1199         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1200         if (rt) {
1201                 new = &rt->dst;
1202
1203                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1204
1205                 new->__use = 1;
1206                 new->input = dst_discard;
1207                 new->output = dst_discard_sk;
1208
1209                 if (dst_metrics_read_only(&ort->dst))
1210                         new->_metrics = ort->dst._metrics;
1211                 else
1212                         dst_copy_metrics(new, &ort->dst);
1213                 rt->rt6i_idev = ort->rt6i_idev;
1214                 if (rt->rt6i_idev)
1215                         in6_dev_hold(rt->rt6i_idev);
1216
1217                 rt->rt6i_gateway = ort->rt6i_gateway;
1218                 rt->rt6i_flags = ort->rt6i_flags;
1219                 rt->rt6i_metric = 0;
1220
1221                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1222 #ifdef CONFIG_IPV6_SUBTREES
1223                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1224 #endif
1225
1226                 dst_free(new);
1227         }
1228
1229         dst_release(dst_orig);
1230         return new ? new : ERR_PTR(-ENOMEM);
1231 }
1232
1233 /*
1234  *      Destination cache support functions
1235  */
1236
1237 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1238 {
1239         if (rt->dst.from &&
1240             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1241                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1242 }
1243
1244 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1245 {
1246         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1247                 return NULL;
1248
1249         if (rt6_check_expired(rt))
1250                 return NULL;
1251
1252         return &rt->dst;
1253 }
1254
1255 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1256 {
1257         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1258             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1259                 return &rt->dst;
1260         else
1261                 return NULL;
1262 }
1263
1264 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1265 {
1266         struct rt6_info *rt;
1267
1268         rt = (struct rt6_info *) dst;
1269
1270         /* All IPV6 dsts are created with ->obsolete set to the value
1271          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1272          * into this function always.
1273          */
1274
1275         rt6_dst_from_metrics_check(rt);
1276
1277         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1278                 return rt6_dst_from_check(rt, cookie);
1279         else
1280                 return rt6_check(rt, cookie);
1281 }
1282
1283 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1284 {
1285         struct rt6_info *rt = (struct rt6_info *) dst;
1286
1287         if (rt) {
1288                 if (rt->rt6i_flags & RTF_CACHE) {
1289                         if (rt6_check_expired(rt)) {
1290                                 ip6_del_rt(rt);
1291                                 dst = NULL;
1292                         }
1293                 } else {
1294                         dst_release(dst);
1295                         dst = NULL;
1296                 }
1297         }
1298         return dst;
1299 }
1300
1301 static void ip6_link_failure(struct sk_buff *skb)
1302 {
1303         struct rt6_info *rt;
1304
1305         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1306
1307         rt = (struct rt6_info *) skb_dst(skb);
1308         if (rt) {
1309                 if (rt->rt6i_flags & RTF_CACHE) {
1310                         dst_hold(&rt->dst);
1311                         if (ip6_del_rt(rt))
1312                                 dst_free(&rt->dst);
1313                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1314                         rt->rt6i_node->fn_sernum = -1;
1315                 }
1316         }
1317 }
1318
1319 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1320 {
1321         struct net *net = dev_net(rt->dst.dev);
1322
1323         rt->rt6i_flags |= RTF_MODIFIED;
1324         rt->rt6i_pmtu = mtu;
1325         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1326 }
1327
1328 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1329                                  const struct ipv6hdr *iph, u32 mtu)
1330 {
1331         struct rt6_info *rt6 = (struct rt6_info *)dst;
1332
1333         if (rt6->rt6i_flags & RTF_LOCAL)
1334                 return;
1335
1336         dst_confirm(dst);
1337         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1338         if (mtu >= dst_mtu(dst))
1339                 return;
1340
1341         if (rt6->rt6i_flags & RTF_CACHE) {
1342                 rt6_do_update_pmtu(rt6, mtu);
1343         } else {
1344                 const struct in6_addr *daddr, *saddr;
1345                 struct rt6_info *nrt6;
1346
1347                 if (iph) {
1348                         daddr = &iph->daddr;
1349                         saddr = &iph->saddr;
1350                 } else if (sk) {
1351                         daddr = &sk->sk_v6_daddr;
1352                         saddr = &inet6_sk(sk)->saddr;
1353                 } else {
1354                         return;
1355                 }
1356                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1357                 if (nrt6) {
1358                         rt6_do_update_pmtu(nrt6, mtu);
1359
1360                         /* ip6_ins_rt(nrt6) will bump the
1361                          * rt6->rt6i_node->fn_sernum
1362                          * which will fail the next rt6_check() and
1363                          * invalidate the sk->sk_dst_cache.
1364                          */
1365                         ip6_ins_rt(nrt6);
1366                 }
1367         }
1368 }
1369
1370 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1371                                struct sk_buff *skb, u32 mtu)
1372 {
1373         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1374 }
1375
1376 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1377                      int oif, u32 mark)
1378 {
1379         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1380         struct dst_entry *dst;
1381         struct flowi6 fl6;
1382
1383         memset(&fl6, 0, sizeof(fl6));
1384         fl6.flowi6_oif = oif;
1385         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1386         fl6.daddr = iph->daddr;
1387         fl6.saddr = iph->saddr;
1388         fl6.flowlabel = ip6_flowinfo(iph);
1389
1390         dst = ip6_route_output(net, NULL, &fl6);
1391         if (!dst->error)
1392                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1393         dst_release(dst);
1394 }
1395 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1396
1397 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1398 {
1399         ip6_update_pmtu(skb, sock_net(sk), mtu,
1400                         sk->sk_bound_dev_if, sk->sk_mark);
1401 }
1402 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1403
1404 /* Handle redirects */
1405 struct ip6rd_flowi {
1406         struct flowi6 fl6;
1407         struct in6_addr gateway;
1408 };
1409
1410 static struct rt6_info *__ip6_route_redirect(struct net *net,
1411                                              struct fib6_table *table,
1412                                              struct flowi6 *fl6,
1413                                              int flags)
1414 {
1415         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1416         struct rt6_info *rt;
1417         struct fib6_node *fn;
1418
1419         /* Get the "current" route for this destination and
1420          * check if the redirect has come from approriate router.
1421          *
1422          * RFC 4861 specifies that redirects should only be
1423          * accepted if they come from the nexthop to the target.
1424          * Due to the way the routes are chosen, this notion
1425          * is a bit fuzzy and one might need to check all possible
1426          * routes.
1427          */
1428
1429         read_lock_bh(&table->tb6_lock);
1430         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1431 restart:
1432         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1433                 if (rt6_check_expired(rt))
1434                         continue;
1435                 if (rt->dst.error)
1436                         break;
1437                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1438                         continue;
1439                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1440                         continue;
1441                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1442                         continue;
1443                 break;
1444         }
1445
1446         if (!rt)
1447                 rt = net->ipv6.ip6_null_entry;
1448         else if (rt->dst.error) {
1449                 rt = net->ipv6.ip6_null_entry;
1450                 goto out;
1451         }
1452
1453         if (rt == net->ipv6.ip6_null_entry) {
1454                 fn = fib6_backtrack(fn, &fl6->saddr);
1455                 if (fn)
1456                         goto restart;
1457         }
1458
1459 out:
1460         dst_hold(&rt->dst);
1461
1462         read_unlock_bh(&table->tb6_lock);
1463
1464         return rt;
1465 };
1466
1467 static struct dst_entry *ip6_route_redirect(struct net *net,
1468                                         const struct flowi6 *fl6,
1469                                         const struct in6_addr *gateway)
1470 {
1471         int flags = RT6_LOOKUP_F_HAS_SADDR;
1472         struct ip6rd_flowi rdfl;
1473
1474         rdfl.fl6 = *fl6;
1475         rdfl.gateway = *gateway;
1476
1477         return fib6_rule_lookup(net, &rdfl.fl6,
1478                                 flags, __ip6_route_redirect);
1479 }
1480
1481 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1482 {
1483         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1484         struct dst_entry *dst;
1485         struct flowi6 fl6;
1486
1487         memset(&fl6, 0, sizeof(fl6));
1488         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1489         fl6.flowi6_oif = oif;
1490         fl6.flowi6_mark = mark;
1491         fl6.daddr = iph->daddr;
1492         fl6.saddr = iph->saddr;
1493         fl6.flowlabel = ip6_flowinfo(iph);
1494
1495         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1496         rt6_do_redirect(dst, NULL, skb);
1497         dst_release(dst);
1498 }
1499 EXPORT_SYMBOL_GPL(ip6_redirect);
1500
1501 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1502                             u32 mark)
1503 {
1504         const struct ipv6hdr *iph = ipv6_hdr(skb);
1505         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1506         struct dst_entry *dst;
1507         struct flowi6 fl6;
1508
1509         memset(&fl6, 0, sizeof(fl6));
1510         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1511         fl6.flowi6_oif = oif;
1512         fl6.flowi6_mark = mark;
1513         fl6.daddr = msg->dest;
1514         fl6.saddr = iph->daddr;
1515
1516         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1517         rt6_do_redirect(dst, NULL, skb);
1518         dst_release(dst);
1519 }
1520
1521 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1522 {
1523         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1524 }
1525 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1526
1527 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1528 {
1529         struct net_device *dev = dst->dev;
1530         unsigned int mtu = dst_mtu(dst);
1531         struct net *net = dev_net(dev);
1532
1533         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1534
1535         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1536                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1537
1538         /*
1539          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1540          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1541          * IPV6_MAXPLEN is also valid and means: "any MSS,
1542          * rely only on pmtu discovery"
1543          */
1544         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1545                 mtu = IPV6_MAXPLEN;
1546         return mtu;
1547 }
1548
1549 static unsigned int ip6_mtu(const struct dst_entry *dst)
1550 {
1551         const struct rt6_info *rt = (const struct rt6_info *)dst;
1552         unsigned int mtu = rt->rt6i_pmtu;
1553         struct inet6_dev *idev;
1554
1555         if (mtu)
1556                 goto out;
1557
1558         mtu = dst_metric_raw(dst, RTAX_MTU);
1559         if (mtu)
1560                 goto out;
1561
1562         mtu = IPV6_MIN_MTU;
1563
1564         rcu_read_lock();
1565         idev = __in6_dev_get(dst->dev);
1566         if (idev)
1567                 mtu = idev->cnf.mtu6;
1568         rcu_read_unlock();
1569
1570 out:
1571         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1572 }
1573
1574 static struct dst_entry *icmp6_dst_gc_list;
1575 static DEFINE_SPINLOCK(icmp6_dst_lock);
1576
1577 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1578                                   struct flowi6 *fl6)
1579 {
1580         struct dst_entry *dst;
1581         struct rt6_info *rt;
1582         struct inet6_dev *idev = in6_dev_get(dev);
1583         struct net *net = dev_net(dev);
1584
1585         if (unlikely(!idev))
1586                 return ERR_PTR(-ENODEV);
1587
1588         rt = ip6_dst_alloc(net, dev, 0);
1589         if (unlikely(!rt)) {
1590                 in6_dev_put(idev);
1591                 dst = ERR_PTR(-ENOMEM);
1592                 goto out;
1593         }
1594
1595         rt->dst.flags |= DST_HOST;
1596         rt->dst.output  = ip6_output;
1597         atomic_set(&rt->dst.__refcnt, 1);
1598         rt->rt6i_gateway  = fl6->daddr;
1599         rt->rt6i_dst.addr = fl6->daddr;
1600         rt->rt6i_dst.plen = 128;
1601         rt->rt6i_idev     = idev;
1602         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1603
1604         spin_lock_bh(&icmp6_dst_lock);
1605         rt->dst.next = icmp6_dst_gc_list;
1606         icmp6_dst_gc_list = &rt->dst;
1607         spin_unlock_bh(&icmp6_dst_lock);
1608
1609         fib6_force_start_gc(net);
1610
1611         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1612
1613 out:
1614         return dst;
1615 }
1616
1617 int icmp6_dst_gc(void)
1618 {
1619         struct dst_entry *dst, **pprev;
1620         int more = 0;
1621
1622         spin_lock_bh(&icmp6_dst_lock);
1623         pprev = &icmp6_dst_gc_list;
1624
1625         while ((dst = *pprev) != NULL) {
1626                 if (!atomic_read(&dst->__refcnt)) {
1627                         *pprev = dst->next;
1628                         dst_free(dst);
1629                 } else {
1630                         pprev = &dst->next;
1631                         ++more;
1632                 }
1633         }
1634
1635         spin_unlock_bh(&icmp6_dst_lock);
1636
1637         return more;
1638 }
1639
1640 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1641                             void *arg)
1642 {
1643         struct dst_entry *dst, **pprev;
1644
1645         spin_lock_bh(&icmp6_dst_lock);
1646         pprev = &icmp6_dst_gc_list;
1647         while ((dst = *pprev) != NULL) {
1648                 struct rt6_info *rt = (struct rt6_info *) dst;
1649                 if (func(rt, arg)) {
1650                         *pprev = dst->next;
1651                         dst_free(dst);
1652                 } else {
1653                         pprev = &dst->next;
1654                 }
1655         }
1656         spin_unlock_bh(&icmp6_dst_lock);
1657 }
1658
1659 static int ip6_dst_gc(struct dst_ops *ops)
1660 {
1661         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1662         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1663         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1664         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1665         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1666         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1667         int entries;
1668
1669         entries = dst_entries_get_fast(ops);
1670         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1671             entries <= rt_max_size)
1672                 goto out;
1673
1674         net->ipv6.ip6_rt_gc_expire++;
1675         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1676         entries = dst_entries_get_slow(ops);
1677         if (entries < ops->gc_thresh)
1678                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1679 out:
1680         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1681         return entries > rt_max_size;
1682 }
1683
1684 static int ip6_convert_metrics(struct mx6_config *mxc,
1685                                const struct fib6_config *cfg)
1686 {
1687         struct nlattr *nla;
1688         int remaining;
1689         u32 *mp;
1690
1691         if (!cfg->fc_mx)
1692                 return 0;
1693
1694         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1695         if (unlikely(!mp))
1696                 return -ENOMEM;
1697
1698         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1699                 int type = nla_type(nla);
1700
1701                 if (type) {
1702                         u32 val;
1703
1704                         if (unlikely(type > RTAX_MAX))
1705                                 goto err;
1706                         if (type == RTAX_CC_ALGO) {
1707                                 char tmp[TCP_CA_NAME_MAX];
1708
1709                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1710                                 val = tcp_ca_get_key_by_name(tmp);
1711                                 if (val == TCP_CA_UNSPEC)
1712                                         goto err;
1713                         } else {
1714                                 val = nla_get_u32(nla);
1715                         }
1716
1717                         mp[type - 1] = val;
1718                         __set_bit(type - 1, mxc->mx_valid);
1719                 }
1720         }
1721
1722         mxc->mx = mp;
1723
1724         return 0;
1725  err:
1726         kfree(mp);
1727         return -EINVAL;
1728 }
1729
1730 int ip6_route_add(struct fib6_config *cfg)
1731 {
1732         int err;
1733         struct net *net = cfg->fc_nlinfo.nl_net;
1734         struct rt6_info *rt = NULL;
1735         struct net_device *dev = NULL;
1736         struct inet6_dev *idev = NULL;
1737         struct fib6_table *table;
1738         struct mx6_config mxc = { .mx = NULL, };
1739         int addr_type;
1740
1741         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1742                 return -EINVAL;
1743 #ifndef CONFIG_IPV6_SUBTREES
1744         if (cfg->fc_src_len)
1745                 return -EINVAL;
1746 #endif
1747         if (cfg->fc_ifindex) {
1748                 err = -ENODEV;
1749                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1750                 if (!dev)
1751                         goto out;
1752                 idev = in6_dev_get(dev);
1753                 if (!idev)
1754                         goto out;
1755         }
1756
1757         if (cfg->fc_metric == 0)
1758                 cfg->fc_metric = IP6_RT_PRIO_USER;
1759
1760         err = -ENOBUFS;
1761         if (cfg->fc_nlinfo.nlh &&
1762             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1763                 table = fib6_get_table(net, cfg->fc_table);
1764                 if (!table) {
1765                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1766                         table = fib6_new_table(net, cfg->fc_table);
1767                 }
1768         } else {
1769                 table = fib6_new_table(net, cfg->fc_table);
1770         }
1771
1772         if (!table)
1773                 goto out;
1774
1775         rt = ip6_dst_alloc(net, NULL,
1776                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1777
1778         if (!rt) {
1779                 err = -ENOMEM;
1780                 goto out;
1781         }
1782
1783         if (cfg->fc_flags & RTF_EXPIRES)
1784                 rt6_set_expires(rt, jiffies +
1785                                 clock_t_to_jiffies(cfg->fc_expires));
1786         else
1787                 rt6_clean_expires(rt);
1788
1789         if (cfg->fc_protocol == RTPROT_UNSPEC)
1790                 cfg->fc_protocol = RTPROT_BOOT;
1791         rt->rt6i_protocol = cfg->fc_protocol;
1792
1793         addr_type = ipv6_addr_type(&cfg->fc_dst);
1794
1795         if (addr_type & IPV6_ADDR_MULTICAST)
1796                 rt->dst.input = ip6_mc_input;
1797         else if (cfg->fc_flags & RTF_LOCAL)
1798                 rt->dst.input = ip6_input;
1799         else
1800                 rt->dst.input = ip6_forward;
1801
1802         rt->dst.output = ip6_output;
1803
1804         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1805         rt->rt6i_dst.plen = cfg->fc_dst_len;
1806         if (rt->rt6i_dst.plen == 128)
1807                 rt->dst.flags |= DST_HOST;
1808
1809 #ifdef CONFIG_IPV6_SUBTREES
1810         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1811         rt->rt6i_src.plen = cfg->fc_src_len;
1812 #endif
1813
1814         rt->rt6i_metric = cfg->fc_metric;
1815
1816         /* We cannot add true routes via loopback here,
1817            they would result in kernel looping; promote them to reject routes
1818          */
1819         if ((cfg->fc_flags & RTF_REJECT) ||
1820             (dev && (dev->flags & IFF_LOOPBACK) &&
1821              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1822              !(cfg->fc_flags & RTF_LOCAL))) {
1823                 /* hold loopback dev/idev if we haven't done so. */
1824                 if (dev != net->loopback_dev) {
1825                         if (dev) {
1826                                 dev_put(dev);
1827                                 in6_dev_put(idev);
1828                         }
1829                         dev = net->loopback_dev;
1830                         dev_hold(dev);
1831                         idev = in6_dev_get(dev);
1832                         if (!idev) {
1833                                 err = -ENODEV;
1834                                 goto out;
1835                         }
1836                 }
1837                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1838                 switch (cfg->fc_type) {
1839                 case RTN_BLACKHOLE:
1840                         rt->dst.error = -EINVAL;
1841                         rt->dst.output = dst_discard_sk;
1842                         rt->dst.input = dst_discard;
1843                         break;
1844                 case RTN_PROHIBIT:
1845                         rt->dst.error = -EACCES;
1846                         rt->dst.output = ip6_pkt_prohibit_out;
1847                         rt->dst.input = ip6_pkt_prohibit;
1848                         break;
1849                 case RTN_THROW:
1850                 default:
1851                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1852                                         : -ENETUNREACH;
1853                         rt->dst.output = ip6_pkt_discard_out;
1854                         rt->dst.input = ip6_pkt_discard;
1855                         break;
1856                 }
1857                 goto install_route;
1858         }
1859
1860         if (cfg->fc_flags & RTF_GATEWAY) {
1861                 const struct in6_addr *gw_addr;
1862                 int gwa_type;
1863
1864                 gw_addr = &cfg->fc_gateway;
1865                 gwa_type = ipv6_addr_type(gw_addr);
1866
1867                 /* if gw_addr is local we will fail to detect this in case
1868                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1869                  * will return already-added prefix route via interface that
1870                  * prefix route was assigned to, which might be non-loopback.
1871                  */
1872                 err = -EINVAL;
1873                 if (ipv6_chk_addr_and_flags(net, gw_addr,
1874                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
1875                                             dev : NULL, 0, 0))
1876                         goto out;
1877
1878                 rt->rt6i_gateway = *gw_addr;
1879
1880                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1881                         struct rt6_info *grt;
1882
1883                         /* IPv6 strictly inhibits using not link-local
1884                            addresses as nexthop address.
1885                            Otherwise, router will not able to send redirects.
1886                            It is very good, but in some (rare!) circumstances
1887                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1888                            some exceptions. --ANK
1889                          */
1890                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1891                                 goto out;
1892
1893                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1894
1895                         err = -EHOSTUNREACH;
1896                         if (!grt)
1897                                 goto out;
1898                         if (dev) {
1899                                 if (dev != grt->dst.dev) {
1900                                         ip6_rt_put(grt);
1901                                         goto out;
1902                                 }
1903                         } else {
1904                                 dev = grt->dst.dev;
1905                                 idev = grt->rt6i_idev;
1906                                 dev_hold(dev);
1907                                 in6_dev_hold(grt->rt6i_idev);
1908                         }
1909                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1910                                 err = 0;
1911                         ip6_rt_put(grt);
1912
1913                         if (err)
1914                                 goto out;
1915                 }
1916                 err = -EINVAL;
1917                 if (!dev || (dev->flags & IFF_LOOPBACK))
1918                         goto out;
1919         }
1920
1921         err = -ENODEV;
1922         if (!dev)
1923                 goto out;
1924
1925         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1926                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1927                         err = -EINVAL;
1928                         goto out;
1929                 }
1930                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1931                 rt->rt6i_prefsrc.plen = 128;
1932         } else
1933                 rt->rt6i_prefsrc.plen = 0;
1934
1935         rt->rt6i_flags = cfg->fc_flags;
1936
1937 install_route:
1938         rt->dst.dev = dev;
1939         rt->rt6i_idev = idev;
1940         rt->rt6i_table = table;
1941
1942         cfg->fc_nlinfo.nl_net = dev_net(dev);
1943
1944         err = ip6_convert_metrics(&mxc, cfg);
1945         if (err)
1946                 goto out;
1947
1948         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1949
1950         kfree(mxc.mx);
1951         return err;
1952 out:
1953         if (dev)
1954                 dev_put(dev);
1955         if (idev)
1956                 in6_dev_put(idev);
1957         if (rt)
1958                 dst_free(&rt->dst);
1959         return err;
1960 }
1961
1962 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1963 {
1964         int err;
1965         struct fib6_table *table;
1966         struct net *net = dev_net(rt->dst.dev);
1967
1968         if (rt == net->ipv6.ip6_null_entry) {
1969                 err = -ENOENT;
1970                 goto out;
1971         }
1972
1973         table = rt->rt6i_table;
1974         write_lock_bh(&table->tb6_lock);
1975         err = fib6_del(rt, info);
1976         write_unlock_bh(&table->tb6_lock);
1977
1978 out:
1979         ip6_rt_put(rt);
1980         return err;
1981 }
1982
1983 int ip6_del_rt(struct rt6_info *rt)
1984 {
1985         struct nl_info info = {
1986                 .nl_net = dev_net(rt->dst.dev),
1987         };
1988         return __ip6_del_rt(rt, &info);
1989 }
1990
1991 static int ip6_route_del(struct fib6_config *cfg)
1992 {
1993         struct fib6_table *table;
1994         struct fib6_node *fn;
1995         struct rt6_info *rt;
1996         int err = -ESRCH;
1997
1998         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1999         if (!table)
2000                 return err;
2001
2002         read_lock_bh(&table->tb6_lock);
2003
2004         fn = fib6_locate(&table->tb6_root,
2005                          &cfg->fc_dst, cfg->fc_dst_len,
2006                          &cfg->fc_src, cfg->fc_src_len);
2007
2008         if (fn) {
2009                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2010                         if ((rt->rt6i_flags & RTF_CACHE) &&
2011                             !(cfg->fc_flags & RTF_CACHE))
2012                                 continue;
2013                         if (cfg->fc_ifindex &&
2014                             (!rt->dst.dev ||
2015                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2016                                 continue;
2017                         if (cfg->fc_flags & RTF_GATEWAY &&
2018                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2019                                 continue;
2020                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2021                                 continue;
2022                         dst_hold(&rt->dst);
2023                         read_unlock_bh(&table->tb6_lock);
2024
2025                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2026                 }
2027         }
2028         read_unlock_bh(&table->tb6_lock);
2029
2030         return err;
2031 }
2032
2033 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2034 {
2035         struct net *net = dev_net(skb->dev);
2036         struct netevent_redirect netevent;
2037         struct rt6_info *rt, *nrt = NULL;
2038         struct ndisc_options ndopts;
2039         struct inet6_dev *in6_dev;
2040         struct neighbour *neigh;
2041         struct rd_msg *msg;
2042         int optlen, on_link;
2043         u8 *lladdr;
2044
2045         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2046         optlen -= sizeof(*msg);
2047
2048         if (optlen < 0) {
2049                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2050                 return;
2051         }
2052
2053         msg = (struct rd_msg *)icmp6_hdr(skb);
2054
2055         if (ipv6_addr_is_multicast(&msg->dest)) {
2056                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2057                 return;
2058         }
2059
2060         on_link = 0;
2061         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2062                 on_link = 1;
2063         } else if (ipv6_addr_type(&msg->target) !=
2064                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2065                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2066                 return;
2067         }
2068
2069         in6_dev = __in6_dev_get(skb->dev);
2070         if (!in6_dev)
2071                 return;
2072         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2073                 return;
2074
2075         /* RFC2461 8.1:
2076          *      The IP source address of the Redirect MUST be the same as the current
2077          *      first-hop router for the specified ICMP Destination Address.
2078          */
2079
2080         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2081                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2082                 return;
2083         }
2084
2085         lladdr = NULL;
2086         if (ndopts.nd_opts_tgt_lladdr) {
2087                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2088                                              skb->dev);
2089                 if (!lladdr) {
2090                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2091                         return;
2092                 }
2093         }
2094
2095         rt = (struct rt6_info *) dst;
2096         if (rt == net->ipv6.ip6_null_entry) {
2097                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2098                 return;
2099         }
2100
2101         /* Redirect received -> path was valid.
2102          * Look, redirects are sent only in response to data packets,
2103          * so that this nexthop apparently is reachable. --ANK
2104          */
2105         dst_confirm(&rt->dst);
2106
2107         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2108         if (!neigh)
2109                 return;
2110
2111         /*
2112          *      We have finally decided to accept it.
2113          */
2114
2115         neigh_update(neigh, lladdr, NUD_STALE,
2116                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2117                      NEIGH_UPDATE_F_OVERRIDE|
2118                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2119                                      NEIGH_UPDATE_F_ISROUTER))
2120                      );
2121
2122         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2123         if (!nrt)
2124                 goto out;
2125
2126         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2127         if (on_link)
2128                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2129
2130         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2131
2132         if (ip6_ins_rt(nrt))
2133                 goto out;
2134
2135         netevent.old = &rt->dst;
2136         netevent.new = &nrt->dst;
2137         netevent.daddr = &msg->dest;
2138         netevent.neigh = neigh;
2139         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2140
2141         if (rt->rt6i_flags & RTF_CACHE) {
2142                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2143                 ip6_del_rt(rt);
2144         }
2145
2146 out:
2147         neigh_release(neigh);
2148 }
2149
2150 /*
2151  *      Misc support functions
2152  */
2153
2154 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2155 {
2156         BUG_ON(from->dst.from);
2157
2158         rt->rt6i_flags &= ~RTF_EXPIRES;
2159         dst_hold(&from->dst);
2160         rt->dst.from = &from->dst;
2161         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2162 }
2163
2164 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2165 {
2166         rt->dst.input = ort->dst.input;
2167         rt->dst.output = ort->dst.output;
2168         rt->rt6i_dst = ort->rt6i_dst;
2169         rt->dst.error = ort->dst.error;
2170         rt->rt6i_idev = ort->rt6i_idev;
2171         if (rt->rt6i_idev)
2172                 in6_dev_hold(rt->rt6i_idev);
2173         rt->dst.lastuse = jiffies;
2174         rt->rt6i_gateway = ort->rt6i_gateway;
2175         rt->rt6i_flags = ort->rt6i_flags;
2176         rt6_set_from(rt, ort);
2177         rt->rt6i_metric = ort->rt6i_metric;
2178 #ifdef CONFIG_IPV6_SUBTREES
2179         rt->rt6i_src = ort->rt6i_src;
2180 #endif
2181         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2182         rt->rt6i_table = ort->rt6i_table;
2183 }
2184
2185 #ifdef CONFIG_IPV6_ROUTE_INFO
2186 static struct rt6_info *rt6_get_route_info(struct net *net,
2187                                            const struct in6_addr *prefix, int prefixlen,
2188                                            const struct in6_addr *gwaddr, int ifindex)
2189 {
2190         struct fib6_node *fn;
2191         struct rt6_info *rt = NULL;
2192         struct fib6_table *table;
2193
2194         table = fib6_get_table(net, RT6_TABLE_INFO);
2195         if (!table)
2196                 return NULL;
2197
2198         read_lock_bh(&table->tb6_lock);
2199         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2200         if (!fn)
2201                 goto out;
2202
2203         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2204                 if (rt->dst.dev->ifindex != ifindex)
2205                         continue;
2206                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2207                         continue;
2208                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2209                         continue;
2210                 dst_hold(&rt->dst);
2211                 break;
2212         }
2213 out:
2214         read_unlock_bh(&table->tb6_lock);
2215         return rt;
2216 }
2217
2218 static struct rt6_info *rt6_add_route_info(struct net *net,
2219                                            const struct in6_addr *prefix, int prefixlen,
2220                                            const struct in6_addr *gwaddr, int ifindex,
2221                                            unsigned int pref)
2222 {
2223         struct fib6_config cfg = {
2224                 .fc_table       = RT6_TABLE_INFO,
2225                 .fc_metric      = IP6_RT_PRIO_USER,
2226                 .fc_ifindex     = ifindex,
2227                 .fc_dst_len     = prefixlen,
2228                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2229                                   RTF_UP | RTF_PREF(pref),
2230                 .fc_nlinfo.portid = 0,
2231                 .fc_nlinfo.nlh = NULL,
2232                 .fc_nlinfo.nl_net = net,
2233         };
2234
2235         cfg.fc_dst = *prefix;
2236         cfg.fc_gateway = *gwaddr;
2237
2238         /* We should treat it as a default route if prefix length is 0. */
2239         if (!prefixlen)
2240                 cfg.fc_flags |= RTF_DEFAULT;
2241
2242         ip6_route_add(&cfg);
2243
2244         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2245 }
2246 #endif
2247
2248 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2249 {
2250         struct rt6_info *rt;
2251         struct fib6_table *table;
2252
2253         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2254         if (!table)
2255                 return NULL;
2256
2257         read_lock_bh(&table->tb6_lock);
2258         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2259                 if (dev == rt->dst.dev &&
2260                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2261                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2262                         break;
2263         }
2264         if (rt)
2265                 dst_hold(&rt->dst);
2266         read_unlock_bh(&table->tb6_lock);
2267         return rt;
2268 }
2269
2270 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2271                                      struct net_device *dev,
2272                                      unsigned int pref)
2273 {
2274         struct fib6_config cfg = {
2275                 .fc_table       = RT6_TABLE_DFLT,
2276                 .fc_metric      = IP6_RT_PRIO_USER,
2277                 .fc_ifindex     = dev->ifindex,
2278                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2279                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2280                 .fc_nlinfo.portid = 0,
2281                 .fc_nlinfo.nlh = NULL,
2282                 .fc_nlinfo.nl_net = dev_net(dev),
2283         };
2284
2285         cfg.fc_gateway = *gwaddr;
2286
2287         ip6_route_add(&cfg);
2288
2289         return rt6_get_dflt_router(gwaddr, dev);
2290 }
2291
2292 void rt6_purge_dflt_routers(struct net *net)
2293 {
2294         struct rt6_info *rt;
2295         struct fib6_table *table;
2296
2297         /* NOTE: Keep consistent with rt6_get_dflt_router */
2298         table = fib6_get_table(net, RT6_TABLE_DFLT);
2299         if (!table)
2300                 return;
2301
2302 restart:
2303         read_lock_bh(&table->tb6_lock);
2304         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2305                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2306                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2307                         dst_hold(&rt->dst);
2308                         read_unlock_bh(&table->tb6_lock);
2309                         ip6_del_rt(rt);
2310                         goto restart;
2311                 }
2312         }
2313         read_unlock_bh(&table->tb6_lock);
2314 }
2315
2316 static void rtmsg_to_fib6_config(struct net *net,
2317                                  struct in6_rtmsg *rtmsg,
2318                                  struct fib6_config *cfg)
2319 {
2320         memset(cfg, 0, sizeof(*cfg));
2321
2322         cfg->fc_table = RT6_TABLE_MAIN;
2323         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2324         cfg->fc_metric = rtmsg->rtmsg_metric;
2325         cfg->fc_expires = rtmsg->rtmsg_info;
2326         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2327         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2328         cfg->fc_flags = rtmsg->rtmsg_flags;
2329
2330         cfg->fc_nlinfo.nl_net = net;
2331
2332         cfg->fc_dst = rtmsg->rtmsg_dst;
2333         cfg->fc_src = rtmsg->rtmsg_src;
2334         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2335 }
2336
2337 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2338 {
2339         struct fib6_config cfg;
2340         struct in6_rtmsg rtmsg;
2341         int err;
2342
2343         switch (cmd) {
2344         case SIOCADDRT:         /* Add a route */
2345         case SIOCDELRT:         /* Delete a route */
2346                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2347                         return -EPERM;
2348                 err = copy_from_user(&rtmsg, arg,
2349                                      sizeof(struct in6_rtmsg));
2350                 if (err)
2351                         return -EFAULT;
2352
2353                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2354
2355                 rtnl_lock();
2356                 switch (cmd) {
2357                 case SIOCADDRT:
2358                         err = ip6_route_add(&cfg);
2359                         break;
2360                 case SIOCDELRT:
2361                         err = ip6_route_del(&cfg);
2362                         break;
2363                 default:
2364                         err = -EINVAL;
2365                 }
2366                 rtnl_unlock();
2367
2368                 return err;
2369         }
2370
2371         return -EINVAL;
2372 }
2373
2374 /*
2375  *      Drop the packet on the floor
2376  */
2377
2378 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2379 {
2380         int type;
2381         struct dst_entry *dst = skb_dst(skb);
2382         switch (ipstats_mib_noroutes) {
2383         case IPSTATS_MIB_INNOROUTES:
2384                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2385                 if (type == IPV6_ADDR_ANY) {
2386                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2387                                       IPSTATS_MIB_INADDRERRORS);
2388                         break;
2389                 }
2390                 /* FALLTHROUGH */
2391         case IPSTATS_MIB_OUTNOROUTES:
2392                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2393                               ipstats_mib_noroutes);
2394                 break;
2395         }
2396         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2397         kfree_skb(skb);
2398         return 0;
2399 }
2400
2401 static int ip6_pkt_discard(struct sk_buff *skb)
2402 {
2403         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2404 }
2405
2406 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2407 {
2408         skb->dev = skb_dst(skb)->dev;
2409         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2410 }
2411
2412 static int ip6_pkt_prohibit(struct sk_buff *skb)
2413 {
2414         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2415 }
2416
2417 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2418 {
2419         skb->dev = skb_dst(skb)->dev;
2420         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2421 }
2422
2423 /*
2424  *      Allocate a dst for local (unicast / anycast) address.
2425  */
2426
2427 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2428                                     const struct in6_addr *addr,
2429                                     bool anycast)
2430 {
2431         struct net *net = dev_net(idev->dev);
2432         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2433                                             DST_NOCOUNT);
2434         if (!rt)
2435                 return ERR_PTR(-ENOMEM);
2436
2437         in6_dev_hold(idev);
2438
2439         rt->dst.flags |= DST_HOST;
2440         rt->dst.input = ip6_input;
2441         rt->dst.output = ip6_output;
2442         rt->rt6i_idev = idev;
2443
2444         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2445         if (anycast)
2446                 rt->rt6i_flags |= RTF_ANYCAST;
2447         else
2448                 rt->rt6i_flags |= RTF_LOCAL;
2449
2450         rt->rt6i_gateway  = *addr;
2451         rt->rt6i_dst.addr = *addr;
2452         rt->rt6i_dst.plen = 128;
2453         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2454
2455         atomic_set(&rt->dst.__refcnt, 1);
2456
2457         return rt;
2458 }
2459
2460 int ip6_route_get_saddr(struct net *net,
2461                         struct rt6_info *rt,
2462                         const struct in6_addr *daddr,
2463                         unsigned int prefs,
2464                         struct in6_addr *saddr)
2465 {
2466         struct inet6_dev *idev =
2467                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2468         int err = 0;
2469         if (rt && rt->rt6i_prefsrc.plen)
2470                 *saddr = rt->rt6i_prefsrc.addr;
2471         else
2472                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2473                                          daddr, prefs, saddr);
2474         return err;
2475 }
2476
2477 /* remove deleted ip from prefsrc entries */
2478 struct arg_dev_net_ip {
2479         struct net_device *dev;
2480         struct net *net;
2481         struct in6_addr *addr;
2482 };
2483
2484 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2485 {
2486         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2487         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2488         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2489
2490         if (((void *)rt->dst.dev == dev || !dev) &&
2491             rt != net->ipv6.ip6_null_entry &&
2492             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2493                 /* remove prefsrc entry */
2494                 rt->rt6i_prefsrc.plen = 0;
2495         }
2496         return 0;
2497 }
2498
2499 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2500 {
2501         struct net *net = dev_net(ifp->idev->dev);
2502         struct arg_dev_net_ip adni = {
2503                 .dev = ifp->idev->dev,
2504                 .net = net,
2505                 .addr = &ifp->addr,
2506         };
2507         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2508 }
2509
2510 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2511 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2512
2513 /* Remove routers and update dst entries when gateway turn into host. */
2514 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2515 {
2516         struct in6_addr *gateway = (struct in6_addr *)arg;
2517
2518         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2519              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2520              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2521                 return -1;
2522         }
2523         return 0;
2524 }
2525
2526 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2527 {
2528         fib6_clean_all(net, fib6_clean_tohost, gateway);
2529 }
2530
2531 struct arg_dev_net {
2532         struct net_device *dev;
2533         struct net *net;
2534 };
2535
2536 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2537 {
2538         const struct arg_dev_net *adn = arg;
2539         const struct net_device *dev = adn->dev;
2540
2541         if ((rt->dst.dev == dev || !dev) &&
2542             rt != adn->net->ipv6.ip6_null_entry)
2543                 return -1;
2544
2545         return 0;
2546 }
2547
2548 void rt6_ifdown(struct net *net, struct net_device *dev)
2549 {
2550         struct arg_dev_net adn = {
2551                 .dev = dev,
2552                 .net = net,
2553         };
2554
2555         fib6_clean_all(net, fib6_ifdown, &adn);
2556         icmp6_clean_all(fib6_ifdown, &adn);
2557         rt6_uncached_list_flush_dev(net, dev);
2558 }
2559
2560 struct rt6_mtu_change_arg {
2561         struct net_device *dev;
2562         unsigned int mtu;
2563 };
2564
2565 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2566 {
2567         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2568         struct inet6_dev *idev;
2569
2570         /* In IPv6 pmtu discovery is not optional,
2571            so that RTAX_MTU lock cannot disable it.
2572            We still use this lock to block changes
2573            caused by addrconf/ndisc.
2574         */
2575
2576         idev = __in6_dev_get(arg->dev);
2577         if (!idev)
2578                 return 0;
2579
2580         /* For administrative MTU increase, there is no way to discover
2581            IPv6 PMTU increase, so PMTU increase should be updated here.
2582            Since RFC 1981 doesn't include administrative MTU increase
2583            update PMTU increase is a MUST. (i.e. jumbo frame)
2584          */
2585         /*
2586            If new MTU is less than route PMTU, this new MTU will be the
2587            lowest MTU in the path, update the route PMTU to reflect PMTU
2588            decreases; if new MTU is greater than route PMTU, and the
2589            old MTU is the lowest MTU in the path, update the route PMTU
2590            to reflect the increase. In this case if the other nodes' MTU
2591            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2592            PMTU discouvery.
2593          */
2594         if (rt->dst.dev == arg->dev &&
2595             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2596                 if (rt->rt6i_flags & RTF_CACHE) {
2597                         /* For RTF_CACHE with rt6i_pmtu == 0
2598                          * (i.e. a redirected route),
2599                          * the metrics of its rt->dst.from has already
2600                          * been updated.
2601                          */
2602                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2603                                 rt->rt6i_pmtu = arg->mtu;
2604                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2605                            (dst_mtu(&rt->dst) < arg->mtu &&
2606                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2607                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2608                 }
2609         }
2610         return 0;
2611 }
2612
2613 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2614 {
2615         struct rt6_mtu_change_arg arg = {
2616                 .dev = dev,
2617                 .mtu = mtu,
2618         };
2619
2620         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2621 }
2622
2623 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2624         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2625         [RTA_OIF]               = { .type = NLA_U32 },
2626         [RTA_IIF]               = { .type = NLA_U32 },
2627         [RTA_PRIORITY]          = { .type = NLA_U32 },
2628         [RTA_METRICS]           = { .type = NLA_NESTED },
2629         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2630         [RTA_PREF]              = { .type = NLA_U8 },
2631 };
2632
2633 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2634                               struct fib6_config *cfg)
2635 {
2636         struct rtmsg *rtm;
2637         struct nlattr *tb[RTA_MAX+1];
2638         unsigned int pref;
2639         int err;
2640
2641         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2642         if (err < 0)
2643                 goto errout;
2644
2645         err = -EINVAL;
2646         rtm = nlmsg_data(nlh);
2647         memset(cfg, 0, sizeof(*cfg));
2648
2649         cfg->fc_table = rtm->rtm_table;
2650         cfg->fc_dst_len = rtm->rtm_dst_len;
2651         cfg->fc_src_len = rtm->rtm_src_len;
2652         cfg->fc_flags = RTF_UP;
2653         cfg->fc_protocol = rtm->rtm_protocol;
2654         cfg->fc_type = rtm->rtm_type;
2655
2656         if (rtm->rtm_type == RTN_UNREACHABLE ||
2657             rtm->rtm_type == RTN_BLACKHOLE ||
2658             rtm->rtm_type == RTN_PROHIBIT ||
2659             rtm->rtm_type == RTN_THROW)
2660                 cfg->fc_flags |= RTF_REJECT;
2661
2662         if (rtm->rtm_type == RTN_LOCAL)
2663                 cfg->fc_flags |= RTF_LOCAL;
2664
2665         if (rtm->rtm_flags & RTM_F_CLONED)
2666                 cfg->fc_flags |= RTF_CACHE;
2667
2668         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2669         cfg->fc_nlinfo.nlh = nlh;
2670         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2671
2672         if (tb[RTA_GATEWAY]) {
2673                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2674                 cfg->fc_flags |= RTF_GATEWAY;
2675         }
2676
2677         if (tb[RTA_DST]) {
2678                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2679
2680                 if (nla_len(tb[RTA_DST]) < plen)
2681                         goto errout;
2682
2683                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2684         }
2685
2686         if (tb[RTA_SRC]) {
2687                 int plen = (rtm->rtm_src_len + 7) >> 3;
2688
2689                 if (nla_len(tb[RTA_SRC]) < plen)
2690                         goto errout;
2691
2692                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2693         }
2694
2695         if (tb[RTA_PREFSRC])
2696                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2697
2698         if (tb[RTA_OIF])
2699                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2700
2701         if (tb[RTA_PRIORITY])
2702                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2703
2704         if (tb[RTA_METRICS]) {
2705                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2706                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2707         }
2708
2709         if (tb[RTA_TABLE])
2710                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2711
2712         if (tb[RTA_MULTIPATH]) {
2713                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2714                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2715         }
2716
2717         if (tb[RTA_PREF]) {
2718                 pref = nla_get_u8(tb[RTA_PREF]);
2719                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2720                     pref != ICMPV6_ROUTER_PREF_HIGH)
2721                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2722                 cfg->fc_flags |= RTF_PREF(pref);
2723         }
2724
2725         err = 0;
2726 errout:
2727         return err;
2728 }
2729
2730 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2731 {
2732         struct fib6_config r_cfg;
2733         struct rtnexthop *rtnh;
2734         int remaining;
2735         int attrlen;
2736         int err = 0, last_err = 0;
2737
2738         remaining = cfg->fc_mp_len;
2739 beginning:
2740         rtnh = (struct rtnexthop *)cfg->fc_mp;
2741
2742         /* Parse a Multipath Entry */
2743         while (rtnh_ok(rtnh, remaining)) {
2744                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2745                 if (rtnh->rtnh_ifindex)
2746                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2747
2748                 attrlen = rtnh_attrlen(rtnh);
2749                 if (attrlen > 0) {
2750                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2751
2752                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2753                         if (nla) {
2754                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2755                                 r_cfg.fc_flags |= RTF_GATEWAY;
2756                         }
2757                 }
2758                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2759                 if (err) {
2760                         last_err = err;
2761                         /* If we are trying to remove a route, do not stop the
2762                          * loop when ip6_route_del() fails (because next hop is
2763                          * already gone), we should try to remove all next hops.
2764                          */
2765                         if (add) {
2766                                 /* If add fails, we should try to delete all
2767                                  * next hops that have been already added.
2768                                  */
2769                                 add = 0;
2770                                 remaining = cfg->fc_mp_len - remaining;
2771                                 goto beginning;
2772                         }
2773                 }
2774                 /* Because each route is added like a single route we remove
2775                  * these flags after the first nexthop: if there is a collision,
2776                  * we have already failed to add the first nexthop:
2777                  * fib6_add_rt2node() has rejected it; when replacing, old
2778                  * nexthops have been replaced by first new, the rest should
2779                  * be added to it.
2780                  */
2781                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2782                                                      NLM_F_REPLACE);
2783                 rtnh = rtnh_next(rtnh, &remaining);
2784         }
2785
2786         return last_err;
2787 }
2788
2789 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2790 {
2791         struct fib6_config cfg;
2792         int err;
2793
2794         err = rtm_to_fib6_config(skb, nlh, &cfg);
2795         if (err < 0)
2796                 return err;
2797
2798         if (cfg.fc_mp)
2799                 return ip6_route_multipath(&cfg, 0);
2800         else
2801                 return ip6_route_del(&cfg);
2802 }
2803
2804 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2805 {
2806         struct fib6_config cfg;
2807         int err;
2808
2809         err = rtm_to_fib6_config(skb, nlh, &cfg);
2810         if (err < 0)
2811                 return err;
2812
2813         if (cfg.fc_mp)
2814                 return ip6_route_multipath(&cfg, 1);
2815         else
2816                 return ip6_route_add(&cfg);
2817 }
2818
2819 static inline size_t rt6_nlmsg_size(void)
2820 {
2821         return NLMSG_ALIGN(sizeof(struct rtmsg))
2822                + nla_total_size(16) /* RTA_SRC */
2823                + nla_total_size(16) /* RTA_DST */
2824                + nla_total_size(16) /* RTA_GATEWAY */
2825                + nla_total_size(16) /* RTA_PREFSRC */
2826                + nla_total_size(4) /* RTA_TABLE */
2827                + nla_total_size(4) /* RTA_IIF */
2828                + nla_total_size(4) /* RTA_OIF */
2829                + nla_total_size(4) /* RTA_PRIORITY */
2830                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2831                + nla_total_size(sizeof(struct rta_cacheinfo))
2832                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2833                + nla_total_size(1); /* RTA_PREF */
2834 }
2835
2836 static int rt6_fill_node(struct net *net,
2837                          struct sk_buff *skb, struct rt6_info *rt,
2838                          struct in6_addr *dst, struct in6_addr *src,
2839                          int iif, int type, u32 portid, u32 seq,
2840                          int prefix, int nowait, unsigned int flags)
2841 {
2842         u32 metrics[RTAX_MAX];
2843         struct rtmsg *rtm;
2844         struct nlmsghdr *nlh;
2845         long expires;
2846         u32 table;
2847
2848         if (prefix) {   /* user wants prefix routes only */
2849                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2850                         /* success since this is not a prefix route */
2851                         return 1;
2852                 }
2853         }
2854
2855         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2856         if (!nlh)
2857                 return -EMSGSIZE;
2858
2859         rtm = nlmsg_data(nlh);
2860         rtm->rtm_family = AF_INET6;
2861         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2862         rtm->rtm_src_len = rt->rt6i_src.plen;
2863         rtm->rtm_tos = 0;
2864         if (rt->rt6i_table)
2865                 table = rt->rt6i_table->tb6_id;
2866         else
2867                 table = RT6_TABLE_UNSPEC;
2868         rtm->rtm_table = table;
2869         if (nla_put_u32(skb, RTA_TABLE, table))
2870                 goto nla_put_failure;
2871         if (rt->rt6i_flags & RTF_REJECT) {
2872                 switch (rt->dst.error) {
2873                 case -EINVAL:
2874                         rtm->rtm_type = RTN_BLACKHOLE;
2875                         break;
2876                 case -EACCES:
2877                         rtm->rtm_type = RTN_PROHIBIT;
2878                         break;
2879                 case -EAGAIN:
2880                         rtm->rtm_type = RTN_THROW;
2881                         break;
2882                 default:
2883                         rtm->rtm_type = RTN_UNREACHABLE;
2884                         break;
2885                 }
2886         }
2887         else if (rt->rt6i_flags & RTF_LOCAL)
2888                 rtm->rtm_type = RTN_LOCAL;
2889         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2890                 rtm->rtm_type = RTN_LOCAL;
2891         else
2892                 rtm->rtm_type = RTN_UNICAST;
2893         rtm->rtm_flags = 0;
2894         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2895         rtm->rtm_protocol = rt->rt6i_protocol;
2896         if (rt->rt6i_flags & RTF_DYNAMIC)
2897                 rtm->rtm_protocol = RTPROT_REDIRECT;
2898         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2899                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2900                         rtm->rtm_protocol = RTPROT_RA;
2901                 else
2902                         rtm->rtm_protocol = RTPROT_KERNEL;
2903         }
2904
2905         if (rt->rt6i_flags & RTF_CACHE)
2906                 rtm->rtm_flags |= RTM_F_CLONED;
2907
2908         if (dst) {
2909                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2910                         goto nla_put_failure;
2911                 rtm->rtm_dst_len = 128;
2912         } else if (rtm->rtm_dst_len)
2913                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2914                         goto nla_put_failure;
2915 #ifdef CONFIG_IPV6_SUBTREES
2916         if (src) {
2917                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2918                         goto nla_put_failure;
2919                 rtm->rtm_src_len = 128;
2920         } else if (rtm->rtm_src_len &&
2921                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2922                 goto nla_put_failure;
2923 #endif
2924         if (iif) {
2925 #ifdef CONFIG_IPV6_MROUTE
2926                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2927                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2928                         if (err <= 0) {
2929                                 if (!nowait) {
2930                                         if (err == 0)
2931                                                 return 0;
2932                                         goto nla_put_failure;
2933                                 } else {
2934                                         if (err == -EMSGSIZE)
2935                                                 goto nla_put_failure;
2936                                 }
2937                         }
2938                 } else
2939 #endif
2940                         if (nla_put_u32(skb, RTA_IIF, iif))
2941                                 goto nla_put_failure;
2942         } else if (dst) {
2943                 struct in6_addr saddr_buf;
2944                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2945                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2946                         goto nla_put_failure;
2947         }
2948
2949         if (rt->rt6i_prefsrc.plen) {
2950                 struct in6_addr saddr_buf;
2951                 saddr_buf = rt->rt6i_prefsrc.addr;
2952                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2953                         goto nla_put_failure;
2954         }
2955
2956         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2957         if (rt->rt6i_pmtu)
2958                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2959         if (rtnetlink_put_metrics(skb, metrics) < 0)
2960                 goto nla_put_failure;
2961
2962         if (rt->rt6i_flags & RTF_GATEWAY) {
2963                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2964                         goto nla_put_failure;
2965         }
2966
2967         if (rt->dst.dev &&
2968             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2969                 goto nla_put_failure;
2970         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2971                 goto nla_put_failure;
2972
2973         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2974
2975         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2976                 goto nla_put_failure;
2977
2978         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2979                 goto nla_put_failure;
2980
2981         nlmsg_end(skb, nlh);
2982         return 0;
2983
2984 nla_put_failure:
2985         nlmsg_cancel(skb, nlh);
2986         return -EMSGSIZE;
2987 }
2988
2989 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2990 {
2991         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2992         int prefix;
2993
2994         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2995                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2996                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2997         } else
2998                 prefix = 0;
2999
3000         return rt6_fill_node(arg->net,
3001                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3002                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3003                      prefix, 0, NLM_F_MULTI);
3004 }
3005
3006 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3007 {
3008         struct net *net = sock_net(in_skb->sk);
3009         struct nlattr *tb[RTA_MAX+1];
3010         struct rt6_info *rt;
3011         struct sk_buff *skb;
3012         struct rtmsg *rtm;
3013         struct flowi6 fl6;
3014         int err, iif = 0, oif = 0;
3015
3016         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3017         if (err < 0)
3018                 goto errout;
3019
3020         err = -EINVAL;
3021         memset(&fl6, 0, sizeof(fl6));
3022
3023         if (tb[RTA_SRC]) {
3024                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3025                         goto errout;
3026
3027                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3028         }
3029
3030         if (tb[RTA_DST]) {
3031                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3032                         goto errout;
3033
3034                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3035         }
3036
3037         if (tb[RTA_IIF])
3038                 iif = nla_get_u32(tb[RTA_IIF]);
3039
3040         if (tb[RTA_OIF])
3041                 oif = nla_get_u32(tb[RTA_OIF]);
3042
3043         if (tb[RTA_MARK])
3044                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3045
3046         if (iif) {
3047                 struct net_device *dev;
3048                 int flags = 0;
3049
3050                 dev = __dev_get_by_index(net, iif);
3051                 if (!dev) {
3052                         err = -ENODEV;
3053                         goto errout;
3054                 }
3055
3056                 fl6.flowi6_iif = iif;
3057
3058                 if (!ipv6_addr_any(&fl6.saddr))
3059                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3060
3061                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3062                                                                flags);
3063         } else {
3064                 fl6.flowi6_oif = oif;
3065
3066                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3067         }
3068
3069         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3070         if (!skb) {
3071                 ip6_rt_put(rt);
3072                 err = -ENOBUFS;
3073                 goto errout;
3074         }
3075
3076         /* Reserve room for dummy headers, this skb can pass
3077            through good chunk of routing engine.
3078          */
3079         skb_reset_mac_header(skb);
3080         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3081
3082         skb_dst_set(skb, &rt->dst);
3083
3084         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3085                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3086                             nlh->nlmsg_seq, 0, 0, 0);
3087         if (err < 0) {
3088                 kfree_skb(skb);
3089                 goto errout;
3090         }
3091
3092         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3093 errout:
3094         return err;
3095 }
3096
3097 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3098 {
3099         struct sk_buff *skb;
3100         struct net *net = info->nl_net;
3101         u32 seq;
3102         int err;
3103
3104         err = -ENOBUFS;
3105         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3106
3107         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3108         if (!skb)
3109                 goto errout;
3110
3111         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3112                                 event, info->portid, seq, 0, 0, 0);
3113         if (err < 0) {
3114                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3115                 WARN_ON(err == -EMSGSIZE);
3116                 kfree_skb(skb);
3117                 goto errout;
3118         }
3119         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3120                     info->nlh, gfp_any());
3121         return;
3122 errout:
3123         if (err < 0)
3124                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3125 }
3126
3127 static int ip6_route_dev_notify(struct notifier_block *this,
3128                                 unsigned long event, void *ptr)
3129 {
3130         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3131         struct net *net = dev_net(dev);
3132
3133         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3134                 net->ipv6.ip6_null_entry->dst.dev = dev;
3135                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3136 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3137                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3138                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3139                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3140                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3141 #endif
3142         }
3143
3144         return NOTIFY_OK;
3145 }
3146
3147 /*
3148  *      /proc
3149  */
3150
3151 #ifdef CONFIG_PROC_FS
3152
3153 static const struct file_operations ipv6_route_proc_fops = {
3154         .owner          = THIS_MODULE,
3155         .open           = ipv6_route_open,
3156         .read           = seq_read,
3157         .llseek         = seq_lseek,
3158         .release        = seq_release_net,
3159 };
3160
3161 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3162 {
3163         struct net *net = (struct net *)seq->private;
3164         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3165                    net->ipv6.rt6_stats->fib_nodes,
3166                    net->ipv6.rt6_stats->fib_route_nodes,
3167                    net->ipv6.rt6_stats->fib_rt_alloc,
3168                    net->ipv6.rt6_stats->fib_rt_entries,
3169                    net->ipv6.rt6_stats->fib_rt_cache,
3170                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3171                    net->ipv6.rt6_stats->fib_discarded_routes);
3172
3173         return 0;
3174 }
3175
3176 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3177 {
3178         return single_open_net(inode, file, rt6_stats_seq_show);
3179 }
3180
3181 static const struct file_operations rt6_stats_seq_fops = {
3182         .owner   = THIS_MODULE,
3183         .open    = rt6_stats_seq_open,
3184         .read    = seq_read,
3185         .llseek  = seq_lseek,
3186         .release = single_release_net,
3187 };
3188 #endif  /* CONFIG_PROC_FS */
3189
3190 #ifdef CONFIG_SYSCTL
3191
3192 static
3193 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3194                               void __user *buffer, size_t *lenp, loff_t *ppos)
3195 {
3196         struct net *net;
3197         int delay;
3198         if (!write)
3199                 return -EINVAL;
3200
3201         net = (struct net *)ctl->extra1;
3202         delay = net->ipv6.sysctl.flush_delay;
3203         proc_dointvec(ctl, write, buffer, lenp, ppos);
3204         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3205         return 0;
3206 }
3207
3208 struct ctl_table ipv6_route_table_template[] = {
3209         {
3210                 .procname       =       "flush",
3211                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3212                 .maxlen         =       sizeof(int),
3213                 .mode           =       0200,
3214                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3215         },
3216         {
3217                 .procname       =       "gc_thresh",
3218                 .data           =       &ip6_dst_ops_template.gc_thresh,
3219                 .maxlen         =       sizeof(int),
3220                 .mode           =       0644,
3221                 .proc_handler   =       proc_dointvec,
3222         },
3223         {
3224                 .procname       =       "max_size",
3225                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3226                 .maxlen         =       sizeof(int),
3227                 .mode           =       0644,
3228                 .proc_handler   =       proc_dointvec,
3229         },
3230         {
3231                 .procname       =       "gc_min_interval",
3232                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3233                 .maxlen         =       sizeof(int),
3234                 .mode           =       0644,
3235                 .proc_handler   =       proc_dointvec_jiffies,
3236         },
3237         {
3238                 .procname       =       "gc_timeout",
3239                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3240                 .maxlen         =       sizeof(int),
3241                 .mode           =       0644,
3242                 .proc_handler   =       proc_dointvec_jiffies,
3243         },
3244         {
3245                 .procname       =       "gc_interval",
3246                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3247                 .maxlen         =       sizeof(int),
3248                 .mode           =       0644,
3249                 .proc_handler   =       proc_dointvec_jiffies,
3250         },
3251         {
3252                 .procname       =       "gc_elasticity",
3253                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3254                 .maxlen         =       sizeof(int),
3255                 .mode           =       0644,
3256                 .proc_handler   =       proc_dointvec,
3257         },
3258         {
3259                 .procname       =       "mtu_expires",
3260                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3261                 .maxlen         =       sizeof(int),
3262                 .mode           =       0644,
3263                 .proc_handler   =       proc_dointvec_jiffies,
3264         },
3265         {
3266                 .procname       =       "min_adv_mss",
3267                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3268                 .maxlen         =       sizeof(int),
3269                 .mode           =       0644,
3270                 .proc_handler   =       proc_dointvec,
3271         },
3272         {
3273                 .procname       =       "gc_min_interval_ms",
3274                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3275                 .maxlen         =       sizeof(int),
3276                 .mode           =       0644,
3277                 .proc_handler   =       proc_dointvec_ms_jiffies,
3278         },
3279         { }
3280 };
3281
3282 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3283 {
3284         struct ctl_table *table;
3285
3286         table = kmemdup(ipv6_route_table_template,
3287                         sizeof(ipv6_route_table_template),
3288                         GFP_KERNEL);
3289
3290         if (table) {
3291                 table[0].data = &net->ipv6.sysctl.flush_delay;
3292                 table[0].extra1 = net;
3293                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3294                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3295                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3296                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3297                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3298                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3299                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3300                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3301                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3302
3303                 /* Don't export sysctls to unprivileged users */
3304                 if (net->user_ns != &init_user_ns)
3305                         table[0].procname = NULL;
3306         }
3307
3308         return table;
3309 }
3310 #endif
3311
3312 static int __net_init ip6_route_net_init(struct net *net)
3313 {
3314         int ret = -ENOMEM;
3315
3316         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3317                sizeof(net->ipv6.ip6_dst_ops));
3318
3319         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3320                 goto out_ip6_dst_ops;
3321
3322         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3323                                            sizeof(*net->ipv6.ip6_null_entry),
3324                                            GFP_KERNEL);
3325         if (!net->ipv6.ip6_null_entry)
3326                 goto out_ip6_dst_entries;
3327         net->ipv6.ip6_null_entry->dst.path =
3328                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3329         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3330         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3331                          ip6_template_metrics, true);
3332
3333 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3334         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3335                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3336                                                GFP_KERNEL);
3337         if (!net->ipv6.ip6_prohibit_entry)
3338                 goto out_ip6_null_entry;
3339         net->ipv6.ip6_prohibit_entry->dst.path =
3340                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3341         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3342         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3343                          ip6_template_metrics, true);
3344
3345         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3346                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3347                                                GFP_KERNEL);
3348         if (!net->ipv6.ip6_blk_hole_entry)
3349                 goto out_ip6_prohibit_entry;
3350         net->ipv6.ip6_blk_hole_entry->dst.path =
3351                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3352         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3353         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3354                          ip6_template_metrics, true);
3355 #endif
3356
3357         net->ipv6.sysctl.flush_delay = 0;
3358         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3359         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3360         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3361         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3362         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3363         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3364         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3365
3366         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3367
3368         ret = 0;
3369 out:
3370         return ret;
3371
3372 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3373 out_ip6_prohibit_entry:
3374         kfree(net->ipv6.ip6_prohibit_entry);
3375 out_ip6_null_entry:
3376         kfree(net->ipv6.ip6_null_entry);
3377 #endif
3378 out_ip6_dst_entries:
3379         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3380 out_ip6_dst_ops:
3381         goto out;
3382 }
3383
3384 static void __net_exit ip6_route_net_exit(struct net *net)
3385 {
3386         kfree(net->ipv6.ip6_null_entry);
3387 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3388         kfree(net->ipv6.ip6_prohibit_entry);
3389         kfree(net->ipv6.ip6_blk_hole_entry);
3390 #endif
3391         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3392 }
3393
3394 static int __net_init ip6_route_net_init_late(struct net *net)
3395 {
3396 #ifdef CONFIG_PROC_FS
3397         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3398         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3399 #endif
3400         return 0;
3401 }
3402
3403 static void __net_exit ip6_route_net_exit_late(struct net *net)
3404 {
3405 #ifdef CONFIG_PROC_FS
3406         remove_proc_entry("ipv6_route", net->proc_net);
3407         remove_proc_entry("rt6_stats", net->proc_net);
3408 #endif
3409 }
3410
3411 static struct pernet_operations ip6_route_net_ops = {
3412         .init = ip6_route_net_init,
3413         .exit = ip6_route_net_exit,
3414 };
3415
3416 static int __net_init ipv6_inetpeer_init(struct net *net)
3417 {
3418         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3419
3420         if (!bp)
3421                 return -ENOMEM;
3422         inet_peer_base_init(bp);
3423         net->ipv6.peers = bp;
3424         return 0;
3425 }
3426
3427 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3428 {
3429         struct inet_peer_base *bp = net->ipv6.peers;
3430
3431         net->ipv6.peers = NULL;
3432         inetpeer_invalidate_tree(bp);
3433         kfree(bp);
3434 }
3435
3436 static struct pernet_operations ipv6_inetpeer_ops = {
3437         .init   =       ipv6_inetpeer_init,
3438         .exit   =       ipv6_inetpeer_exit,
3439 };
3440
3441 static struct pernet_operations ip6_route_net_late_ops = {
3442         .init = ip6_route_net_init_late,
3443         .exit = ip6_route_net_exit_late,
3444 };
3445
3446 static struct notifier_block ip6_route_dev_notifier = {
3447         .notifier_call = ip6_route_dev_notify,
3448         .priority = 0,
3449 };
3450
3451 int __init ip6_route_init(void)
3452 {
3453         int ret;
3454         int cpu;
3455
3456         ret = -ENOMEM;
3457         ip6_dst_ops_template.kmem_cachep =
3458                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3459                                   SLAB_HWCACHE_ALIGN, NULL);
3460         if (!ip6_dst_ops_template.kmem_cachep)
3461                 goto out;
3462
3463         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3464         if (ret)
3465                 goto out_kmem_cache;
3466
3467         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3468         if (ret)
3469                 goto out_dst_entries;
3470
3471         ret = register_pernet_subsys(&ip6_route_net_ops);
3472         if (ret)
3473                 goto out_register_inetpeer;
3474
3475         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3476
3477         /* Registering of the loopback is done before this portion of code,
3478          * the loopback reference in rt6_info will not be taken, do it
3479          * manually for init_net */
3480         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3481         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3482   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3483         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3484         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3485         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3486         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3487   #endif
3488         ret = fib6_init();
3489         if (ret)
3490                 goto out_register_subsys;
3491
3492         ret = xfrm6_init();
3493         if (ret)
3494                 goto out_fib6_init;
3495
3496         ret = fib6_rules_init();
3497         if (ret)
3498                 goto xfrm6_init;
3499
3500         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3501         if (ret)
3502                 goto fib6_rules_init;
3503
3504         ret = -ENOBUFS;
3505         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3506             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3507             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3508                 goto out_register_late_subsys;
3509
3510         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3511         if (ret)
3512                 goto out_register_late_subsys;
3513
3514         for_each_possible_cpu(cpu) {
3515                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3516
3517                 INIT_LIST_HEAD(&ul->head);
3518                 spin_lock_init(&ul->lock);
3519         }
3520
3521 out:
3522         return ret;
3523
3524 out_register_late_subsys:
3525         unregister_pernet_subsys(&ip6_route_net_late_ops);
3526 fib6_rules_init:
3527         fib6_rules_cleanup();
3528 xfrm6_init:
3529         xfrm6_fini();
3530 out_fib6_init:
3531         fib6_gc_cleanup();
3532 out_register_subsys:
3533         unregister_pernet_subsys(&ip6_route_net_ops);
3534 out_register_inetpeer:
3535         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3536 out_dst_entries:
3537         dst_entries_destroy(&ip6_dst_blackhole_ops);
3538 out_kmem_cache:
3539         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3540         goto out;
3541 }
3542
3543 void ip6_route_cleanup(void)
3544 {
3545         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3546         unregister_pernet_subsys(&ip6_route_net_late_ops);
3547         fib6_rules_cleanup();
3548         xfrm6_fini();
3549         fib6_gc_cleanup();
3550         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3551         unregister_pernet_subsys(&ip6_route_net_ops);
3552         dst_entries_destroy(&ip6_dst_blackhole_ops);
3553         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3554 }