Merge tag 'hwspinlock-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/ohad...
[linux-drm-fsl-dcu.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/xfrm.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
60 #include <net/nexthop.h>
61
62 #include <asm/uaccess.h>
63
64 #ifdef CONFIG_SYSCTL
65 #include <linux/sysctl.h>
66 #endif
67
68 enum rt6_nud_state {
69         RT6_NUD_FAIL_HARD = -3,
70         RT6_NUD_FAIL_PROBE = -2,
71         RT6_NUD_FAIL_DO_RR = -1,
72         RT6_NUD_SUCCEED = 1
73 };
74
75 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb);
87 static int              ip6_pkt_prohibit(struct sk_buff *skb);
88 static int              ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
91                                            struct sk_buff *skb, u32 mtu);
92 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
93                                         struct sk_buff *skb);
94 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
95 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct net *net,
99                                            const struct in6_addr *prefix, int prefixlen,
100                                            const struct in6_addr *gwaddr, int ifindex,
101                                            unsigned int pref);
102 static struct rt6_info *rt6_get_route_info(struct net *net,
103                                            const struct in6_addr *prefix, int prefixlen,
104                                            const struct in6_addr *gwaddr, int ifindex);
105 #endif
106
107 struct uncached_list {
108         spinlock_t              lock;
109         struct list_head        head;
110 };
111
112 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
113
114 static void rt6_uncached_list_add(struct rt6_info *rt)
115 {
116         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
117
118         rt->dst.flags |= DST_NOCACHE;
119         rt->rt6i_uncached_list = ul;
120
121         spin_lock_bh(&ul->lock);
122         list_add_tail(&rt->rt6i_uncached, &ul->head);
123         spin_unlock_bh(&ul->lock);
124 }
125
126 static void rt6_uncached_list_del(struct rt6_info *rt)
127 {
128         if (!list_empty(&rt->rt6i_uncached)) {
129                 struct uncached_list *ul = rt->rt6i_uncached_list;
130
131                 spin_lock_bh(&ul->lock);
132                 list_del(&rt->rt6i_uncached);
133                 spin_unlock_bh(&ul->lock);
134         }
135 }
136
137 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
138 {
139         struct net_device *loopback_dev = net->loopback_dev;
140         int cpu;
141
142         for_each_possible_cpu(cpu) {
143                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
144                 struct rt6_info *rt;
145
146                 spin_lock_bh(&ul->lock);
147                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
148                         struct inet6_dev *rt_idev = rt->rt6i_idev;
149                         struct net_device *rt_dev = rt->dst.dev;
150
151                         if (rt_idev && (rt_idev->dev == dev || !dev) &&
152                             rt_idev->dev != loopback_dev) {
153                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
154                                 in6_dev_put(rt_idev);
155                         }
156
157                         if (rt_dev && (rt_dev == dev || !dev) &&
158                             rt_dev != loopback_dev) {
159                                 rt->dst.dev = loopback_dev;
160                                 dev_hold(rt->dst.dev);
161                                 dev_put(rt_dev);
162                         }
163                 }
164                 spin_unlock_bh(&ul->lock);
165         }
166 }
167
168 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
169 {
170         return dst_metrics_write_ptr(rt->dst.from);
171 }
172
173 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
174 {
175         struct rt6_info *rt = (struct rt6_info *)dst;
176
177         if (rt->rt6i_flags & RTF_PCPU)
178                 return rt6_pcpu_cow_metrics(rt);
179         else if (rt->rt6i_flags & RTF_CACHE)
180                 return NULL;
181         else
182                 return dst_cow_metrics_generic(dst, old);
183 }
184
185 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
186                                              struct sk_buff *skb,
187                                              const void *daddr)
188 {
189         struct in6_addr *p = &rt->rt6i_gateway;
190
191         if (!ipv6_addr_any(p))
192                 return (const void *) p;
193         else if (skb)
194                 return &ipv6_hdr(skb)->daddr;
195         return daddr;
196 }
197
198 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
199                                           struct sk_buff *skb,
200                                           const void *daddr)
201 {
202         struct rt6_info *rt = (struct rt6_info *) dst;
203         struct neighbour *n;
204
205         daddr = choose_neigh_daddr(rt, skb, daddr);
206         n = __ipv6_neigh_lookup(dst->dev, daddr);
207         if (n)
208                 return n;
209         return neigh_create(&nd_tbl, daddr, dst->dev);
210 }
211
212 static struct dst_ops ip6_dst_ops_template = {
213         .family                 =       AF_INET6,
214         .gc                     =       ip6_dst_gc,
215         .gc_thresh              =       1024,
216         .check                  =       ip6_dst_check,
217         .default_advmss         =       ip6_default_advmss,
218         .mtu                    =       ip6_mtu,
219         .cow_metrics            =       ipv6_cow_metrics,
220         .destroy                =       ip6_dst_destroy,
221         .ifdown                 =       ip6_dst_ifdown,
222         .negative_advice        =       ip6_negative_advice,
223         .link_failure           =       ip6_link_failure,
224         .update_pmtu            =       ip6_rt_update_pmtu,
225         .redirect               =       rt6_do_redirect,
226         .local_out              =       __ip6_local_out,
227         .neigh_lookup           =       ip6_neigh_lookup,
228 };
229
230 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
231 {
232         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
233
234         return mtu ? : dst->dev->mtu;
235 }
236
237 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
238                                          struct sk_buff *skb, u32 mtu)
239 {
240 }
241
242 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
243                                       struct sk_buff *skb)
244 {
245 }
246
247 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
248                                          unsigned long old)
249 {
250         return NULL;
251 }
252
253 static struct dst_ops ip6_dst_blackhole_ops = {
254         .family                 =       AF_INET6,
255         .destroy                =       ip6_dst_destroy,
256         .check                  =       ip6_dst_check,
257         .mtu                    =       ip6_blackhole_mtu,
258         .default_advmss         =       ip6_default_advmss,
259         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
260         .redirect               =       ip6_rt_blackhole_redirect,
261         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
262         .neigh_lookup           =       ip6_neigh_lookup,
263 };
264
265 static const u32 ip6_template_metrics[RTAX_MAX] = {
266         [RTAX_HOPLIMIT - 1] = 0,
267 };
268
269 static const struct rt6_info ip6_null_entry_template = {
270         .dst = {
271                 .__refcnt       = ATOMIC_INIT(1),
272                 .__use          = 1,
273                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
274                 .error          = -ENETUNREACH,
275                 .input          = ip6_pkt_discard,
276                 .output         = ip6_pkt_discard_out,
277         },
278         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
279         .rt6i_protocol  = RTPROT_KERNEL,
280         .rt6i_metric    = ~(u32) 0,
281         .rt6i_ref       = ATOMIC_INIT(1),
282 };
283
284 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
285
286 static const struct rt6_info ip6_prohibit_entry_template = {
287         .dst = {
288                 .__refcnt       = ATOMIC_INIT(1),
289                 .__use          = 1,
290                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
291                 .error          = -EACCES,
292                 .input          = ip6_pkt_prohibit,
293                 .output         = ip6_pkt_prohibit_out,
294         },
295         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
296         .rt6i_protocol  = RTPROT_KERNEL,
297         .rt6i_metric    = ~(u32) 0,
298         .rt6i_ref       = ATOMIC_INIT(1),
299 };
300
301 static const struct rt6_info ip6_blk_hole_entry_template = {
302         .dst = {
303                 .__refcnt       = ATOMIC_INIT(1),
304                 .__use          = 1,
305                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
306                 .error          = -EINVAL,
307                 .input          = dst_discard,
308                 .output         = dst_discard_sk,
309         },
310         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
311         .rt6i_protocol  = RTPROT_KERNEL,
312         .rt6i_metric    = ~(u32) 0,
313         .rt6i_ref       = ATOMIC_INIT(1),
314 };
315
316 #endif
317
318 /* allocate dst with ip6_dst_ops */
319 static struct rt6_info *__ip6_dst_alloc(struct net *net,
320                                         struct net_device *dev,
321                                         int flags,
322                                         struct fib6_table *table)
323 {
324         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
325                                         0, DST_OBSOLETE_FORCE_CHK, flags);
326
327         if (rt) {
328                 struct dst_entry *dst = &rt->dst;
329
330                 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
331                 INIT_LIST_HEAD(&rt->rt6i_siblings);
332                 INIT_LIST_HEAD(&rt->rt6i_uncached);
333         }
334         return rt;
335 }
336
337 static struct rt6_info *ip6_dst_alloc(struct net *net,
338                                       struct net_device *dev,
339                                       int flags,
340                                       struct fib6_table *table)
341 {
342         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags, table);
343
344         if (rt) {
345                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
346                 if (rt->rt6i_pcpu) {
347                         int cpu;
348
349                         for_each_possible_cpu(cpu) {
350                                 struct rt6_info **p;
351
352                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
353                                 /* no one shares rt */
354                                 *p =  NULL;
355                         }
356                 } else {
357                         dst_destroy((struct dst_entry *)rt);
358                         return NULL;
359                 }
360         }
361
362         return rt;
363 }
364
365 static void ip6_dst_destroy(struct dst_entry *dst)
366 {
367         struct rt6_info *rt = (struct rt6_info *)dst;
368         struct dst_entry *from = dst->from;
369         struct inet6_dev *idev;
370
371         dst_destroy_metrics_generic(dst);
372
373         if (rt->rt6i_pcpu)
374                 free_percpu(rt->rt6i_pcpu);
375
376         rt6_uncached_list_del(rt);
377
378         idev = rt->rt6i_idev;
379         if (idev) {
380                 rt->rt6i_idev = NULL;
381                 in6_dev_put(idev);
382         }
383
384         dst->from = NULL;
385         dst_release(from);
386 }
387
388 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
389                            int how)
390 {
391         struct rt6_info *rt = (struct rt6_info *)dst;
392         struct inet6_dev *idev = rt->rt6i_idev;
393         struct net_device *loopback_dev =
394                 dev_net(dev)->loopback_dev;
395
396         if (dev != loopback_dev) {
397                 if (idev && idev->dev == dev) {
398                         struct inet6_dev *loopback_idev =
399                                 in6_dev_get(loopback_dev);
400                         if (loopback_idev) {
401                                 rt->rt6i_idev = loopback_idev;
402                                 in6_dev_put(idev);
403                         }
404                 }
405         }
406 }
407
408 static bool rt6_check_expired(const struct rt6_info *rt)
409 {
410         if (rt->rt6i_flags & RTF_EXPIRES) {
411                 if (time_after(jiffies, rt->dst.expires))
412                         return true;
413         } else if (rt->dst.from) {
414                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
415         }
416         return false;
417 }
418
419 /* Multipath route selection:
420  *   Hash based function using packet header and flowlabel.
421  * Adapted from fib_info_hashfn()
422  */
423 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
424                                const struct flowi6 *fl6)
425 {
426         unsigned int val = fl6->flowi6_proto;
427
428         val ^= ipv6_addr_hash(&fl6->daddr);
429         val ^= ipv6_addr_hash(&fl6->saddr);
430
431         /* Work only if this not encapsulated */
432         switch (fl6->flowi6_proto) {
433         case IPPROTO_UDP:
434         case IPPROTO_TCP:
435         case IPPROTO_SCTP:
436                 val ^= (__force u16)fl6->fl6_sport;
437                 val ^= (__force u16)fl6->fl6_dport;
438                 break;
439
440         case IPPROTO_ICMPV6:
441                 val ^= (__force u16)fl6->fl6_icmp_type;
442                 val ^= (__force u16)fl6->fl6_icmp_code;
443                 break;
444         }
445         /* RFC6438 recommands to use flowlabel */
446         val ^= (__force u32)fl6->flowlabel;
447
448         /* Perhaps, we need to tune, this function? */
449         val = val ^ (val >> 7) ^ (val >> 12);
450         return val % candidate_count;
451 }
452
453 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
454                                              struct flowi6 *fl6, int oif,
455                                              int strict)
456 {
457         struct rt6_info *sibling, *next_sibling;
458         int route_choosen;
459
460         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
461         /* Don't change the route, if route_choosen == 0
462          * (siblings does not include ourself)
463          */
464         if (route_choosen)
465                 list_for_each_entry_safe(sibling, next_sibling,
466                                 &match->rt6i_siblings, rt6i_siblings) {
467                         route_choosen--;
468                         if (route_choosen == 0) {
469                                 if (rt6_score_route(sibling, oif, strict) < 0)
470                                         break;
471                                 match = sibling;
472                                 break;
473                         }
474                 }
475         return match;
476 }
477
478 /*
479  *      Route lookup. Any table->tb6_lock is implied.
480  */
481
482 static inline struct rt6_info *rt6_device_match(struct net *net,
483                                                     struct rt6_info *rt,
484                                                     const struct in6_addr *saddr,
485                                                     int oif,
486                                                     int flags)
487 {
488         struct rt6_info *local = NULL;
489         struct rt6_info *sprt;
490
491         if (!oif && ipv6_addr_any(saddr))
492                 goto out;
493
494         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
495                 struct net_device *dev = sprt->dst.dev;
496
497                 if (oif) {
498                         if (dev->ifindex == oif)
499                                 return sprt;
500                         if (dev->flags & IFF_LOOPBACK) {
501                                 if (!sprt->rt6i_idev ||
502                                     sprt->rt6i_idev->dev->ifindex != oif) {
503                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
504                                                 continue;
505                                         if (local && (!oif ||
506                                                       local->rt6i_idev->dev->ifindex == oif))
507                                                 continue;
508                                 }
509                                 local = sprt;
510                         }
511                 } else {
512                         if (ipv6_chk_addr(net, saddr, dev,
513                                           flags & RT6_LOOKUP_F_IFACE))
514                                 return sprt;
515                 }
516         }
517
518         if (oif) {
519                 if (local)
520                         return local;
521
522                 if (flags & RT6_LOOKUP_F_IFACE)
523                         return net->ipv6.ip6_null_entry;
524         }
525 out:
526         return rt;
527 }
528
529 #ifdef CONFIG_IPV6_ROUTER_PREF
530 struct __rt6_probe_work {
531         struct work_struct work;
532         struct in6_addr target;
533         struct net_device *dev;
534 };
535
536 static void rt6_probe_deferred(struct work_struct *w)
537 {
538         struct in6_addr mcaddr;
539         struct __rt6_probe_work *work =
540                 container_of(w, struct __rt6_probe_work, work);
541
542         addrconf_addr_solict_mult(&work->target, &mcaddr);
543         ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL);
544         dev_put(work->dev);
545         kfree(work);
546 }
547
548 static void rt6_probe(struct rt6_info *rt)
549 {
550         struct neighbour *neigh;
551         /*
552          * Okay, this does not seem to be appropriate
553          * for now, however, we need to check if it
554          * is really so; aka Router Reachability Probing.
555          *
556          * Router Reachability Probe MUST be rate-limited
557          * to no more than one per minute.
558          */
559         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
560                 return;
561         rcu_read_lock_bh();
562         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
563         if (neigh) {
564                 write_lock(&neigh->lock);
565                 if (neigh->nud_state & NUD_VALID)
566                         goto out;
567         }
568
569         if (!neigh ||
570             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
571                 struct __rt6_probe_work *work;
572
573                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
574
575                 if (neigh && work)
576                         __neigh_set_probe_once(neigh);
577
578                 if (neigh)
579                         write_unlock(&neigh->lock);
580
581                 if (work) {
582                         INIT_WORK(&work->work, rt6_probe_deferred);
583                         work->target = rt->rt6i_gateway;
584                         dev_hold(rt->dst.dev);
585                         work->dev = rt->dst.dev;
586                         schedule_work(&work->work);
587                 }
588         } else {
589 out:
590                 write_unlock(&neigh->lock);
591         }
592         rcu_read_unlock_bh();
593 }
594 #else
595 static inline void rt6_probe(struct rt6_info *rt)
596 {
597 }
598 #endif
599
600 /*
601  * Default Router Selection (RFC 2461 6.3.6)
602  */
603 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
604 {
605         struct net_device *dev = rt->dst.dev;
606         if (!oif || dev->ifindex == oif)
607                 return 2;
608         if ((dev->flags & IFF_LOOPBACK) &&
609             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
610                 return 1;
611         return 0;
612 }
613
614 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
615 {
616         struct neighbour *neigh;
617         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
618
619         if (rt->rt6i_flags & RTF_NONEXTHOP ||
620             !(rt->rt6i_flags & RTF_GATEWAY))
621                 return RT6_NUD_SUCCEED;
622
623         rcu_read_lock_bh();
624         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
625         if (neigh) {
626                 read_lock(&neigh->lock);
627                 if (neigh->nud_state & NUD_VALID)
628                         ret = RT6_NUD_SUCCEED;
629 #ifdef CONFIG_IPV6_ROUTER_PREF
630                 else if (!(neigh->nud_state & NUD_FAILED))
631                         ret = RT6_NUD_SUCCEED;
632                 else
633                         ret = RT6_NUD_FAIL_PROBE;
634 #endif
635                 read_unlock(&neigh->lock);
636         } else {
637                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
638                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
639         }
640         rcu_read_unlock_bh();
641
642         return ret;
643 }
644
645 static int rt6_score_route(struct rt6_info *rt, int oif,
646                            int strict)
647 {
648         int m;
649
650         m = rt6_check_dev(rt, oif);
651         if (!m && (strict & RT6_LOOKUP_F_IFACE))
652                 return RT6_NUD_FAIL_HARD;
653 #ifdef CONFIG_IPV6_ROUTER_PREF
654         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
655 #endif
656         if (strict & RT6_LOOKUP_F_REACHABLE) {
657                 int n = rt6_check_neigh(rt);
658                 if (n < 0)
659                         return n;
660         }
661         return m;
662 }
663
664 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
665                                    int *mpri, struct rt6_info *match,
666                                    bool *do_rr)
667 {
668         int m;
669         bool match_do_rr = false;
670
671         if (rt6_check_expired(rt))
672                 goto out;
673
674         m = rt6_score_route(rt, oif, strict);
675         if (m == RT6_NUD_FAIL_DO_RR) {
676                 match_do_rr = true;
677                 m = 0; /* lowest valid score */
678         } else if (m == RT6_NUD_FAIL_HARD) {
679                 goto out;
680         }
681
682         if (strict & RT6_LOOKUP_F_REACHABLE)
683                 rt6_probe(rt);
684
685         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
686         if (m > *mpri) {
687                 *do_rr = match_do_rr;
688                 *mpri = m;
689                 match = rt;
690         }
691 out:
692         return match;
693 }
694
695 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
696                                      struct rt6_info *rr_head,
697                                      u32 metric, int oif, int strict,
698                                      bool *do_rr)
699 {
700         struct rt6_info *rt, *match, *cont;
701         int mpri = -1;
702
703         match = NULL;
704         cont = NULL;
705         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
706                 if (rt->rt6i_metric != metric) {
707                         cont = rt;
708                         break;
709                 }
710
711                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
712         }
713
714         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
715                 if (rt->rt6i_metric != metric) {
716                         cont = rt;
717                         break;
718                 }
719
720                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
721         }
722
723         if (match || !cont)
724                 return match;
725
726         for (rt = cont; rt; rt = rt->dst.rt6_next)
727                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
728
729         return match;
730 }
731
732 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
733 {
734         struct rt6_info *match, *rt0;
735         struct net *net;
736         bool do_rr = false;
737
738         rt0 = fn->rr_ptr;
739         if (!rt0)
740                 fn->rr_ptr = rt0 = fn->leaf;
741
742         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
743                              &do_rr);
744
745         if (do_rr) {
746                 struct rt6_info *next = rt0->dst.rt6_next;
747
748                 /* no entries matched; do round-robin */
749                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
750                         next = fn->leaf;
751
752                 if (next != rt0)
753                         fn->rr_ptr = next;
754         }
755
756         net = dev_net(rt0->dst.dev);
757         return match ? match : net->ipv6.ip6_null_entry;
758 }
759
760 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
761 {
762         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
763 }
764
765 #ifdef CONFIG_IPV6_ROUTE_INFO
766 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
767                   const struct in6_addr *gwaddr)
768 {
769         struct net *net = dev_net(dev);
770         struct route_info *rinfo = (struct route_info *) opt;
771         struct in6_addr prefix_buf, *prefix;
772         unsigned int pref;
773         unsigned long lifetime;
774         struct rt6_info *rt;
775
776         if (len < sizeof(struct route_info)) {
777                 return -EINVAL;
778         }
779
780         /* Sanity check for prefix_len and length */
781         if (rinfo->length > 3) {
782                 return -EINVAL;
783         } else if (rinfo->prefix_len > 128) {
784                 return -EINVAL;
785         } else if (rinfo->prefix_len > 64) {
786                 if (rinfo->length < 2) {
787                         return -EINVAL;
788                 }
789         } else if (rinfo->prefix_len > 0) {
790                 if (rinfo->length < 1) {
791                         return -EINVAL;
792                 }
793         }
794
795         pref = rinfo->route_pref;
796         if (pref == ICMPV6_ROUTER_PREF_INVALID)
797                 return -EINVAL;
798
799         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
800
801         if (rinfo->length == 3)
802                 prefix = (struct in6_addr *)rinfo->prefix;
803         else {
804                 /* this function is safe */
805                 ipv6_addr_prefix(&prefix_buf,
806                                  (struct in6_addr *)rinfo->prefix,
807                                  rinfo->prefix_len);
808                 prefix = &prefix_buf;
809         }
810
811         if (rinfo->prefix_len == 0)
812                 rt = rt6_get_dflt_router(gwaddr, dev);
813         else
814                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
815                                         gwaddr, dev->ifindex);
816
817         if (rt && !lifetime) {
818                 ip6_del_rt(rt);
819                 rt = NULL;
820         }
821
822         if (!rt && lifetime)
823                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
824                                         pref);
825         else if (rt)
826                 rt->rt6i_flags = RTF_ROUTEINFO |
827                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
828
829         if (rt) {
830                 if (!addrconf_finite_timeout(lifetime))
831                         rt6_clean_expires(rt);
832                 else
833                         rt6_set_expires(rt, jiffies + HZ * lifetime);
834
835                 ip6_rt_put(rt);
836         }
837         return 0;
838 }
839 #endif
840
841 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
842                                         struct in6_addr *saddr)
843 {
844         struct fib6_node *pn;
845         while (1) {
846                 if (fn->fn_flags & RTN_TL_ROOT)
847                         return NULL;
848                 pn = fn->parent;
849                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
850                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
851                 else
852                         fn = pn;
853                 if (fn->fn_flags & RTN_RTINFO)
854                         return fn;
855         }
856 }
857
858 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
859                                              struct fib6_table *table,
860                                              struct flowi6 *fl6, int flags)
861 {
862         struct fib6_node *fn;
863         struct rt6_info *rt;
864
865         read_lock_bh(&table->tb6_lock);
866         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
867 restart:
868         rt = fn->leaf;
869         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
870         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
871                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
872         if (rt == net->ipv6.ip6_null_entry) {
873                 fn = fib6_backtrack(fn, &fl6->saddr);
874                 if (fn)
875                         goto restart;
876         }
877         dst_use(&rt->dst, jiffies);
878         read_unlock_bh(&table->tb6_lock);
879         return rt;
880
881 }
882
883 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
884                                     int flags)
885 {
886         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
887 }
888 EXPORT_SYMBOL_GPL(ip6_route_lookup);
889
890 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
891                             const struct in6_addr *saddr, int oif, int strict)
892 {
893         struct flowi6 fl6 = {
894                 .flowi6_oif = oif,
895                 .daddr = *daddr,
896         };
897         struct dst_entry *dst;
898         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
899
900         if (saddr) {
901                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
902                 flags |= RT6_LOOKUP_F_HAS_SADDR;
903         }
904
905         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
906         if (dst->error == 0)
907                 return (struct rt6_info *) dst;
908
909         dst_release(dst);
910
911         return NULL;
912 }
913 EXPORT_SYMBOL(rt6_lookup);
914
915 /* ip6_ins_rt is called with FREE table->tb6_lock.
916    It takes new route entry, the addition fails by any reason the
917    route is freed. In any case, if caller does not hold it, it may
918    be destroyed.
919  */
920
921 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
922                         struct mx6_config *mxc)
923 {
924         int err;
925         struct fib6_table *table;
926
927         table = rt->rt6i_table;
928         write_lock_bh(&table->tb6_lock);
929         err = fib6_add(&table->tb6_root, rt, info, mxc);
930         write_unlock_bh(&table->tb6_lock);
931
932         return err;
933 }
934
935 int ip6_ins_rt(struct rt6_info *rt)
936 {
937         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
938         struct mx6_config mxc = { .mx = NULL, };
939
940         return __ip6_ins_rt(rt, &info, &mxc);
941 }
942
943 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
944                                            const struct in6_addr *daddr,
945                                            const struct in6_addr *saddr)
946 {
947         struct rt6_info *rt;
948
949         /*
950          *      Clone the route.
951          */
952
953         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
954                 ort = (struct rt6_info *)ort->dst.from;
955
956         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev,
957                              0, ort->rt6i_table);
958
959         if (!rt)
960                 return NULL;
961
962         ip6_rt_copy_init(rt, ort);
963         rt->rt6i_flags |= RTF_CACHE;
964         rt->rt6i_metric = 0;
965         rt->dst.flags |= DST_HOST;
966         rt->rt6i_dst.addr = *daddr;
967         rt->rt6i_dst.plen = 128;
968
969         if (!rt6_is_gw_or_nonexthop(ort)) {
970                 if (ort->rt6i_dst.plen != 128 &&
971                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
972                         rt->rt6i_flags |= RTF_ANYCAST;
973 #ifdef CONFIG_IPV6_SUBTREES
974                 if (rt->rt6i_src.plen && saddr) {
975                         rt->rt6i_src.addr = *saddr;
976                         rt->rt6i_src.plen = 128;
977                 }
978 #endif
979         }
980
981         return rt;
982 }
983
984 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
985 {
986         struct rt6_info *pcpu_rt;
987
988         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
989                                   rt->dst.dev, rt->dst.flags,
990                                   rt->rt6i_table);
991
992         if (!pcpu_rt)
993                 return NULL;
994         ip6_rt_copy_init(pcpu_rt, rt);
995         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
996         pcpu_rt->rt6i_flags |= RTF_PCPU;
997         return pcpu_rt;
998 }
999
1000 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1001 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1002 {
1003         struct rt6_info *pcpu_rt, *prev, **p;
1004
1005         p = this_cpu_ptr(rt->rt6i_pcpu);
1006         pcpu_rt = *p;
1007
1008         if (pcpu_rt)
1009                 goto done;
1010
1011         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1012         if (!pcpu_rt) {
1013                 struct net *net = dev_net(rt->dst.dev);
1014
1015                 pcpu_rt = net->ipv6.ip6_null_entry;
1016                 goto done;
1017         }
1018
1019         prev = cmpxchg(p, NULL, pcpu_rt);
1020         if (prev) {
1021                 /* If someone did it before us, return prev instead */
1022                 dst_destroy(&pcpu_rt->dst);
1023                 pcpu_rt = prev;
1024         }
1025
1026 done:
1027         dst_hold(&pcpu_rt->dst);
1028         rt6_dst_from_metrics_check(pcpu_rt);
1029         return pcpu_rt;
1030 }
1031
1032 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
1033                                       struct flowi6 *fl6, int flags)
1034 {
1035         struct fib6_node *fn, *saved_fn;
1036         struct rt6_info *rt;
1037         int strict = 0;
1038
1039         strict |= flags & RT6_LOOKUP_F_IFACE;
1040         if (net->ipv6.devconf_all->forwarding == 0)
1041                 strict |= RT6_LOOKUP_F_REACHABLE;
1042
1043         read_lock_bh(&table->tb6_lock);
1044
1045         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1046         saved_fn = fn;
1047
1048 redo_rt6_select:
1049         rt = rt6_select(fn, oif, strict);
1050         if (rt->rt6i_nsiblings)
1051                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1052         if (rt == net->ipv6.ip6_null_entry) {
1053                 fn = fib6_backtrack(fn, &fl6->saddr);
1054                 if (fn)
1055                         goto redo_rt6_select;
1056                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1057                         /* also consider unreachable route */
1058                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1059                         fn = saved_fn;
1060                         goto redo_rt6_select;
1061                 }
1062         }
1063
1064
1065         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1066                 dst_use(&rt->dst, jiffies);
1067                 read_unlock_bh(&table->tb6_lock);
1068
1069                 rt6_dst_from_metrics_check(rt);
1070                 return rt;
1071         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1072                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1073                 /* Create a RTF_CACHE clone which will not be
1074                  * owned by the fib6 tree.  It is for the special case where
1075                  * the daddr in the skb during the neighbor look-up is different
1076                  * from the fl6->daddr used to look-up route here.
1077                  */
1078
1079                 struct rt6_info *uncached_rt;
1080
1081                 dst_use(&rt->dst, jiffies);
1082                 read_unlock_bh(&table->tb6_lock);
1083
1084                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1085                 dst_release(&rt->dst);
1086
1087                 if (uncached_rt)
1088                         rt6_uncached_list_add(uncached_rt);
1089                 else
1090                         uncached_rt = net->ipv6.ip6_null_entry;
1091
1092                 dst_hold(&uncached_rt->dst);
1093                 return uncached_rt;
1094
1095         } else {
1096                 /* Get a percpu copy */
1097
1098                 struct rt6_info *pcpu_rt;
1099
1100                 rt->dst.lastuse = jiffies;
1101                 rt->dst.__use++;
1102                 pcpu_rt = rt6_get_pcpu_route(rt);
1103                 read_unlock_bh(&table->tb6_lock);
1104
1105                 return pcpu_rt;
1106         }
1107 }
1108
1109 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1110                                             struct flowi6 *fl6, int flags)
1111 {
1112         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1113 }
1114
1115 static struct dst_entry *ip6_route_input_lookup(struct net *net,
1116                                                 struct net_device *dev,
1117                                                 struct flowi6 *fl6, int flags)
1118 {
1119         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1120                 flags |= RT6_LOOKUP_F_IFACE;
1121
1122         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1123 }
1124
1125 void ip6_route_input(struct sk_buff *skb)
1126 {
1127         const struct ipv6hdr *iph = ipv6_hdr(skb);
1128         struct net *net = dev_net(skb->dev);
1129         int flags = RT6_LOOKUP_F_HAS_SADDR;
1130         struct flowi6 fl6 = {
1131                 .flowi6_iif = skb->dev->ifindex,
1132                 .daddr = iph->daddr,
1133                 .saddr = iph->saddr,
1134                 .flowlabel = ip6_flowinfo(iph),
1135                 .flowi6_mark = skb->mark,
1136                 .flowi6_proto = iph->nexthdr,
1137         };
1138
1139         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1140 }
1141
1142 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1143                                              struct flowi6 *fl6, int flags)
1144 {
1145         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1146 }
1147
1148 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
1149                                     struct flowi6 *fl6)
1150 {
1151         int flags = 0;
1152
1153         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1154
1155         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
1156                 flags |= RT6_LOOKUP_F_IFACE;
1157
1158         if (!ipv6_addr_any(&fl6->saddr))
1159                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1160         else if (sk)
1161                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1162
1163         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1164 }
1165 EXPORT_SYMBOL(ip6_route_output);
1166
1167 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1168 {
1169         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1170         struct dst_entry *new = NULL;
1171
1172         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1173         if (rt) {
1174                 new = &rt->dst;
1175
1176                 memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
1177
1178                 new->__use = 1;
1179                 new->input = dst_discard;
1180                 new->output = dst_discard_sk;
1181
1182                 if (dst_metrics_read_only(&ort->dst))
1183                         new->_metrics = ort->dst._metrics;
1184                 else
1185                         dst_copy_metrics(new, &ort->dst);
1186                 rt->rt6i_idev = ort->rt6i_idev;
1187                 if (rt->rt6i_idev)
1188                         in6_dev_hold(rt->rt6i_idev);
1189
1190                 rt->rt6i_gateway = ort->rt6i_gateway;
1191                 rt->rt6i_flags = ort->rt6i_flags;
1192                 rt->rt6i_metric = 0;
1193
1194                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1195 #ifdef CONFIG_IPV6_SUBTREES
1196                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1197 #endif
1198
1199                 dst_free(new);
1200         }
1201
1202         dst_release(dst_orig);
1203         return new ? new : ERR_PTR(-ENOMEM);
1204 }
1205
1206 /*
1207  *      Destination cache support functions
1208  */
1209
1210 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1211 {
1212         if (rt->dst.from &&
1213             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1214                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1215 }
1216
1217 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1218 {
1219         if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
1220                 return NULL;
1221
1222         if (rt6_check_expired(rt))
1223                 return NULL;
1224
1225         return &rt->dst;
1226 }
1227
1228 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1229 {
1230         if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1231             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1232                 return &rt->dst;
1233         else
1234                 return NULL;
1235 }
1236
1237 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1238 {
1239         struct rt6_info *rt;
1240
1241         rt = (struct rt6_info *) dst;
1242
1243         /* All IPV6 dsts are created with ->obsolete set to the value
1244          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1245          * into this function always.
1246          */
1247
1248         rt6_dst_from_metrics_check(rt);
1249
1250         if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
1251                 return rt6_dst_from_check(rt, cookie);
1252         else
1253                 return rt6_check(rt, cookie);
1254 }
1255
1256 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1257 {
1258         struct rt6_info *rt = (struct rt6_info *) dst;
1259
1260         if (rt) {
1261                 if (rt->rt6i_flags & RTF_CACHE) {
1262                         if (rt6_check_expired(rt)) {
1263                                 ip6_del_rt(rt);
1264                                 dst = NULL;
1265                         }
1266                 } else {
1267                         dst_release(dst);
1268                         dst = NULL;
1269                 }
1270         }
1271         return dst;
1272 }
1273
1274 static void ip6_link_failure(struct sk_buff *skb)
1275 {
1276         struct rt6_info *rt;
1277
1278         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1279
1280         rt = (struct rt6_info *) skb_dst(skb);
1281         if (rt) {
1282                 if (rt->rt6i_flags & RTF_CACHE) {
1283                         dst_hold(&rt->dst);
1284                         if (ip6_del_rt(rt))
1285                                 dst_free(&rt->dst);
1286                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
1287                         rt->rt6i_node->fn_sernum = -1;
1288                 }
1289         }
1290 }
1291
1292 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1293 {
1294         struct net *net = dev_net(rt->dst.dev);
1295
1296         rt->rt6i_flags |= RTF_MODIFIED;
1297         rt->rt6i_pmtu = mtu;
1298         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1299 }
1300
1301 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1302                                  const struct ipv6hdr *iph, u32 mtu)
1303 {
1304         struct rt6_info *rt6 = (struct rt6_info *)dst;
1305
1306         if (rt6->rt6i_flags & RTF_LOCAL)
1307                 return;
1308
1309         dst_confirm(dst);
1310         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1311         if (mtu >= dst_mtu(dst))
1312                 return;
1313
1314         if (rt6->rt6i_flags & RTF_CACHE) {
1315                 rt6_do_update_pmtu(rt6, mtu);
1316         } else {
1317                 const struct in6_addr *daddr, *saddr;
1318                 struct rt6_info *nrt6;
1319
1320                 if (iph) {
1321                         daddr = &iph->daddr;
1322                         saddr = &iph->saddr;
1323                 } else if (sk) {
1324                         daddr = &sk->sk_v6_daddr;
1325                         saddr = &inet6_sk(sk)->saddr;
1326                 } else {
1327                         return;
1328                 }
1329                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1330                 if (nrt6) {
1331                         rt6_do_update_pmtu(nrt6, mtu);
1332
1333                         /* ip6_ins_rt(nrt6) will bump the
1334                          * rt6->rt6i_node->fn_sernum
1335                          * which will fail the next rt6_check() and
1336                          * invalidate the sk->sk_dst_cache.
1337                          */
1338                         ip6_ins_rt(nrt6);
1339                 }
1340         }
1341 }
1342
1343 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1344                                struct sk_buff *skb, u32 mtu)
1345 {
1346         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1347 }
1348
1349 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1350                      int oif, u32 mark)
1351 {
1352         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1353         struct dst_entry *dst;
1354         struct flowi6 fl6;
1355
1356         memset(&fl6, 0, sizeof(fl6));
1357         fl6.flowi6_oif = oif;
1358         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1359         fl6.daddr = iph->daddr;
1360         fl6.saddr = iph->saddr;
1361         fl6.flowlabel = ip6_flowinfo(iph);
1362
1363         dst = ip6_route_output(net, NULL, &fl6);
1364         if (!dst->error)
1365                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1366         dst_release(dst);
1367 }
1368 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1369
1370 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1371 {
1372         ip6_update_pmtu(skb, sock_net(sk), mtu,
1373                         sk->sk_bound_dev_if, sk->sk_mark);
1374 }
1375 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1376
1377 /* Handle redirects */
1378 struct ip6rd_flowi {
1379         struct flowi6 fl6;
1380         struct in6_addr gateway;
1381 };
1382
1383 static struct rt6_info *__ip6_route_redirect(struct net *net,
1384                                              struct fib6_table *table,
1385                                              struct flowi6 *fl6,
1386                                              int flags)
1387 {
1388         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1389         struct rt6_info *rt;
1390         struct fib6_node *fn;
1391
1392         /* Get the "current" route for this destination and
1393          * check if the redirect has come from approriate router.
1394          *
1395          * RFC 4861 specifies that redirects should only be
1396          * accepted if they come from the nexthop to the target.
1397          * Due to the way the routes are chosen, this notion
1398          * is a bit fuzzy and one might need to check all possible
1399          * routes.
1400          */
1401
1402         read_lock_bh(&table->tb6_lock);
1403         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1404 restart:
1405         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1406                 if (rt6_check_expired(rt))
1407                         continue;
1408                 if (rt->dst.error)
1409                         break;
1410                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1411                         continue;
1412                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1413                         continue;
1414                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1415                         continue;
1416                 break;
1417         }
1418
1419         if (!rt)
1420                 rt = net->ipv6.ip6_null_entry;
1421         else if (rt->dst.error) {
1422                 rt = net->ipv6.ip6_null_entry;
1423                 goto out;
1424         }
1425
1426         if (rt == net->ipv6.ip6_null_entry) {
1427                 fn = fib6_backtrack(fn, &fl6->saddr);
1428                 if (fn)
1429                         goto restart;
1430         }
1431
1432 out:
1433         dst_hold(&rt->dst);
1434
1435         read_unlock_bh(&table->tb6_lock);
1436
1437         return rt;
1438 };
1439
1440 static struct dst_entry *ip6_route_redirect(struct net *net,
1441                                         const struct flowi6 *fl6,
1442                                         const struct in6_addr *gateway)
1443 {
1444         int flags = RT6_LOOKUP_F_HAS_SADDR;
1445         struct ip6rd_flowi rdfl;
1446
1447         rdfl.fl6 = *fl6;
1448         rdfl.gateway = *gateway;
1449
1450         return fib6_rule_lookup(net, &rdfl.fl6,
1451                                 flags, __ip6_route_redirect);
1452 }
1453
1454 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1455 {
1456         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1457         struct dst_entry *dst;
1458         struct flowi6 fl6;
1459
1460         memset(&fl6, 0, sizeof(fl6));
1461         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1462         fl6.flowi6_oif = oif;
1463         fl6.flowi6_mark = mark;
1464         fl6.daddr = iph->daddr;
1465         fl6.saddr = iph->saddr;
1466         fl6.flowlabel = ip6_flowinfo(iph);
1467
1468         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1469         rt6_do_redirect(dst, NULL, skb);
1470         dst_release(dst);
1471 }
1472 EXPORT_SYMBOL_GPL(ip6_redirect);
1473
1474 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1475                             u32 mark)
1476 {
1477         const struct ipv6hdr *iph = ipv6_hdr(skb);
1478         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1479         struct dst_entry *dst;
1480         struct flowi6 fl6;
1481
1482         memset(&fl6, 0, sizeof(fl6));
1483         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1484         fl6.flowi6_oif = oif;
1485         fl6.flowi6_mark = mark;
1486         fl6.daddr = msg->dest;
1487         fl6.saddr = iph->daddr;
1488
1489         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1490         rt6_do_redirect(dst, NULL, skb);
1491         dst_release(dst);
1492 }
1493
1494 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1495 {
1496         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1497 }
1498 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1499
1500 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1501 {
1502         struct net_device *dev = dst->dev;
1503         unsigned int mtu = dst_mtu(dst);
1504         struct net *net = dev_net(dev);
1505
1506         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1507
1508         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1509                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1510
1511         /*
1512          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1513          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1514          * IPV6_MAXPLEN is also valid and means: "any MSS,
1515          * rely only on pmtu discovery"
1516          */
1517         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1518                 mtu = IPV6_MAXPLEN;
1519         return mtu;
1520 }
1521
1522 static unsigned int ip6_mtu(const struct dst_entry *dst)
1523 {
1524         const struct rt6_info *rt = (const struct rt6_info *)dst;
1525         unsigned int mtu = rt->rt6i_pmtu;
1526         struct inet6_dev *idev;
1527
1528         if (mtu)
1529                 goto out;
1530
1531         mtu = dst_metric_raw(dst, RTAX_MTU);
1532         if (mtu)
1533                 goto out;
1534
1535         mtu = IPV6_MIN_MTU;
1536
1537         rcu_read_lock();
1538         idev = __in6_dev_get(dst->dev);
1539         if (idev)
1540                 mtu = idev->cnf.mtu6;
1541         rcu_read_unlock();
1542
1543 out:
1544         return min_t(unsigned int, mtu, IP6_MAX_MTU);
1545 }
1546
1547 static struct dst_entry *icmp6_dst_gc_list;
1548 static DEFINE_SPINLOCK(icmp6_dst_lock);
1549
1550 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1551                                   struct flowi6 *fl6)
1552 {
1553         struct dst_entry *dst;
1554         struct rt6_info *rt;
1555         struct inet6_dev *idev = in6_dev_get(dev);
1556         struct net *net = dev_net(dev);
1557
1558         if (unlikely(!idev))
1559                 return ERR_PTR(-ENODEV);
1560
1561         rt = ip6_dst_alloc(net, dev, 0, NULL);
1562         if (unlikely(!rt)) {
1563                 in6_dev_put(idev);
1564                 dst = ERR_PTR(-ENOMEM);
1565                 goto out;
1566         }
1567
1568         rt->dst.flags |= DST_HOST;
1569         rt->dst.output  = ip6_output;
1570         atomic_set(&rt->dst.__refcnt, 1);
1571         rt->rt6i_gateway  = fl6->daddr;
1572         rt->rt6i_dst.addr = fl6->daddr;
1573         rt->rt6i_dst.plen = 128;
1574         rt->rt6i_idev     = idev;
1575         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1576
1577         spin_lock_bh(&icmp6_dst_lock);
1578         rt->dst.next = icmp6_dst_gc_list;
1579         icmp6_dst_gc_list = &rt->dst;
1580         spin_unlock_bh(&icmp6_dst_lock);
1581
1582         fib6_force_start_gc(net);
1583
1584         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1585
1586 out:
1587         return dst;
1588 }
1589
1590 int icmp6_dst_gc(void)
1591 {
1592         struct dst_entry *dst, **pprev;
1593         int more = 0;
1594
1595         spin_lock_bh(&icmp6_dst_lock);
1596         pprev = &icmp6_dst_gc_list;
1597
1598         while ((dst = *pprev) != NULL) {
1599                 if (!atomic_read(&dst->__refcnt)) {
1600                         *pprev = dst->next;
1601                         dst_free(dst);
1602                 } else {
1603                         pprev = &dst->next;
1604                         ++more;
1605                 }
1606         }
1607
1608         spin_unlock_bh(&icmp6_dst_lock);
1609
1610         return more;
1611 }
1612
1613 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1614                             void *arg)
1615 {
1616         struct dst_entry *dst, **pprev;
1617
1618         spin_lock_bh(&icmp6_dst_lock);
1619         pprev = &icmp6_dst_gc_list;
1620         while ((dst = *pprev) != NULL) {
1621                 struct rt6_info *rt = (struct rt6_info *) dst;
1622                 if (func(rt, arg)) {
1623                         *pprev = dst->next;
1624                         dst_free(dst);
1625                 } else {
1626                         pprev = &dst->next;
1627                 }
1628         }
1629         spin_unlock_bh(&icmp6_dst_lock);
1630 }
1631
1632 static int ip6_dst_gc(struct dst_ops *ops)
1633 {
1634         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1635         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1636         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1637         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1638         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1639         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1640         int entries;
1641
1642         entries = dst_entries_get_fast(ops);
1643         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1644             entries <= rt_max_size)
1645                 goto out;
1646
1647         net->ipv6.ip6_rt_gc_expire++;
1648         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1649         entries = dst_entries_get_slow(ops);
1650         if (entries < ops->gc_thresh)
1651                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1652 out:
1653         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1654         return entries > rt_max_size;
1655 }
1656
1657 static int ip6_convert_metrics(struct mx6_config *mxc,
1658                                const struct fib6_config *cfg)
1659 {
1660         struct nlattr *nla;
1661         int remaining;
1662         u32 *mp;
1663
1664         if (!cfg->fc_mx)
1665                 return 0;
1666
1667         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1668         if (unlikely(!mp))
1669                 return -ENOMEM;
1670
1671         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1672                 int type = nla_type(nla);
1673
1674                 if (type) {
1675                         u32 val;
1676
1677                         if (unlikely(type > RTAX_MAX))
1678                                 goto err;
1679                         if (type == RTAX_CC_ALGO) {
1680                                 char tmp[TCP_CA_NAME_MAX];
1681
1682                                 nla_strlcpy(tmp, nla, sizeof(tmp));
1683                                 val = tcp_ca_get_key_by_name(tmp);
1684                                 if (val == TCP_CA_UNSPEC)
1685                                         goto err;
1686                         } else {
1687                                 val = nla_get_u32(nla);
1688                         }
1689
1690                         mp[type - 1] = val;
1691                         __set_bit(type - 1, mxc->mx_valid);
1692                 }
1693         }
1694
1695         mxc->mx = mp;
1696
1697         return 0;
1698  err:
1699         kfree(mp);
1700         return -EINVAL;
1701 }
1702
1703 int ip6_route_add(struct fib6_config *cfg)
1704 {
1705         int err;
1706         struct net *net = cfg->fc_nlinfo.nl_net;
1707         struct rt6_info *rt = NULL;
1708         struct net_device *dev = NULL;
1709         struct inet6_dev *idev = NULL;
1710         struct fib6_table *table;
1711         struct mx6_config mxc = { .mx = NULL, };
1712         int addr_type;
1713
1714         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1715                 return -EINVAL;
1716 #ifndef CONFIG_IPV6_SUBTREES
1717         if (cfg->fc_src_len)
1718                 return -EINVAL;
1719 #endif
1720         if (cfg->fc_ifindex) {
1721                 err = -ENODEV;
1722                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1723                 if (!dev)
1724                         goto out;
1725                 idev = in6_dev_get(dev);
1726                 if (!idev)
1727                         goto out;
1728         }
1729
1730         if (cfg->fc_metric == 0)
1731                 cfg->fc_metric = IP6_RT_PRIO_USER;
1732
1733         err = -ENOBUFS;
1734         if (cfg->fc_nlinfo.nlh &&
1735             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1736                 table = fib6_get_table(net, cfg->fc_table);
1737                 if (!table) {
1738                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1739                         table = fib6_new_table(net, cfg->fc_table);
1740                 }
1741         } else {
1742                 table = fib6_new_table(net, cfg->fc_table);
1743         }
1744
1745         if (!table)
1746                 goto out;
1747
1748         rt = ip6_dst_alloc(net, NULL, (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT, table);
1749
1750         if (!rt) {
1751                 err = -ENOMEM;
1752                 goto out;
1753         }
1754
1755         if (cfg->fc_flags & RTF_EXPIRES)
1756                 rt6_set_expires(rt, jiffies +
1757                                 clock_t_to_jiffies(cfg->fc_expires));
1758         else
1759                 rt6_clean_expires(rt);
1760
1761         if (cfg->fc_protocol == RTPROT_UNSPEC)
1762                 cfg->fc_protocol = RTPROT_BOOT;
1763         rt->rt6i_protocol = cfg->fc_protocol;
1764
1765         addr_type = ipv6_addr_type(&cfg->fc_dst);
1766
1767         if (addr_type & IPV6_ADDR_MULTICAST)
1768                 rt->dst.input = ip6_mc_input;
1769         else if (cfg->fc_flags & RTF_LOCAL)
1770                 rt->dst.input = ip6_input;
1771         else
1772                 rt->dst.input = ip6_forward;
1773
1774         rt->dst.output = ip6_output;
1775
1776         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1777         rt->rt6i_dst.plen = cfg->fc_dst_len;
1778         if (rt->rt6i_dst.plen == 128)
1779                 rt->dst.flags |= DST_HOST;
1780
1781 #ifdef CONFIG_IPV6_SUBTREES
1782         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1783         rt->rt6i_src.plen = cfg->fc_src_len;
1784 #endif
1785
1786         rt->rt6i_metric = cfg->fc_metric;
1787
1788         /* We cannot add true routes via loopback here,
1789            they would result in kernel looping; promote them to reject routes
1790          */
1791         if ((cfg->fc_flags & RTF_REJECT) ||
1792             (dev && (dev->flags & IFF_LOOPBACK) &&
1793              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1794              !(cfg->fc_flags & RTF_LOCAL))) {
1795                 /* hold loopback dev/idev if we haven't done so. */
1796                 if (dev != net->loopback_dev) {
1797                         if (dev) {
1798                                 dev_put(dev);
1799                                 in6_dev_put(idev);
1800                         }
1801                         dev = net->loopback_dev;
1802                         dev_hold(dev);
1803                         idev = in6_dev_get(dev);
1804                         if (!idev) {
1805                                 err = -ENODEV;
1806                                 goto out;
1807                         }
1808                 }
1809                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1810                 switch (cfg->fc_type) {
1811                 case RTN_BLACKHOLE:
1812                         rt->dst.error = -EINVAL;
1813                         rt->dst.output = dst_discard_sk;
1814                         rt->dst.input = dst_discard;
1815                         break;
1816                 case RTN_PROHIBIT:
1817                         rt->dst.error = -EACCES;
1818                         rt->dst.output = ip6_pkt_prohibit_out;
1819                         rt->dst.input = ip6_pkt_prohibit;
1820                         break;
1821                 case RTN_THROW:
1822                 default:
1823                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1824                                         : -ENETUNREACH;
1825                         rt->dst.output = ip6_pkt_discard_out;
1826                         rt->dst.input = ip6_pkt_discard;
1827                         break;
1828                 }
1829                 goto install_route;
1830         }
1831
1832         if (cfg->fc_flags & RTF_GATEWAY) {
1833                 const struct in6_addr *gw_addr;
1834                 int gwa_type;
1835
1836                 gw_addr = &cfg->fc_gateway;
1837
1838                 /* if gw_addr is local we will fail to detect this in case
1839                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1840                  * will return already-added prefix route via interface that
1841                  * prefix route was assigned to, which might be non-loopback.
1842                  */
1843                 err = -EINVAL;
1844                 if (ipv6_chk_addr_and_flags(net, gw_addr, NULL, 0, 0))
1845                         goto out;
1846
1847                 rt->rt6i_gateway = *gw_addr;
1848                 gwa_type = ipv6_addr_type(gw_addr);
1849
1850                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1851                         struct rt6_info *grt;
1852
1853                         /* IPv6 strictly inhibits using not link-local
1854                            addresses as nexthop address.
1855                            Otherwise, router will not able to send redirects.
1856                            It is very good, but in some (rare!) circumstances
1857                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1858                            some exceptions. --ANK
1859                          */
1860                         if (!(gwa_type & IPV6_ADDR_UNICAST))
1861                                 goto out;
1862
1863                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1864
1865                         err = -EHOSTUNREACH;
1866                         if (!grt)
1867                                 goto out;
1868                         if (dev) {
1869                                 if (dev != grt->dst.dev) {
1870                                         ip6_rt_put(grt);
1871                                         goto out;
1872                                 }
1873                         } else {
1874                                 dev = grt->dst.dev;
1875                                 idev = grt->rt6i_idev;
1876                                 dev_hold(dev);
1877                                 in6_dev_hold(grt->rt6i_idev);
1878                         }
1879                         if (!(grt->rt6i_flags & RTF_GATEWAY))
1880                                 err = 0;
1881                         ip6_rt_put(grt);
1882
1883                         if (err)
1884                                 goto out;
1885                 }
1886                 err = -EINVAL;
1887                 if (!dev || (dev->flags & IFF_LOOPBACK))
1888                         goto out;
1889         }
1890
1891         err = -ENODEV;
1892         if (!dev)
1893                 goto out;
1894
1895         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1896                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1897                         err = -EINVAL;
1898                         goto out;
1899                 }
1900                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1901                 rt->rt6i_prefsrc.plen = 128;
1902         } else
1903                 rt->rt6i_prefsrc.plen = 0;
1904
1905         rt->rt6i_flags = cfg->fc_flags;
1906
1907 install_route:
1908         rt->dst.dev = dev;
1909         rt->rt6i_idev = idev;
1910         rt->rt6i_table = table;
1911
1912         cfg->fc_nlinfo.nl_net = dev_net(dev);
1913
1914         err = ip6_convert_metrics(&mxc, cfg);
1915         if (err)
1916                 goto out;
1917
1918         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
1919
1920         kfree(mxc.mx);
1921         return err;
1922 out:
1923         if (dev)
1924                 dev_put(dev);
1925         if (idev)
1926                 in6_dev_put(idev);
1927         if (rt)
1928                 dst_free(&rt->dst);
1929         return err;
1930 }
1931
1932 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1933 {
1934         int err;
1935         struct fib6_table *table;
1936         struct net *net = dev_net(rt->dst.dev);
1937
1938         if (rt == net->ipv6.ip6_null_entry) {
1939                 err = -ENOENT;
1940                 goto out;
1941         }
1942
1943         table = rt->rt6i_table;
1944         write_lock_bh(&table->tb6_lock);
1945         err = fib6_del(rt, info);
1946         write_unlock_bh(&table->tb6_lock);
1947
1948 out:
1949         ip6_rt_put(rt);
1950         return err;
1951 }
1952
1953 int ip6_del_rt(struct rt6_info *rt)
1954 {
1955         struct nl_info info = {
1956                 .nl_net = dev_net(rt->dst.dev),
1957         };
1958         return __ip6_del_rt(rt, &info);
1959 }
1960
1961 static int ip6_route_del(struct fib6_config *cfg)
1962 {
1963         struct fib6_table *table;
1964         struct fib6_node *fn;
1965         struct rt6_info *rt;
1966         int err = -ESRCH;
1967
1968         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1969         if (!table)
1970                 return err;
1971
1972         read_lock_bh(&table->tb6_lock);
1973
1974         fn = fib6_locate(&table->tb6_root,
1975                          &cfg->fc_dst, cfg->fc_dst_len,
1976                          &cfg->fc_src, cfg->fc_src_len);
1977
1978         if (fn) {
1979                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1980                         if ((rt->rt6i_flags & RTF_CACHE) &&
1981                             !(cfg->fc_flags & RTF_CACHE))
1982                                 continue;
1983                         if (cfg->fc_ifindex &&
1984                             (!rt->dst.dev ||
1985                              rt->dst.dev->ifindex != cfg->fc_ifindex))
1986                                 continue;
1987                         if (cfg->fc_flags & RTF_GATEWAY &&
1988                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1989                                 continue;
1990                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1991                                 continue;
1992                         dst_hold(&rt->dst);
1993                         read_unlock_bh(&table->tb6_lock);
1994
1995                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1996                 }
1997         }
1998         read_unlock_bh(&table->tb6_lock);
1999
2000         return err;
2001 }
2002
2003 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2004 {
2005         struct net *net = dev_net(skb->dev);
2006         struct netevent_redirect netevent;
2007         struct rt6_info *rt, *nrt = NULL;
2008         struct ndisc_options ndopts;
2009         struct inet6_dev *in6_dev;
2010         struct neighbour *neigh;
2011         struct rd_msg *msg;
2012         int optlen, on_link;
2013         u8 *lladdr;
2014
2015         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2016         optlen -= sizeof(*msg);
2017
2018         if (optlen < 0) {
2019                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2020                 return;
2021         }
2022
2023         msg = (struct rd_msg *)icmp6_hdr(skb);
2024
2025         if (ipv6_addr_is_multicast(&msg->dest)) {
2026                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2027                 return;
2028         }
2029
2030         on_link = 0;
2031         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2032                 on_link = 1;
2033         } else if (ipv6_addr_type(&msg->target) !=
2034                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2035                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2036                 return;
2037         }
2038
2039         in6_dev = __in6_dev_get(skb->dev);
2040         if (!in6_dev)
2041                 return;
2042         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2043                 return;
2044
2045         /* RFC2461 8.1:
2046          *      The IP source address of the Redirect MUST be the same as the current
2047          *      first-hop router for the specified ICMP Destination Address.
2048          */
2049
2050         if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
2051                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2052                 return;
2053         }
2054
2055         lladdr = NULL;
2056         if (ndopts.nd_opts_tgt_lladdr) {
2057                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2058                                              skb->dev);
2059                 if (!lladdr) {
2060                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2061                         return;
2062                 }
2063         }
2064
2065         rt = (struct rt6_info *) dst;
2066         if (rt == net->ipv6.ip6_null_entry) {
2067                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2068                 return;
2069         }
2070
2071         /* Redirect received -> path was valid.
2072          * Look, redirects are sent only in response to data packets,
2073          * so that this nexthop apparently is reachable. --ANK
2074          */
2075         dst_confirm(&rt->dst);
2076
2077         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2078         if (!neigh)
2079                 return;
2080
2081         /*
2082          *      We have finally decided to accept it.
2083          */
2084
2085         neigh_update(neigh, lladdr, NUD_STALE,
2086                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2087                      NEIGH_UPDATE_F_OVERRIDE|
2088                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2089                                      NEIGH_UPDATE_F_ISROUTER))
2090                      );
2091
2092         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2093         if (!nrt)
2094                 goto out;
2095
2096         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2097         if (on_link)
2098                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2099
2100         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2101
2102         if (ip6_ins_rt(nrt))
2103                 goto out;
2104
2105         netevent.old = &rt->dst;
2106         netevent.new = &nrt->dst;
2107         netevent.daddr = &msg->dest;
2108         netevent.neigh = neigh;
2109         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2110
2111         if (rt->rt6i_flags & RTF_CACHE) {
2112                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2113                 ip6_del_rt(rt);
2114         }
2115
2116 out:
2117         neigh_release(neigh);
2118 }
2119
2120 /*
2121  *      Misc support functions
2122  */
2123
2124 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2125 {
2126         BUG_ON(from->dst.from);
2127
2128         rt->rt6i_flags &= ~RTF_EXPIRES;
2129         dst_hold(&from->dst);
2130         rt->dst.from = &from->dst;
2131         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2132 }
2133
2134 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2135 {
2136         rt->dst.input = ort->dst.input;
2137         rt->dst.output = ort->dst.output;
2138         rt->rt6i_dst = ort->rt6i_dst;
2139         rt->dst.error = ort->dst.error;
2140         rt->rt6i_idev = ort->rt6i_idev;
2141         if (rt->rt6i_idev)
2142                 in6_dev_hold(rt->rt6i_idev);
2143         rt->dst.lastuse = jiffies;
2144         rt->rt6i_gateway = ort->rt6i_gateway;
2145         rt->rt6i_flags = ort->rt6i_flags;
2146         rt6_set_from(rt, ort);
2147         rt->rt6i_metric = ort->rt6i_metric;
2148 #ifdef CONFIG_IPV6_SUBTREES
2149         rt->rt6i_src = ort->rt6i_src;
2150 #endif
2151         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2152         rt->rt6i_table = ort->rt6i_table;
2153 }
2154
2155 #ifdef CONFIG_IPV6_ROUTE_INFO
2156 static struct rt6_info *rt6_get_route_info(struct net *net,
2157                                            const struct in6_addr *prefix, int prefixlen,
2158                                            const struct in6_addr *gwaddr, int ifindex)
2159 {
2160         struct fib6_node *fn;
2161         struct rt6_info *rt = NULL;
2162         struct fib6_table *table;
2163
2164         table = fib6_get_table(net, RT6_TABLE_INFO);
2165         if (!table)
2166                 return NULL;
2167
2168         read_lock_bh(&table->tb6_lock);
2169         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2170         if (!fn)
2171                 goto out;
2172
2173         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2174                 if (rt->dst.dev->ifindex != ifindex)
2175                         continue;
2176                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2177                         continue;
2178                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2179                         continue;
2180                 dst_hold(&rt->dst);
2181                 break;
2182         }
2183 out:
2184         read_unlock_bh(&table->tb6_lock);
2185         return rt;
2186 }
2187
2188 static struct rt6_info *rt6_add_route_info(struct net *net,
2189                                            const struct in6_addr *prefix, int prefixlen,
2190                                            const struct in6_addr *gwaddr, int ifindex,
2191                                            unsigned int pref)
2192 {
2193         struct fib6_config cfg = {
2194                 .fc_table       = RT6_TABLE_INFO,
2195                 .fc_metric      = IP6_RT_PRIO_USER,
2196                 .fc_ifindex     = ifindex,
2197                 .fc_dst_len     = prefixlen,
2198                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2199                                   RTF_UP | RTF_PREF(pref),
2200                 .fc_nlinfo.portid = 0,
2201                 .fc_nlinfo.nlh = NULL,
2202                 .fc_nlinfo.nl_net = net,
2203         };
2204
2205         cfg.fc_dst = *prefix;
2206         cfg.fc_gateway = *gwaddr;
2207
2208         /* We should treat it as a default route if prefix length is 0. */
2209         if (!prefixlen)
2210                 cfg.fc_flags |= RTF_DEFAULT;
2211
2212         ip6_route_add(&cfg);
2213
2214         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
2215 }
2216 #endif
2217
2218 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2219 {
2220         struct rt6_info *rt;
2221         struct fib6_table *table;
2222
2223         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
2224         if (!table)
2225                 return NULL;
2226
2227         read_lock_bh(&table->tb6_lock);
2228         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2229                 if (dev == rt->dst.dev &&
2230                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2231                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2232                         break;
2233         }
2234         if (rt)
2235                 dst_hold(&rt->dst);
2236         read_unlock_bh(&table->tb6_lock);
2237         return rt;
2238 }
2239
2240 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2241                                      struct net_device *dev,
2242                                      unsigned int pref)
2243 {
2244         struct fib6_config cfg = {
2245                 .fc_table       = RT6_TABLE_DFLT,
2246                 .fc_metric      = IP6_RT_PRIO_USER,
2247                 .fc_ifindex     = dev->ifindex,
2248                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2249                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2250                 .fc_nlinfo.portid = 0,
2251                 .fc_nlinfo.nlh = NULL,
2252                 .fc_nlinfo.nl_net = dev_net(dev),
2253         };
2254
2255         cfg.fc_gateway = *gwaddr;
2256
2257         ip6_route_add(&cfg);
2258
2259         return rt6_get_dflt_router(gwaddr, dev);
2260 }
2261
2262 void rt6_purge_dflt_routers(struct net *net)
2263 {
2264         struct rt6_info *rt;
2265         struct fib6_table *table;
2266
2267         /* NOTE: Keep consistent with rt6_get_dflt_router */
2268         table = fib6_get_table(net, RT6_TABLE_DFLT);
2269         if (!table)
2270                 return;
2271
2272 restart:
2273         read_lock_bh(&table->tb6_lock);
2274         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2275                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2276                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2277                         dst_hold(&rt->dst);
2278                         read_unlock_bh(&table->tb6_lock);
2279                         ip6_del_rt(rt);
2280                         goto restart;
2281                 }
2282         }
2283         read_unlock_bh(&table->tb6_lock);
2284 }
2285
2286 static void rtmsg_to_fib6_config(struct net *net,
2287                                  struct in6_rtmsg *rtmsg,
2288                                  struct fib6_config *cfg)
2289 {
2290         memset(cfg, 0, sizeof(*cfg));
2291
2292         cfg->fc_table = RT6_TABLE_MAIN;
2293         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2294         cfg->fc_metric = rtmsg->rtmsg_metric;
2295         cfg->fc_expires = rtmsg->rtmsg_info;
2296         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2297         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2298         cfg->fc_flags = rtmsg->rtmsg_flags;
2299
2300         cfg->fc_nlinfo.nl_net = net;
2301
2302         cfg->fc_dst = rtmsg->rtmsg_dst;
2303         cfg->fc_src = rtmsg->rtmsg_src;
2304         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2305 }
2306
2307 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2308 {
2309         struct fib6_config cfg;
2310         struct in6_rtmsg rtmsg;
2311         int err;
2312
2313         switch (cmd) {
2314         case SIOCADDRT:         /* Add a route */
2315         case SIOCDELRT:         /* Delete a route */
2316                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2317                         return -EPERM;
2318                 err = copy_from_user(&rtmsg, arg,
2319                                      sizeof(struct in6_rtmsg));
2320                 if (err)
2321                         return -EFAULT;
2322
2323                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2324
2325                 rtnl_lock();
2326                 switch (cmd) {
2327                 case SIOCADDRT:
2328                         err = ip6_route_add(&cfg);
2329                         break;
2330                 case SIOCDELRT:
2331                         err = ip6_route_del(&cfg);
2332                         break;
2333                 default:
2334                         err = -EINVAL;
2335                 }
2336                 rtnl_unlock();
2337
2338                 return err;
2339         }
2340
2341         return -EINVAL;
2342 }
2343
2344 /*
2345  *      Drop the packet on the floor
2346  */
2347
2348 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2349 {
2350         int type;
2351         struct dst_entry *dst = skb_dst(skb);
2352         switch (ipstats_mib_noroutes) {
2353         case IPSTATS_MIB_INNOROUTES:
2354                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2355                 if (type == IPV6_ADDR_ANY) {
2356                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2357                                       IPSTATS_MIB_INADDRERRORS);
2358                         break;
2359                 }
2360                 /* FALLTHROUGH */
2361         case IPSTATS_MIB_OUTNOROUTES:
2362                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2363                               ipstats_mib_noroutes);
2364                 break;
2365         }
2366         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2367         kfree_skb(skb);
2368         return 0;
2369 }
2370
2371 static int ip6_pkt_discard(struct sk_buff *skb)
2372 {
2373         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2374 }
2375
2376 static int ip6_pkt_discard_out(struct sock *sk, struct sk_buff *skb)
2377 {
2378         skb->dev = skb_dst(skb)->dev;
2379         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2380 }
2381
2382 static int ip6_pkt_prohibit(struct sk_buff *skb)
2383 {
2384         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2385 }
2386
2387 static int ip6_pkt_prohibit_out(struct sock *sk, struct sk_buff *skb)
2388 {
2389         skb->dev = skb_dst(skb)->dev;
2390         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2391 }
2392
2393 /*
2394  *      Allocate a dst for local (unicast / anycast) address.
2395  */
2396
2397 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2398                                     const struct in6_addr *addr,
2399                                     bool anycast)
2400 {
2401         struct net *net = dev_net(idev->dev);
2402         struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev,
2403                                             DST_NOCOUNT, NULL);
2404         if (!rt)
2405                 return ERR_PTR(-ENOMEM);
2406
2407         in6_dev_hold(idev);
2408
2409         rt->dst.flags |= DST_HOST;
2410         rt->dst.input = ip6_input;
2411         rt->dst.output = ip6_output;
2412         rt->rt6i_idev = idev;
2413
2414         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2415         if (anycast)
2416                 rt->rt6i_flags |= RTF_ANYCAST;
2417         else
2418                 rt->rt6i_flags |= RTF_LOCAL;
2419
2420         rt->rt6i_gateway  = *addr;
2421         rt->rt6i_dst.addr = *addr;
2422         rt->rt6i_dst.plen = 128;
2423         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2424
2425         atomic_set(&rt->dst.__refcnt, 1);
2426
2427         return rt;
2428 }
2429
2430 int ip6_route_get_saddr(struct net *net,
2431                         struct rt6_info *rt,
2432                         const struct in6_addr *daddr,
2433                         unsigned int prefs,
2434                         struct in6_addr *saddr)
2435 {
2436         struct inet6_dev *idev =
2437                 rt ? ip6_dst_idev((struct dst_entry *)rt) : NULL;
2438         int err = 0;
2439         if (rt && rt->rt6i_prefsrc.plen)
2440                 *saddr = rt->rt6i_prefsrc.addr;
2441         else
2442                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2443                                          daddr, prefs, saddr);
2444         return err;
2445 }
2446
2447 /* remove deleted ip from prefsrc entries */
2448 struct arg_dev_net_ip {
2449         struct net_device *dev;
2450         struct net *net;
2451         struct in6_addr *addr;
2452 };
2453
2454 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2455 {
2456         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2457         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2458         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2459
2460         if (((void *)rt->dst.dev == dev || !dev) &&
2461             rt != net->ipv6.ip6_null_entry &&
2462             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2463                 /* remove prefsrc entry */
2464                 rt->rt6i_prefsrc.plen = 0;
2465         }
2466         return 0;
2467 }
2468
2469 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2470 {
2471         struct net *net = dev_net(ifp->idev->dev);
2472         struct arg_dev_net_ip adni = {
2473                 .dev = ifp->idev->dev,
2474                 .net = net,
2475                 .addr = &ifp->addr,
2476         };
2477         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2478 }
2479
2480 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2481 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2482
2483 /* Remove routers and update dst entries when gateway turn into host. */
2484 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2485 {
2486         struct in6_addr *gateway = (struct in6_addr *)arg;
2487
2488         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2489              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2490              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2491                 return -1;
2492         }
2493         return 0;
2494 }
2495
2496 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2497 {
2498         fib6_clean_all(net, fib6_clean_tohost, gateway);
2499 }
2500
2501 struct arg_dev_net {
2502         struct net_device *dev;
2503         struct net *net;
2504 };
2505
2506 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2507 {
2508         const struct arg_dev_net *adn = arg;
2509         const struct net_device *dev = adn->dev;
2510
2511         if ((rt->dst.dev == dev || !dev) &&
2512             rt != adn->net->ipv6.ip6_null_entry)
2513                 return -1;
2514
2515         return 0;
2516 }
2517
2518 void rt6_ifdown(struct net *net, struct net_device *dev)
2519 {
2520         struct arg_dev_net adn = {
2521                 .dev = dev,
2522                 .net = net,
2523         };
2524
2525         fib6_clean_all(net, fib6_ifdown, &adn);
2526         icmp6_clean_all(fib6_ifdown, &adn);
2527         rt6_uncached_list_flush_dev(net, dev);
2528 }
2529
2530 struct rt6_mtu_change_arg {
2531         struct net_device *dev;
2532         unsigned int mtu;
2533 };
2534
2535 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2536 {
2537         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2538         struct inet6_dev *idev;
2539
2540         /* In IPv6 pmtu discovery is not optional,
2541            so that RTAX_MTU lock cannot disable it.
2542            We still use this lock to block changes
2543            caused by addrconf/ndisc.
2544         */
2545
2546         idev = __in6_dev_get(arg->dev);
2547         if (!idev)
2548                 return 0;
2549
2550         /* For administrative MTU increase, there is no way to discover
2551            IPv6 PMTU increase, so PMTU increase should be updated here.
2552            Since RFC 1981 doesn't include administrative MTU increase
2553            update PMTU increase is a MUST. (i.e. jumbo frame)
2554          */
2555         /*
2556            If new MTU is less than route PMTU, this new MTU will be the
2557            lowest MTU in the path, update the route PMTU to reflect PMTU
2558            decreases; if new MTU is greater than route PMTU, and the
2559            old MTU is the lowest MTU in the path, update the route PMTU
2560            to reflect the increase. In this case if the other nodes' MTU
2561            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2562            PMTU discouvery.
2563          */
2564         if (rt->dst.dev == arg->dev &&
2565             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2566                 if (rt->rt6i_flags & RTF_CACHE) {
2567                         /* For RTF_CACHE with rt6i_pmtu == 0
2568                          * (i.e. a redirected route),
2569                          * the metrics of its rt->dst.from has already
2570                          * been updated.
2571                          */
2572                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2573                                 rt->rt6i_pmtu = arg->mtu;
2574                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2575                            (dst_mtu(&rt->dst) < arg->mtu &&
2576                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2577                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2578                 }
2579         }
2580         return 0;
2581 }
2582
2583 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2584 {
2585         struct rt6_mtu_change_arg arg = {
2586                 .dev = dev,
2587                 .mtu = mtu,
2588         };
2589
2590         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2591 }
2592
2593 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2594         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2595         [RTA_OIF]               = { .type = NLA_U32 },
2596         [RTA_IIF]               = { .type = NLA_U32 },
2597         [RTA_PRIORITY]          = { .type = NLA_U32 },
2598         [RTA_METRICS]           = { .type = NLA_NESTED },
2599         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2600         [RTA_PREF]              = { .type = NLA_U8 },
2601 };
2602
2603 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2604                               struct fib6_config *cfg)
2605 {
2606         struct rtmsg *rtm;
2607         struct nlattr *tb[RTA_MAX+1];
2608         unsigned int pref;
2609         int err;
2610
2611         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2612         if (err < 0)
2613                 goto errout;
2614
2615         err = -EINVAL;
2616         rtm = nlmsg_data(nlh);
2617         memset(cfg, 0, sizeof(*cfg));
2618
2619         cfg->fc_table = rtm->rtm_table;
2620         cfg->fc_dst_len = rtm->rtm_dst_len;
2621         cfg->fc_src_len = rtm->rtm_src_len;
2622         cfg->fc_flags = RTF_UP;
2623         cfg->fc_protocol = rtm->rtm_protocol;
2624         cfg->fc_type = rtm->rtm_type;
2625
2626         if (rtm->rtm_type == RTN_UNREACHABLE ||
2627             rtm->rtm_type == RTN_BLACKHOLE ||
2628             rtm->rtm_type == RTN_PROHIBIT ||
2629             rtm->rtm_type == RTN_THROW)
2630                 cfg->fc_flags |= RTF_REJECT;
2631
2632         if (rtm->rtm_type == RTN_LOCAL)
2633                 cfg->fc_flags |= RTF_LOCAL;
2634
2635         if (rtm->rtm_flags & RTM_F_CLONED)
2636                 cfg->fc_flags |= RTF_CACHE;
2637
2638         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2639         cfg->fc_nlinfo.nlh = nlh;
2640         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2641
2642         if (tb[RTA_GATEWAY]) {
2643                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2644                 cfg->fc_flags |= RTF_GATEWAY;
2645         }
2646
2647         if (tb[RTA_DST]) {
2648                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2649
2650                 if (nla_len(tb[RTA_DST]) < plen)
2651                         goto errout;
2652
2653                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2654         }
2655
2656         if (tb[RTA_SRC]) {
2657                 int plen = (rtm->rtm_src_len + 7) >> 3;
2658
2659                 if (nla_len(tb[RTA_SRC]) < plen)
2660                         goto errout;
2661
2662                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2663         }
2664
2665         if (tb[RTA_PREFSRC])
2666                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2667
2668         if (tb[RTA_OIF])
2669                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2670
2671         if (tb[RTA_PRIORITY])
2672                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2673
2674         if (tb[RTA_METRICS]) {
2675                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2676                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2677         }
2678
2679         if (tb[RTA_TABLE])
2680                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2681
2682         if (tb[RTA_MULTIPATH]) {
2683                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2684                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2685         }
2686
2687         if (tb[RTA_PREF]) {
2688                 pref = nla_get_u8(tb[RTA_PREF]);
2689                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2690                     pref != ICMPV6_ROUTER_PREF_HIGH)
2691                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2692                 cfg->fc_flags |= RTF_PREF(pref);
2693         }
2694
2695         err = 0;
2696 errout:
2697         return err;
2698 }
2699
2700 static int ip6_route_multipath(struct fib6_config *cfg, int add)
2701 {
2702         struct fib6_config r_cfg;
2703         struct rtnexthop *rtnh;
2704         int remaining;
2705         int attrlen;
2706         int err = 0, last_err = 0;
2707
2708         remaining = cfg->fc_mp_len;
2709 beginning:
2710         rtnh = (struct rtnexthop *)cfg->fc_mp;
2711
2712         /* Parse a Multipath Entry */
2713         while (rtnh_ok(rtnh, remaining)) {
2714                 memcpy(&r_cfg, cfg, sizeof(*cfg));
2715                 if (rtnh->rtnh_ifindex)
2716                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
2717
2718                 attrlen = rtnh_attrlen(rtnh);
2719                 if (attrlen > 0) {
2720                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
2721
2722                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
2723                         if (nla) {
2724                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
2725                                 r_cfg.fc_flags |= RTF_GATEWAY;
2726                         }
2727                 }
2728                 err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
2729                 if (err) {
2730                         last_err = err;
2731                         /* If we are trying to remove a route, do not stop the
2732                          * loop when ip6_route_del() fails (because next hop is
2733                          * already gone), we should try to remove all next hops.
2734                          */
2735                         if (add) {
2736                                 /* If add fails, we should try to delete all
2737                                  * next hops that have been already added.
2738                                  */
2739                                 add = 0;
2740                                 remaining = cfg->fc_mp_len - remaining;
2741                                 goto beginning;
2742                         }
2743                 }
2744                 /* Because each route is added like a single route we remove
2745                  * these flags after the first nexthop: if there is a collision,
2746                  * we have already failed to add the first nexthop:
2747                  * fib6_add_rt2node() has rejected it; when replacing, old
2748                  * nexthops have been replaced by first new, the rest should
2749                  * be added to it.
2750                  */
2751                 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
2752                                                      NLM_F_REPLACE);
2753                 rtnh = rtnh_next(rtnh, &remaining);
2754         }
2755
2756         return last_err;
2757 }
2758
2759 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2760 {
2761         struct fib6_config cfg;
2762         int err;
2763
2764         err = rtm_to_fib6_config(skb, nlh, &cfg);
2765         if (err < 0)
2766                 return err;
2767
2768         if (cfg.fc_mp)
2769                 return ip6_route_multipath(&cfg, 0);
2770         else
2771                 return ip6_route_del(&cfg);
2772 }
2773
2774 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
2775 {
2776         struct fib6_config cfg;
2777         int err;
2778
2779         err = rtm_to_fib6_config(skb, nlh, &cfg);
2780         if (err < 0)
2781                 return err;
2782
2783         if (cfg.fc_mp)
2784                 return ip6_route_multipath(&cfg, 1);
2785         else
2786                 return ip6_route_add(&cfg);
2787 }
2788
2789 static inline size_t rt6_nlmsg_size(void)
2790 {
2791         return NLMSG_ALIGN(sizeof(struct rtmsg))
2792                + nla_total_size(16) /* RTA_SRC */
2793                + nla_total_size(16) /* RTA_DST */
2794                + nla_total_size(16) /* RTA_GATEWAY */
2795                + nla_total_size(16) /* RTA_PREFSRC */
2796                + nla_total_size(4) /* RTA_TABLE */
2797                + nla_total_size(4) /* RTA_IIF */
2798                + nla_total_size(4) /* RTA_OIF */
2799                + nla_total_size(4) /* RTA_PRIORITY */
2800                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2801                + nla_total_size(sizeof(struct rta_cacheinfo))
2802                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
2803                + nla_total_size(1); /* RTA_PREF */
2804 }
2805
2806 static int rt6_fill_node(struct net *net,
2807                          struct sk_buff *skb, struct rt6_info *rt,
2808                          struct in6_addr *dst, struct in6_addr *src,
2809                          int iif, int type, u32 portid, u32 seq,
2810                          int prefix, int nowait, unsigned int flags)
2811 {
2812         u32 metrics[RTAX_MAX];
2813         struct rtmsg *rtm;
2814         struct nlmsghdr *nlh;
2815         long expires;
2816         u32 table;
2817
2818         if (prefix) {   /* user wants prefix routes only */
2819                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2820                         /* success since this is not a prefix route */
2821                         return 1;
2822                 }
2823         }
2824
2825         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
2826         if (!nlh)
2827                 return -EMSGSIZE;
2828
2829         rtm = nlmsg_data(nlh);
2830         rtm->rtm_family = AF_INET6;
2831         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2832         rtm->rtm_src_len = rt->rt6i_src.plen;
2833         rtm->rtm_tos = 0;
2834         if (rt->rt6i_table)
2835                 table = rt->rt6i_table->tb6_id;
2836         else
2837                 table = RT6_TABLE_UNSPEC;
2838         rtm->rtm_table = table;
2839         if (nla_put_u32(skb, RTA_TABLE, table))
2840                 goto nla_put_failure;
2841         if (rt->rt6i_flags & RTF_REJECT) {
2842                 switch (rt->dst.error) {
2843                 case -EINVAL:
2844                         rtm->rtm_type = RTN_BLACKHOLE;
2845                         break;
2846                 case -EACCES:
2847                         rtm->rtm_type = RTN_PROHIBIT;
2848                         break;
2849                 case -EAGAIN:
2850                         rtm->rtm_type = RTN_THROW;
2851                         break;
2852                 default:
2853                         rtm->rtm_type = RTN_UNREACHABLE;
2854                         break;
2855                 }
2856         }
2857         else if (rt->rt6i_flags & RTF_LOCAL)
2858                 rtm->rtm_type = RTN_LOCAL;
2859         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2860                 rtm->rtm_type = RTN_LOCAL;
2861         else
2862                 rtm->rtm_type = RTN_UNICAST;
2863         rtm->rtm_flags = 0;
2864         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2865         rtm->rtm_protocol = rt->rt6i_protocol;
2866         if (rt->rt6i_flags & RTF_DYNAMIC)
2867                 rtm->rtm_protocol = RTPROT_REDIRECT;
2868         else if (rt->rt6i_flags & RTF_ADDRCONF) {
2869                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
2870                         rtm->rtm_protocol = RTPROT_RA;
2871                 else
2872                         rtm->rtm_protocol = RTPROT_KERNEL;
2873         }
2874
2875         if (rt->rt6i_flags & RTF_CACHE)
2876                 rtm->rtm_flags |= RTM_F_CLONED;
2877
2878         if (dst) {
2879                 if (nla_put_in6_addr(skb, RTA_DST, dst))
2880                         goto nla_put_failure;
2881                 rtm->rtm_dst_len = 128;
2882         } else if (rtm->rtm_dst_len)
2883                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
2884                         goto nla_put_failure;
2885 #ifdef CONFIG_IPV6_SUBTREES
2886         if (src) {
2887                 if (nla_put_in6_addr(skb, RTA_SRC, src))
2888                         goto nla_put_failure;
2889                 rtm->rtm_src_len = 128;
2890         } else if (rtm->rtm_src_len &&
2891                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
2892                 goto nla_put_failure;
2893 #endif
2894         if (iif) {
2895 #ifdef CONFIG_IPV6_MROUTE
2896                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2897                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2898                         if (err <= 0) {
2899                                 if (!nowait) {
2900                                         if (err == 0)
2901                                                 return 0;
2902                                         goto nla_put_failure;
2903                                 } else {
2904                                         if (err == -EMSGSIZE)
2905                                                 goto nla_put_failure;
2906                                 }
2907                         }
2908                 } else
2909 #endif
2910                         if (nla_put_u32(skb, RTA_IIF, iif))
2911                                 goto nla_put_failure;
2912         } else if (dst) {
2913                 struct in6_addr saddr_buf;
2914                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2915                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2916                         goto nla_put_failure;
2917         }
2918
2919         if (rt->rt6i_prefsrc.plen) {
2920                 struct in6_addr saddr_buf;
2921                 saddr_buf = rt->rt6i_prefsrc.addr;
2922                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
2923                         goto nla_put_failure;
2924         }
2925
2926         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2927         if (rt->rt6i_pmtu)
2928                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
2929         if (rtnetlink_put_metrics(skb, metrics) < 0)
2930                 goto nla_put_failure;
2931
2932         if (rt->rt6i_flags & RTF_GATEWAY) {
2933                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
2934                         goto nla_put_failure;
2935         }
2936
2937         if (rt->dst.dev &&
2938             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2939                 goto nla_put_failure;
2940         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2941                 goto nla_put_failure;
2942
2943         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2944
2945         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2946                 goto nla_put_failure;
2947
2948         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
2949                 goto nla_put_failure;
2950
2951         nlmsg_end(skb, nlh);
2952         return 0;
2953
2954 nla_put_failure:
2955         nlmsg_cancel(skb, nlh);
2956         return -EMSGSIZE;
2957 }
2958
2959 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2960 {
2961         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2962         int prefix;
2963
2964         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2965                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2966                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2967         } else
2968                 prefix = 0;
2969
2970         return rt6_fill_node(arg->net,
2971                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2972                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
2973                      prefix, 0, NLM_F_MULTI);
2974 }
2975
2976 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2977 {
2978         struct net *net = sock_net(in_skb->sk);
2979         struct nlattr *tb[RTA_MAX+1];
2980         struct rt6_info *rt;
2981         struct sk_buff *skb;
2982         struct rtmsg *rtm;
2983         struct flowi6 fl6;
2984         int err, iif = 0, oif = 0;
2985
2986         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2987         if (err < 0)
2988                 goto errout;
2989
2990         err = -EINVAL;
2991         memset(&fl6, 0, sizeof(fl6));
2992
2993         if (tb[RTA_SRC]) {
2994                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2995                         goto errout;
2996
2997                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2998         }
2999
3000         if (tb[RTA_DST]) {
3001                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3002                         goto errout;
3003
3004                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3005         }
3006
3007         if (tb[RTA_IIF])
3008                 iif = nla_get_u32(tb[RTA_IIF]);
3009
3010         if (tb[RTA_OIF])
3011                 oif = nla_get_u32(tb[RTA_OIF]);
3012
3013         if (tb[RTA_MARK])
3014                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3015
3016         if (iif) {
3017                 struct net_device *dev;
3018                 int flags = 0;
3019
3020                 dev = __dev_get_by_index(net, iif);
3021                 if (!dev) {
3022                         err = -ENODEV;
3023                         goto errout;
3024                 }
3025
3026                 fl6.flowi6_iif = iif;
3027
3028                 if (!ipv6_addr_any(&fl6.saddr))
3029                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3030
3031                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3032                                                                flags);
3033         } else {
3034                 fl6.flowi6_oif = oif;
3035
3036                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3037         }
3038
3039         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3040         if (!skb) {
3041                 ip6_rt_put(rt);
3042                 err = -ENOBUFS;
3043                 goto errout;
3044         }
3045
3046         /* Reserve room for dummy headers, this skb can pass
3047            through good chunk of routing engine.
3048          */
3049         skb_reset_mac_header(skb);
3050         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3051
3052         skb_dst_set(skb, &rt->dst);
3053
3054         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3055                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3056                             nlh->nlmsg_seq, 0, 0, 0);
3057         if (err < 0) {
3058                 kfree_skb(skb);
3059                 goto errout;
3060         }
3061
3062         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3063 errout:
3064         return err;
3065 }
3066
3067 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
3068 {
3069         struct sk_buff *skb;
3070         struct net *net = info->nl_net;
3071         u32 seq;
3072         int err;
3073
3074         err = -ENOBUFS;
3075         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3076
3077         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
3078         if (!skb)
3079                 goto errout;
3080
3081         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3082                                 event, info->portid, seq, 0, 0, 0);
3083         if (err < 0) {
3084                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3085                 WARN_ON(err == -EMSGSIZE);
3086                 kfree_skb(skb);
3087                 goto errout;
3088         }
3089         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3090                     info->nlh, gfp_any());
3091         return;
3092 errout:
3093         if (err < 0)
3094                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3095 }
3096
3097 static int ip6_route_dev_notify(struct notifier_block *this,
3098                                 unsigned long event, void *ptr)
3099 {
3100         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3101         struct net *net = dev_net(dev);
3102
3103         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3104                 net->ipv6.ip6_null_entry->dst.dev = dev;
3105                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3106 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3107                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3108                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3109                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3110                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3111 #endif
3112         }
3113
3114         return NOTIFY_OK;
3115 }
3116
3117 /*
3118  *      /proc
3119  */
3120
3121 #ifdef CONFIG_PROC_FS
3122
3123 static const struct file_operations ipv6_route_proc_fops = {
3124         .owner          = THIS_MODULE,
3125         .open           = ipv6_route_open,
3126         .read           = seq_read,
3127         .llseek         = seq_lseek,
3128         .release        = seq_release_net,
3129 };
3130
3131 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3132 {
3133         struct net *net = (struct net *)seq->private;
3134         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3135                    net->ipv6.rt6_stats->fib_nodes,
3136                    net->ipv6.rt6_stats->fib_route_nodes,
3137                    net->ipv6.rt6_stats->fib_rt_alloc,
3138                    net->ipv6.rt6_stats->fib_rt_entries,
3139                    net->ipv6.rt6_stats->fib_rt_cache,
3140                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3141                    net->ipv6.rt6_stats->fib_discarded_routes);
3142
3143         return 0;
3144 }
3145
3146 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3147 {
3148         return single_open_net(inode, file, rt6_stats_seq_show);
3149 }
3150
3151 static const struct file_operations rt6_stats_seq_fops = {
3152         .owner   = THIS_MODULE,
3153         .open    = rt6_stats_seq_open,
3154         .read    = seq_read,
3155         .llseek  = seq_lseek,
3156         .release = single_release_net,
3157 };
3158 #endif  /* CONFIG_PROC_FS */
3159
3160 #ifdef CONFIG_SYSCTL
3161
3162 static
3163 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3164                               void __user *buffer, size_t *lenp, loff_t *ppos)
3165 {
3166         struct net *net;
3167         int delay;
3168         if (!write)
3169                 return -EINVAL;
3170
3171         net = (struct net *)ctl->extra1;
3172         delay = net->ipv6.sysctl.flush_delay;
3173         proc_dointvec(ctl, write, buffer, lenp, ppos);
3174         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3175         return 0;
3176 }
3177
3178 struct ctl_table ipv6_route_table_template[] = {
3179         {
3180                 .procname       =       "flush",
3181                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3182                 .maxlen         =       sizeof(int),
3183                 .mode           =       0200,
3184                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3185         },
3186         {
3187                 .procname       =       "gc_thresh",
3188                 .data           =       &ip6_dst_ops_template.gc_thresh,
3189                 .maxlen         =       sizeof(int),
3190                 .mode           =       0644,
3191                 .proc_handler   =       proc_dointvec,
3192         },
3193         {
3194                 .procname       =       "max_size",
3195                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3196                 .maxlen         =       sizeof(int),
3197                 .mode           =       0644,
3198                 .proc_handler   =       proc_dointvec,
3199         },
3200         {
3201                 .procname       =       "gc_min_interval",
3202                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3203                 .maxlen         =       sizeof(int),
3204                 .mode           =       0644,
3205                 .proc_handler   =       proc_dointvec_jiffies,
3206         },
3207         {
3208                 .procname       =       "gc_timeout",
3209                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3210                 .maxlen         =       sizeof(int),
3211                 .mode           =       0644,
3212                 .proc_handler   =       proc_dointvec_jiffies,
3213         },
3214         {
3215                 .procname       =       "gc_interval",
3216                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3217                 .maxlen         =       sizeof(int),
3218                 .mode           =       0644,
3219                 .proc_handler   =       proc_dointvec_jiffies,
3220         },
3221         {
3222                 .procname       =       "gc_elasticity",
3223                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3224                 .maxlen         =       sizeof(int),
3225                 .mode           =       0644,
3226                 .proc_handler   =       proc_dointvec,
3227         },
3228         {
3229                 .procname       =       "mtu_expires",
3230                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3231                 .maxlen         =       sizeof(int),
3232                 .mode           =       0644,
3233                 .proc_handler   =       proc_dointvec_jiffies,
3234         },
3235         {
3236                 .procname       =       "min_adv_mss",
3237                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3238                 .maxlen         =       sizeof(int),
3239                 .mode           =       0644,
3240                 .proc_handler   =       proc_dointvec,
3241         },
3242         {
3243                 .procname       =       "gc_min_interval_ms",
3244                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3245                 .maxlen         =       sizeof(int),
3246                 .mode           =       0644,
3247                 .proc_handler   =       proc_dointvec_ms_jiffies,
3248         },
3249         { }
3250 };
3251
3252 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3253 {
3254         struct ctl_table *table;
3255
3256         table = kmemdup(ipv6_route_table_template,
3257                         sizeof(ipv6_route_table_template),
3258                         GFP_KERNEL);
3259
3260         if (table) {
3261                 table[0].data = &net->ipv6.sysctl.flush_delay;
3262                 table[0].extra1 = net;
3263                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3264                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3265                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3266                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3267                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3268                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3269                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3270                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3271                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3272
3273                 /* Don't export sysctls to unprivileged users */
3274                 if (net->user_ns != &init_user_ns)
3275                         table[0].procname = NULL;
3276         }
3277
3278         return table;
3279 }
3280 #endif
3281
3282 static int __net_init ip6_route_net_init(struct net *net)
3283 {
3284         int ret = -ENOMEM;
3285
3286         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3287                sizeof(net->ipv6.ip6_dst_ops));
3288
3289         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3290                 goto out_ip6_dst_ops;
3291
3292         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3293                                            sizeof(*net->ipv6.ip6_null_entry),
3294                                            GFP_KERNEL);
3295         if (!net->ipv6.ip6_null_entry)
3296                 goto out_ip6_dst_entries;
3297         net->ipv6.ip6_null_entry->dst.path =
3298                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3299         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3300         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3301                          ip6_template_metrics, true);
3302
3303 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3304         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3305                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3306                                                GFP_KERNEL);
3307         if (!net->ipv6.ip6_prohibit_entry)
3308                 goto out_ip6_null_entry;
3309         net->ipv6.ip6_prohibit_entry->dst.path =
3310                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3311         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3312         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3313                          ip6_template_metrics, true);
3314
3315         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3316                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3317                                                GFP_KERNEL);
3318         if (!net->ipv6.ip6_blk_hole_entry)
3319                 goto out_ip6_prohibit_entry;
3320         net->ipv6.ip6_blk_hole_entry->dst.path =
3321                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3322         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3323         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3324                          ip6_template_metrics, true);
3325 #endif
3326
3327         net->ipv6.sysctl.flush_delay = 0;
3328         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3329         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3330         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3331         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3332         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3333         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3334         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3335
3336         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3337
3338         ret = 0;
3339 out:
3340         return ret;
3341
3342 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3343 out_ip6_prohibit_entry:
3344         kfree(net->ipv6.ip6_prohibit_entry);
3345 out_ip6_null_entry:
3346         kfree(net->ipv6.ip6_null_entry);
3347 #endif
3348 out_ip6_dst_entries:
3349         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3350 out_ip6_dst_ops:
3351         goto out;
3352 }
3353
3354 static void __net_exit ip6_route_net_exit(struct net *net)
3355 {
3356         kfree(net->ipv6.ip6_null_entry);
3357 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3358         kfree(net->ipv6.ip6_prohibit_entry);
3359         kfree(net->ipv6.ip6_blk_hole_entry);
3360 #endif
3361         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3362 }
3363
3364 static int __net_init ip6_route_net_init_late(struct net *net)
3365 {
3366 #ifdef CONFIG_PROC_FS
3367         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3368         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3369 #endif
3370         return 0;
3371 }
3372
3373 static void __net_exit ip6_route_net_exit_late(struct net *net)
3374 {
3375 #ifdef CONFIG_PROC_FS
3376         remove_proc_entry("ipv6_route", net->proc_net);
3377         remove_proc_entry("rt6_stats", net->proc_net);
3378 #endif
3379 }
3380
3381 static struct pernet_operations ip6_route_net_ops = {
3382         .init = ip6_route_net_init,
3383         .exit = ip6_route_net_exit,
3384 };
3385
3386 static int __net_init ipv6_inetpeer_init(struct net *net)
3387 {
3388         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3389
3390         if (!bp)
3391                 return -ENOMEM;
3392         inet_peer_base_init(bp);
3393         net->ipv6.peers = bp;
3394         return 0;
3395 }
3396
3397 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3398 {
3399         struct inet_peer_base *bp = net->ipv6.peers;
3400
3401         net->ipv6.peers = NULL;
3402         inetpeer_invalidate_tree(bp);
3403         kfree(bp);
3404 }
3405
3406 static struct pernet_operations ipv6_inetpeer_ops = {
3407         .init   =       ipv6_inetpeer_init,
3408         .exit   =       ipv6_inetpeer_exit,
3409 };
3410
3411 static struct pernet_operations ip6_route_net_late_ops = {
3412         .init = ip6_route_net_init_late,
3413         .exit = ip6_route_net_exit_late,
3414 };
3415
3416 static struct notifier_block ip6_route_dev_notifier = {
3417         .notifier_call = ip6_route_dev_notify,
3418         .priority = 0,
3419 };
3420
3421 int __init ip6_route_init(void)
3422 {
3423         int ret;
3424         int cpu;
3425
3426         ret = -ENOMEM;
3427         ip6_dst_ops_template.kmem_cachep =
3428                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3429                                   SLAB_HWCACHE_ALIGN, NULL);
3430         if (!ip6_dst_ops_template.kmem_cachep)
3431                 goto out;
3432
3433         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3434         if (ret)
3435                 goto out_kmem_cache;
3436
3437         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3438         if (ret)
3439                 goto out_dst_entries;
3440
3441         ret = register_pernet_subsys(&ip6_route_net_ops);
3442         if (ret)
3443                 goto out_register_inetpeer;
3444
3445         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3446
3447         /* Registering of the loopback is done before this portion of code,
3448          * the loopback reference in rt6_info will not be taken, do it
3449          * manually for init_net */
3450         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3451         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3452   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3453         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3454         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3455         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3456         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3457   #endif
3458         ret = fib6_init();
3459         if (ret)
3460                 goto out_register_subsys;
3461
3462         ret = xfrm6_init();
3463         if (ret)
3464                 goto out_fib6_init;
3465
3466         ret = fib6_rules_init();
3467         if (ret)
3468                 goto xfrm6_init;
3469
3470         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3471         if (ret)
3472                 goto fib6_rules_init;
3473
3474         ret = -ENOBUFS;
3475         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3476             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3477             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3478                 goto out_register_late_subsys;
3479
3480         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3481         if (ret)
3482                 goto out_register_late_subsys;
3483
3484         for_each_possible_cpu(cpu) {
3485                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3486
3487                 INIT_LIST_HEAD(&ul->head);
3488                 spin_lock_init(&ul->lock);
3489         }
3490
3491 out:
3492         return ret;
3493
3494 out_register_late_subsys:
3495         unregister_pernet_subsys(&ip6_route_net_late_ops);
3496 fib6_rules_init:
3497         fib6_rules_cleanup();
3498 xfrm6_init:
3499         xfrm6_fini();
3500 out_fib6_init:
3501         fib6_gc_cleanup();
3502 out_register_subsys:
3503         unregister_pernet_subsys(&ip6_route_net_ops);
3504 out_register_inetpeer:
3505         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3506 out_dst_entries:
3507         dst_entries_destroy(&ip6_dst_blackhole_ops);
3508 out_kmem_cache:
3509         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3510         goto out;
3511 }
3512
3513 void ip6_route_cleanup(void)
3514 {
3515         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3516         unregister_pernet_subsys(&ip6_route_net_late_ops);
3517         fib6_rules_cleanup();
3518         xfrm6_fini();
3519         fib6_gc_cleanup();
3520         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3521         unregister_pernet_subsys(&ip6_route_net_ops);
3522         dst_entries_destroy(&ip6_dst_blackhole_ops);
3523         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3524 }