clocksource: dw_apb_timer_of: Fix support for dts binding "snps,dw-apb-timer"
[linux-drm-fsl-dcu.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 /* IPv4 datagram length is stored into 16bit field (tot_len) */
116 #define IP_MAX_MTU      0xFFFF
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump_ipv4(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 /*
469  * Peer allocation may fail only in serious out-of-memory conditions.  However
470  * we still can generate some output.
471  * Random ID selection looks a bit dangerous because we have no chances to
472  * select ID being unique in a reasonable period of time.
473  * But broken packet identifier may be better than no packet at all.
474  */
475 static void ip_select_fb_ident(struct iphdr *iph)
476 {
477         static DEFINE_SPINLOCK(ip_fb_id_lock);
478         static u32 ip_fallback_id;
479         u32 salt;
480
481         spin_lock_bh(&ip_fb_id_lock);
482         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
483         iph->id = htons(salt & 0xFFFF);
484         ip_fallback_id = salt;
485         spin_unlock_bh(&ip_fb_id_lock);
486 }
487
488 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
489 {
490         struct net *net = dev_net(dst->dev);
491         struct inet_peer *peer;
492
493         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
494         if (peer) {
495                 iph->id = htons(inet_getid(peer, more));
496                 inet_putpeer(peer);
497                 return;
498         }
499
500         ip_select_fb_ident(iph);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
505                              const struct iphdr *iph,
506                              int oif, u8 tos,
507                              u8 prot, u32 mark, int flow_flags)
508 {
509         if (sk) {
510                 const struct inet_sock *inet = inet_sk(sk);
511
512                 oif = sk->sk_bound_dev_if;
513                 mark = sk->sk_mark;
514                 tos = RT_CONN_FLAGS(sk);
515                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516         }
517         flowi4_init_output(fl4, oif, mark, tos,
518                            RT_SCOPE_UNIVERSE, prot,
519                            flow_flags,
520                            iph->daddr, iph->saddr, 0, 0);
521 }
522
523 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
524                                const struct sock *sk)
525 {
526         const struct iphdr *iph = ip_hdr(skb);
527         int oif = skb->dev->ifindex;
528         u8 tos = RT_TOS(iph->tos);
529         u8 prot = iph->protocol;
530         u32 mark = skb->mark;
531
532         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
533 }
534
535 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
536 {
537         const struct inet_sock *inet = inet_sk(sk);
538         const struct ip_options_rcu *inet_opt;
539         __be32 daddr = inet->inet_daddr;
540
541         rcu_read_lock();
542         inet_opt = rcu_dereference(inet->inet_opt);
543         if (inet_opt && inet_opt->opt.srr)
544                 daddr = inet_opt->opt.faddr;
545         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
546                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
547                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
548                            inet_sk_flowi_flags(sk),
549                            daddr, inet->inet_saddr, 0, 0);
550         rcu_read_unlock();
551 }
552
553 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
554                                  const struct sk_buff *skb)
555 {
556         if (skb)
557                 build_skb_flow_key(fl4, skb, sk);
558         else
559                 build_sk_flow_key(fl4, sk);
560 }
561
562 static inline void rt_free(struct rtable *rt)
563 {
564         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
565 }
566
567 static DEFINE_SPINLOCK(fnhe_lock);
568
569 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570 {
571         struct rtable *rt;
572
573         rt = rcu_dereference(fnhe->fnhe_rth_input);
574         if (rt) {
575                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
576                 rt_free(rt);
577         }
578         rt = rcu_dereference(fnhe->fnhe_rth_output);
579         if (rt) {
580                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
581                 rt_free(rt);
582         }
583 }
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588
589         oldest = rcu_dereference(hash->chain);
590         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
591              fnhe = rcu_dereference(fnhe->fnhe_next)) {
592                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
593                         oldest = fnhe;
594         }
595         fnhe_flush_routes(oldest);
596         return oldest;
597 }
598
599 static inline u32 fnhe_hashfun(__be32 daddr)
600 {
601         u32 hval;
602
603         hval = (__force u32) daddr;
604         hval ^= (hval >> 11) ^ (hval >> 22);
605
606         return hval & (FNHE_HASH_SIZE - 1);
607 }
608
609 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
610 {
611         rt->rt_pmtu = fnhe->fnhe_pmtu;
612         rt->dst.expires = fnhe->fnhe_expires;
613
614         if (fnhe->fnhe_gw) {
615                 rt->rt_flags |= RTCF_REDIRECTED;
616                 rt->rt_gateway = fnhe->fnhe_gw;
617                 rt->rt_uses_gateway = 1;
618         }
619 }
620
621 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
622                                   u32 pmtu, unsigned long expires)
623 {
624         struct fnhe_hash_bucket *hash;
625         struct fib_nh_exception *fnhe;
626         struct rtable *rt;
627         unsigned int i;
628         int depth;
629         u32 hval = fnhe_hashfun(daddr);
630
631         spin_lock_bh(&fnhe_lock);
632
633         hash = nh->nh_exceptions;
634         if (!hash) {
635                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
636                 if (!hash)
637                         goto out_unlock;
638                 nh->nh_exceptions = hash;
639         }
640
641         hash += hval;
642
643         depth = 0;
644         for (fnhe = rcu_dereference(hash->chain); fnhe;
645              fnhe = rcu_dereference(fnhe->fnhe_next)) {
646                 if (fnhe->fnhe_daddr == daddr)
647                         break;
648                 depth++;
649         }
650
651         if (fnhe) {
652                 if (gw)
653                         fnhe->fnhe_gw = gw;
654                 if (pmtu) {
655                         fnhe->fnhe_pmtu = pmtu;
656                         fnhe->fnhe_expires = max(1UL, expires);
657                 }
658                 /* Update all cached dsts too */
659                 rt = rcu_dereference(fnhe->fnhe_rth_input);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662                 rt = rcu_dereference(fnhe->fnhe_rth_output);
663                 if (rt)
664                         fill_route_from_fnhe(rt, fnhe);
665         } else {
666                 if (depth > FNHE_RECLAIM_DEPTH)
667                         fnhe = fnhe_oldest(hash);
668                 else {
669                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670                         if (!fnhe)
671                                 goto out_unlock;
672
673                         fnhe->fnhe_next = hash->chain;
674                         rcu_assign_pointer(hash->chain, fnhe);
675                 }
676                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
677                 fnhe->fnhe_daddr = daddr;
678                 fnhe->fnhe_gw = gw;
679                 fnhe->fnhe_pmtu = pmtu;
680                 fnhe->fnhe_expires = expires;
681
682                 /* Exception created; mark the cached routes for the nexthop
683                  * stale, so anyone caching it rechecks if this exception
684                  * applies to them.
685                  */
686                 rt = rcu_dereference(nh->nh_rth_input);
687                 if (rt)
688                         rt->dst.obsolete = DST_OBSOLETE_KILL;
689
690                 for_each_possible_cpu(i) {
691                         struct rtable __rcu **prt;
692                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
693                         rt = rcu_dereference(*prt);
694                         if (rt)
695                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
696                 }
697         }
698
699         fnhe->fnhe_stamp = jiffies;
700
701 out_unlock:
702         spin_unlock_bh(&fnhe_lock);
703         return;
704 }
705
706 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
707                              bool kill_route)
708 {
709         __be32 new_gw = icmp_hdr(skb)->un.gateway;
710         __be32 old_gw = ip_hdr(skb)->saddr;
711         struct net_device *dev = skb->dev;
712         struct in_device *in_dev;
713         struct fib_result res;
714         struct neighbour *n;
715         struct net *net;
716
717         switch (icmp_hdr(skb)->code & 7) {
718         case ICMP_REDIR_NET:
719         case ICMP_REDIR_NETTOS:
720         case ICMP_REDIR_HOST:
721         case ICMP_REDIR_HOSTTOS:
722                 break;
723
724         default:
725                 return;
726         }
727
728         if (rt->rt_gateway != old_gw)
729                 return;
730
731         in_dev = __in_dev_get_rcu(dev);
732         if (!in_dev)
733                 return;
734
735         net = dev_net(dev);
736         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
737             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
738             ipv4_is_zeronet(new_gw))
739                 goto reject_redirect;
740
741         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
742                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
743                         goto reject_redirect;
744                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
745                         goto reject_redirect;
746         } else {
747                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
748                         goto reject_redirect;
749         }
750
751         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
752         if (n) {
753                 if (!(n->nud_state & NUD_VALID)) {
754                         neigh_event_send(n, NULL);
755                 } else {
756                         if (fib_lookup(net, fl4, &res) == 0) {
757                                 struct fib_nh *nh = &FIB_RES_NH(res);
758
759                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760                                                       0, 0);
761                         }
762                         if (kill_route)
763                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
764                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
765                 }
766                 neigh_release(n);
767         }
768         return;
769
770 reject_redirect:
771 #ifdef CONFIG_IP_ROUTE_VERBOSE
772         if (IN_DEV_LOG_MARTIANS(in_dev)) {
773                 const struct iphdr *iph = (const struct iphdr *) skb->data;
774                 __be32 daddr = iph->daddr;
775                 __be32 saddr = iph->saddr;
776
777                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
778                                      "  Advised path = %pI4 -> %pI4\n",
779                                      &old_gw, dev->name, &new_gw,
780                                      &saddr, &daddr);
781         }
782 #endif
783         ;
784 }
785
786 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 {
788         struct rtable *rt;
789         struct flowi4 fl4;
790         const struct iphdr *iph = (const struct iphdr *) skb->data;
791         int oif = skb->dev->ifindex;
792         u8 tos = RT_TOS(iph->tos);
793         u8 prot = iph->protocol;
794         u32 mark = skb->mark;
795
796         rt = (struct rtable *) dst;
797
798         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
799         __ip_do_redirect(rt, skb, &fl4, true);
800 }
801
802 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
803 {
804         struct rtable *rt = (struct rtable *)dst;
805         struct dst_entry *ret = dst;
806
807         if (rt) {
808                 if (dst->obsolete > 0) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
812                            rt->dst.expires) {
813                         ip_rt_put(rt);
814                         ret = NULL;
815                 }
816         }
817         return ret;
818 }
819
820 /*
821  * Algorithm:
822  *      1. The first ip_rt_redirect_number redirects are sent
823  *         with exponential backoff, then we stop sending them at all,
824  *         assuming that the host ignores our redirects.
825  *      2. If we did not see packets requiring redirects
826  *         during ip_rt_redirect_silence, we assume that the host
827  *         forgot redirected route and start to send redirects again.
828  *
829  * This algorithm is much cheaper and more intelligent than dumb load limiting
830  * in icmp.c.
831  *
832  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
833  * and "frag. need" (breaks PMTU discovery) in icmp.c.
834  */
835
836 void ip_rt_send_redirect(struct sk_buff *skb)
837 {
838         struct rtable *rt = skb_rtable(skb);
839         struct in_device *in_dev;
840         struct inet_peer *peer;
841         struct net *net;
842         int log_martians;
843
844         rcu_read_lock();
845         in_dev = __in_dev_get_rcu(rt->dst.dev);
846         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
847                 rcu_read_unlock();
848                 return;
849         }
850         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
851         rcu_read_unlock();
852
853         net = dev_net(rt->dst.dev);
854         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
855         if (!peer) {
856                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
857                           rt_nexthop(rt, ip_hdr(skb)->daddr));
858                 return;
859         }
860
861         /* No redirected packets during ip_rt_redirect_silence;
862          * reset the algorithm.
863          */
864         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
865                 peer->rate_tokens = 0;
866
867         /* Too many ignored redirects; do not send anything
868          * set dst.rate_last to the last seen redirected packet.
869          */
870         if (peer->rate_tokens >= ip_rt_redirect_number) {
871                 peer->rate_last = jiffies;
872                 goto out_put_peer;
873         }
874
875         /* Check for load limit; set rate_last to the latest sent
876          * redirect.
877          */
878         if (peer->rate_tokens == 0 ||
879             time_after(jiffies,
880                        (peer->rate_last +
881                         (ip_rt_redirect_load << peer->rate_tokens)))) {
882                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
883
884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
885                 peer->rate_last = jiffies;
886                 ++peer->rate_tokens;
887 #ifdef CONFIG_IP_ROUTE_VERBOSE
888                 if (log_martians &&
889                     peer->rate_tokens == ip_rt_redirect_number)
890                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
891                                              &ip_hdr(skb)->saddr, inet_iif(skb),
892                                              &ip_hdr(skb)->daddr, &gw);
893 #endif
894         }
895 out_put_peer:
896         inet_putpeer(peer);
897 }
898
899 static int ip_error(struct sk_buff *skb)
900 {
901         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
902         struct rtable *rt = skb_rtable(skb);
903         struct inet_peer *peer;
904         unsigned long now;
905         struct net *net;
906         bool send;
907         int code;
908
909         net = dev_net(rt->dst.dev);
910         if (!IN_DEV_FORWARD(in_dev)) {
911                 switch (rt->dst.error) {
912                 case EHOSTUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
914                         break;
915
916                 case ENETUNREACH:
917                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
918                         break;
919                 }
920                 goto out;
921         }
922
923         switch (rt->dst.error) {
924         case EINVAL:
925         default:
926                 goto out;
927         case EHOSTUNREACH:
928                 code = ICMP_HOST_UNREACH;
929                 break;
930         case ENETUNREACH:
931                 code = ICMP_NET_UNREACH;
932                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
933                 break;
934         case EACCES:
935                 code = ICMP_PKT_FILTERED;
936                 break;
937         }
938
939         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
940
941         send = true;
942         if (peer) {
943                 now = jiffies;
944                 peer->rate_tokens += now - peer->rate_last;
945                 if (peer->rate_tokens > ip_rt_error_burst)
946                         peer->rate_tokens = ip_rt_error_burst;
947                 peer->rate_last = now;
948                 if (peer->rate_tokens >= ip_rt_error_cost)
949                         peer->rate_tokens -= ip_rt_error_cost;
950                 else
951                         send = false;
952                 inet_putpeer(peer);
953         }
954         if (send)
955                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
956
957 out:    kfree_skb(skb);
958         return 0;
959 }
960
961 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
962 {
963         struct dst_entry *dst = &rt->dst;
964         struct fib_result res;
965
966         if (dst_metric_locked(dst, RTAX_MTU))
967                 return;
968
969         if (dst->dev->mtu < mtu)
970                 return;
971
972         if (mtu < ip_rt_min_pmtu)
973                 mtu = ip_rt_min_pmtu;
974
975         if (rt->rt_pmtu == mtu &&
976             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
977                 return;
978
979         rcu_read_lock();
980         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
981                 struct fib_nh *nh = &FIB_RES_NH(res);
982
983                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
984                                       jiffies + ip_rt_mtu_expires);
985         }
986         rcu_read_unlock();
987 }
988
989 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
990                               struct sk_buff *skb, u32 mtu)
991 {
992         struct rtable *rt = (struct rtable *) dst;
993         struct flowi4 fl4;
994
995         ip_rt_build_flow_key(&fl4, sk, skb);
996         __ip_rt_update_pmtu(rt, &fl4, mtu);
997 }
998
999 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                       int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002         const struct iphdr *iph = (const struct iphdr *) skb->data;
1003         struct flowi4 fl4;
1004         struct rtable *rt;
1005
1006         __build_flow_key(&fl4, NULL, iph, oif,
1007                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1008         rt = __ip_route_output_key(net, &fl4);
1009         if (!IS_ERR(rt)) {
1010                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1011                 ip_rt_put(rt);
1012         }
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1015
1016 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023         rt = __ip_route_output_key(sock_net(sk), &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029
1030 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035         struct dst_entry *dst;
1036         bool new = false;
1037
1038         bh_lock_sock(sk);
1039
1040         if (!ip_sk_accept_pmtu(sk))
1041                 goto out;
1042
1043         rt = (struct rtable *) __sk_dst_get(sk);
1044
1045         if (sock_owned_by_user(sk) || !rt) {
1046                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1047                 goto out;
1048         }
1049
1050         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1051
1052         if (!__sk_dst_check(sk, 0)) {
1053                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1054                 if (IS_ERR(rt))
1055                         goto out;
1056
1057                 new = true;
1058         }
1059
1060         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1061
1062         dst = dst_check(&rt->dst, 0);
1063         if (!dst) {
1064                 if (new)
1065                         dst_release(&rt->dst);
1066
1067                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1068                 if (IS_ERR(rt))
1069                         goto out;
1070
1071                 new = true;
1072         }
1073
1074         if (new)
1075                 __sk_dst_set(sk, &rt->dst);
1076
1077 out:
1078         bh_unlock_sock(sk);
1079 }
1080 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1081
1082 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1083                    int oif, u32 mark, u8 protocol, int flow_flags)
1084 {
1085         const struct iphdr *iph = (const struct iphdr *) skb->data;
1086         struct flowi4 fl4;
1087         struct rtable *rt;
1088
1089         __build_flow_key(&fl4, NULL, iph, oif,
1090                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1091         rt = __ip_route_output_key(net, &fl4);
1092         if (!IS_ERR(rt)) {
1093                 __ip_do_redirect(rt, skb, &fl4, false);
1094                 ip_rt_put(rt);
1095         }
1096 }
1097 EXPORT_SYMBOL_GPL(ipv4_redirect);
1098
1099 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1100 {
1101         const struct iphdr *iph = (const struct iphdr *) skb->data;
1102         struct flowi4 fl4;
1103         struct rtable *rt;
1104
1105         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1106         rt = __ip_route_output_key(sock_net(sk), &fl4);
1107         if (!IS_ERR(rt)) {
1108                 __ip_do_redirect(rt, skb, &fl4, false);
1109                 ip_rt_put(rt);
1110         }
1111 }
1112 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1113
1114 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1115 {
1116         struct rtable *rt = (struct rtable *) dst;
1117
1118         /* All IPV4 dsts are created with ->obsolete set to the value
1119          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1120          * into this function always.
1121          *
1122          * When a PMTU/redirect information update invalidates a route,
1123          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1124          * DST_OBSOLETE_DEAD by dst_free().
1125          */
1126         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1127                 return NULL;
1128         return dst;
1129 }
1130
1131 static void ipv4_link_failure(struct sk_buff *skb)
1132 {
1133         struct rtable *rt;
1134
1135         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1136
1137         rt = skb_rtable(skb);
1138         if (rt)
1139                 dst_set_expires(&rt->dst, 0);
1140 }
1141
1142 static int ip_rt_bug(struct sk_buff *skb)
1143 {
1144         pr_debug("%s: %pI4 -> %pI4, %s\n",
1145                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1146                  skb->dev ? skb->dev->name : "?");
1147         kfree_skb(skb);
1148         WARN_ON(1);
1149         return 0;
1150 }
1151
1152 /*
1153    We do not cache source address of outgoing interface,
1154    because it is used only by IP RR, TS and SRR options,
1155    so that it out of fast path.
1156
1157    BTW remember: "addr" is allowed to be not aligned
1158    in IP options!
1159  */
1160
1161 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1162 {
1163         __be32 src;
1164
1165         if (rt_is_output_route(rt))
1166                 src = ip_hdr(skb)->saddr;
1167         else {
1168                 struct fib_result res;
1169                 struct flowi4 fl4;
1170                 struct iphdr *iph;
1171
1172                 iph = ip_hdr(skb);
1173
1174                 memset(&fl4, 0, sizeof(fl4));
1175                 fl4.daddr = iph->daddr;
1176                 fl4.saddr = iph->saddr;
1177                 fl4.flowi4_tos = RT_TOS(iph->tos);
1178                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1179                 fl4.flowi4_iif = skb->dev->ifindex;
1180                 fl4.flowi4_mark = skb->mark;
1181
1182                 rcu_read_lock();
1183                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1184                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1185                 else
1186                         src = inet_select_addr(rt->dst.dev,
1187                                                rt_nexthop(rt, iph->daddr),
1188                                                RT_SCOPE_UNIVERSE);
1189                 rcu_read_unlock();
1190         }
1191         memcpy(addr, &src, 4);
1192 }
1193
1194 #ifdef CONFIG_IP_ROUTE_CLASSID
1195 static void set_class_tag(struct rtable *rt, u32 tag)
1196 {
1197         if (!(rt->dst.tclassid & 0xFFFF))
1198                 rt->dst.tclassid |= tag & 0xFFFF;
1199         if (!(rt->dst.tclassid & 0xFFFF0000))
1200                 rt->dst.tclassid |= tag & 0xFFFF0000;
1201 }
1202 #endif
1203
1204 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1205 {
1206         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1207
1208         if (advmss == 0) {
1209                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1210                                ip_rt_min_advmss);
1211                 if (advmss > 65535 - 40)
1212                         advmss = 65535 - 40;
1213         }
1214         return advmss;
1215 }
1216
1217 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1218 {
1219         const struct rtable *rt = (const struct rtable *) dst;
1220         unsigned int mtu = rt->rt_pmtu;
1221
1222         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1223                 mtu = dst_metric_raw(dst, RTAX_MTU);
1224
1225         if (mtu)
1226                 return mtu;
1227
1228         mtu = dst->dev->mtu;
1229
1230         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1231                 if (rt->rt_uses_gateway && mtu > 576)
1232                         mtu = 576;
1233         }
1234
1235         return min_t(unsigned int, mtu, IP_MAX_MTU);
1236 }
1237
1238 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1239 {
1240         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1241         struct fib_nh_exception *fnhe;
1242         u32 hval;
1243
1244         if (!hash)
1245                 return NULL;
1246
1247         hval = fnhe_hashfun(daddr);
1248
1249         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1250              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1251                 if (fnhe->fnhe_daddr == daddr)
1252                         return fnhe;
1253         }
1254         return NULL;
1255 }
1256
1257 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1258                               __be32 daddr)
1259 {
1260         bool ret = false;
1261
1262         spin_lock_bh(&fnhe_lock);
1263
1264         if (daddr == fnhe->fnhe_daddr) {
1265                 struct rtable __rcu **porig;
1266                 struct rtable *orig;
1267                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1268
1269                 if (rt_is_input_route(rt))
1270                         porig = &fnhe->fnhe_rth_input;
1271                 else
1272                         porig = &fnhe->fnhe_rth_output;
1273                 orig = rcu_dereference(*porig);
1274
1275                 if (fnhe->fnhe_genid != genid) {
1276                         fnhe->fnhe_genid = genid;
1277                         fnhe->fnhe_gw = 0;
1278                         fnhe->fnhe_pmtu = 0;
1279                         fnhe->fnhe_expires = 0;
1280                         fnhe_flush_routes(fnhe);
1281                         orig = NULL;
1282                 }
1283                 fill_route_from_fnhe(rt, fnhe);
1284                 if (!rt->rt_gateway)
1285                         rt->rt_gateway = daddr;
1286
1287                 if (!(rt->dst.flags & DST_NOCACHE)) {
1288                         rcu_assign_pointer(*porig, rt);
1289                         if (orig)
1290                                 rt_free(orig);
1291                         ret = true;
1292                 }
1293
1294                 fnhe->fnhe_stamp = jiffies;
1295         }
1296         spin_unlock_bh(&fnhe_lock);
1297
1298         return ret;
1299 }
1300
1301 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1302 {
1303         struct rtable *orig, *prev, **p;
1304         bool ret = true;
1305
1306         if (rt_is_input_route(rt)) {
1307                 p = (struct rtable **)&nh->nh_rth_input;
1308         } else {
1309                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1310         }
1311         orig = *p;
1312
1313         prev = cmpxchg(p, orig, rt);
1314         if (prev == orig) {
1315                 if (orig)
1316                         rt_free(orig);
1317         } else
1318                 ret = false;
1319
1320         return ret;
1321 }
1322
1323 static DEFINE_SPINLOCK(rt_uncached_lock);
1324 static LIST_HEAD(rt_uncached_list);
1325
1326 static void rt_add_uncached_list(struct rtable *rt)
1327 {
1328         spin_lock_bh(&rt_uncached_lock);
1329         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1330         spin_unlock_bh(&rt_uncached_lock);
1331 }
1332
1333 static void ipv4_dst_destroy(struct dst_entry *dst)
1334 {
1335         struct rtable *rt = (struct rtable *) dst;
1336
1337         if (!list_empty(&rt->rt_uncached)) {
1338                 spin_lock_bh(&rt_uncached_lock);
1339                 list_del(&rt->rt_uncached);
1340                 spin_unlock_bh(&rt_uncached_lock);
1341         }
1342 }
1343
1344 void rt_flush_dev(struct net_device *dev)
1345 {
1346         if (!list_empty(&rt_uncached_list)) {
1347                 struct net *net = dev_net(dev);
1348                 struct rtable *rt;
1349
1350                 spin_lock_bh(&rt_uncached_lock);
1351                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1352                         if (rt->dst.dev != dev)
1353                                 continue;
1354                         rt->dst.dev = net->loopback_dev;
1355                         dev_hold(rt->dst.dev);
1356                         dev_put(dev);
1357                 }
1358                 spin_unlock_bh(&rt_uncached_lock);
1359         }
1360 }
1361
1362 static bool rt_cache_valid(const struct rtable *rt)
1363 {
1364         return  rt &&
1365                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1366                 !rt_is_expired(rt);
1367 }
1368
1369 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1370                            const struct fib_result *res,
1371                            struct fib_nh_exception *fnhe,
1372                            struct fib_info *fi, u16 type, u32 itag)
1373 {
1374         bool cached = false;
1375
1376         if (fi) {
1377                 struct fib_nh *nh = &FIB_RES_NH(*res);
1378
1379                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1380                         rt->rt_gateway = nh->nh_gw;
1381                         rt->rt_uses_gateway = 1;
1382                 }
1383                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1384 #ifdef CONFIG_IP_ROUTE_CLASSID
1385                 rt->dst.tclassid = nh->nh_tclassid;
1386 #endif
1387                 if (unlikely(fnhe))
1388                         cached = rt_bind_exception(rt, fnhe, daddr);
1389                 else if (!(rt->dst.flags & DST_NOCACHE))
1390                         cached = rt_cache_route(nh, rt);
1391                 if (unlikely(!cached)) {
1392                         /* Routes we intend to cache in nexthop exception or
1393                          * FIB nexthop have the DST_NOCACHE bit clear.
1394                          * However, if we are unsuccessful at storing this
1395                          * route into the cache we really need to set it.
1396                          */
1397                         rt->dst.flags |= DST_NOCACHE;
1398                         if (!rt->rt_gateway)
1399                                 rt->rt_gateway = daddr;
1400                         rt_add_uncached_list(rt);
1401                 }
1402         } else
1403                 rt_add_uncached_list(rt);
1404
1405 #ifdef CONFIG_IP_ROUTE_CLASSID
1406 #ifdef CONFIG_IP_MULTIPLE_TABLES
1407         set_class_tag(rt, res->tclassid);
1408 #endif
1409         set_class_tag(rt, itag);
1410 #endif
1411 }
1412
1413 static struct rtable *rt_dst_alloc(struct net_device *dev,
1414                                    bool nopolicy, bool noxfrm, bool will_cache)
1415 {
1416         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1417                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1418                          (nopolicy ? DST_NOPOLICY : 0) |
1419                          (noxfrm ? DST_NOXFRM : 0));
1420 }
1421
1422 /* called in rcu_read_lock() section */
1423 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1424                                 u8 tos, struct net_device *dev, int our)
1425 {
1426         struct rtable *rth;
1427         struct in_device *in_dev = __in_dev_get_rcu(dev);
1428         u32 itag = 0;
1429         int err;
1430
1431         /* Primary sanity checks. */
1432
1433         if (in_dev == NULL)
1434                 return -EINVAL;
1435
1436         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1437             skb->protocol != htons(ETH_P_IP))
1438                 goto e_inval;
1439
1440         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1441                 if (ipv4_is_loopback(saddr))
1442                         goto e_inval;
1443
1444         if (ipv4_is_zeronet(saddr)) {
1445                 if (!ipv4_is_local_multicast(daddr))
1446                         goto e_inval;
1447         } else {
1448                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1449                                           in_dev, &itag);
1450                 if (err < 0)
1451                         goto e_err;
1452         }
1453         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1454                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1455         if (!rth)
1456                 goto e_nobufs;
1457
1458 #ifdef CONFIG_IP_ROUTE_CLASSID
1459         rth->dst.tclassid = itag;
1460 #endif
1461         rth->dst.output = ip_rt_bug;
1462
1463         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1464         rth->rt_flags   = RTCF_MULTICAST;
1465         rth->rt_type    = RTN_MULTICAST;
1466         rth->rt_is_input= 1;
1467         rth->rt_iif     = 0;
1468         rth->rt_pmtu    = 0;
1469         rth->rt_gateway = 0;
1470         rth->rt_uses_gateway = 0;
1471         INIT_LIST_HEAD(&rth->rt_uncached);
1472         if (our) {
1473                 rth->dst.input= ip_local_deliver;
1474                 rth->rt_flags |= RTCF_LOCAL;
1475         }
1476
1477 #ifdef CONFIG_IP_MROUTE
1478         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1479                 rth->dst.input = ip_mr_input;
1480 #endif
1481         RT_CACHE_STAT_INC(in_slow_mc);
1482
1483         skb_dst_set(skb, &rth->dst);
1484         return 0;
1485
1486 e_nobufs:
1487         return -ENOBUFS;
1488 e_inval:
1489         return -EINVAL;
1490 e_err:
1491         return err;
1492 }
1493
1494
1495 static void ip_handle_martian_source(struct net_device *dev,
1496                                      struct in_device *in_dev,
1497                                      struct sk_buff *skb,
1498                                      __be32 daddr,
1499                                      __be32 saddr)
1500 {
1501         RT_CACHE_STAT_INC(in_martian_src);
1502 #ifdef CONFIG_IP_ROUTE_VERBOSE
1503         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1504                 /*
1505                  *      RFC1812 recommendation, if source is martian,
1506                  *      the only hint is MAC header.
1507                  */
1508                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1509                         &daddr, &saddr, dev->name);
1510                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1511                         print_hex_dump(KERN_WARNING, "ll header: ",
1512                                        DUMP_PREFIX_OFFSET, 16, 1,
1513                                        skb_mac_header(skb),
1514                                        dev->hard_header_len, true);
1515                 }
1516         }
1517 #endif
1518 }
1519
1520 /* called in rcu_read_lock() section */
1521 static int __mkroute_input(struct sk_buff *skb,
1522                            const struct fib_result *res,
1523                            struct in_device *in_dev,
1524                            __be32 daddr, __be32 saddr, u32 tos)
1525 {
1526         struct fib_nh_exception *fnhe;
1527         struct rtable *rth;
1528         int err;
1529         struct in_device *out_dev;
1530         unsigned int flags = 0;
1531         bool do_cache;
1532         u32 itag;
1533
1534         /* get a working reference to the output device */
1535         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1536         if (out_dev == NULL) {
1537                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1538                 return -EINVAL;
1539         }
1540
1541         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1542                                   in_dev->dev, in_dev, &itag);
1543         if (err < 0) {
1544                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1545                                          saddr);
1546
1547                 goto cleanup;
1548         }
1549
1550         do_cache = res->fi && !itag;
1551         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1552             (IN_DEV_SHARED_MEDIA(out_dev) ||
1553              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1554                 flags |= RTCF_DOREDIRECT;
1555                 do_cache = false;
1556         }
1557
1558         if (skb->protocol != htons(ETH_P_IP)) {
1559                 /* Not IP (i.e. ARP). Do not create route, if it is
1560                  * invalid for proxy arp. DNAT routes are always valid.
1561                  *
1562                  * Proxy arp feature have been extended to allow, ARP
1563                  * replies back to the same interface, to support
1564                  * Private VLAN switch technologies. See arp.c.
1565                  */
1566                 if (out_dev == in_dev &&
1567                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1568                         err = -EINVAL;
1569                         goto cleanup;
1570                 }
1571         }
1572
1573         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1574         if (do_cache) {
1575                 if (fnhe != NULL)
1576                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1577                 else
1578                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1579
1580                 if (rt_cache_valid(rth)) {
1581                         skb_dst_set_noref(skb, &rth->dst);
1582                         goto out;
1583                 }
1584         }
1585
1586         rth = rt_dst_alloc(out_dev->dev,
1587                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1588                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1589         if (!rth) {
1590                 err = -ENOBUFS;
1591                 goto cleanup;
1592         }
1593
1594         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1595         rth->rt_flags = flags;
1596         rth->rt_type = res->type;
1597         rth->rt_is_input = 1;
1598         rth->rt_iif     = 0;
1599         rth->rt_pmtu    = 0;
1600         rth->rt_gateway = 0;
1601         rth->rt_uses_gateway = 0;
1602         INIT_LIST_HEAD(&rth->rt_uncached);
1603
1604         rth->dst.input = ip_forward;
1605         rth->dst.output = ip_output;
1606
1607         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1608         skb_dst_set(skb, &rth->dst);
1609 out:
1610         err = 0;
1611  cleanup:
1612         return err;
1613 }
1614
1615 static int ip_mkroute_input(struct sk_buff *skb,
1616                             struct fib_result *res,
1617                             const struct flowi4 *fl4,
1618                             struct in_device *in_dev,
1619                             __be32 daddr, __be32 saddr, u32 tos)
1620 {
1621 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1622         if (res->fi && res->fi->fib_nhs > 1)
1623                 fib_select_multipath(res);
1624 #endif
1625
1626         /* create a routing cache entry */
1627         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1628 }
1629
1630 /*
1631  *      NOTE. We drop all the packets that has local source
1632  *      addresses, because every properly looped back packet
1633  *      must have correct destination already attached by output routine.
1634  *
1635  *      Such approach solves two big problems:
1636  *      1. Not simplex devices are handled properly.
1637  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1638  *      called with rcu_read_lock()
1639  */
1640
1641 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1642                                u8 tos, struct net_device *dev)
1643 {
1644         struct fib_result res;
1645         struct in_device *in_dev = __in_dev_get_rcu(dev);
1646         struct flowi4   fl4;
1647         unsigned int    flags = 0;
1648         u32             itag = 0;
1649         struct rtable   *rth;
1650         int             err = -EINVAL;
1651         struct net    *net = dev_net(dev);
1652         bool do_cache;
1653
1654         /* IP on this device is disabled. */
1655
1656         if (!in_dev)
1657                 goto out;
1658
1659         /* Check for the most weird martians, which can be not detected
1660            by fib_lookup.
1661          */
1662
1663         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1664                 goto martian_source;
1665
1666         res.fi = NULL;
1667         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1668                 goto brd_input;
1669
1670         /* Accept zero addresses only to limited broadcast;
1671          * I even do not know to fix it or not. Waiting for complains :-)
1672          */
1673         if (ipv4_is_zeronet(saddr))
1674                 goto martian_source;
1675
1676         if (ipv4_is_zeronet(daddr))
1677                 goto martian_destination;
1678
1679         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1680          * and call it once if daddr or/and saddr are loopback addresses
1681          */
1682         if (ipv4_is_loopback(daddr)) {
1683                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1684                         goto martian_destination;
1685         } else if (ipv4_is_loopback(saddr)) {
1686                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1687                         goto martian_source;
1688         }
1689
1690         /*
1691          *      Now we are ready to route packet.
1692          */
1693         fl4.flowi4_oif = 0;
1694         fl4.flowi4_iif = dev->ifindex;
1695         fl4.flowi4_mark = skb->mark;
1696         fl4.flowi4_tos = tos;
1697         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1698         fl4.daddr = daddr;
1699         fl4.saddr = saddr;
1700         err = fib_lookup(net, &fl4, &res);
1701         if (err != 0)
1702                 goto no_route;
1703
1704         RT_CACHE_STAT_INC(in_slow_tot);
1705
1706         if (res.type == RTN_BROADCAST)
1707                 goto brd_input;
1708
1709         if (res.type == RTN_LOCAL) {
1710                 err = fib_validate_source(skb, saddr, daddr, tos,
1711                                           LOOPBACK_IFINDEX,
1712                                           dev, in_dev, &itag);
1713                 if (err < 0)
1714                         goto martian_source_keep_err;
1715                 goto local_input;
1716         }
1717
1718         if (!IN_DEV_FORWARD(in_dev))
1719                 goto no_route;
1720         if (res.type != RTN_UNICAST)
1721                 goto martian_destination;
1722
1723         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1724 out:    return err;
1725
1726 brd_input:
1727         if (skb->protocol != htons(ETH_P_IP))
1728                 goto e_inval;
1729
1730         if (!ipv4_is_zeronet(saddr)) {
1731                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1732                                           in_dev, &itag);
1733                 if (err < 0)
1734                         goto martian_source_keep_err;
1735         }
1736         flags |= RTCF_BROADCAST;
1737         res.type = RTN_BROADCAST;
1738         RT_CACHE_STAT_INC(in_brd);
1739
1740 local_input:
1741         do_cache = false;
1742         if (res.fi) {
1743                 if (!itag) {
1744                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1745                         if (rt_cache_valid(rth)) {
1746                                 skb_dst_set_noref(skb, &rth->dst);
1747                                 err = 0;
1748                                 goto out;
1749                         }
1750                         do_cache = true;
1751                 }
1752         }
1753
1754         rth = rt_dst_alloc(net->loopback_dev,
1755                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1756         if (!rth)
1757                 goto e_nobufs;
1758
1759         rth->dst.input= ip_local_deliver;
1760         rth->dst.output= ip_rt_bug;
1761 #ifdef CONFIG_IP_ROUTE_CLASSID
1762         rth->dst.tclassid = itag;
1763 #endif
1764
1765         rth->rt_genid = rt_genid_ipv4(net);
1766         rth->rt_flags   = flags|RTCF_LOCAL;
1767         rth->rt_type    = res.type;
1768         rth->rt_is_input = 1;
1769         rth->rt_iif     = 0;
1770         rth->rt_pmtu    = 0;
1771         rth->rt_gateway = 0;
1772         rth->rt_uses_gateway = 0;
1773         INIT_LIST_HEAD(&rth->rt_uncached);
1774         if (res.type == RTN_UNREACHABLE) {
1775                 rth->dst.input= ip_error;
1776                 rth->dst.error= -err;
1777                 rth->rt_flags   &= ~RTCF_LOCAL;
1778         }
1779         if (do_cache)
1780                 rt_cache_route(&FIB_RES_NH(res), rth);
1781         skb_dst_set(skb, &rth->dst);
1782         err = 0;
1783         goto out;
1784
1785 no_route:
1786         RT_CACHE_STAT_INC(in_no_route);
1787         res.type = RTN_UNREACHABLE;
1788         if (err == -ESRCH)
1789                 err = -ENETUNREACH;
1790         goto local_input;
1791
1792         /*
1793          *      Do not cache martian addresses: they should be logged (RFC1812)
1794          */
1795 martian_destination:
1796         RT_CACHE_STAT_INC(in_martian_dst);
1797 #ifdef CONFIG_IP_ROUTE_VERBOSE
1798         if (IN_DEV_LOG_MARTIANS(in_dev))
1799                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1800                                      &daddr, &saddr, dev->name);
1801 #endif
1802
1803 e_inval:
1804         err = -EINVAL;
1805         goto out;
1806
1807 e_nobufs:
1808         err = -ENOBUFS;
1809         goto out;
1810
1811 martian_source:
1812         err = -EINVAL;
1813 martian_source_keep_err:
1814         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1815         goto out;
1816 }
1817
1818 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1819                          u8 tos, struct net_device *dev)
1820 {
1821         int res;
1822
1823         rcu_read_lock();
1824
1825         /* Multicast recognition logic is moved from route cache to here.
1826            The problem was that too many Ethernet cards have broken/missing
1827            hardware multicast filters :-( As result the host on multicasting
1828            network acquires a lot of useless route cache entries, sort of
1829            SDR messages from all the world. Now we try to get rid of them.
1830            Really, provided software IP multicast filter is organized
1831            reasonably (at least, hashed), it does not result in a slowdown
1832            comparing with route cache reject entries.
1833            Note, that multicast routers are not affected, because
1834            route cache entry is created eventually.
1835          */
1836         if (ipv4_is_multicast(daddr)) {
1837                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1838
1839                 if (in_dev) {
1840                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1841                                                   ip_hdr(skb)->protocol);
1842                         if (our
1843 #ifdef CONFIG_IP_MROUTE
1844                                 ||
1845                             (!ipv4_is_local_multicast(daddr) &&
1846                              IN_DEV_MFORWARD(in_dev))
1847 #endif
1848                            ) {
1849                                 int res = ip_route_input_mc(skb, daddr, saddr,
1850                                                             tos, dev, our);
1851                                 rcu_read_unlock();
1852                                 return res;
1853                         }
1854                 }
1855                 rcu_read_unlock();
1856                 return -EINVAL;
1857         }
1858         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1859         rcu_read_unlock();
1860         return res;
1861 }
1862 EXPORT_SYMBOL(ip_route_input_noref);
1863
1864 /* called with rcu_read_lock() */
1865 static struct rtable *__mkroute_output(const struct fib_result *res,
1866                                        const struct flowi4 *fl4, int orig_oif,
1867                                        struct net_device *dev_out,
1868                                        unsigned int flags)
1869 {
1870         struct fib_info *fi = res->fi;
1871         struct fib_nh_exception *fnhe;
1872         struct in_device *in_dev;
1873         u16 type = res->type;
1874         struct rtable *rth;
1875         bool do_cache;
1876
1877         in_dev = __in_dev_get_rcu(dev_out);
1878         if (!in_dev)
1879                 return ERR_PTR(-EINVAL);
1880
1881         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1882                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1883                         return ERR_PTR(-EINVAL);
1884
1885         if (ipv4_is_lbcast(fl4->daddr))
1886                 type = RTN_BROADCAST;
1887         else if (ipv4_is_multicast(fl4->daddr))
1888                 type = RTN_MULTICAST;
1889         else if (ipv4_is_zeronet(fl4->daddr))
1890                 return ERR_PTR(-EINVAL);
1891
1892         if (dev_out->flags & IFF_LOOPBACK)
1893                 flags |= RTCF_LOCAL;
1894
1895         do_cache = true;
1896         if (type == RTN_BROADCAST) {
1897                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1898                 fi = NULL;
1899         } else if (type == RTN_MULTICAST) {
1900                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1901                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1902                                      fl4->flowi4_proto))
1903                         flags &= ~RTCF_LOCAL;
1904                 else
1905                         do_cache = false;
1906                 /* If multicast route do not exist use
1907                  * default one, but do not gateway in this case.
1908                  * Yes, it is hack.
1909                  */
1910                 if (fi && res->prefixlen < 4)
1911                         fi = NULL;
1912         }
1913
1914         fnhe = NULL;
1915         do_cache &= fi != NULL;
1916         if (do_cache) {
1917                 struct rtable __rcu **prth;
1918                 struct fib_nh *nh = &FIB_RES_NH(*res);
1919
1920                 fnhe = find_exception(nh, fl4->daddr);
1921                 if (fnhe)
1922                         prth = &fnhe->fnhe_rth_output;
1923                 else {
1924                         if (unlikely(fl4->flowi4_flags &
1925                                      FLOWI_FLAG_KNOWN_NH &&
1926                                      !(nh->nh_gw &&
1927                                        nh->nh_scope == RT_SCOPE_LINK))) {
1928                                 do_cache = false;
1929                                 goto add;
1930                         }
1931                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1932                 }
1933                 rth = rcu_dereference(*prth);
1934                 if (rt_cache_valid(rth)) {
1935                         dst_hold(&rth->dst);
1936                         return rth;
1937                 }
1938         }
1939
1940 add:
1941         rth = rt_dst_alloc(dev_out,
1942                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1943                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1944                            do_cache);
1945         if (!rth)
1946                 return ERR_PTR(-ENOBUFS);
1947
1948         rth->dst.output = ip_output;
1949
1950         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1951         rth->rt_flags   = flags;
1952         rth->rt_type    = type;
1953         rth->rt_is_input = 0;
1954         rth->rt_iif     = orig_oif ? : 0;
1955         rth->rt_pmtu    = 0;
1956         rth->rt_gateway = 0;
1957         rth->rt_uses_gateway = 0;
1958         INIT_LIST_HEAD(&rth->rt_uncached);
1959
1960         RT_CACHE_STAT_INC(out_slow_tot);
1961
1962         if (flags & RTCF_LOCAL)
1963                 rth->dst.input = ip_local_deliver;
1964         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1965                 if (flags & RTCF_LOCAL &&
1966                     !(dev_out->flags & IFF_LOOPBACK)) {
1967                         rth->dst.output = ip_mc_output;
1968                         RT_CACHE_STAT_INC(out_slow_mc);
1969                 }
1970 #ifdef CONFIG_IP_MROUTE
1971                 if (type == RTN_MULTICAST) {
1972                         if (IN_DEV_MFORWARD(in_dev) &&
1973                             !ipv4_is_local_multicast(fl4->daddr)) {
1974                                 rth->dst.input = ip_mr_input;
1975                                 rth->dst.output = ip_mc_output;
1976                         }
1977                 }
1978 #endif
1979         }
1980
1981         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1982
1983         return rth;
1984 }
1985
1986 /*
1987  * Major route resolver routine.
1988  */
1989
1990 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1991 {
1992         struct net_device *dev_out = NULL;
1993         __u8 tos = RT_FL_TOS(fl4);
1994         unsigned int flags = 0;
1995         struct fib_result res;
1996         struct rtable *rth;
1997         int orig_oif;
1998
1999         res.tclassid    = 0;
2000         res.fi          = NULL;
2001         res.table       = NULL;
2002
2003         orig_oif = fl4->flowi4_oif;
2004
2005         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2006         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2007         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2008                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2009
2010         rcu_read_lock();
2011         if (fl4->saddr) {
2012                 rth = ERR_PTR(-EINVAL);
2013                 if (ipv4_is_multicast(fl4->saddr) ||
2014                     ipv4_is_lbcast(fl4->saddr) ||
2015                     ipv4_is_zeronet(fl4->saddr))
2016                         goto out;
2017
2018                 /* I removed check for oif == dev_out->oif here.
2019                    It was wrong for two reasons:
2020                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2021                       is assigned to multiple interfaces.
2022                    2. Moreover, we are allowed to send packets with saddr
2023                       of another iface. --ANK
2024                  */
2025
2026                 if (fl4->flowi4_oif == 0 &&
2027                     (ipv4_is_multicast(fl4->daddr) ||
2028                      ipv4_is_lbcast(fl4->daddr))) {
2029                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2030                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2031                         if (dev_out == NULL)
2032                                 goto out;
2033
2034                         /* Special hack: user can direct multicasts
2035                            and limited broadcast via necessary interface
2036                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2037                            This hack is not just for fun, it allows
2038                            vic,vat and friends to work.
2039                            They bind socket to loopback, set ttl to zero
2040                            and expect that it will work.
2041                            From the viewpoint of routing cache they are broken,
2042                            because we are not allowed to build multicast path
2043                            with loopback source addr (look, routing cache
2044                            cannot know, that ttl is zero, so that packet
2045                            will not leave this host and route is valid).
2046                            Luckily, this hack is good workaround.
2047                          */
2048
2049                         fl4->flowi4_oif = dev_out->ifindex;
2050                         goto make_route;
2051                 }
2052
2053                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2054                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2055                         if (!__ip_dev_find(net, fl4->saddr, false))
2056                                 goto out;
2057                 }
2058         }
2059
2060
2061         if (fl4->flowi4_oif) {
2062                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2063                 rth = ERR_PTR(-ENODEV);
2064                 if (dev_out == NULL)
2065                         goto out;
2066
2067                 /* RACE: Check return value of inet_select_addr instead. */
2068                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2069                         rth = ERR_PTR(-ENETUNREACH);
2070                         goto out;
2071                 }
2072                 if (ipv4_is_local_multicast(fl4->daddr) ||
2073                     ipv4_is_lbcast(fl4->daddr)) {
2074                         if (!fl4->saddr)
2075                                 fl4->saddr = inet_select_addr(dev_out, 0,
2076                                                               RT_SCOPE_LINK);
2077                         goto make_route;
2078                 }
2079                 if (!fl4->saddr) {
2080                         if (ipv4_is_multicast(fl4->daddr))
2081                                 fl4->saddr = inet_select_addr(dev_out, 0,
2082                                                               fl4->flowi4_scope);
2083                         else if (!fl4->daddr)
2084                                 fl4->saddr = inet_select_addr(dev_out, 0,
2085                                                               RT_SCOPE_HOST);
2086                 }
2087         }
2088
2089         if (!fl4->daddr) {
2090                 fl4->daddr = fl4->saddr;
2091                 if (!fl4->daddr)
2092                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2093                 dev_out = net->loopback_dev;
2094                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2095                 res.type = RTN_LOCAL;
2096                 flags |= RTCF_LOCAL;
2097                 goto make_route;
2098         }
2099
2100         if (fib_lookup(net, fl4, &res)) {
2101                 res.fi = NULL;
2102                 res.table = NULL;
2103                 if (fl4->flowi4_oif) {
2104                         /* Apparently, routing tables are wrong. Assume,
2105                            that the destination is on link.
2106
2107                            WHY? DW.
2108                            Because we are allowed to send to iface
2109                            even if it has NO routes and NO assigned
2110                            addresses. When oif is specified, routing
2111                            tables are looked up with only one purpose:
2112                            to catch if destination is gatewayed, rather than
2113                            direct. Moreover, if MSG_DONTROUTE is set,
2114                            we send packet, ignoring both routing tables
2115                            and ifaddr state. --ANK
2116
2117
2118                            We could make it even if oif is unknown,
2119                            likely IPv6, but we do not.
2120                          */
2121
2122                         if (fl4->saddr == 0)
2123                                 fl4->saddr = inet_select_addr(dev_out, 0,
2124                                                               RT_SCOPE_LINK);
2125                         res.type = RTN_UNICAST;
2126                         goto make_route;
2127                 }
2128                 rth = ERR_PTR(-ENETUNREACH);
2129                 goto out;
2130         }
2131
2132         if (res.type == RTN_LOCAL) {
2133                 if (!fl4->saddr) {
2134                         if (res.fi->fib_prefsrc)
2135                                 fl4->saddr = res.fi->fib_prefsrc;
2136                         else
2137                                 fl4->saddr = fl4->daddr;
2138                 }
2139                 dev_out = net->loopback_dev;
2140                 fl4->flowi4_oif = dev_out->ifindex;
2141                 flags |= RTCF_LOCAL;
2142                 goto make_route;
2143         }
2144
2145 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2146         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2147                 fib_select_multipath(&res);
2148         else
2149 #endif
2150         if (!res.prefixlen &&
2151             res.table->tb_num_default > 1 &&
2152             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2153                 fib_select_default(&res);
2154
2155         if (!fl4->saddr)
2156                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2157
2158         dev_out = FIB_RES_DEV(res);
2159         fl4->flowi4_oif = dev_out->ifindex;
2160
2161
2162 make_route:
2163         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2164
2165 out:
2166         rcu_read_unlock();
2167         return rth;
2168 }
2169 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2170
2171 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2172 {
2173         return NULL;
2174 }
2175
2176 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2177 {
2178         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2179
2180         return mtu ? : dst->dev->mtu;
2181 }
2182
2183 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2184                                           struct sk_buff *skb, u32 mtu)
2185 {
2186 }
2187
2188 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2189                                        struct sk_buff *skb)
2190 {
2191 }
2192
2193 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2194                                           unsigned long old)
2195 {
2196         return NULL;
2197 }
2198
2199 static struct dst_ops ipv4_dst_blackhole_ops = {
2200         .family                 =       AF_INET,
2201         .protocol               =       cpu_to_be16(ETH_P_IP),
2202         .check                  =       ipv4_blackhole_dst_check,
2203         .mtu                    =       ipv4_blackhole_mtu,
2204         .default_advmss         =       ipv4_default_advmss,
2205         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2206         .redirect               =       ipv4_rt_blackhole_redirect,
2207         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2208         .neigh_lookup           =       ipv4_neigh_lookup,
2209 };
2210
2211 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2212 {
2213         struct rtable *ort = (struct rtable *) dst_orig;
2214         struct rtable *rt;
2215
2216         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2217         if (rt) {
2218                 struct dst_entry *new = &rt->dst;
2219
2220                 new->__use = 1;
2221                 new->input = dst_discard;
2222                 new->output = dst_discard;
2223
2224                 new->dev = ort->dst.dev;
2225                 if (new->dev)
2226                         dev_hold(new->dev);
2227
2228                 rt->rt_is_input = ort->rt_is_input;
2229                 rt->rt_iif = ort->rt_iif;
2230                 rt->rt_pmtu = ort->rt_pmtu;
2231
2232                 rt->rt_genid = rt_genid_ipv4(net);
2233                 rt->rt_flags = ort->rt_flags;
2234                 rt->rt_type = ort->rt_type;
2235                 rt->rt_gateway = ort->rt_gateway;
2236                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2237
2238                 INIT_LIST_HEAD(&rt->rt_uncached);
2239
2240                 dst_free(new);
2241         }
2242
2243         dst_release(dst_orig);
2244
2245         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2246 }
2247
2248 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2249                                     struct sock *sk)
2250 {
2251         struct rtable *rt = __ip_route_output_key(net, flp4);
2252
2253         if (IS_ERR(rt))
2254                 return rt;
2255
2256         if (flp4->flowi4_proto)
2257                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2258                                                    flowi4_to_flowi(flp4),
2259                                                    sk, 0);
2260
2261         return rt;
2262 }
2263 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2264
2265 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2266                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2267                         u32 seq, int event, int nowait, unsigned int flags)
2268 {
2269         struct rtable *rt = skb_rtable(skb);
2270         struct rtmsg *r;
2271         struct nlmsghdr *nlh;
2272         unsigned long expires = 0;
2273         u32 error;
2274         u32 metrics[RTAX_MAX];
2275
2276         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2277         if (nlh == NULL)
2278                 return -EMSGSIZE;
2279
2280         r = nlmsg_data(nlh);
2281         r->rtm_family    = AF_INET;
2282         r->rtm_dst_len  = 32;
2283         r->rtm_src_len  = 0;
2284         r->rtm_tos      = fl4->flowi4_tos;
2285         r->rtm_table    = RT_TABLE_MAIN;
2286         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2287                 goto nla_put_failure;
2288         r->rtm_type     = rt->rt_type;
2289         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2290         r->rtm_protocol = RTPROT_UNSPEC;
2291         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2292         if (rt->rt_flags & RTCF_NOTIFY)
2293                 r->rtm_flags |= RTM_F_NOTIFY;
2294
2295         if (nla_put_be32(skb, RTA_DST, dst))
2296                 goto nla_put_failure;
2297         if (src) {
2298                 r->rtm_src_len = 32;
2299                 if (nla_put_be32(skb, RTA_SRC, src))
2300                         goto nla_put_failure;
2301         }
2302         if (rt->dst.dev &&
2303             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2304                 goto nla_put_failure;
2305 #ifdef CONFIG_IP_ROUTE_CLASSID
2306         if (rt->dst.tclassid &&
2307             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2308                 goto nla_put_failure;
2309 #endif
2310         if (!rt_is_input_route(rt) &&
2311             fl4->saddr != src) {
2312                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2313                         goto nla_put_failure;
2314         }
2315         if (rt->rt_uses_gateway &&
2316             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2317                 goto nla_put_failure;
2318
2319         expires = rt->dst.expires;
2320         if (expires) {
2321                 unsigned long now = jiffies;
2322
2323                 if (time_before(now, expires))
2324                         expires -= now;
2325                 else
2326                         expires = 0;
2327         }
2328
2329         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2330         if (rt->rt_pmtu && expires)
2331                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2332         if (rtnetlink_put_metrics(skb, metrics) < 0)
2333                 goto nla_put_failure;
2334
2335         if (fl4->flowi4_mark &&
2336             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2337                 goto nla_put_failure;
2338
2339         error = rt->dst.error;
2340
2341         if (rt_is_input_route(rt)) {
2342 #ifdef CONFIG_IP_MROUTE
2343                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2344                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2345                         int err = ipmr_get_route(net, skb,
2346                                                  fl4->saddr, fl4->daddr,
2347                                                  r, nowait);
2348                         if (err <= 0) {
2349                                 if (!nowait) {
2350                                         if (err == 0)
2351                                                 return 0;
2352                                         goto nla_put_failure;
2353                                 } else {
2354                                         if (err == -EMSGSIZE)
2355                                                 goto nla_put_failure;
2356                                         error = err;
2357                                 }
2358                         }
2359                 } else
2360 #endif
2361                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2362                                 goto nla_put_failure;
2363         }
2364
2365         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2366                 goto nla_put_failure;
2367
2368         return nlmsg_end(skb, nlh);
2369
2370 nla_put_failure:
2371         nlmsg_cancel(skb, nlh);
2372         return -EMSGSIZE;
2373 }
2374
2375 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2376 {
2377         struct net *net = sock_net(in_skb->sk);
2378         struct rtmsg *rtm;
2379         struct nlattr *tb[RTA_MAX+1];
2380         struct rtable *rt = NULL;
2381         struct flowi4 fl4;
2382         __be32 dst = 0;
2383         __be32 src = 0;
2384         u32 iif;
2385         int err;
2386         int mark;
2387         struct sk_buff *skb;
2388
2389         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2390         if (err < 0)
2391                 goto errout;
2392
2393         rtm = nlmsg_data(nlh);
2394
2395         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2396         if (skb == NULL) {
2397                 err = -ENOBUFS;
2398                 goto errout;
2399         }
2400
2401         /* Reserve room for dummy headers, this skb can pass
2402            through good chunk of routing engine.
2403          */
2404         skb_reset_mac_header(skb);
2405         skb_reset_network_header(skb);
2406
2407         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2408         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2409         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2410
2411         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2412         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2413         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2414         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2415
2416         memset(&fl4, 0, sizeof(fl4));
2417         fl4.daddr = dst;
2418         fl4.saddr = src;
2419         fl4.flowi4_tos = rtm->rtm_tos;
2420         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2421         fl4.flowi4_mark = mark;
2422
2423         if (iif) {
2424                 struct net_device *dev;
2425
2426                 dev = __dev_get_by_index(net, iif);
2427                 if (dev == NULL) {
2428                         err = -ENODEV;
2429                         goto errout_free;
2430                 }
2431
2432                 skb->protocol   = htons(ETH_P_IP);
2433                 skb->dev        = dev;
2434                 skb->mark       = mark;
2435                 local_bh_disable();
2436                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2437                 local_bh_enable();
2438
2439                 rt = skb_rtable(skb);
2440                 if (err == 0 && rt->dst.error)
2441                         err = -rt->dst.error;
2442         } else {
2443                 rt = ip_route_output_key(net, &fl4);
2444
2445                 err = 0;
2446                 if (IS_ERR(rt))
2447                         err = PTR_ERR(rt);
2448         }
2449
2450         if (err)
2451                 goto errout_free;
2452
2453         skb_dst_set(skb, &rt->dst);
2454         if (rtm->rtm_flags & RTM_F_NOTIFY)
2455                 rt->rt_flags |= RTCF_NOTIFY;
2456
2457         err = rt_fill_info(net, dst, src, &fl4, skb,
2458                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2459                            RTM_NEWROUTE, 0, 0);
2460         if (err <= 0)
2461                 goto errout_free;
2462
2463         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2464 errout:
2465         return err;
2466
2467 errout_free:
2468         kfree_skb(skb);
2469         goto errout;
2470 }
2471
2472 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2473 {
2474         return skb->len;
2475 }
2476
2477 void ip_rt_multicast_event(struct in_device *in_dev)
2478 {
2479         rt_cache_flush(dev_net(in_dev->dev));
2480 }
2481
2482 #ifdef CONFIG_SYSCTL
2483 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2484 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2485 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2486 static int ip_rt_gc_elasticity __read_mostly    = 8;
2487
2488 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2489                                         void __user *buffer,
2490                                         size_t *lenp, loff_t *ppos)
2491 {
2492         struct net *net = (struct net *)__ctl->extra1;
2493
2494         if (write) {
2495                 rt_cache_flush(net);
2496                 fnhe_genid_bump(net);
2497                 return 0;
2498         }
2499
2500         return -EINVAL;
2501 }
2502
2503 static struct ctl_table ipv4_route_table[] = {
2504         {
2505                 .procname       = "gc_thresh",
2506                 .data           = &ipv4_dst_ops.gc_thresh,
2507                 .maxlen         = sizeof(int),
2508                 .mode           = 0644,
2509                 .proc_handler   = proc_dointvec,
2510         },
2511         {
2512                 .procname       = "max_size",
2513                 .data           = &ip_rt_max_size,
2514                 .maxlen         = sizeof(int),
2515                 .mode           = 0644,
2516                 .proc_handler   = proc_dointvec,
2517         },
2518         {
2519                 /*  Deprecated. Use gc_min_interval_ms */
2520
2521                 .procname       = "gc_min_interval",
2522                 .data           = &ip_rt_gc_min_interval,
2523                 .maxlen         = sizeof(int),
2524                 .mode           = 0644,
2525                 .proc_handler   = proc_dointvec_jiffies,
2526         },
2527         {
2528                 .procname       = "gc_min_interval_ms",
2529                 .data           = &ip_rt_gc_min_interval,
2530                 .maxlen         = sizeof(int),
2531                 .mode           = 0644,
2532                 .proc_handler   = proc_dointvec_ms_jiffies,
2533         },
2534         {
2535                 .procname       = "gc_timeout",
2536                 .data           = &ip_rt_gc_timeout,
2537                 .maxlen         = sizeof(int),
2538                 .mode           = 0644,
2539                 .proc_handler   = proc_dointvec_jiffies,
2540         },
2541         {
2542                 .procname       = "gc_interval",
2543                 .data           = &ip_rt_gc_interval,
2544                 .maxlen         = sizeof(int),
2545                 .mode           = 0644,
2546                 .proc_handler   = proc_dointvec_jiffies,
2547         },
2548         {
2549                 .procname       = "redirect_load",
2550                 .data           = &ip_rt_redirect_load,
2551                 .maxlen         = sizeof(int),
2552                 .mode           = 0644,
2553                 .proc_handler   = proc_dointvec,
2554         },
2555         {
2556                 .procname       = "redirect_number",
2557                 .data           = &ip_rt_redirect_number,
2558                 .maxlen         = sizeof(int),
2559                 .mode           = 0644,
2560                 .proc_handler   = proc_dointvec,
2561         },
2562         {
2563                 .procname       = "redirect_silence",
2564                 .data           = &ip_rt_redirect_silence,
2565                 .maxlen         = sizeof(int),
2566                 .mode           = 0644,
2567                 .proc_handler   = proc_dointvec,
2568         },
2569         {
2570                 .procname       = "error_cost",
2571                 .data           = &ip_rt_error_cost,
2572                 .maxlen         = sizeof(int),
2573                 .mode           = 0644,
2574                 .proc_handler   = proc_dointvec,
2575         },
2576         {
2577                 .procname       = "error_burst",
2578                 .data           = &ip_rt_error_burst,
2579                 .maxlen         = sizeof(int),
2580                 .mode           = 0644,
2581                 .proc_handler   = proc_dointvec,
2582         },
2583         {
2584                 .procname       = "gc_elasticity",
2585                 .data           = &ip_rt_gc_elasticity,
2586                 .maxlen         = sizeof(int),
2587                 .mode           = 0644,
2588                 .proc_handler   = proc_dointvec,
2589         },
2590         {
2591                 .procname       = "mtu_expires",
2592                 .data           = &ip_rt_mtu_expires,
2593                 .maxlen         = sizeof(int),
2594                 .mode           = 0644,
2595                 .proc_handler   = proc_dointvec_jiffies,
2596         },
2597         {
2598                 .procname       = "min_pmtu",
2599                 .data           = &ip_rt_min_pmtu,
2600                 .maxlen         = sizeof(int),
2601                 .mode           = 0644,
2602                 .proc_handler   = proc_dointvec,
2603         },
2604         {
2605                 .procname       = "min_adv_mss",
2606                 .data           = &ip_rt_min_advmss,
2607                 .maxlen         = sizeof(int),
2608                 .mode           = 0644,
2609                 .proc_handler   = proc_dointvec,
2610         },
2611         { }
2612 };
2613
2614 static struct ctl_table ipv4_route_flush_table[] = {
2615         {
2616                 .procname       = "flush",
2617                 .maxlen         = sizeof(int),
2618                 .mode           = 0200,
2619                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2620         },
2621         { },
2622 };
2623
2624 static __net_init int sysctl_route_net_init(struct net *net)
2625 {
2626         struct ctl_table *tbl;
2627
2628         tbl = ipv4_route_flush_table;
2629         if (!net_eq(net, &init_net)) {
2630                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2631                 if (tbl == NULL)
2632                         goto err_dup;
2633
2634                 /* Don't export sysctls to unprivileged users */
2635                 if (net->user_ns != &init_user_ns)
2636                         tbl[0].procname = NULL;
2637         }
2638         tbl[0].extra1 = net;
2639
2640         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2641         if (net->ipv4.route_hdr == NULL)
2642                 goto err_reg;
2643         return 0;
2644
2645 err_reg:
2646         if (tbl != ipv4_route_flush_table)
2647                 kfree(tbl);
2648 err_dup:
2649         return -ENOMEM;
2650 }
2651
2652 static __net_exit void sysctl_route_net_exit(struct net *net)
2653 {
2654         struct ctl_table *tbl;
2655
2656         tbl = net->ipv4.route_hdr->ctl_table_arg;
2657         unregister_net_sysctl_table(net->ipv4.route_hdr);
2658         BUG_ON(tbl == ipv4_route_flush_table);
2659         kfree(tbl);
2660 }
2661
2662 static __net_initdata struct pernet_operations sysctl_route_ops = {
2663         .init = sysctl_route_net_init,
2664         .exit = sysctl_route_net_exit,
2665 };
2666 #endif
2667
2668 static __net_init int rt_genid_init(struct net *net)
2669 {
2670         atomic_set(&net->ipv4.rt_genid, 0);
2671         atomic_set(&net->fnhe_genid, 0);
2672         get_random_bytes(&net->ipv4.dev_addr_genid,
2673                          sizeof(net->ipv4.dev_addr_genid));
2674         return 0;
2675 }
2676
2677 static __net_initdata struct pernet_operations rt_genid_ops = {
2678         .init = rt_genid_init,
2679 };
2680
2681 static int __net_init ipv4_inetpeer_init(struct net *net)
2682 {
2683         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2684
2685         if (!bp)
2686                 return -ENOMEM;
2687         inet_peer_base_init(bp);
2688         net->ipv4.peers = bp;
2689         return 0;
2690 }
2691
2692 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2693 {
2694         struct inet_peer_base *bp = net->ipv4.peers;
2695
2696         net->ipv4.peers = NULL;
2697         inetpeer_invalidate_tree(bp);
2698         kfree(bp);
2699 }
2700
2701 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2702         .init   =       ipv4_inetpeer_init,
2703         .exit   =       ipv4_inetpeer_exit,
2704 };
2705
2706 #ifdef CONFIG_IP_ROUTE_CLASSID
2707 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2708 #endif /* CONFIG_IP_ROUTE_CLASSID */
2709
2710 int __init ip_rt_init(void)
2711 {
2712         int rc = 0;
2713
2714 #ifdef CONFIG_IP_ROUTE_CLASSID
2715         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2716         if (!ip_rt_acct)
2717                 panic("IP: failed to allocate ip_rt_acct\n");
2718 #endif
2719
2720         ipv4_dst_ops.kmem_cachep =
2721                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2722                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2723
2724         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2725
2726         if (dst_entries_init(&ipv4_dst_ops) < 0)
2727                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2728
2729         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2730                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2731
2732         ipv4_dst_ops.gc_thresh = ~0;
2733         ip_rt_max_size = INT_MAX;
2734
2735         devinet_init();
2736         ip_fib_init();
2737
2738         if (ip_rt_proc_init())
2739                 pr_err("Unable to create route proc files\n");
2740 #ifdef CONFIG_XFRM
2741         xfrm_init();
2742         xfrm4_init();
2743 #endif
2744         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2745
2746 #ifdef CONFIG_SYSCTL
2747         register_pernet_subsys(&sysctl_route_ops);
2748 #endif
2749         register_pernet_subsys(&rt_genid_ops);
2750         register_pernet_subsys(&ipv4_inetpeer_ops);
2751         return rc;
2752 }
2753
2754 #ifdef CONFIG_SYSCTL
2755 /*
2756  * We really need to sanitize the damn ipv4 init order, then all
2757  * this nonsense will go away.
2758  */
2759 void __init ip_static_sysctl_init(void)
2760 {
2761         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2762 }
2763 #endif