Merge tag 'hwmon-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/groeck...
[linux-drm-fsl-dcu.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 /* IPv4 datagram length is stored into 16bit field (tot_len) */
116 #define IP_MAX_MTU      0xFFFF
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_redirect_number __read_mostly  = 9;
122 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124 static int ip_rt_error_cost __read_mostly       = HZ;
125 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
126 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
127 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
128 static int ip_rt_min_advmss __read_mostly       = 256;
129
130 /*
131  *      Interface to generic destination cache.
132  */
133
134 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
135 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
136 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
137 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
138 static void              ipv4_link_failure(struct sk_buff *skb);
139 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
140                                            struct sk_buff *skb, u32 mtu);
141 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
142                                         struct sk_buff *skb);
143 static void             ipv4_dst_destroy(struct dst_entry *dst);
144
145 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
146                             int how)
147 {
148 }
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             cpu_to_be16(ETH_P_IP),
163         .check =                ipv4_dst_check,
164         .default_advmss =       ipv4_default_advmss,
165         .mtu =                  ipv4_mtu,
166         .cow_metrics =          ipv4_cow_metrics,
167         .destroy =              ipv4_dst_destroy,
168         .ifdown =               ipv4_dst_ifdown,
169         .negative_advice =      ipv4_negative_advice,
170         .link_failure =         ipv4_link_failure,
171         .update_pmtu =          ip_rt_update_pmtu,
172         .redirect =             ip_do_redirect,
173         .local_out =            __ip_local_out,
174         .neigh_lookup =         ipv4_neigh_lookup,
175 };
176
177 #define ECN_OR_COST(class)      TC_PRIO_##class
178
179 const __u8 ip_tos2prio[16] = {
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BESTEFFORT,
183         ECN_OR_COST(BESTEFFORT),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_BULK,
187         ECN_OR_COST(BULK),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE,
191         ECN_OR_COST(INTERACTIVE),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK),
194         TC_PRIO_INTERACTIVE_BULK,
195         ECN_OR_COST(INTERACTIVE_BULK)
196 };
197 EXPORT_SYMBOL(ip_tos2prio);
198
199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
200 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
201
202 #ifdef CONFIG_PROC_FS
203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204 {
205         if (*pos)
206                 return NULL;
207         return SEQ_START_TOKEN;
208 }
209
210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211 {
212         ++*pos;
213         return NULL;
214 }
215
216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217 {
218 }
219
220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 {
222         if (v == SEQ_START_TOKEN)
223                 seq_printf(seq, "%-127s\n",
224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226                            "HHUptod\tSpecDst");
227         return 0;
228 }
229
230 static const struct seq_operations rt_cache_seq_ops = {
231         .start  = rt_cache_seq_start,
232         .next   = rt_cache_seq_next,
233         .stop   = rt_cache_seq_stop,
234         .show   = rt_cache_seq_show,
235 };
236
237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 {
239         return seq_open(file, &rt_cache_seq_ops);
240 }
241
242 static const struct file_operations rt_cache_seq_fops = {
243         .owner   = THIS_MODULE,
244         .open    = rt_cache_seq_open,
245         .read    = seq_read,
246         .llseek  = seq_lseek,
247         .release = seq_release,
248 };
249
250
251 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252 {
253         int cpu;
254
255         if (*pos == 0)
256                 return SEQ_START_TOKEN;
257
258         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
259                 if (!cpu_possible(cpu))
260                         continue;
261                 *pos = cpu+1;
262                 return &per_cpu(rt_cache_stat, cpu);
263         }
264         return NULL;
265 }
266
267 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268 {
269         int cpu;
270
271         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
272                 if (!cpu_possible(cpu))
273                         continue;
274                 *pos = cpu+1;
275                 return &per_cpu(rt_cache_stat, cpu);
276         }
277         return NULL;
278
279 }
280
281 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282 {
283
284 }
285
286 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 {
288         struct rt_cache_stat *st = v;
289
290         if (v == SEQ_START_TOKEN) {
291                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
292                 return 0;
293         }
294
295         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
296                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
297                    dst_entries_get_slow(&ipv4_dst_ops),
298                    0, /* st->in_hit */
299                    st->in_slow_tot,
300                    st->in_slow_mc,
301                    st->in_no_route,
302                    st->in_brd,
303                    st->in_martian_dst,
304                    st->in_martian_src,
305
306                    0, /* st->out_hit */
307                    st->out_slow_tot,
308                    st->out_slow_mc,
309
310                    0, /* st->gc_total */
311                    0, /* st->gc_ignored */
312                    0, /* st->gc_goal_miss */
313                    0, /* st->gc_dst_overflow */
314                    0, /* st->in_hlist_search */
315                    0  /* st->out_hlist_search */
316                 );
317         return 0;
318 }
319
320 static const struct seq_operations rt_cpu_seq_ops = {
321         .start  = rt_cpu_seq_start,
322         .next   = rt_cpu_seq_next,
323         .stop   = rt_cpu_seq_stop,
324         .show   = rt_cpu_seq_show,
325 };
326
327
328 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 {
330         return seq_open(file, &rt_cpu_seq_ops);
331 }
332
333 static const struct file_operations rt_cpu_seq_fops = {
334         .owner   = THIS_MODULE,
335         .open    = rt_cpu_seq_open,
336         .read    = seq_read,
337         .llseek  = seq_lseek,
338         .release = seq_release,
339 };
340
341 #ifdef CONFIG_IP_ROUTE_CLASSID
342 static int rt_acct_proc_show(struct seq_file *m, void *v)
343 {
344         struct ip_rt_acct *dst, *src;
345         unsigned int i, j;
346
347         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348         if (!dst)
349                 return -ENOMEM;
350
351         for_each_possible_cpu(i) {
352                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353                 for (j = 0; j < 256; j++) {
354                         dst[j].o_bytes   += src[j].o_bytes;
355                         dst[j].o_packets += src[j].o_packets;
356                         dst[j].i_bytes   += src[j].i_bytes;
357                         dst[j].i_packets += src[j].i_packets;
358                 }
359         }
360
361         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362         kfree(dst);
363         return 0;
364 }
365
366 static int rt_acct_proc_open(struct inode *inode, struct file *file)
367 {
368         return single_open(file, rt_acct_proc_show, NULL);
369 }
370
371 static const struct file_operations rt_acct_proc_fops = {
372         .owner          = THIS_MODULE,
373         .open           = rt_acct_proc_open,
374         .read           = seq_read,
375         .llseek         = seq_lseek,
376         .release        = single_release,
377 };
378 #endif
379
380 static int __net_init ip_rt_do_proc_init(struct net *net)
381 {
382         struct proc_dir_entry *pde;
383
384         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385                           &rt_cache_seq_fops);
386         if (!pde)
387                 goto err1;
388
389         pde = proc_create("rt_cache", S_IRUGO,
390                           net->proc_net_stat, &rt_cpu_seq_fops);
391         if (!pde)
392                 goto err2;
393
394 #ifdef CONFIG_IP_ROUTE_CLASSID
395         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
396         if (!pde)
397                 goto err3;
398 #endif
399         return 0;
400
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 err3:
403         remove_proc_entry("rt_cache", net->proc_net_stat);
404 #endif
405 err2:
406         remove_proc_entry("rt_cache", net->proc_net);
407 err1:
408         return -ENOMEM;
409 }
410
411 static void __net_exit ip_rt_do_proc_exit(struct net *net)
412 {
413         remove_proc_entry("rt_cache", net->proc_net_stat);
414         remove_proc_entry("rt_cache", net->proc_net);
415 #ifdef CONFIG_IP_ROUTE_CLASSID
416         remove_proc_entry("rt_acct", net->proc_net);
417 #endif
418 }
419
420 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
421         .init = ip_rt_do_proc_init,
422         .exit = ip_rt_do_proc_exit,
423 };
424
425 static int __init ip_rt_proc_init(void)
426 {
427         return register_pernet_subsys(&ip_rt_proc_ops);
428 }
429
430 #else
431 static inline int ip_rt_proc_init(void)
432 {
433         return 0;
434 }
435 #endif /* CONFIG_PROC_FS */
436
437 static inline bool rt_is_expired(const struct rtable *rth)
438 {
439         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
440 }
441
442 void rt_cache_flush(struct net *net)
443 {
444         rt_genid_bump_ipv4(net);
445 }
446
447 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448                                            struct sk_buff *skb,
449                                            const void *daddr)
450 {
451         struct net_device *dev = dst->dev;
452         const __be32 *pkey = daddr;
453         const struct rtable *rt;
454         struct neighbour *n;
455
456         rt = (const struct rtable *) dst;
457         if (rt->rt_gateway)
458                 pkey = (const __be32 *) &rt->rt_gateway;
459         else if (skb)
460                 pkey = &ip_hdr(skb)->daddr;
461
462         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
463         if (n)
464                 return n;
465         return neigh_create(&arp_tbl, pkey, dev);
466 }
467
468 /*
469  * Peer allocation may fail only in serious out-of-memory conditions.  However
470  * we still can generate some output.
471  * Random ID selection looks a bit dangerous because we have no chances to
472  * select ID being unique in a reasonable period of time.
473  * But broken packet identifier may be better than no packet at all.
474  */
475 static void ip_select_fb_ident(struct iphdr *iph)
476 {
477         static DEFINE_SPINLOCK(ip_fb_id_lock);
478         static u32 ip_fallback_id;
479         u32 salt;
480
481         spin_lock_bh(&ip_fb_id_lock);
482         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
483         iph->id = htons(salt & 0xFFFF);
484         ip_fallback_id = salt;
485         spin_unlock_bh(&ip_fb_id_lock);
486 }
487
488 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
489 {
490         struct net *net = dev_net(dst->dev);
491         struct inet_peer *peer;
492
493         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
494         if (peer) {
495                 iph->id = htons(inet_getid(peer, more));
496                 inet_putpeer(peer);
497                 return;
498         }
499
500         ip_select_fb_ident(iph);
501 }
502 EXPORT_SYMBOL(__ip_select_ident);
503
504 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
505                              const struct iphdr *iph,
506                              int oif, u8 tos,
507                              u8 prot, u32 mark, int flow_flags)
508 {
509         if (sk) {
510                 const struct inet_sock *inet = inet_sk(sk);
511
512                 oif = sk->sk_bound_dev_if;
513                 mark = sk->sk_mark;
514                 tos = RT_CONN_FLAGS(sk);
515                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
516         }
517         flowi4_init_output(fl4, oif, mark, tos,
518                            RT_SCOPE_UNIVERSE, prot,
519                            flow_flags,
520                            iph->daddr, iph->saddr, 0, 0);
521 }
522
523 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
524                                const struct sock *sk)
525 {
526         const struct iphdr *iph = ip_hdr(skb);
527         int oif = skb->dev->ifindex;
528         u8 tos = RT_TOS(iph->tos);
529         u8 prot = iph->protocol;
530         u32 mark = skb->mark;
531
532         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
533 }
534
535 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
536 {
537         const struct inet_sock *inet = inet_sk(sk);
538         const struct ip_options_rcu *inet_opt;
539         __be32 daddr = inet->inet_daddr;
540
541         rcu_read_lock();
542         inet_opt = rcu_dereference(inet->inet_opt);
543         if (inet_opt && inet_opt->opt.srr)
544                 daddr = inet_opt->opt.faddr;
545         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
546                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
547                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
548                            inet_sk_flowi_flags(sk),
549                            daddr, inet->inet_saddr, 0, 0);
550         rcu_read_unlock();
551 }
552
553 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
554                                  const struct sk_buff *skb)
555 {
556         if (skb)
557                 build_skb_flow_key(fl4, skb, sk);
558         else
559                 build_sk_flow_key(fl4, sk);
560 }
561
562 static inline void rt_free(struct rtable *rt)
563 {
564         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
565 }
566
567 static DEFINE_SPINLOCK(fnhe_lock);
568
569 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
570 {
571         struct rtable *rt;
572
573         rt = rcu_dereference(fnhe->fnhe_rth_input);
574         if (rt) {
575                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
576                 rt_free(rt);
577         }
578         rt = rcu_dereference(fnhe->fnhe_rth_output);
579         if (rt) {
580                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
581                 rt_free(rt);
582         }
583 }
584
585 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
586 {
587         struct fib_nh_exception *fnhe, *oldest;
588
589         oldest = rcu_dereference(hash->chain);
590         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
591              fnhe = rcu_dereference(fnhe->fnhe_next)) {
592                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
593                         oldest = fnhe;
594         }
595         fnhe_flush_routes(oldest);
596         return oldest;
597 }
598
599 static inline u32 fnhe_hashfun(__be32 daddr)
600 {
601         u32 hval;
602
603         hval = (__force u32) daddr;
604         hval ^= (hval >> 11) ^ (hval >> 22);
605
606         return hval & (FNHE_HASH_SIZE - 1);
607 }
608
609 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
610 {
611         rt->rt_pmtu = fnhe->fnhe_pmtu;
612         rt->dst.expires = fnhe->fnhe_expires;
613
614         if (fnhe->fnhe_gw) {
615                 rt->rt_flags |= RTCF_REDIRECTED;
616                 rt->rt_gateway = fnhe->fnhe_gw;
617                 rt->rt_uses_gateway = 1;
618         }
619 }
620
621 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
622                                   u32 pmtu, unsigned long expires)
623 {
624         struct fnhe_hash_bucket *hash;
625         struct fib_nh_exception *fnhe;
626         struct rtable *rt;
627         unsigned int i;
628         int depth;
629         u32 hval = fnhe_hashfun(daddr);
630
631         spin_lock_bh(&fnhe_lock);
632
633         hash = nh->nh_exceptions;
634         if (!hash) {
635                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
636                 if (!hash)
637                         goto out_unlock;
638                 nh->nh_exceptions = hash;
639         }
640
641         hash += hval;
642
643         depth = 0;
644         for (fnhe = rcu_dereference(hash->chain); fnhe;
645              fnhe = rcu_dereference(fnhe->fnhe_next)) {
646                 if (fnhe->fnhe_daddr == daddr)
647                         break;
648                 depth++;
649         }
650
651         if (fnhe) {
652                 if (gw)
653                         fnhe->fnhe_gw = gw;
654                 if (pmtu) {
655                         fnhe->fnhe_pmtu = pmtu;
656                         fnhe->fnhe_expires = max(1UL, expires);
657                 }
658                 /* Update all cached dsts too */
659                 rt = rcu_dereference(fnhe->fnhe_rth_input);
660                 if (rt)
661                         fill_route_from_fnhe(rt, fnhe);
662                 rt = rcu_dereference(fnhe->fnhe_rth_output);
663                 if (rt)
664                         fill_route_from_fnhe(rt, fnhe);
665         } else {
666                 if (depth > FNHE_RECLAIM_DEPTH)
667                         fnhe = fnhe_oldest(hash);
668                 else {
669                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
670                         if (!fnhe)
671                                 goto out_unlock;
672
673                         fnhe->fnhe_next = hash->chain;
674                         rcu_assign_pointer(hash->chain, fnhe);
675                 }
676                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
677                 fnhe->fnhe_daddr = daddr;
678                 fnhe->fnhe_gw = gw;
679                 fnhe->fnhe_pmtu = pmtu;
680                 fnhe->fnhe_expires = expires;
681
682                 /* Exception created; mark the cached routes for the nexthop
683                  * stale, so anyone caching it rechecks if this exception
684                  * applies to them.
685                  */
686                 rt = rcu_dereference(nh->nh_rth_input);
687                 if (rt)
688                         rt->dst.obsolete = DST_OBSOLETE_KILL;
689
690                 for_each_possible_cpu(i) {
691                         struct rtable __rcu **prt;
692                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
693                         rt = rcu_dereference(*prt);
694                         if (rt)
695                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
696                 }
697         }
698
699         fnhe->fnhe_stamp = jiffies;
700
701 out_unlock:
702         spin_unlock_bh(&fnhe_lock);
703         return;
704 }
705
706 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
707                              bool kill_route)
708 {
709         __be32 new_gw = icmp_hdr(skb)->un.gateway;
710         __be32 old_gw = ip_hdr(skb)->saddr;
711         struct net_device *dev = skb->dev;
712         struct in_device *in_dev;
713         struct fib_result res;
714         struct neighbour *n;
715         struct net *net;
716
717         switch (icmp_hdr(skb)->code & 7) {
718         case ICMP_REDIR_NET:
719         case ICMP_REDIR_NETTOS:
720         case ICMP_REDIR_HOST:
721         case ICMP_REDIR_HOSTTOS:
722                 break;
723
724         default:
725                 return;
726         }
727
728         if (rt->rt_gateway != old_gw)
729                 return;
730
731         in_dev = __in_dev_get_rcu(dev);
732         if (!in_dev)
733                 return;
734
735         net = dev_net(dev);
736         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
737             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
738             ipv4_is_zeronet(new_gw))
739                 goto reject_redirect;
740
741         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
742                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
743                         goto reject_redirect;
744                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
745                         goto reject_redirect;
746         } else {
747                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
748                         goto reject_redirect;
749         }
750
751         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
752         if (n) {
753                 if (!(n->nud_state & NUD_VALID)) {
754                         neigh_event_send(n, NULL);
755                 } else {
756                         if (fib_lookup(net, fl4, &res) == 0) {
757                                 struct fib_nh *nh = &FIB_RES_NH(res);
758
759                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
760                                                       0, 0);
761                         }
762                         if (kill_route)
763                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
764                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
765                 }
766                 neigh_release(n);
767         }
768         return;
769
770 reject_redirect:
771 #ifdef CONFIG_IP_ROUTE_VERBOSE
772         if (IN_DEV_LOG_MARTIANS(in_dev)) {
773                 const struct iphdr *iph = (const struct iphdr *) skb->data;
774                 __be32 daddr = iph->daddr;
775                 __be32 saddr = iph->saddr;
776
777                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
778                                      "  Advised path = %pI4 -> %pI4\n",
779                                      &old_gw, dev->name, &new_gw,
780                                      &saddr, &daddr);
781         }
782 #endif
783         ;
784 }
785
786 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
787 {
788         struct rtable *rt;
789         struct flowi4 fl4;
790         const struct iphdr *iph = (const struct iphdr *) skb->data;
791         int oif = skb->dev->ifindex;
792         u8 tos = RT_TOS(iph->tos);
793         u8 prot = iph->protocol;
794         u32 mark = skb->mark;
795
796         rt = (struct rtable *) dst;
797
798         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
799         __ip_do_redirect(rt, skb, &fl4, true);
800 }
801
802 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
803 {
804         struct rtable *rt = (struct rtable *)dst;
805         struct dst_entry *ret = dst;
806
807         if (rt) {
808                 if (dst->obsolete > 0) {
809                         ip_rt_put(rt);
810                         ret = NULL;
811                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
812                            rt->dst.expires) {
813                         ip_rt_put(rt);
814                         ret = NULL;
815                 }
816         }
817         return ret;
818 }
819
820 /*
821  * Algorithm:
822  *      1. The first ip_rt_redirect_number redirects are sent
823  *         with exponential backoff, then we stop sending them at all,
824  *         assuming that the host ignores our redirects.
825  *      2. If we did not see packets requiring redirects
826  *         during ip_rt_redirect_silence, we assume that the host
827  *         forgot redirected route and start to send redirects again.
828  *
829  * This algorithm is much cheaper and more intelligent than dumb load limiting
830  * in icmp.c.
831  *
832  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
833  * and "frag. need" (breaks PMTU discovery) in icmp.c.
834  */
835
836 void ip_rt_send_redirect(struct sk_buff *skb)
837 {
838         struct rtable *rt = skb_rtable(skb);
839         struct in_device *in_dev;
840         struct inet_peer *peer;
841         struct net *net;
842         int log_martians;
843
844         rcu_read_lock();
845         in_dev = __in_dev_get_rcu(rt->dst.dev);
846         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
847                 rcu_read_unlock();
848                 return;
849         }
850         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
851         rcu_read_unlock();
852
853         net = dev_net(rt->dst.dev);
854         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
855         if (!peer) {
856                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
857                           rt_nexthop(rt, ip_hdr(skb)->daddr));
858                 return;
859         }
860
861         /* No redirected packets during ip_rt_redirect_silence;
862          * reset the algorithm.
863          */
864         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
865                 peer->rate_tokens = 0;
866
867         /* Too many ignored redirects; do not send anything
868          * set dst.rate_last to the last seen redirected packet.
869          */
870         if (peer->rate_tokens >= ip_rt_redirect_number) {
871                 peer->rate_last = jiffies;
872                 goto out_put_peer;
873         }
874
875         /* Check for load limit; set rate_last to the latest sent
876          * redirect.
877          */
878         if (peer->rate_tokens == 0 ||
879             time_after(jiffies,
880                        (peer->rate_last +
881                         (ip_rt_redirect_load << peer->rate_tokens)))) {
882                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
883
884                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
885                 peer->rate_last = jiffies;
886                 ++peer->rate_tokens;
887 #ifdef CONFIG_IP_ROUTE_VERBOSE
888                 if (log_martians &&
889                     peer->rate_tokens == ip_rt_redirect_number)
890                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
891                                              &ip_hdr(skb)->saddr, inet_iif(skb),
892                                              &ip_hdr(skb)->daddr, &gw);
893 #endif
894         }
895 out_put_peer:
896         inet_putpeer(peer);
897 }
898
899 static int ip_error(struct sk_buff *skb)
900 {
901         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
902         struct rtable *rt = skb_rtable(skb);
903         struct inet_peer *peer;
904         unsigned long now;
905         struct net *net;
906         bool send;
907         int code;
908
909         net = dev_net(rt->dst.dev);
910         if (!IN_DEV_FORWARD(in_dev)) {
911                 switch (rt->dst.error) {
912                 case EHOSTUNREACH:
913                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
914                         break;
915
916                 case ENETUNREACH:
917                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
918                         break;
919                 }
920                 goto out;
921         }
922
923         switch (rt->dst.error) {
924         case EINVAL:
925         default:
926                 goto out;
927         case EHOSTUNREACH:
928                 code = ICMP_HOST_UNREACH;
929                 break;
930         case ENETUNREACH:
931                 code = ICMP_NET_UNREACH;
932                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
933                 break;
934         case EACCES:
935                 code = ICMP_PKT_FILTERED;
936                 break;
937         }
938
939         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
940
941         send = true;
942         if (peer) {
943                 now = jiffies;
944                 peer->rate_tokens += now - peer->rate_last;
945                 if (peer->rate_tokens > ip_rt_error_burst)
946                         peer->rate_tokens = ip_rt_error_burst;
947                 peer->rate_last = now;
948                 if (peer->rate_tokens >= ip_rt_error_cost)
949                         peer->rate_tokens -= ip_rt_error_cost;
950                 else
951                         send = false;
952                 inet_putpeer(peer);
953         }
954         if (send)
955                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
956
957 out:    kfree_skb(skb);
958         return 0;
959 }
960
961 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
962 {
963         struct dst_entry *dst = &rt->dst;
964         struct fib_result res;
965
966         if (dst_metric_locked(dst, RTAX_MTU))
967                 return;
968
969         if (dst->dev->mtu < mtu)
970                 return;
971
972         if (mtu < ip_rt_min_pmtu)
973                 mtu = ip_rt_min_pmtu;
974
975         if (rt->rt_pmtu == mtu &&
976             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
977                 return;
978
979         rcu_read_lock();
980         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
981                 struct fib_nh *nh = &FIB_RES_NH(res);
982
983                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
984                                       jiffies + ip_rt_mtu_expires);
985         }
986         rcu_read_unlock();
987 }
988
989 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
990                               struct sk_buff *skb, u32 mtu)
991 {
992         struct rtable *rt = (struct rtable *) dst;
993         struct flowi4 fl4;
994
995         ip_rt_build_flow_key(&fl4, sk, skb);
996         __ip_rt_update_pmtu(rt, &fl4, mtu);
997 }
998
999 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1000                       int oif, u32 mark, u8 protocol, int flow_flags)
1001 {
1002         const struct iphdr *iph = (const struct iphdr *) skb->data;
1003         struct flowi4 fl4;
1004         struct rtable *rt;
1005
1006         __build_flow_key(&fl4, NULL, iph, oif,
1007                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1008         rt = __ip_route_output_key(net, &fl4);
1009         if (!IS_ERR(rt)) {
1010                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1011                 ip_rt_put(rt);
1012         }
1013 }
1014 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1015
1016 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1017 {
1018         const struct iphdr *iph = (const struct iphdr *) skb->data;
1019         struct flowi4 fl4;
1020         struct rtable *rt;
1021
1022         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1023         rt = __ip_route_output_key(sock_net(sk), &fl4);
1024         if (!IS_ERR(rt)) {
1025                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1026                 ip_rt_put(rt);
1027         }
1028 }
1029
1030 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1031 {
1032         const struct iphdr *iph = (const struct iphdr *) skb->data;
1033         struct flowi4 fl4;
1034         struct rtable *rt;
1035         struct dst_entry *dst;
1036         bool new = false;
1037
1038         bh_lock_sock(sk);
1039
1040         if (!ip_sk_accept_pmtu(sk))
1041                 goto out;
1042
1043         rt = (struct rtable *) __sk_dst_get(sk);
1044
1045         if (sock_owned_by_user(sk) || !rt) {
1046                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1047                 goto out;
1048         }
1049
1050         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1051
1052         if (!__sk_dst_check(sk, 0)) {
1053                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1054                 if (IS_ERR(rt))
1055                         goto out;
1056
1057                 new = true;
1058         }
1059
1060         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1061
1062         dst = dst_check(&rt->dst, 0);
1063         if (!dst) {
1064                 if (new)
1065                         dst_release(&rt->dst);
1066
1067                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1068                 if (IS_ERR(rt))
1069                         goto out;
1070
1071                 new = true;
1072         }
1073
1074         if (new)
1075                 __sk_dst_set(sk, &rt->dst);
1076
1077 out:
1078         bh_unlock_sock(sk);
1079 }
1080 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1081
1082 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1083                    int oif, u32 mark, u8 protocol, int flow_flags)
1084 {
1085         const struct iphdr *iph = (const struct iphdr *) skb->data;
1086         struct flowi4 fl4;
1087         struct rtable *rt;
1088
1089         __build_flow_key(&fl4, NULL, iph, oif,
1090                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1091         rt = __ip_route_output_key(net, &fl4);
1092         if (!IS_ERR(rt)) {
1093                 __ip_do_redirect(rt, skb, &fl4, false);
1094                 ip_rt_put(rt);
1095         }
1096 }
1097 EXPORT_SYMBOL_GPL(ipv4_redirect);
1098
1099 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1100 {
1101         const struct iphdr *iph = (const struct iphdr *) skb->data;
1102         struct flowi4 fl4;
1103         struct rtable *rt;
1104
1105         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1106         rt = __ip_route_output_key(sock_net(sk), &fl4);
1107         if (!IS_ERR(rt)) {
1108                 __ip_do_redirect(rt, skb, &fl4, false);
1109                 ip_rt_put(rt);
1110         }
1111 }
1112 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1113
1114 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1115 {
1116         struct rtable *rt = (struct rtable *) dst;
1117
1118         /* All IPV4 dsts are created with ->obsolete set to the value
1119          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1120          * into this function always.
1121          *
1122          * When a PMTU/redirect information update invalidates a route,
1123          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1124          * DST_OBSOLETE_DEAD by dst_free().
1125          */
1126         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1127                 return NULL;
1128         return dst;
1129 }
1130
1131 static void ipv4_link_failure(struct sk_buff *skb)
1132 {
1133         struct rtable *rt;
1134
1135         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1136
1137         rt = skb_rtable(skb);
1138         if (rt)
1139                 dst_set_expires(&rt->dst, 0);
1140 }
1141
1142 static int ip_rt_bug(struct sk_buff *skb)
1143 {
1144         pr_debug("%s: %pI4 -> %pI4, %s\n",
1145                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1146                  skb->dev ? skb->dev->name : "?");
1147         kfree_skb(skb);
1148         WARN_ON(1);
1149         return 0;
1150 }
1151
1152 /*
1153    We do not cache source address of outgoing interface,
1154    because it is used only by IP RR, TS and SRR options,
1155    so that it out of fast path.
1156
1157    BTW remember: "addr" is allowed to be not aligned
1158    in IP options!
1159  */
1160
1161 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1162 {
1163         __be32 src;
1164
1165         if (rt_is_output_route(rt))
1166                 src = ip_hdr(skb)->saddr;
1167         else {
1168                 struct fib_result res;
1169                 struct flowi4 fl4;
1170                 struct iphdr *iph;
1171
1172                 iph = ip_hdr(skb);
1173
1174                 memset(&fl4, 0, sizeof(fl4));
1175                 fl4.daddr = iph->daddr;
1176                 fl4.saddr = iph->saddr;
1177                 fl4.flowi4_tos = RT_TOS(iph->tos);
1178                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1179                 fl4.flowi4_iif = skb->dev->ifindex;
1180                 fl4.flowi4_mark = skb->mark;
1181
1182                 rcu_read_lock();
1183                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1184                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1185                 else
1186                         src = inet_select_addr(rt->dst.dev,
1187                                                rt_nexthop(rt, iph->daddr),
1188                                                RT_SCOPE_UNIVERSE);
1189                 rcu_read_unlock();
1190         }
1191         memcpy(addr, &src, 4);
1192 }
1193
1194 #ifdef CONFIG_IP_ROUTE_CLASSID
1195 static void set_class_tag(struct rtable *rt, u32 tag)
1196 {
1197         if (!(rt->dst.tclassid & 0xFFFF))
1198                 rt->dst.tclassid |= tag & 0xFFFF;
1199         if (!(rt->dst.tclassid & 0xFFFF0000))
1200                 rt->dst.tclassid |= tag & 0xFFFF0000;
1201 }
1202 #endif
1203
1204 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1205 {
1206         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1207
1208         if (advmss == 0) {
1209                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1210                                ip_rt_min_advmss);
1211                 if (advmss > 65535 - 40)
1212                         advmss = 65535 - 40;
1213         }
1214         return advmss;
1215 }
1216
1217 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1218 {
1219         const struct rtable *rt = (const struct rtable *) dst;
1220         unsigned int mtu = rt->rt_pmtu;
1221
1222         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1223                 mtu = dst_metric_raw(dst, RTAX_MTU);
1224
1225         if (mtu)
1226                 return mtu;
1227
1228         mtu = dst->dev->mtu;
1229
1230         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1231                 if (rt->rt_uses_gateway && mtu > 576)
1232                         mtu = 576;
1233         }
1234
1235         return min_t(unsigned int, mtu, IP_MAX_MTU);
1236 }
1237
1238 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1239 {
1240         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1241         struct fib_nh_exception *fnhe;
1242         u32 hval;
1243
1244         if (!hash)
1245                 return NULL;
1246
1247         hval = fnhe_hashfun(daddr);
1248
1249         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1250              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1251                 if (fnhe->fnhe_daddr == daddr)
1252                         return fnhe;
1253         }
1254         return NULL;
1255 }
1256
1257 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1258                               __be32 daddr)
1259 {
1260         bool ret = false;
1261
1262         spin_lock_bh(&fnhe_lock);
1263
1264         if (daddr == fnhe->fnhe_daddr) {
1265                 struct rtable __rcu **porig;
1266                 struct rtable *orig;
1267                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1268
1269                 if (rt_is_input_route(rt))
1270                         porig = &fnhe->fnhe_rth_input;
1271                 else
1272                         porig = &fnhe->fnhe_rth_output;
1273                 orig = rcu_dereference(*porig);
1274
1275                 if (fnhe->fnhe_genid != genid) {
1276                         fnhe->fnhe_genid = genid;
1277                         fnhe->fnhe_gw = 0;
1278                         fnhe->fnhe_pmtu = 0;
1279                         fnhe->fnhe_expires = 0;
1280                         fnhe_flush_routes(fnhe);
1281                         orig = NULL;
1282                 }
1283                 fill_route_from_fnhe(rt, fnhe);
1284                 if (!rt->rt_gateway)
1285                         rt->rt_gateway = daddr;
1286
1287                 if (!(rt->dst.flags & DST_NOCACHE)) {
1288                         rcu_assign_pointer(*porig, rt);
1289                         if (orig)
1290                                 rt_free(orig);
1291                         ret = true;
1292                 }
1293
1294                 fnhe->fnhe_stamp = jiffies;
1295         }
1296         spin_unlock_bh(&fnhe_lock);
1297
1298         return ret;
1299 }
1300
1301 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1302 {
1303         struct rtable *orig, *prev, **p;
1304         bool ret = true;
1305
1306         if (rt_is_input_route(rt)) {
1307                 p = (struct rtable **)&nh->nh_rth_input;
1308         } else {
1309                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1310         }
1311         orig = *p;
1312
1313         prev = cmpxchg(p, orig, rt);
1314         if (prev == orig) {
1315                 if (orig)
1316                         rt_free(orig);
1317         } else
1318                 ret = false;
1319
1320         return ret;
1321 }
1322
1323 static DEFINE_SPINLOCK(rt_uncached_lock);
1324 static LIST_HEAD(rt_uncached_list);
1325
1326 static void rt_add_uncached_list(struct rtable *rt)
1327 {
1328         spin_lock_bh(&rt_uncached_lock);
1329         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1330         spin_unlock_bh(&rt_uncached_lock);
1331 }
1332
1333 static void ipv4_dst_destroy(struct dst_entry *dst)
1334 {
1335         struct rtable *rt = (struct rtable *) dst;
1336
1337         if (!list_empty(&rt->rt_uncached)) {
1338                 spin_lock_bh(&rt_uncached_lock);
1339                 list_del(&rt->rt_uncached);
1340                 spin_unlock_bh(&rt_uncached_lock);
1341         }
1342 }
1343
1344 void rt_flush_dev(struct net_device *dev)
1345 {
1346         if (!list_empty(&rt_uncached_list)) {
1347                 struct net *net = dev_net(dev);
1348                 struct rtable *rt;
1349
1350                 spin_lock_bh(&rt_uncached_lock);
1351                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1352                         if (rt->dst.dev != dev)
1353                                 continue;
1354                         rt->dst.dev = net->loopback_dev;
1355                         dev_hold(rt->dst.dev);
1356                         dev_put(dev);
1357                 }
1358                 spin_unlock_bh(&rt_uncached_lock);
1359         }
1360 }
1361
1362 static bool rt_cache_valid(const struct rtable *rt)
1363 {
1364         return  rt &&
1365                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1366                 !rt_is_expired(rt);
1367 }
1368
1369 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1370                            const struct fib_result *res,
1371                            struct fib_nh_exception *fnhe,
1372                            struct fib_info *fi, u16 type, u32 itag)
1373 {
1374         bool cached = false;
1375
1376         if (fi) {
1377                 struct fib_nh *nh = &FIB_RES_NH(*res);
1378
1379                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1380                         rt->rt_gateway = nh->nh_gw;
1381                         rt->rt_uses_gateway = 1;
1382                 }
1383                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1384 #ifdef CONFIG_IP_ROUTE_CLASSID
1385                 rt->dst.tclassid = nh->nh_tclassid;
1386 #endif
1387                 if (unlikely(fnhe))
1388                         cached = rt_bind_exception(rt, fnhe, daddr);
1389                 else if (!(rt->dst.flags & DST_NOCACHE))
1390                         cached = rt_cache_route(nh, rt);
1391                 if (unlikely(!cached)) {
1392                         /* Routes we intend to cache in nexthop exception or
1393                          * FIB nexthop have the DST_NOCACHE bit clear.
1394                          * However, if we are unsuccessful at storing this
1395                          * route into the cache we really need to set it.
1396                          */
1397                         rt->dst.flags |= DST_NOCACHE;
1398                         if (!rt->rt_gateway)
1399                                 rt->rt_gateway = daddr;
1400                         rt_add_uncached_list(rt);
1401                 }
1402         } else
1403                 rt_add_uncached_list(rt);
1404
1405 #ifdef CONFIG_IP_ROUTE_CLASSID
1406 #ifdef CONFIG_IP_MULTIPLE_TABLES
1407         set_class_tag(rt, res->tclassid);
1408 #endif
1409         set_class_tag(rt, itag);
1410 #endif
1411 }
1412
1413 static struct rtable *rt_dst_alloc(struct net_device *dev,
1414                                    bool nopolicy, bool noxfrm, bool will_cache)
1415 {
1416         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1417                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1418                          (nopolicy ? DST_NOPOLICY : 0) |
1419                          (noxfrm ? DST_NOXFRM : 0));
1420 }
1421
1422 /* called in rcu_read_lock() section */
1423 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1424                                 u8 tos, struct net_device *dev, int our)
1425 {
1426         struct rtable *rth;
1427         struct in_device *in_dev = __in_dev_get_rcu(dev);
1428         u32 itag = 0;
1429         int err;
1430
1431         /* Primary sanity checks. */
1432
1433         if (in_dev == NULL)
1434                 return -EINVAL;
1435
1436         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1437             skb->protocol != htons(ETH_P_IP))
1438                 goto e_inval;
1439
1440         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1441                 if (ipv4_is_loopback(saddr))
1442                         goto e_inval;
1443
1444         if (ipv4_is_zeronet(saddr)) {
1445                 if (!ipv4_is_local_multicast(daddr))
1446                         goto e_inval;
1447         } else {
1448                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1449                                           in_dev, &itag);
1450                 if (err < 0)
1451                         goto e_err;
1452         }
1453         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1454                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1455         if (!rth)
1456                 goto e_nobufs;
1457
1458 #ifdef CONFIG_IP_ROUTE_CLASSID
1459         rth->dst.tclassid = itag;
1460 #endif
1461         rth->dst.output = ip_rt_bug;
1462
1463         rth->rt_genid   = rt_genid_ipv4(dev_net(dev));
1464         rth->rt_flags   = RTCF_MULTICAST;
1465         rth->rt_type    = RTN_MULTICAST;
1466         rth->rt_is_input= 1;
1467         rth->rt_iif     = 0;
1468         rth->rt_pmtu    = 0;
1469         rth->rt_gateway = 0;
1470         rth->rt_uses_gateway = 0;
1471         INIT_LIST_HEAD(&rth->rt_uncached);
1472         if (our) {
1473                 rth->dst.input= ip_local_deliver;
1474                 rth->rt_flags |= RTCF_LOCAL;
1475         }
1476
1477 #ifdef CONFIG_IP_MROUTE
1478         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1479                 rth->dst.input = ip_mr_input;
1480 #endif
1481         RT_CACHE_STAT_INC(in_slow_mc);
1482
1483         skb_dst_set(skb, &rth->dst);
1484         return 0;
1485
1486 e_nobufs:
1487         return -ENOBUFS;
1488 e_inval:
1489         return -EINVAL;
1490 e_err:
1491         return err;
1492 }
1493
1494
1495 static void ip_handle_martian_source(struct net_device *dev,
1496                                      struct in_device *in_dev,
1497                                      struct sk_buff *skb,
1498                                      __be32 daddr,
1499                                      __be32 saddr)
1500 {
1501         RT_CACHE_STAT_INC(in_martian_src);
1502 #ifdef CONFIG_IP_ROUTE_VERBOSE
1503         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1504                 /*
1505                  *      RFC1812 recommendation, if source is martian,
1506                  *      the only hint is MAC header.
1507                  */
1508                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1509                         &daddr, &saddr, dev->name);
1510                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1511                         print_hex_dump(KERN_WARNING, "ll header: ",
1512                                        DUMP_PREFIX_OFFSET, 16, 1,
1513                                        skb_mac_header(skb),
1514                                        dev->hard_header_len, true);
1515                 }
1516         }
1517 #endif
1518 }
1519
1520 /* called in rcu_read_lock() section */
1521 static int __mkroute_input(struct sk_buff *skb,
1522                            const struct fib_result *res,
1523                            struct in_device *in_dev,
1524                            __be32 daddr, __be32 saddr, u32 tos)
1525 {
1526         struct fib_nh_exception *fnhe;
1527         struct rtable *rth;
1528         int err;
1529         struct in_device *out_dev;
1530         unsigned int flags = 0;
1531         bool do_cache;
1532         u32 itag;
1533
1534         /* get a working reference to the output device */
1535         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1536         if (out_dev == NULL) {
1537                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1538                 return -EINVAL;
1539         }
1540
1541         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1542                                   in_dev->dev, in_dev, &itag);
1543         if (err < 0) {
1544                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1545                                          saddr);
1546
1547                 goto cleanup;
1548         }
1549
1550         do_cache = res->fi && !itag;
1551         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1552             (IN_DEV_SHARED_MEDIA(out_dev) ||
1553              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1554                 flags |= RTCF_DOREDIRECT;
1555                 do_cache = false;
1556         }
1557
1558         if (skb->protocol != htons(ETH_P_IP)) {
1559                 /* Not IP (i.e. ARP). Do not create route, if it is
1560                  * invalid for proxy arp. DNAT routes are always valid.
1561                  *
1562                  * Proxy arp feature have been extended to allow, ARP
1563                  * replies back to the same interface, to support
1564                  * Private VLAN switch technologies. See arp.c.
1565                  */
1566                 if (out_dev == in_dev &&
1567                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1568                         err = -EINVAL;
1569                         goto cleanup;
1570                 }
1571         }
1572
1573         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1574         if (do_cache) {
1575                 if (fnhe != NULL)
1576                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1577                 else
1578                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1579
1580                 if (rt_cache_valid(rth)) {
1581                         skb_dst_set_noref(skb, &rth->dst);
1582                         goto out;
1583                 }
1584         }
1585
1586         rth = rt_dst_alloc(out_dev->dev,
1587                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1588                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1589         if (!rth) {
1590                 err = -ENOBUFS;
1591                 goto cleanup;
1592         }
1593
1594         rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev));
1595         rth->rt_flags = flags;
1596         rth->rt_type = res->type;
1597         rth->rt_is_input = 1;
1598         rth->rt_iif     = 0;
1599         rth->rt_pmtu    = 0;
1600         rth->rt_gateway = 0;
1601         rth->rt_uses_gateway = 0;
1602         INIT_LIST_HEAD(&rth->rt_uncached);
1603
1604         rth->dst.input = ip_forward;
1605         rth->dst.output = ip_output;
1606
1607         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1608         skb_dst_set(skb, &rth->dst);
1609 out:
1610         err = 0;
1611  cleanup:
1612         return err;
1613 }
1614
1615 static int ip_mkroute_input(struct sk_buff *skb,
1616                             struct fib_result *res,
1617                             const struct flowi4 *fl4,
1618                             struct in_device *in_dev,
1619                             __be32 daddr, __be32 saddr, u32 tos)
1620 {
1621 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1622         if (res->fi && res->fi->fib_nhs > 1)
1623                 fib_select_multipath(res);
1624 #endif
1625
1626         /* create a routing cache entry */
1627         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1628 }
1629
1630 /*
1631  *      NOTE. We drop all the packets that has local source
1632  *      addresses, because every properly looped back packet
1633  *      must have correct destination already attached by output routine.
1634  *
1635  *      Such approach solves two big problems:
1636  *      1. Not simplex devices are handled properly.
1637  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1638  *      called with rcu_read_lock()
1639  */
1640
1641 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1642                                u8 tos, struct net_device *dev)
1643 {
1644         struct fib_result res;
1645         struct in_device *in_dev = __in_dev_get_rcu(dev);
1646         struct flowi4   fl4;
1647         unsigned int    flags = 0;
1648         u32             itag = 0;
1649         struct rtable   *rth;
1650         int             err = -EINVAL;
1651         struct net    *net = dev_net(dev);
1652         bool do_cache;
1653
1654         /* IP on this device is disabled. */
1655
1656         if (!in_dev)
1657                 goto out;
1658
1659         /* Check for the most weird martians, which can be not detected
1660            by fib_lookup.
1661          */
1662
1663         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1664                 goto martian_source;
1665
1666         res.fi = NULL;
1667         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1668                 goto brd_input;
1669
1670         /* Accept zero addresses only to limited broadcast;
1671          * I even do not know to fix it or not. Waiting for complains :-)
1672          */
1673         if (ipv4_is_zeronet(saddr))
1674                 goto martian_source;
1675
1676         if (ipv4_is_zeronet(daddr))
1677                 goto martian_destination;
1678
1679         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1680          * and call it once if daddr or/and saddr are loopback addresses
1681          */
1682         if (ipv4_is_loopback(daddr)) {
1683                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1684                         goto martian_destination;
1685         } else if (ipv4_is_loopback(saddr)) {
1686                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1687                         goto martian_source;
1688         }
1689
1690         /*
1691          *      Now we are ready to route packet.
1692          */
1693         fl4.flowi4_oif = 0;
1694         fl4.flowi4_iif = dev->ifindex;
1695         fl4.flowi4_mark = skb->mark;
1696         fl4.flowi4_tos = tos;
1697         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1698         fl4.daddr = daddr;
1699         fl4.saddr = saddr;
1700         err = fib_lookup(net, &fl4, &res);
1701         if (err != 0)
1702                 goto no_route;
1703
1704         RT_CACHE_STAT_INC(in_slow_tot);
1705
1706         if (res.type == RTN_BROADCAST)
1707                 goto brd_input;
1708
1709         if (res.type == RTN_LOCAL) {
1710                 err = fib_validate_source(skb, saddr, daddr, tos,
1711                                           LOOPBACK_IFINDEX,
1712                                           dev, in_dev, &itag);
1713                 if (err < 0)
1714                         goto martian_source_keep_err;
1715                 goto local_input;
1716         }
1717
1718         if (!IN_DEV_FORWARD(in_dev))
1719                 goto no_route;
1720         if (res.type != RTN_UNICAST)
1721                 goto martian_destination;
1722
1723         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1724 out:    return err;
1725
1726 brd_input:
1727         if (skb->protocol != htons(ETH_P_IP))
1728                 goto e_inval;
1729
1730         if (!ipv4_is_zeronet(saddr)) {
1731                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1732                                           in_dev, &itag);
1733                 if (err < 0)
1734                         goto martian_source_keep_err;
1735         }
1736         flags |= RTCF_BROADCAST;
1737         res.type = RTN_BROADCAST;
1738         RT_CACHE_STAT_INC(in_brd);
1739
1740 local_input:
1741         do_cache = false;
1742         if (res.fi) {
1743                 if (!itag) {
1744                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1745                         if (rt_cache_valid(rth)) {
1746                                 skb_dst_set_noref(skb, &rth->dst);
1747                                 err = 0;
1748                                 goto out;
1749                         }
1750                         do_cache = true;
1751                 }
1752         }
1753
1754         rth = rt_dst_alloc(net->loopback_dev,
1755                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1756         if (!rth)
1757                 goto e_nobufs;
1758
1759         rth->dst.input= ip_local_deliver;
1760         rth->dst.output= ip_rt_bug;
1761 #ifdef CONFIG_IP_ROUTE_CLASSID
1762         rth->dst.tclassid = itag;
1763 #endif
1764
1765         rth->rt_genid = rt_genid_ipv4(net);
1766         rth->rt_flags   = flags|RTCF_LOCAL;
1767         rth->rt_type    = res.type;
1768         rth->rt_is_input = 1;
1769         rth->rt_iif     = 0;
1770         rth->rt_pmtu    = 0;
1771         rth->rt_gateway = 0;
1772         rth->rt_uses_gateway = 0;
1773         INIT_LIST_HEAD(&rth->rt_uncached);
1774         if (res.type == RTN_UNREACHABLE) {
1775                 rth->dst.input= ip_error;
1776                 rth->dst.error= -err;
1777                 rth->rt_flags   &= ~RTCF_LOCAL;
1778         }
1779         if (do_cache) {
1780                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1781                         rth->dst.flags |= DST_NOCACHE;
1782                         rt_add_uncached_list(rth);
1783                 }
1784         }
1785         skb_dst_set(skb, &rth->dst);
1786         err = 0;
1787         goto out;
1788
1789 no_route:
1790         RT_CACHE_STAT_INC(in_no_route);
1791         res.type = RTN_UNREACHABLE;
1792         if (err == -ESRCH)
1793                 err = -ENETUNREACH;
1794         goto local_input;
1795
1796         /*
1797          *      Do not cache martian addresses: they should be logged (RFC1812)
1798          */
1799 martian_destination:
1800         RT_CACHE_STAT_INC(in_martian_dst);
1801 #ifdef CONFIG_IP_ROUTE_VERBOSE
1802         if (IN_DEV_LOG_MARTIANS(in_dev))
1803                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1804                                      &daddr, &saddr, dev->name);
1805 #endif
1806
1807 e_inval:
1808         err = -EINVAL;
1809         goto out;
1810
1811 e_nobufs:
1812         err = -ENOBUFS;
1813         goto out;
1814
1815 martian_source:
1816         err = -EINVAL;
1817 martian_source_keep_err:
1818         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1819         goto out;
1820 }
1821
1822 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1823                          u8 tos, struct net_device *dev)
1824 {
1825         int res;
1826
1827         rcu_read_lock();
1828
1829         /* Multicast recognition logic is moved from route cache to here.
1830            The problem was that too many Ethernet cards have broken/missing
1831            hardware multicast filters :-( As result the host on multicasting
1832            network acquires a lot of useless route cache entries, sort of
1833            SDR messages from all the world. Now we try to get rid of them.
1834            Really, provided software IP multicast filter is organized
1835            reasonably (at least, hashed), it does not result in a slowdown
1836            comparing with route cache reject entries.
1837            Note, that multicast routers are not affected, because
1838            route cache entry is created eventually.
1839          */
1840         if (ipv4_is_multicast(daddr)) {
1841                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1842
1843                 if (in_dev) {
1844                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1845                                                   ip_hdr(skb)->protocol);
1846                         if (our
1847 #ifdef CONFIG_IP_MROUTE
1848                                 ||
1849                             (!ipv4_is_local_multicast(daddr) &&
1850                              IN_DEV_MFORWARD(in_dev))
1851 #endif
1852                            ) {
1853                                 int res = ip_route_input_mc(skb, daddr, saddr,
1854                                                             tos, dev, our);
1855                                 rcu_read_unlock();
1856                                 return res;
1857                         }
1858                 }
1859                 rcu_read_unlock();
1860                 return -EINVAL;
1861         }
1862         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1863         rcu_read_unlock();
1864         return res;
1865 }
1866 EXPORT_SYMBOL(ip_route_input_noref);
1867
1868 /* called with rcu_read_lock() */
1869 static struct rtable *__mkroute_output(const struct fib_result *res,
1870                                        const struct flowi4 *fl4, int orig_oif,
1871                                        struct net_device *dev_out,
1872                                        unsigned int flags)
1873 {
1874         struct fib_info *fi = res->fi;
1875         struct fib_nh_exception *fnhe;
1876         struct in_device *in_dev;
1877         u16 type = res->type;
1878         struct rtable *rth;
1879         bool do_cache;
1880
1881         in_dev = __in_dev_get_rcu(dev_out);
1882         if (!in_dev)
1883                 return ERR_PTR(-EINVAL);
1884
1885         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1886                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1887                         return ERR_PTR(-EINVAL);
1888
1889         if (ipv4_is_lbcast(fl4->daddr))
1890                 type = RTN_BROADCAST;
1891         else if (ipv4_is_multicast(fl4->daddr))
1892                 type = RTN_MULTICAST;
1893         else if (ipv4_is_zeronet(fl4->daddr))
1894                 return ERR_PTR(-EINVAL);
1895
1896         if (dev_out->flags & IFF_LOOPBACK)
1897                 flags |= RTCF_LOCAL;
1898
1899         do_cache = true;
1900         if (type == RTN_BROADCAST) {
1901                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1902                 fi = NULL;
1903         } else if (type == RTN_MULTICAST) {
1904                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1905                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1906                                      fl4->flowi4_proto))
1907                         flags &= ~RTCF_LOCAL;
1908                 else
1909                         do_cache = false;
1910                 /* If multicast route do not exist use
1911                  * default one, but do not gateway in this case.
1912                  * Yes, it is hack.
1913                  */
1914                 if (fi && res->prefixlen < 4)
1915                         fi = NULL;
1916         }
1917
1918         fnhe = NULL;
1919         do_cache &= fi != NULL;
1920         if (do_cache) {
1921                 struct rtable __rcu **prth;
1922                 struct fib_nh *nh = &FIB_RES_NH(*res);
1923
1924                 fnhe = find_exception(nh, fl4->daddr);
1925                 if (fnhe)
1926                         prth = &fnhe->fnhe_rth_output;
1927                 else {
1928                         if (unlikely(fl4->flowi4_flags &
1929                                      FLOWI_FLAG_KNOWN_NH &&
1930                                      !(nh->nh_gw &&
1931                                        nh->nh_scope == RT_SCOPE_LINK))) {
1932                                 do_cache = false;
1933                                 goto add;
1934                         }
1935                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1936                 }
1937                 rth = rcu_dereference(*prth);
1938                 if (rt_cache_valid(rth)) {
1939                         dst_hold(&rth->dst);
1940                         return rth;
1941                 }
1942         }
1943
1944 add:
1945         rth = rt_dst_alloc(dev_out,
1946                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1947                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1948                            do_cache);
1949         if (!rth)
1950                 return ERR_PTR(-ENOBUFS);
1951
1952         rth->dst.output = ip_output;
1953
1954         rth->rt_genid = rt_genid_ipv4(dev_net(dev_out));
1955         rth->rt_flags   = flags;
1956         rth->rt_type    = type;
1957         rth->rt_is_input = 0;
1958         rth->rt_iif     = orig_oif ? : 0;
1959         rth->rt_pmtu    = 0;
1960         rth->rt_gateway = 0;
1961         rth->rt_uses_gateway = 0;
1962         INIT_LIST_HEAD(&rth->rt_uncached);
1963
1964         RT_CACHE_STAT_INC(out_slow_tot);
1965
1966         if (flags & RTCF_LOCAL)
1967                 rth->dst.input = ip_local_deliver;
1968         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1969                 if (flags & RTCF_LOCAL &&
1970                     !(dev_out->flags & IFF_LOOPBACK)) {
1971                         rth->dst.output = ip_mc_output;
1972                         RT_CACHE_STAT_INC(out_slow_mc);
1973                 }
1974 #ifdef CONFIG_IP_MROUTE
1975                 if (type == RTN_MULTICAST) {
1976                         if (IN_DEV_MFORWARD(in_dev) &&
1977                             !ipv4_is_local_multicast(fl4->daddr)) {
1978                                 rth->dst.input = ip_mr_input;
1979                                 rth->dst.output = ip_mc_output;
1980                         }
1981                 }
1982 #endif
1983         }
1984
1985         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1986
1987         return rth;
1988 }
1989
1990 /*
1991  * Major route resolver routine.
1992  */
1993
1994 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1995 {
1996         struct net_device *dev_out = NULL;
1997         __u8 tos = RT_FL_TOS(fl4);
1998         unsigned int flags = 0;
1999         struct fib_result res;
2000         struct rtable *rth;
2001         int orig_oif;
2002
2003         res.tclassid    = 0;
2004         res.fi          = NULL;
2005         res.table       = NULL;
2006
2007         orig_oif = fl4->flowi4_oif;
2008
2009         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2010         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2011         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2012                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2013
2014         rcu_read_lock();
2015         if (fl4->saddr) {
2016                 rth = ERR_PTR(-EINVAL);
2017                 if (ipv4_is_multicast(fl4->saddr) ||
2018                     ipv4_is_lbcast(fl4->saddr) ||
2019                     ipv4_is_zeronet(fl4->saddr))
2020                         goto out;
2021
2022                 /* I removed check for oif == dev_out->oif here.
2023                    It was wrong for two reasons:
2024                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2025                       is assigned to multiple interfaces.
2026                    2. Moreover, we are allowed to send packets with saddr
2027                       of another iface. --ANK
2028                  */
2029
2030                 if (fl4->flowi4_oif == 0 &&
2031                     (ipv4_is_multicast(fl4->daddr) ||
2032                      ipv4_is_lbcast(fl4->daddr))) {
2033                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2034                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2035                         if (dev_out == NULL)
2036                                 goto out;
2037
2038                         /* Special hack: user can direct multicasts
2039                            and limited broadcast via necessary interface
2040                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2041                            This hack is not just for fun, it allows
2042                            vic,vat and friends to work.
2043                            They bind socket to loopback, set ttl to zero
2044                            and expect that it will work.
2045                            From the viewpoint of routing cache they are broken,
2046                            because we are not allowed to build multicast path
2047                            with loopback source addr (look, routing cache
2048                            cannot know, that ttl is zero, so that packet
2049                            will not leave this host and route is valid).
2050                            Luckily, this hack is good workaround.
2051                          */
2052
2053                         fl4->flowi4_oif = dev_out->ifindex;
2054                         goto make_route;
2055                 }
2056
2057                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2058                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2059                         if (!__ip_dev_find(net, fl4->saddr, false))
2060                                 goto out;
2061                 }
2062         }
2063
2064
2065         if (fl4->flowi4_oif) {
2066                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2067                 rth = ERR_PTR(-ENODEV);
2068                 if (dev_out == NULL)
2069                         goto out;
2070
2071                 /* RACE: Check return value of inet_select_addr instead. */
2072                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2073                         rth = ERR_PTR(-ENETUNREACH);
2074                         goto out;
2075                 }
2076                 if (ipv4_is_local_multicast(fl4->daddr) ||
2077                     ipv4_is_lbcast(fl4->daddr)) {
2078                         if (!fl4->saddr)
2079                                 fl4->saddr = inet_select_addr(dev_out, 0,
2080                                                               RT_SCOPE_LINK);
2081                         goto make_route;
2082                 }
2083                 if (!fl4->saddr) {
2084                         if (ipv4_is_multicast(fl4->daddr))
2085                                 fl4->saddr = inet_select_addr(dev_out, 0,
2086                                                               fl4->flowi4_scope);
2087                         else if (!fl4->daddr)
2088                                 fl4->saddr = inet_select_addr(dev_out, 0,
2089                                                               RT_SCOPE_HOST);
2090                 }
2091         }
2092
2093         if (!fl4->daddr) {
2094                 fl4->daddr = fl4->saddr;
2095                 if (!fl4->daddr)
2096                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2097                 dev_out = net->loopback_dev;
2098                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2099                 res.type = RTN_LOCAL;
2100                 flags |= RTCF_LOCAL;
2101                 goto make_route;
2102         }
2103
2104         if (fib_lookup(net, fl4, &res)) {
2105                 res.fi = NULL;
2106                 res.table = NULL;
2107                 if (fl4->flowi4_oif) {
2108                         /* Apparently, routing tables are wrong. Assume,
2109                            that the destination is on link.
2110
2111                            WHY? DW.
2112                            Because we are allowed to send to iface
2113                            even if it has NO routes and NO assigned
2114                            addresses. When oif is specified, routing
2115                            tables are looked up with only one purpose:
2116                            to catch if destination is gatewayed, rather than
2117                            direct. Moreover, if MSG_DONTROUTE is set,
2118                            we send packet, ignoring both routing tables
2119                            and ifaddr state. --ANK
2120
2121
2122                            We could make it even if oif is unknown,
2123                            likely IPv6, but we do not.
2124                          */
2125
2126                         if (fl4->saddr == 0)
2127                                 fl4->saddr = inet_select_addr(dev_out, 0,
2128                                                               RT_SCOPE_LINK);
2129                         res.type = RTN_UNICAST;
2130                         goto make_route;
2131                 }
2132                 rth = ERR_PTR(-ENETUNREACH);
2133                 goto out;
2134         }
2135
2136         if (res.type == RTN_LOCAL) {
2137                 if (!fl4->saddr) {
2138                         if (res.fi->fib_prefsrc)
2139                                 fl4->saddr = res.fi->fib_prefsrc;
2140                         else
2141                                 fl4->saddr = fl4->daddr;
2142                 }
2143                 dev_out = net->loopback_dev;
2144                 fl4->flowi4_oif = dev_out->ifindex;
2145                 flags |= RTCF_LOCAL;
2146                 goto make_route;
2147         }
2148
2149 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2150         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2151                 fib_select_multipath(&res);
2152         else
2153 #endif
2154         if (!res.prefixlen &&
2155             res.table->tb_num_default > 1 &&
2156             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2157                 fib_select_default(&res);
2158
2159         if (!fl4->saddr)
2160                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2161
2162         dev_out = FIB_RES_DEV(res);
2163         fl4->flowi4_oif = dev_out->ifindex;
2164
2165
2166 make_route:
2167         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2168
2169 out:
2170         rcu_read_unlock();
2171         return rth;
2172 }
2173 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2174
2175 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2176 {
2177         return NULL;
2178 }
2179
2180 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2181 {
2182         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2183
2184         return mtu ? : dst->dev->mtu;
2185 }
2186
2187 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2188                                           struct sk_buff *skb, u32 mtu)
2189 {
2190 }
2191
2192 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2193                                        struct sk_buff *skb)
2194 {
2195 }
2196
2197 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2198                                           unsigned long old)
2199 {
2200         return NULL;
2201 }
2202
2203 static struct dst_ops ipv4_dst_blackhole_ops = {
2204         .family                 =       AF_INET,
2205         .protocol               =       cpu_to_be16(ETH_P_IP),
2206         .check                  =       ipv4_blackhole_dst_check,
2207         .mtu                    =       ipv4_blackhole_mtu,
2208         .default_advmss         =       ipv4_default_advmss,
2209         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2210         .redirect               =       ipv4_rt_blackhole_redirect,
2211         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2212         .neigh_lookup           =       ipv4_neigh_lookup,
2213 };
2214
2215 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2216 {
2217         struct rtable *ort = (struct rtable *) dst_orig;
2218         struct rtable *rt;
2219
2220         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2221         if (rt) {
2222                 struct dst_entry *new = &rt->dst;
2223
2224                 new->__use = 1;
2225                 new->input = dst_discard;
2226                 new->output = dst_discard;
2227
2228                 new->dev = ort->dst.dev;
2229                 if (new->dev)
2230                         dev_hold(new->dev);
2231
2232                 rt->rt_is_input = ort->rt_is_input;
2233                 rt->rt_iif = ort->rt_iif;
2234                 rt->rt_pmtu = ort->rt_pmtu;
2235
2236                 rt->rt_genid = rt_genid_ipv4(net);
2237                 rt->rt_flags = ort->rt_flags;
2238                 rt->rt_type = ort->rt_type;
2239                 rt->rt_gateway = ort->rt_gateway;
2240                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2241
2242                 INIT_LIST_HEAD(&rt->rt_uncached);
2243
2244                 dst_free(new);
2245         }
2246
2247         dst_release(dst_orig);
2248
2249         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2250 }
2251
2252 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2253                                     struct sock *sk)
2254 {
2255         struct rtable *rt = __ip_route_output_key(net, flp4);
2256
2257         if (IS_ERR(rt))
2258                 return rt;
2259
2260         if (flp4->flowi4_proto)
2261                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2262                                                    flowi4_to_flowi(flp4),
2263                                                    sk, 0);
2264
2265         return rt;
2266 }
2267 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2268
2269 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2270                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2271                         u32 seq, int event, int nowait, unsigned int flags)
2272 {
2273         struct rtable *rt = skb_rtable(skb);
2274         struct rtmsg *r;
2275         struct nlmsghdr *nlh;
2276         unsigned long expires = 0;
2277         u32 error;
2278         u32 metrics[RTAX_MAX];
2279
2280         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2281         if (nlh == NULL)
2282                 return -EMSGSIZE;
2283
2284         r = nlmsg_data(nlh);
2285         r->rtm_family    = AF_INET;
2286         r->rtm_dst_len  = 32;
2287         r->rtm_src_len  = 0;
2288         r->rtm_tos      = fl4->flowi4_tos;
2289         r->rtm_table    = RT_TABLE_MAIN;
2290         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2291                 goto nla_put_failure;
2292         r->rtm_type     = rt->rt_type;
2293         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2294         r->rtm_protocol = RTPROT_UNSPEC;
2295         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2296         if (rt->rt_flags & RTCF_NOTIFY)
2297                 r->rtm_flags |= RTM_F_NOTIFY;
2298
2299         if (nla_put_be32(skb, RTA_DST, dst))
2300                 goto nla_put_failure;
2301         if (src) {
2302                 r->rtm_src_len = 32;
2303                 if (nla_put_be32(skb, RTA_SRC, src))
2304                         goto nla_put_failure;
2305         }
2306         if (rt->dst.dev &&
2307             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2308                 goto nla_put_failure;
2309 #ifdef CONFIG_IP_ROUTE_CLASSID
2310         if (rt->dst.tclassid &&
2311             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2312                 goto nla_put_failure;
2313 #endif
2314         if (!rt_is_input_route(rt) &&
2315             fl4->saddr != src) {
2316                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2317                         goto nla_put_failure;
2318         }
2319         if (rt->rt_uses_gateway &&
2320             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2321                 goto nla_put_failure;
2322
2323         expires = rt->dst.expires;
2324         if (expires) {
2325                 unsigned long now = jiffies;
2326
2327                 if (time_before(now, expires))
2328                         expires -= now;
2329                 else
2330                         expires = 0;
2331         }
2332
2333         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2334         if (rt->rt_pmtu && expires)
2335                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2336         if (rtnetlink_put_metrics(skb, metrics) < 0)
2337                 goto nla_put_failure;
2338
2339         if (fl4->flowi4_mark &&
2340             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2341                 goto nla_put_failure;
2342
2343         error = rt->dst.error;
2344
2345         if (rt_is_input_route(rt)) {
2346 #ifdef CONFIG_IP_MROUTE
2347                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2348                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2349                         int err = ipmr_get_route(net, skb,
2350                                                  fl4->saddr, fl4->daddr,
2351                                                  r, nowait);
2352                         if (err <= 0) {
2353                                 if (!nowait) {
2354                                         if (err == 0)
2355                                                 return 0;
2356                                         goto nla_put_failure;
2357                                 } else {
2358                                         if (err == -EMSGSIZE)
2359                                                 goto nla_put_failure;
2360                                         error = err;
2361                                 }
2362                         }
2363                 } else
2364 #endif
2365                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2366                                 goto nla_put_failure;
2367         }
2368
2369         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2370                 goto nla_put_failure;
2371
2372         return nlmsg_end(skb, nlh);
2373
2374 nla_put_failure:
2375         nlmsg_cancel(skb, nlh);
2376         return -EMSGSIZE;
2377 }
2378
2379 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2380 {
2381         struct net *net = sock_net(in_skb->sk);
2382         struct rtmsg *rtm;
2383         struct nlattr *tb[RTA_MAX+1];
2384         struct rtable *rt = NULL;
2385         struct flowi4 fl4;
2386         __be32 dst = 0;
2387         __be32 src = 0;
2388         u32 iif;
2389         int err;
2390         int mark;
2391         struct sk_buff *skb;
2392
2393         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2394         if (err < 0)
2395                 goto errout;
2396
2397         rtm = nlmsg_data(nlh);
2398
2399         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2400         if (skb == NULL) {
2401                 err = -ENOBUFS;
2402                 goto errout;
2403         }
2404
2405         /* Reserve room for dummy headers, this skb can pass
2406            through good chunk of routing engine.
2407          */
2408         skb_reset_mac_header(skb);
2409         skb_reset_network_header(skb);
2410
2411         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2412         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2413         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2414
2415         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2416         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2417         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2418         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2419
2420         memset(&fl4, 0, sizeof(fl4));
2421         fl4.daddr = dst;
2422         fl4.saddr = src;
2423         fl4.flowi4_tos = rtm->rtm_tos;
2424         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2425         fl4.flowi4_mark = mark;
2426
2427         if (iif) {
2428                 struct net_device *dev;
2429
2430                 dev = __dev_get_by_index(net, iif);
2431                 if (dev == NULL) {
2432                         err = -ENODEV;
2433                         goto errout_free;
2434                 }
2435
2436                 skb->protocol   = htons(ETH_P_IP);
2437                 skb->dev        = dev;
2438                 skb->mark       = mark;
2439                 local_bh_disable();
2440                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2441                 local_bh_enable();
2442
2443                 rt = skb_rtable(skb);
2444                 if (err == 0 && rt->dst.error)
2445                         err = -rt->dst.error;
2446         } else {
2447                 rt = ip_route_output_key(net, &fl4);
2448
2449                 err = 0;
2450                 if (IS_ERR(rt))
2451                         err = PTR_ERR(rt);
2452         }
2453
2454         if (err)
2455                 goto errout_free;
2456
2457         skb_dst_set(skb, &rt->dst);
2458         if (rtm->rtm_flags & RTM_F_NOTIFY)
2459                 rt->rt_flags |= RTCF_NOTIFY;
2460
2461         err = rt_fill_info(net, dst, src, &fl4, skb,
2462                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2463                            RTM_NEWROUTE, 0, 0);
2464         if (err <= 0)
2465                 goto errout_free;
2466
2467         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2468 errout:
2469         return err;
2470
2471 errout_free:
2472         kfree_skb(skb);
2473         goto errout;
2474 }
2475
2476 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2477 {
2478         return skb->len;
2479 }
2480
2481 void ip_rt_multicast_event(struct in_device *in_dev)
2482 {
2483         rt_cache_flush(dev_net(in_dev->dev));
2484 }
2485
2486 #ifdef CONFIG_SYSCTL
2487 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2488 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2489 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2490 static int ip_rt_gc_elasticity __read_mostly    = 8;
2491
2492 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2493                                         void __user *buffer,
2494                                         size_t *lenp, loff_t *ppos)
2495 {
2496         struct net *net = (struct net *)__ctl->extra1;
2497
2498         if (write) {
2499                 rt_cache_flush(net);
2500                 fnhe_genid_bump(net);
2501                 return 0;
2502         }
2503
2504         return -EINVAL;
2505 }
2506
2507 static struct ctl_table ipv4_route_table[] = {
2508         {
2509                 .procname       = "gc_thresh",
2510                 .data           = &ipv4_dst_ops.gc_thresh,
2511                 .maxlen         = sizeof(int),
2512                 .mode           = 0644,
2513                 .proc_handler   = proc_dointvec,
2514         },
2515         {
2516                 .procname       = "max_size",
2517                 .data           = &ip_rt_max_size,
2518                 .maxlen         = sizeof(int),
2519                 .mode           = 0644,
2520                 .proc_handler   = proc_dointvec,
2521         },
2522         {
2523                 /*  Deprecated. Use gc_min_interval_ms */
2524
2525                 .procname       = "gc_min_interval",
2526                 .data           = &ip_rt_gc_min_interval,
2527                 .maxlen         = sizeof(int),
2528                 .mode           = 0644,
2529                 .proc_handler   = proc_dointvec_jiffies,
2530         },
2531         {
2532                 .procname       = "gc_min_interval_ms",
2533                 .data           = &ip_rt_gc_min_interval,
2534                 .maxlen         = sizeof(int),
2535                 .mode           = 0644,
2536                 .proc_handler   = proc_dointvec_ms_jiffies,
2537         },
2538         {
2539                 .procname       = "gc_timeout",
2540                 .data           = &ip_rt_gc_timeout,
2541                 .maxlen         = sizeof(int),
2542                 .mode           = 0644,
2543                 .proc_handler   = proc_dointvec_jiffies,
2544         },
2545         {
2546                 .procname       = "gc_interval",
2547                 .data           = &ip_rt_gc_interval,
2548                 .maxlen         = sizeof(int),
2549                 .mode           = 0644,
2550                 .proc_handler   = proc_dointvec_jiffies,
2551         },
2552         {
2553                 .procname       = "redirect_load",
2554                 .data           = &ip_rt_redirect_load,
2555                 .maxlen         = sizeof(int),
2556                 .mode           = 0644,
2557                 .proc_handler   = proc_dointvec,
2558         },
2559         {
2560                 .procname       = "redirect_number",
2561                 .data           = &ip_rt_redirect_number,
2562                 .maxlen         = sizeof(int),
2563                 .mode           = 0644,
2564                 .proc_handler   = proc_dointvec,
2565         },
2566         {
2567                 .procname       = "redirect_silence",
2568                 .data           = &ip_rt_redirect_silence,
2569                 .maxlen         = sizeof(int),
2570                 .mode           = 0644,
2571                 .proc_handler   = proc_dointvec,
2572         },
2573         {
2574                 .procname       = "error_cost",
2575                 .data           = &ip_rt_error_cost,
2576                 .maxlen         = sizeof(int),
2577                 .mode           = 0644,
2578                 .proc_handler   = proc_dointvec,
2579         },
2580         {
2581                 .procname       = "error_burst",
2582                 .data           = &ip_rt_error_burst,
2583                 .maxlen         = sizeof(int),
2584                 .mode           = 0644,
2585                 .proc_handler   = proc_dointvec,
2586         },
2587         {
2588                 .procname       = "gc_elasticity",
2589                 .data           = &ip_rt_gc_elasticity,
2590                 .maxlen         = sizeof(int),
2591                 .mode           = 0644,
2592                 .proc_handler   = proc_dointvec,
2593         },
2594         {
2595                 .procname       = "mtu_expires",
2596                 .data           = &ip_rt_mtu_expires,
2597                 .maxlen         = sizeof(int),
2598                 .mode           = 0644,
2599                 .proc_handler   = proc_dointvec_jiffies,
2600         },
2601         {
2602                 .procname       = "min_pmtu",
2603                 .data           = &ip_rt_min_pmtu,
2604                 .maxlen         = sizeof(int),
2605                 .mode           = 0644,
2606                 .proc_handler   = proc_dointvec,
2607         },
2608         {
2609                 .procname       = "min_adv_mss",
2610                 .data           = &ip_rt_min_advmss,
2611                 .maxlen         = sizeof(int),
2612                 .mode           = 0644,
2613                 .proc_handler   = proc_dointvec,
2614         },
2615         { }
2616 };
2617
2618 static struct ctl_table ipv4_route_flush_table[] = {
2619         {
2620                 .procname       = "flush",
2621                 .maxlen         = sizeof(int),
2622                 .mode           = 0200,
2623                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2624         },
2625         { },
2626 };
2627
2628 static __net_init int sysctl_route_net_init(struct net *net)
2629 {
2630         struct ctl_table *tbl;
2631
2632         tbl = ipv4_route_flush_table;
2633         if (!net_eq(net, &init_net)) {
2634                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2635                 if (tbl == NULL)
2636                         goto err_dup;
2637
2638                 /* Don't export sysctls to unprivileged users */
2639                 if (net->user_ns != &init_user_ns)
2640                         tbl[0].procname = NULL;
2641         }
2642         tbl[0].extra1 = net;
2643
2644         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2645         if (net->ipv4.route_hdr == NULL)
2646                 goto err_reg;
2647         return 0;
2648
2649 err_reg:
2650         if (tbl != ipv4_route_flush_table)
2651                 kfree(tbl);
2652 err_dup:
2653         return -ENOMEM;
2654 }
2655
2656 static __net_exit void sysctl_route_net_exit(struct net *net)
2657 {
2658         struct ctl_table *tbl;
2659
2660         tbl = net->ipv4.route_hdr->ctl_table_arg;
2661         unregister_net_sysctl_table(net->ipv4.route_hdr);
2662         BUG_ON(tbl == ipv4_route_flush_table);
2663         kfree(tbl);
2664 }
2665
2666 static __net_initdata struct pernet_operations sysctl_route_ops = {
2667         .init = sysctl_route_net_init,
2668         .exit = sysctl_route_net_exit,
2669 };
2670 #endif
2671
2672 static __net_init int rt_genid_init(struct net *net)
2673 {
2674         atomic_set(&net->ipv4.rt_genid, 0);
2675         atomic_set(&net->fnhe_genid, 0);
2676         get_random_bytes(&net->ipv4.dev_addr_genid,
2677                          sizeof(net->ipv4.dev_addr_genid));
2678         return 0;
2679 }
2680
2681 static __net_initdata struct pernet_operations rt_genid_ops = {
2682         .init = rt_genid_init,
2683 };
2684
2685 static int __net_init ipv4_inetpeer_init(struct net *net)
2686 {
2687         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2688
2689         if (!bp)
2690                 return -ENOMEM;
2691         inet_peer_base_init(bp);
2692         net->ipv4.peers = bp;
2693         return 0;
2694 }
2695
2696 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2697 {
2698         struct inet_peer_base *bp = net->ipv4.peers;
2699
2700         net->ipv4.peers = NULL;
2701         inetpeer_invalidate_tree(bp);
2702         kfree(bp);
2703 }
2704
2705 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2706         .init   =       ipv4_inetpeer_init,
2707         .exit   =       ipv4_inetpeer_exit,
2708 };
2709
2710 #ifdef CONFIG_IP_ROUTE_CLASSID
2711 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2712 #endif /* CONFIG_IP_ROUTE_CLASSID */
2713
2714 int __init ip_rt_init(void)
2715 {
2716         int rc = 0;
2717
2718 #ifdef CONFIG_IP_ROUTE_CLASSID
2719         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2720         if (!ip_rt_acct)
2721                 panic("IP: failed to allocate ip_rt_acct\n");
2722 #endif
2723
2724         ipv4_dst_ops.kmem_cachep =
2725                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2726                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2727
2728         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2729
2730         if (dst_entries_init(&ipv4_dst_ops) < 0)
2731                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2732
2733         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2734                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2735
2736         ipv4_dst_ops.gc_thresh = ~0;
2737         ip_rt_max_size = INT_MAX;
2738
2739         devinet_init();
2740         ip_fib_init();
2741
2742         if (ip_rt_proc_init())
2743                 pr_err("Unable to create route proc files\n");
2744 #ifdef CONFIG_XFRM
2745         xfrm_init();
2746         xfrm4_init();
2747 #endif
2748         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2749
2750 #ifdef CONFIG_SYSCTL
2751         register_pernet_subsys(&sysctl_route_ops);
2752 #endif
2753         register_pernet_subsys(&rt_genid_ops);
2754         register_pernet_subsys(&ipv4_inetpeer_ops);
2755         return rc;
2756 }
2757
2758 #ifdef CONFIG_SYSCTL
2759 /*
2760  * We really need to sanitize the damn ipv4 init order, then all
2761  * this nonsense will go away.
2762  */
2763 void __init ip_static_sysctl_init(void)
2764 {
2765         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2766 }
2767 #endif