Merge branch 'upstream' of git://ftp.linux-mips.org/pub/scm/upstream-linus
[linux-drm-fsl-dcu.git] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION    "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84  * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91                                     ecache->ct);
92         ecache->events = 0;
93         ip_conntrack_put(ecache->ct);
94         ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98  * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101         struct ip_conntrack_ecache *ecache;
102
103         local_bh_disable();
104         ecache = &__get_cpu_var(ip_conntrack_ecache);
105         if (ecache->ct == ct)
106                 __ip_ct_deliver_cached_events(ecache);
107         local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112         struct ip_conntrack_ecache *ecache;
113
114         /* take care of delivering potentially old events */
115         ecache = &__get_cpu_var(ip_conntrack_ecache);
116         BUG_ON(ecache->ct == ct);
117         if (ecache->ct)
118                 __ip_ct_deliver_cached_events(ecache);
119         /* initialize for this conntrack/packet */
120         ecache->ct = ct;
121         nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125  * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128         struct ip_conntrack_ecache *ecache;
129         int cpu;
130
131         for_each_possible_cpu(cpu) {
132                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133                 if (ecache->ct)
134                         ip_conntrack_put(ecache->ct);
135         }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147                             unsigned int size, unsigned int rnd)
148 {
149         return (jhash_3words((__force u32)tuple->src.ip,
150                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
152                              rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158         return __hash_conntrack(tuple, ip_conntrack_htable_size,
159                                 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164                 const struct sk_buff *skb,
165                 unsigned int dataoff,
166                 struct ip_conntrack_tuple *tuple,
167                 const struct ip_conntrack_protocol *protocol)
168 {
169         /* Never happen */
170         if (iph->frag_off & htons(IP_OFFSET)) {
171                 printk("ip_conntrack_core: Frag of proto %u.\n",
172                        iph->protocol);
173                 return 0;
174         }
175
176         tuple->src.ip = iph->saddr;
177         tuple->dst.ip = iph->daddr;
178         tuple->dst.protonum = iph->protocol;
179         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181         return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186                    const struct ip_conntrack_tuple *orig,
187                    const struct ip_conntrack_protocol *protocol)
188 {
189         inverse->src.ip = orig->dst.ip;
190         inverse->dst.ip = orig->src.ip;
191         inverse->dst.protonum = orig->dst.protonum;
192         inverse->dst.dir = !orig->dst.dir;
193
194         return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201         IP_NF_ASSERT(!timer_pending(&exp->timeout));
202         list_del(&exp->list);
203         CONNTRACK_STAT_INC(expect_delete);
204         exp->master->expecting--;
205         ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210         struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212         write_lock_bh(&ip_conntrack_lock);
213         ip_ct_unlink_expect(exp);
214         write_unlock_bh(&ip_conntrack_lock);
215         ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221         struct ip_conntrack_expect *i;
222
223         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225                         return i;
226         }
227         return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234         struct ip_conntrack_expect *i;
235
236         read_lock_bh(&ip_conntrack_lock);
237         i = __ip_conntrack_expect_find(tuple);
238         if (i)
239                 atomic_inc(&i->use);
240         read_unlock_bh(&ip_conntrack_lock);
241
242         return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246  * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250         struct ip_conntrack_expect *i;
251
252         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253                 /* If master is not in hash table yet (ie. packet hasn't left
254                    this machine yet), how can other end know about expected?
255                    Hence these are not the droids you are looking for (if
256                    master ct never got confirmed, we'd hold a reference to it
257                    and weird things would happen to future packets). */
258                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259                     && is_confirmed(i->master)) {
260                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
261                                 atomic_inc(&i->use);
262                                 return i;
263                         } else if (del_timer(&i->timeout)) {
264                                 ip_ct_unlink_expect(i);
265                                 return i;
266                         }
267                 }
268         }
269         return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275         struct ip_conntrack_expect *i, *tmp;
276
277         /* Optimization: most connection never expect any others. */
278         if (ct->expecting == 0)
279                 return;
280
281         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282                 if (i->master == ct && del_timer(&i->timeout)) {
283                         ip_ct_unlink_expect(i);
284                         ip_conntrack_expect_put(i);
285                 }
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         DEBUGP("clean_from_lists(%p)\n", ct);
293         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296         /* Destroy all pending expectations */
297         ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304         struct ip_conntrack_protocol *proto;
305         struct ip_conntrack_helper *helper;
306
307         DEBUGP("destroy_conntrack(%p)\n", ct);
308         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309         IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
311         ip_conntrack_event(IPCT_DESTROY, ct);
312         set_bit(IPS_DYING_BIT, &ct->status);
313
314         helper = ct->helper;
315         if (helper && helper->destroy)
316                 helper->destroy(ct);
317
318         /* To make sure we don't get any weird locking issues here:
319          * destroy_conntrack() MUST NOT be called with a write lock
320          * to ip_conntrack_lock!!! -HW */
321         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322         if (proto && proto->destroy)
323                 proto->destroy(ct);
324
325         if (ip_conntrack_destroyed)
326                 ip_conntrack_destroyed(ct);
327
328         write_lock_bh(&ip_conntrack_lock);
329         /* Expectations will have been removed in clean_from_lists,
330          * except TFTP can create an expectation on the first packet,
331          * before connection is in the list, so we need to clean here,
332          * too. */
333         ip_ct_remove_expectations(ct);
334
335         /* We overload first tuple to link into unconfirmed list. */
336         if (!is_confirmed(ct)) {
337                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339         }
340
341         CONNTRACK_STAT_INC(delete);
342         write_unlock_bh(&ip_conntrack_lock);
343
344         if (ct->master)
345                 ip_conntrack_put(ct->master);
346
347         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348         ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         write_lock_bh(&ip_conntrack_lock);
356         /* Inside lock so preempt is disabled on module removal path.
357          * Otherwise we can get spurious warnings. */
358         CONNTRACK_STAT_INC(delete_list);
359         clean_from_lists(ct);
360         write_unlock_bh(&ip_conntrack_lock);
361         ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366                     const struct ip_conntrack *ignored_conntrack)
367 {
368         struct ip_conntrack_tuple_hash *h;
369         unsigned int hash = hash_conntrack(tuple);
370
371         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
372                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
373                     ip_ct_tuple_equal(tuple, &h->tuple)) {
374                         CONNTRACK_STAT_INC(found);
375                         return h;
376                 }
377                 CONNTRACK_STAT_INC(searched);
378         }
379
380         return NULL;
381 }
382
383 /* Find a connection corresponding to a tuple. */
384 struct ip_conntrack_tuple_hash *
385 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
386                       const struct ip_conntrack *ignored_conntrack)
387 {
388         struct ip_conntrack_tuple_hash *h;
389
390         read_lock_bh(&ip_conntrack_lock);
391         h = __ip_conntrack_find(tuple, ignored_conntrack);
392         if (h)
393                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
394         read_unlock_bh(&ip_conntrack_lock);
395
396         return h;
397 }
398
399 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
400                                         unsigned int hash,
401                                         unsigned int repl_hash)
402 {
403         ct->id = ++ip_conntrack_next_id;
404         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
405                  &ip_conntrack_hash[hash]);
406         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
407                  &ip_conntrack_hash[repl_hash]);
408 }
409
410 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
411 {
412         unsigned int hash, repl_hash;
413
414         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417         write_lock_bh(&ip_conntrack_lock);
418         __ip_conntrack_hash_insert(ct, hash, repl_hash);
419         write_unlock_bh(&ip_conntrack_lock);
420 }
421
422 /* Confirm a connection given skb; places it in hash table */
423 int
424 __ip_conntrack_confirm(struct sk_buff **pskb)
425 {
426         unsigned int hash, repl_hash;
427         struct ip_conntrack_tuple_hash *h;
428         struct ip_conntrack *ct;
429         enum ip_conntrack_info ctinfo;
430
431         ct = ip_conntrack_get(*pskb, &ctinfo);
432
433         /* ipt_REJECT uses ip_conntrack_attach to attach related
434            ICMP/TCP RST packets in other direction.  Actual packet
435            which created connection will be IP_CT_NEW or for an
436            expected connection, IP_CT_RELATED. */
437         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438                 return NF_ACCEPT;
439
440         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443         /* We're not in hash table, and we refuse to set up related
444            connections for unconfirmed conns.  But packet copies and
445            REJECT will give spurious warnings here. */
446         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448         /* No external references means noone else could have
449            confirmed us. */
450         IP_NF_ASSERT(!is_confirmed(ct));
451         DEBUGP("Confirming conntrack %p\n", ct);
452
453         write_lock_bh(&ip_conntrack_lock);
454
455         /* See if there's one in the list already, including reverse:
456            NAT could have grabbed it without realizing, since we're
457            not in the hash.  If there is, we lost race. */
458         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
459                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
460                                       &h->tuple))
461                         goto out;
462         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
463                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
464                                       &h->tuple))
465                         goto out;
466
467         /* Remove from unconfirmed list */
468         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
469
470         __ip_conntrack_hash_insert(ct, hash, repl_hash);
471         /* Timer relative to confirmation time, not original
472            setting time, otherwise we'd get timer wrap in
473            weird delay cases. */
474         ct->timeout.expires += jiffies;
475         add_timer(&ct->timeout);
476         atomic_inc(&ct->ct_general.use);
477         set_bit(IPS_CONFIRMED_BIT, &ct->status);
478         CONNTRACK_STAT_INC(insert);
479         write_unlock_bh(&ip_conntrack_lock);
480         if (ct->helper)
481                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
482 #ifdef CONFIG_IP_NF_NAT_NEEDED
483         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
484             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
485                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
486 #endif
487         ip_conntrack_event_cache(master_ct(ct) ?
488                                  IPCT_RELATED : IPCT_NEW, *pskb);
489
490         return NF_ACCEPT;
491
492 out:
493         CONNTRACK_STAT_INC(insert_failed);
494         write_unlock_bh(&ip_conntrack_lock);
495         return NF_DROP;
496 }
497
498 /* Returns true if a connection correspondings to the tuple (required
499    for NAT). */
500 int
501 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
502                          const struct ip_conntrack *ignored_conntrack)
503 {
504         struct ip_conntrack_tuple_hash *h;
505
506         read_lock_bh(&ip_conntrack_lock);
507         h = __ip_conntrack_find(tuple, ignored_conntrack);
508         read_unlock_bh(&ip_conntrack_lock);
509
510         return h != NULL;
511 }
512
513 /* There's a small race here where we may free a just-assured
514    connection.  Too bad: we're in trouble anyway. */
515 static int early_drop(struct list_head *chain)
516 {
517         /* Traverse backwards: gives us oldest, which is roughly LRU */
518         struct ip_conntrack_tuple_hash *h;
519         struct ip_conntrack *ct = NULL, *tmp;
520         int dropped = 0;
521
522         read_lock_bh(&ip_conntrack_lock);
523         list_for_each_entry_reverse(h, chain, list) {
524                 tmp = tuplehash_to_ctrack(h);
525                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
526                         ct = tmp;
527                         atomic_inc(&ct->ct_general.use);
528                         break;
529                 }
530         }
531         read_unlock_bh(&ip_conntrack_lock);
532
533         if (!ct)
534                 return dropped;
535
536         if (del_timer(&ct->timeout)) {
537                 death_by_timeout((unsigned long)ct);
538                 dropped = 1;
539                 CONNTRACK_STAT_INC(early_drop);
540         }
541         ip_conntrack_put(ct);
542         return dropped;
543 }
544
545 static struct ip_conntrack_helper *
546 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
547 {
548         struct ip_conntrack_helper *h;
549
550         list_for_each_entry(h, &helpers, list) {
551                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
552                         return h;
553         }
554         return NULL;
555 }
556
557 struct ip_conntrack_helper *
558 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
559 {
560         struct ip_conntrack_helper *helper;
561
562         /* need ip_conntrack_lock to assure that helper exists until
563          * try_module_get() is called */
564         read_lock_bh(&ip_conntrack_lock);
565
566         helper = __ip_conntrack_helper_find(tuple);
567         if (helper) {
568                 /* need to increase module usage count to assure helper will
569                  * not go away while the caller is e.g. busy putting a
570                  * conntrack in the hash that uses the helper */
571                 if (!try_module_get(helper->me))
572                         helper = NULL;
573         }
574
575         read_unlock_bh(&ip_conntrack_lock);
576
577         return helper;
578 }
579
580 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
581 {
582         module_put(helper->me);
583 }
584
585 struct ip_conntrack_protocol *
586 __ip_conntrack_proto_find(u_int8_t protocol)
587 {
588         return ip_ct_protos[protocol];
589 }
590
591 /* this is guaranteed to always return a valid protocol helper, since
592  * it falls back to generic_protocol */
593 struct ip_conntrack_protocol *
594 ip_conntrack_proto_find_get(u_int8_t protocol)
595 {
596         struct ip_conntrack_protocol *p;
597
598         preempt_disable();
599         p = __ip_conntrack_proto_find(protocol);
600         if (p) {
601                 if (!try_module_get(p->me))
602                         p = &ip_conntrack_generic_protocol;
603         }
604         preempt_enable();
605
606         return p;
607 }
608
609 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
610 {
611         module_put(p->me);
612 }
613
614 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
615                                         struct ip_conntrack_tuple *repl)
616 {
617         struct ip_conntrack *conntrack;
618
619         if (!ip_conntrack_hash_rnd_initted) {
620                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
621                 ip_conntrack_hash_rnd_initted = 1;
622         }
623
624         /* We don't want any race condition at early drop stage */
625         atomic_inc(&ip_conntrack_count);
626
627         if (ip_conntrack_max
628             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
629                 unsigned int hash = hash_conntrack(orig);
630                 /* Try dropping from this hash chain. */
631                 if (!early_drop(&ip_conntrack_hash[hash])) {
632                         atomic_dec(&ip_conntrack_count);
633                         if (net_ratelimit())
634                                 printk(KERN_WARNING
635                                        "ip_conntrack: table full, dropping"
636                                        " packet.\n");
637                         return ERR_PTR(-ENOMEM);
638                 }
639         }
640
641         conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
642         if (!conntrack) {
643                 DEBUGP("Can't allocate conntrack.\n");
644                 atomic_dec(&ip_conntrack_count);
645                 return ERR_PTR(-ENOMEM);
646         }
647
648         atomic_set(&conntrack->ct_general.use, 1);
649         conntrack->ct_general.destroy = destroy_conntrack;
650         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
651         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
652         /* Don't set timer yet: wait for confirmation */
653         init_timer(&conntrack->timeout);
654         conntrack->timeout.data = (unsigned long)conntrack;
655         conntrack->timeout.function = death_by_timeout;
656
657         return conntrack;
658 }
659
660 void
661 ip_conntrack_free(struct ip_conntrack *conntrack)
662 {
663         atomic_dec(&ip_conntrack_count);
664         kmem_cache_free(ip_conntrack_cachep, conntrack);
665 }
666
667 /* Allocate a new conntrack: we return -ENOMEM if classification
668  * failed due to stress.   Otherwise it really is unclassifiable */
669 static struct ip_conntrack_tuple_hash *
670 init_conntrack(struct ip_conntrack_tuple *tuple,
671                struct ip_conntrack_protocol *protocol,
672                struct sk_buff *skb)
673 {
674         struct ip_conntrack *conntrack;
675         struct ip_conntrack_tuple repl_tuple;
676         struct ip_conntrack_expect *exp;
677
678         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
679                 DEBUGP("Can't invert tuple.\n");
680                 return NULL;
681         }
682
683         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
684         if (conntrack == NULL || IS_ERR(conntrack))
685                 return (struct ip_conntrack_tuple_hash *)conntrack;
686
687         if (!protocol->new(conntrack, skb)) {
688                 ip_conntrack_free(conntrack);
689                 return NULL;
690         }
691
692         write_lock_bh(&ip_conntrack_lock);
693         exp = find_expectation(tuple);
694
695         if (exp) {
696                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
697                         conntrack, exp);
698                 /* Welcome, Mr. Bond.  We've been expecting you... */
699                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
700                 conntrack->master = exp->master;
701 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
702                 conntrack->mark = exp->master->mark;
703 #endif
704 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
705     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
706                 /* this is ugly, but there is no other place where to put it */
707                 conntrack->nat.masq_index = exp->master->nat.masq_index;
708 #endif
709 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
710                 conntrack->secmark = exp->master->secmark;
711 #endif
712                 nf_conntrack_get(&conntrack->master->ct_general);
713                 CONNTRACK_STAT_INC(expect_new);
714         } else {
715                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
716
717                 CONNTRACK_STAT_INC(new);
718         }
719
720         /* Overload tuple linked list to put us in unconfirmed list. */
721         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
722
723         write_unlock_bh(&ip_conntrack_lock);
724
725         if (exp) {
726                 if (exp->expectfn)
727                         exp->expectfn(conntrack, exp);
728                 ip_conntrack_expect_put(exp);
729         }
730
731         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
732 }
733
734 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
735 static inline struct ip_conntrack *
736 resolve_normal_ct(struct sk_buff *skb,
737                   struct ip_conntrack_protocol *proto,
738                   int *set_reply,
739                   unsigned int hooknum,
740                   enum ip_conntrack_info *ctinfo)
741 {
742         struct ip_conntrack_tuple tuple;
743         struct ip_conntrack_tuple_hash *h;
744         struct ip_conntrack *ct;
745
746         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
747
748         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
749                                 &tuple,proto))
750                 return NULL;
751
752         /* look for tuple match */
753         h = ip_conntrack_find_get(&tuple, NULL);
754         if (!h) {
755                 h = init_conntrack(&tuple, proto, skb);
756                 if (!h)
757                         return NULL;
758                 if (IS_ERR(h))
759                         return (void *)h;
760         }
761         ct = tuplehash_to_ctrack(h);
762
763         /* It exists; we have (non-exclusive) reference. */
764         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
765                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
766                 /* Please set reply bit if this packet OK */
767                 *set_reply = 1;
768         } else {
769                 /* Once we've had two way comms, always ESTABLISHED. */
770                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
771                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
772                                ct);
773                         *ctinfo = IP_CT_ESTABLISHED;
774                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
775                         DEBUGP("ip_conntrack_in: related packet for %p\n",
776                                ct);
777                         *ctinfo = IP_CT_RELATED;
778                 } else {
779                         DEBUGP("ip_conntrack_in: new packet for %p\n",
780                                ct);
781                         *ctinfo = IP_CT_NEW;
782                 }
783                 *set_reply = 0;
784         }
785         skb->nfct = &ct->ct_general;
786         skb->nfctinfo = *ctinfo;
787         return ct;
788 }
789
790 /* Netfilter hook itself. */
791 unsigned int ip_conntrack_in(unsigned int hooknum,
792                              struct sk_buff **pskb,
793                              const struct net_device *in,
794                              const struct net_device *out,
795                              int (*okfn)(struct sk_buff *))
796 {
797         struct ip_conntrack *ct;
798         enum ip_conntrack_info ctinfo;
799         struct ip_conntrack_protocol *proto;
800         int set_reply = 0;
801         int ret;
802
803         /* Previously seen (loopback or untracked)?  Ignore. */
804         if ((*pskb)->nfct) {
805                 CONNTRACK_STAT_INC(ignore);
806                 return NF_ACCEPT;
807         }
808
809         /* Never happen */
810         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
811                 if (net_ratelimit()) {
812                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
813                        (*pskb)->nh.iph->protocol, hooknum);
814                 }
815                 return NF_DROP;
816         }
817
818 /* Doesn't cover locally-generated broadcast, so not worth it. */
819 #if 0
820         /* Ignore broadcast: no `connection'. */
821         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
822                 printk("Broadcast packet!\n");
823                 return NF_ACCEPT;
824         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
825                    == htonl(0x000000FF)) {
826                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
827                        NIPQUAD((*pskb)->nh.iph->saddr),
828                        NIPQUAD((*pskb)->nh.iph->daddr),
829                        (*pskb)->sk, (*pskb)->pkt_type);
830         }
831 #endif
832
833         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
834
835         /* It may be an special packet, error, unclean...
836          * inverse of the return code tells to the netfilter
837          * core what to do with the packet. */
838         if (proto->error != NULL
839             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
840                 CONNTRACK_STAT_INC(error);
841                 CONNTRACK_STAT_INC(invalid);
842                 return -ret;
843         }
844
845         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
846                 /* Not valid part of a connection */
847                 CONNTRACK_STAT_INC(invalid);
848                 return NF_ACCEPT;
849         }
850
851         if (IS_ERR(ct)) {
852                 /* Too stressed to deal. */
853                 CONNTRACK_STAT_INC(drop);
854                 return NF_DROP;
855         }
856
857         IP_NF_ASSERT((*pskb)->nfct);
858
859         ret = proto->packet(ct, *pskb, ctinfo);
860         if (ret < 0) {
861                 /* Invalid: inverse of the return code tells
862                  * the netfilter core what to do*/
863                 nf_conntrack_put((*pskb)->nfct);
864                 (*pskb)->nfct = NULL;
865                 CONNTRACK_STAT_INC(invalid);
866                 return -ret;
867         }
868
869         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
870                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
871
872         return ret;
873 }
874
875 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
876                    const struct ip_conntrack_tuple *orig)
877 {
878         return ip_ct_invert_tuple(inverse, orig,
879                                   __ip_conntrack_proto_find(orig->dst.protonum));
880 }
881
882 /* Would two expected things clash? */
883 static inline int expect_clash(const struct ip_conntrack_expect *a,
884                                const struct ip_conntrack_expect *b)
885 {
886         /* Part covered by intersection of masks must be unequal,
887            otherwise they clash */
888         struct ip_conntrack_tuple intersect_mask
889                 = { { a->mask.src.ip & b->mask.src.ip,
890                       { a->mask.src.u.all & b->mask.src.u.all } },
891                     { a->mask.dst.ip & b->mask.dst.ip,
892                       { a->mask.dst.u.all & b->mask.dst.u.all },
893                       a->mask.dst.protonum & b->mask.dst.protonum } };
894
895         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
896 }
897
898 static inline int expect_matches(const struct ip_conntrack_expect *a,
899                                  const struct ip_conntrack_expect *b)
900 {
901         return a->master == b->master
902                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
903                 && ip_ct_tuple_equal(&a->mask, &b->mask);
904 }
905
906 /* Generally a bad idea to call this: could have matched already. */
907 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
908 {
909         struct ip_conntrack_expect *i;
910
911         write_lock_bh(&ip_conntrack_lock);
912         /* choose the the oldest expectation to evict */
913         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
914                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
915                         ip_ct_unlink_expect(i);
916                         write_unlock_bh(&ip_conntrack_lock);
917                         ip_conntrack_expect_put(i);
918                         return;
919                 }
920         }
921         write_unlock_bh(&ip_conntrack_lock);
922 }
923
924 /* We don't increase the master conntrack refcount for non-fulfilled
925  * conntracks. During the conntrack destruction, the expectations are
926  * always killed before the conntrack itself */
927 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
928 {
929         struct ip_conntrack_expect *new;
930
931         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
932         if (!new) {
933                 DEBUGP("expect_related: OOM allocating expect\n");
934                 return NULL;
935         }
936         new->master = me;
937         atomic_set(&new->use, 1);
938         return new;
939 }
940
941 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
942 {
943         if (atomic_dec_and_test(&exp->use))
944                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
945 }
946
947 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
948 {
949         atomic_inc(&exp->use);
950         exp->master->expecting++;
951         list_add(&exp->list, &ip_conntrack_expect_list);
952
953         init_timer(&exp->timeout);
954         exp->timeout.data = (unsigned long)exp;
955         exp->timeout.function = expectation_timed_out;
956         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
957         add_timer(&exp->timeout);
958
959         exp->id = ++ip_conntrack_expect_next_id;
960         atomic_inc(&exp->use);
961         CONNTRACK_STAT_INC(expect_create);
962 }
963
964 /* Race with expectations being used means we could have none to find; OK. */
965 static void evict_oldest_expect(struct ip_conntrack *master)
966 {
967         struct ip_conntrack_expect *i;
968
969         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
970                 if (i->master == master) {
971                         if (del_timer(&i->timeout)) {
972                                 ip_ct_unlink_expect(i);
973                                 ip_conntrack_expect_put(i);
974                         }
975                         break;
976                 }
977         }
978 }
979
980 static inline int refresh_timer(struct ip_conntrack_expect *i)
981 {
982         if (!del_timer(&i->timeout))
983                 return 0;
984
985         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
986         add_timer(&i->timeout);
987         return 1;
988 }
989
990 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
991 {
992         struct ip_conntrack_expect *i;
993         int ret;
994
995         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
996         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
997         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
998
999         write_lock_bh(&ip_conntrack_lock);
1000         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1001                 if (expect_matches(i, expect)) {
1002                         /* Refresh timer: if it's dying, ignore.. */
1003                         if (refresh_timer(i)) {
1004                                 ret = 0;
1005                                 goto out;
1006                         }
1007                 } else if (expect_clash(i, expect)) {
1008                         ret = -EBUSY;
1009                         goto out;
1010                 }
1011         }
1012
1013         /* Will be over limit? */
1014         if (expect->master->helper->max_expected &&
1015             expect->master->expecting >= expect->master->helper->max_expected)
1016                 evict_oldest_expect(expect->master);
1017
1018         ip_conntrack_expect_insert(expect);
1019         ip_conntrack_expect_event(IPEXP_NEW, expect);
1020         ret = 0;
1021 out:
1022         write_unlock_bh(&ip_conntrack_lock);
1023         return ret;
1024 }
1025
1026 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1027    implicitly racy: see __ip_conntrack_confirm */
1028 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1029                               const struct ip_conntrack_tuple *newreply)
1030 {
1031         write_lock_bh(&ip_conntrack_lock);
1032         /* Should be unconfirmed, so not in hash table yet */
1033         IP_NF_ASSERT(!is_confirmed(conntrack));
1034
1035         DEBUGP("Altering reply tuple of %p to ", conntrack);
1036         DUMP_TUPLE(newreply);
1037
1038         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1039         if (!conntrack->master && conntrack->expecting == 0)
1040                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1041         write_unlock_bh(&ip_conntrack_lock);
1042 }
1043
1044 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1045 {
1046         BUG_ON(me->timeout == 0);
1047         write_lock_bh(&ip_conntrack_lock);
1048         list_add(&me->list, &helpers);
1049         write_unlock_bh(&ip_conntrack_lock);
1050
1051         return 0;
1052 }
1053
1054 struct ip_conntrack_helper *
1055 __ip_conntrack_helper_find_byname(const char *name)
1056 {
1057         struct ip_conntrack_helper *h;
1058
1059         list_for_each_entry(h, &helpers, list) {
1060                 if (!strcmp(h->name, name))
1061                         return h;
1062         }
1063
1064         return NULL;
1065 }
1066
1067 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1068                           const struct ip_conntrack_helper *me)
1069 {
1070         if (tuplehash_to_ctrack(i)->helper == me) {
1071                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1072                 tuplehash_to_ctrack(i)->helper = NULL;
1073         }
1074 }
1075
1076 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1077 {
1078         unsigned int i;
1079         struct ip_conntrack_tuple_hash *h;
1080         struct ip_conntrack_expect *exp, *tmp;
1081
1082         /* Need write lock here, to delete helper. */
1083         write_lock_bh(&ip_conntrack_lock);
1084         list_del(&me->list);
1085
1086         /* Get rid of expectations */
1087         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1088                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1089                         ip_ct_unlink_expect(exp);
1090                         ip_conntrack_expect_put(exp);
1091                 }
1092         }
1093         /* Get rid of expecteds, set helpers to NULL. */
1094         list_for_each_entry(h, &unconfirmed, list)
1095                 unhelp(h, me);
1096         for (i = 0; i < ip_conntrack_htable_size; i++) {
1097                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1098                         unhelp(h, me);
1099         }
1100         write_unlock_bh(&ip_conntrack_lock);
1101
1102         /* Someone could be still looking at the helper in a bh. */
1103         synchronize_net();
1104 }
1105
1106 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1107 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1108                         enum ip_conntrack_info ctinfo,
1109                         const struct sk_buff *skb,
1110                         unsigned long extra_jiffies,
1111                         int do_acct)
1112 {
1113         int event = 0;
1114
1115         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1116         IP_NF_ASSERT(skb);
1117
1118         write_lock_bh(&ip_conntrack_lock);
1119
1120         /* Only update if this is not a fixed timeout */
1121         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1122                 write_unlock_bh(&ip_conntrack_lock);
1123                 return;
1124         }
1125
1126         /* If not in hash table, timer will not be active yet */
1127         if (!is_confirmed(ct)) {
1128                 ct->timeout.expires = extra_jiffies;
1129                 event = IPCT_REFRESH;
1130         } else {
1131                 /* Need del_timer for race avoidance (may already be dying). */
1132                 if (del_timer(&ct->timeout)) {
1133                         ct->timeout.expires = jiffies + extra_jiffies;
1134                         add_timer(&ct->timeout);
1135                         event = IPCT_REFRESH;
1136                 }
1137         }
1138
1139 #ifdef CONFIG_IP_NF_CT_ACCT
1140         if (do_acct) {
1141                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1142                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1143                                                 ntohs(skb->nh.iph->tot_len);
1144                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1145                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1146                         event |= IPCT_COUNTER_FILLING;
1147         }
1148 #endif
1149
1150         write_unlock_bh(&ip_conntrack_lock);
1151
1152         /* must be unlocked when calling event cache */
1153         if (event)
1154                 ip_conntrack_event_cache(event, skb);
1155 }
1156
1157 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1158     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1159 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1160  * in ip_conntrack_core, since we don't want the protocols to autoload
1161  * or depend on ctnetlink */
1162 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1163                                const struct ip_conntrack_tuple *tuple)
1164 {
1165         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1166                 &tuple->src.u.tcp.port);
1167         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1168                 &tuple->dst.u.tcp.port);
1169         return 0;
1170
1171 nfattr_failure:
1172         return -1;
1173 }
1174
1175 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1176                                struct ip_conntrack_tuple *t)
1177 {
1178         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1179                 return -EINVAL;
1180
1181         t->src.u.tcp.port =
1182                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1183         t->dst.u.tcp.port =
1184                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1185
1186         return 0;
1187 }
1188 #endif
1189
1190 /* Returns new sk_buff, or NULL */
1191 struct sk_buff *
1192 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1193 {
1194         skb_orphan(skb);
1195
1196         local_bh_disable();
1197         skb = ip_defrag(skb, user);
1198         local_bh_enable();
1199
1200         if (skb)
1201                 ip_send_check(skb->nh.iph);
1202         return skb;
1203 }
1204
1205 /* Used by ipt_REJECT. */
1206 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1207 {
1208         struct ip_conntrack *ct;
1209         enum ip_conntrack_info ctinfo;
1210
1211         /* This ICMP is in reverse direction to the packet which caused it */
1212         ct = ip_conntrack_get(skb, &ctinfo);
1213
1214         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1215                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1216         else
1217                 ctinfo = IP_CT_RELATED;
1218
1219         /* Attach to new skbuff, and increment count */
1220         nskb->nfct = &ct->ct_general;
1221         nskb->nfctinfo = ctinfo;
1222         nf_conntrack_get(nskb->nfct);
1223 }
1224
1225 /* Bring out ya dead! */
1226 static struct ip_conntrack *
1227 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1228                 void *data, unsigned int *bucket)
1229 {
1230         struct ip_conntrack_tuple_hash *h;
1231         struct ip_conntrack *ct;
1232
1233         write_lock_bh(&ip_conntrack_lock);
1234         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1235                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1236                         ct = tuplehash_to_ctrack(h);
1237                         if (iter(ct, data))
1238                                 goto found;
1239                 }
1240         }
1241         list_for_each_entry(h, &unconfirmed, list) {
1242                 ct = tuplehash_to_ctrack(h);
1243                 if (iter(ct, data))
1244                         goto found;
1245         }
1246         write_unlock_bh(&ip_conntrack_lock);
1247         return NULL;
1248
1249 found:
1250         atomic_inc(&ct->ct_general.use);
1251         write_unlock_bh(&ip_conntrack_lock);
1252         return ct;
1253 }
1254
1255 void
1256 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1257 {
1258         struct ip_conntrack *ct;
1259         unsigned int bucket = 0;
1260
1261         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1262                 /* Time to push up daises... */
1263                 if (del_timer(&ct->timeout))
1264                         death_by_timeout((unsigned long)ct);
1265                 /* ... else the timer will get him soon. */
1266
1267                 ip_conntrack_put(ct);
1268         }
1269 }
1270
1271 /* Fast function for those who don't want to parse /proc (and I don't
1272    blame them). */
1273 /* Reversing the socket's dst/src point of view gives us the reply
1274    mapping. */
1275 static int
1276 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1277 {
1278         struct inet_sock *inet = inet_sk(sk);
1279         struct ip_conntrack_tuple_hash *h;
1280         struct ip_conntrack_tuple tuple;
1281
1282         IP_CT_TUPLE_U_BLANK(&tuple);
1283         tuple.src.ip = inet->rcv_saddr;
1284         tuple.src.u.tcp.port = inet->sport;
1285         tuple.dst.ip = inet->daddr;
1286         tuple.dst.u.tcp.port = inet->dport;
1287         tuple.dst.protonum = IPPROTO_TCP;
1288
1289         /* We only do TCP at the moment: is there a better way? */
1290         if (strcmp(sk->sk_prot->name, "TCP")) {
1291                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1292                 return -ENOPROTOOPT;
1293         }
1294
1295         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1296                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1297                        *len, sizeof(struct sockaddr_in));
1298                 return -EINVAL;
1299         }
1300
1301         h = ip_conntrack_find_get(&tuple, NULL);
1302         if (h) {
1303                 struct sockaddr_in sin;
1304                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1305
1306                 sin.sin_family = AF_INET;
1307                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1308                         .tuple.dst.u.tcp.port;
1309                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1310                         .tuple.dst.ip;
1311                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1312
1313                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1314                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1315                 ip_conntrack_put(ct);
1316                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1317                         return -EFAULT;
1318                 else
1319                         return 0;
1320         }
1321         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1322                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1323                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1324         return -ENOENT;
1325 }
1326
1327 static struct nf_sockopt_ops so_getorigdst = {
1328         .pf             = PF_INET,
1329         .get_optmin     = SO_ORIGINAL_DST,
1330         .get_optmax     = SO_ORIGINAL_DST+1,
1331         .get            = &getorigdst,
1332 };
1333
1334 static int kill_all(struct ip_conntrack *i, void *data)
1335 {
1336         return 1;
1337 }
1338
1339 void ip_conntrack_flush(void)
1340 {
1341         ip_ct_iterate_cleanup(kill_all, NULL);
1342 }
1343
1344 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1345 {
1346         if (vmalloced)
1347                 vfree(hash);
1348         else
1349                 free_pages((unsigned long)hash,
1350                            get_order(sizeof(struct list_head) * size));
1351 }
1352
1353 /* Mishearing the voices in his head, our hero wonders how he's
1354    supposed to kill the mall. */
1355 void ip_conntrack_cleanup(void)
1356 {
1357         ip_ct_attach = NULL;
1358
1359         /* This makes sure all current packets have passed through
1360            netfilter framework.  Roll on, two-stage module
1361            delete... */
1362         synchronize_net();
1363
1364         ip_ct_event_cache_flush();
1365  i_see_dead_people:
1366         ip_conntrack_flush();
1367         if (atomic_read(&ip_conntrack_count) != 0) {
1368                 schedule();
1369                 goto i_see_dead_people;
1370         }
1371         /* wait until all references to ip_conntrack_untracked are dropped */
1372         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1373                 schedule();
1374
1375         kmem_cache_destroy(ip_conntrack_cachep);
1376         kmem_cache_destroy(ip_conntrack_expect_cachep);
1377         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1378                             ip_conntrack_htable_size);
1379         nf_unregister_sockopt(&so_getorigdst);
1380 }
1381
1382 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1383 {
1384         struct list_head *hash;
1385         unsigned int i;
1386
1387         *vmalloced = 0;
1388         hash = (void*)__get_free_pages(GFP_KERNEL,
1389                                        get_order(sizeof(struct list_head)
1390                                                  * size));
1391         if (!hash) {
1392                 *vmalloced = 1;
1393                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1394                 hash = vmalloc(sizeof(struct list_head) * size);
1395         }
1396
1397         if (hash)
1398                 for (i = 0; i < size; i++)
1399                         INIT_LIST_HEAD(&hash[i]);
1400
1401         return hash;
1402 }
1403
1404 static int set_hashsize(const char *val, struct kernel_param *kp)
1405 {
1406         int i, bucket, hashsize, vmalloced;
1407         int old_vmalloced, old_size;
1408         int rnd;
1409         struct list_head *hash, *old_hash;
1410         struct ip_conntrack_tuple_hash *h;
1411
1412         /* On boot, we can set this without any fancy locking. */
1413         if (!ip_conntrack_htable_size)
1414                 return param_set_int(val, kp);
1415
1416         hashsize = simple_strtol(val, NULL, 0);
1417         if (!hashsize)
1418                 return -EINVAL;
1419
1420         hash = alloc_hashtable(hashsize, &vmalloced);
1421         if (!hash)
1422                 return -ENOMEM;
1423
1424         /* We have to rehash for the new table anyway, so we also can
1425          * use a new random seed */
1426         get_random_bytes(&rnd, 4);
1427
1428         write_lock_bh(&ip_conntrack_lock);
1429         for (i = 0; i < ip_conntrack_htable_size; i++) {
1430                 while (!list_empty(&ip_conntrack_hash[i])) {
1431                         h = list_entry(ip_conntrack_hash[i].next,
1432                                        struct ip_conntrack_tuple_hash, list);
1433                         list_del(&h->list);
1434                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1435                         list_add_tail(&h->list, &hash[bucket]);
1436                 }
1437         }
1438         old_size = ip_conntrack_htable_size;
1439         old_vmalloced = ip_conntrack_vmalloc;
1440         old_hash = ip_conntrack_hash;
1441
1442         ip_conntrack_htable_size = hashsize;
1443         ip_conntrack_vmalloc = vmalloced;
1444         ip_conntrack_hash = hash;
1445         ip_conntrack_hash_rnd = rnd;
1446         write_unlock_bh(&ip_conntrack_lock);
1447
1448         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1449         return 0;
1450 }
1451
1452 module_param_call(hashsize, set_hashsize, param_get_uint,
1453                   &ip_conntrack_htable_size, 0600);
1454
1455 int __init ip_conntrack_init(void)
1456 {
1457         unsigned int i;
1458         int ret;
1459
1460         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1461          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1462         if (!ip_conntrack_htable_size) {
1463                 ip_conntrack_htable_size
1464                         = (((num_physpages << PAGE_SHIFT) / 16384)
1465                            / sizeof(struct list_head));
1466                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1467                         ip_conntrack_htable_size = 8192;
1468                 if (ip_conntrack_htable_size < 16)
1469                         ip_conntrack_htable_size = 16;
1470         }
1471         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1472
1473         printk("ip_conntrack version %s (%u buckets, %d max)"
1474                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1475                ip_conntrack_htable_size, ip_conntrack_max,
1476                sizeof(struct ip_conntrack));
1477
1478         ret = nf_register_sockopt(&so_getorigdst);
1479         if (ret != 0) {
1480                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1481                 return ret;
1482         }
1483
1484         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1485                                             &ip_conntrack_vmalloc);
1486         if (!ip_conntrack_hash) {
1487                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1488                 goto err_unreg_sockopt;
1489         }
1490
1491         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1492                                                 sizeof(struct ip_conntrack), 0,
1493                                                 0, NULL, NULL);
1494         if (!ip_conntrack_cachep) {
1495                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1496                 goto err_free_hash;
1497         }
1498
1499         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1500                                         sizeof(struct ip_conntrack_expect),
1501                                         0, 0, NULL, NULL);
1502         if (!ip_conntrack_expect_cachep) {
1503                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1504                 goto err_free_conntrack_slab;
1505         }
1506
1507         /* Don't NEED lock here, but good form anyway. */
1508         write_lock_bh(&ip_conntrack_lock);
1509         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1510                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1511         /* Sew in builtin protocols. */
1512         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1513         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1514         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1515         write_unlock_bh(&ip_conntrack_lock);
1516
1517         /* For use by ipt_REJECT */
1518         ip_ct_attach = ip_conntrack_attach;
1519
1520         /* Set up fake conntrack:
1521             - to never be deleted, not in any hashes */
1522         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1523         /*  - and look it like as a confirmed connection */
1524         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1525
1526         return ret;
1527
1528 err_free_conntrack_slab:
1529         kmem_cache_destroy(ip_conntrack_cachep);
1530 err_free_hash:
1531         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1532                             ip_conntrack_htable_size);
1533 err_unreg_sockopt:
1534         nf_unregister_sockopt(&so_getorigdst);
1535
1536         return -ENOMEM;
1537 }