initramfs: fix initramfs size calculation
[linux-drm-fsl-dcu.git] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <linux/slab.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82 #include <linux/mutex.h>
83 #include <linux/if_vlan.h>
84 #include <linux/virtio_net.h>
85 #include <linux/errqueue.h>
86
87 #ifdef CONFIG_INET
88 #include <net/inet_common.h>
89 #endif
90
91 /*
92    Assumptions:
93    - if device has no dev->hard_header routine, it adds and removes ll header
94      inside itself. In this case ll header is invisible outside of device,
95      but higher levels still should reserve dev->hard_header_len.
96      Some devices are enough clever to reallocate skb, when header
97      will not fit to reserved space (tunnel), another ones are silly
98      (PPP).
99    - packet socket receives packets with pulled ll header,
100      so that SOCK_RAW should push it back.
101
102 On receive:
103 -----------
104
105 Incoming, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> data
108
109 Outgoing, dev->hard_header!=NULL
110    mac_header -> ll header
111    data       -> ll header
112
113 Incoming, dev->hard_header==NULL
114    mac_header -> UNKNOWN position. It is very likely, that it points to ll
115                  header.  PPP makes it, that is wrong, because introduce
116                  assymetry between rx and tx paths.
117    data       -> data
118
119 Outgoing, dev->hard_header==NULL
120    mac_header -> data. ll header is still not built!
121    data       -> data
122
123 Resume
124   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
125
126
127 On transmit:
128 ------------
129
130 dev->hard_header != NULL
131    mac_header -> ll header
132    data       -> ll header
133
134 dev->hard_header == NULL (ll header is added by device, we cannot control it)
135    mac_header -> data
136    data       -> data
137
138    We should set nh.raw on output to correct posistion,
139    packet classifier depends on it.
140  */
141
142 /* Private packet socket structures. */
143
144 struct packet_mclist {
145         struct packet_mclist    *next;
146         int                     ifindex;
147         int                     count;
148         unsigned short          type;
149         unsigned short          alen;
150         unsigned char           addr[MAX_ADDR_LEN];
151 };
152 /* identical to struct packet_mreq except it has
153  * a longer address field.
154  */
155 struct packet_mreq_max {
156         int             mr_ifindex;
157         unsigned short  mr_type;
158         unsigned short  mr_alen;
159         unsigned char   mr_address[MAX_ADDR_LEN];
160 };
161
162 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
163                 int closing, int tx_ring);
164
165 struct packet_ring_buffer {
166         char                    **pg_vec;
167         unsigned int            head;
168         unsigned int            frames_per_block;
169         unsigned int            frame_size;
170         unsigned int            frame_max;
171
172         unsigned int            pg_vec_order;
173         unsigned int            pg_vec_pages;
174         unsigned int            pg_vec_len;
175
176         atomic_t                pending;
177 };
178
179 struct packet_sock;
180 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
181
182 static void packet_flush_mclist(struct sock *sk);
183
184 struct packet_sock {
185         /* struct sock has to be the first member of packet_sock */
186         struct sock             sk;
187         struct tpacket_stats    stats;
188         struct packet_ring_buffer       rx_ring;
189         struct packet_ring_buffer       tx_ring;
190         int                     copy_thresh;
191         spinlock_t              bind_lock;
192         struct mutex            pg_vec_lock;
193         unsigned int            running:1,      /* prot_hook is attached*/
194                                 auxdata:1,
195                                 origdev:1,
196                                 has_vnet_hdr:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200         atomic_t                mapped;
201         enum tpacket_versions   tp_version;
202         unsigned int            tp_hdrlen;
203         unsigned int            tp_reserve;
204         unsigned int            tp_loss:1;
205         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
206 };
207
208 struct packet_skb_cb {
209         unsigned int origlen;
210         union {
211                 struct sockaddr_pkt pkt;
212                 struct sockaddr_ll ll;
213         } sa;
214 };
215
216 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
217
218 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
219 {
220         union {
221                 struct tpacket_hdr *h1;
222                 struct tpacket2_hdr *h2;
223                 void *raw;
224         } h;
225
226         h.raw = frame;
227         switch (po->tp_version) {
228         case TPACKET_V1:
229                 h.h1->tp_status = status;
230                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
231                 break;
232         case TPACKET_V2:
233                 h.h2->tp_status = status;
234                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
235                 break;
236         default:
237                 pr_err("TPACKET version not supported\n");
238                 BUG();
239         }
240
241         smp_wmb();
242 }
243
244 static int __packet_get_status(struct packet_sock *po, void *frame)
245 {
246         union {
247                 struct tpacket_hdr *h1;
248                 struct tpacket2_hdr *h2;
249                 void *raw;
250         } h;
251
252         smp_rmb();
253
254         h.raw = frame;
255         switch (po->tp_version) {
256         case TPACKET_V1:
257                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
258                 return h.h1->tp_status;
259         case TPACKET_V2:
260                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
261                 return h.h2->tp_status;
262         default:
263                 pr_err("TPACKET version not supported\n");
264                 BUG();
265                 return 0;
266         }
267 }
268
269 static void *packet_lookup_frame(struct packet_sock *po,
270                 struct packet_ring_buffer *rb,
271                 unsigned int position,
272                 int status)
273 {
274         unsigned int pg_vec_pos, frame_offset;
275         union {
276                 struct tpacket_hdr *h1;
277                 struct tpacket2_hdr *h2;
278                 void *raw;
279         } h;
280
281         pg_vec_pos = position / rb->frames_per_block;
282         frame_offset = position % rb->frames_per_block;
283
284         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
285
286         if (status != __packet_get_status(po, h.raw))
287                 return NULL;
288
289         return h.raw;
290 }
291
292 static inline void *packet_current_frame(struct packet_sock *po,
293                 struct packet_ring_buffer *rb,
294                 int status)
295 {
296         return packet_lookup_frame(po, rb, rb->head, status);
297 }
298
299 static inline void *packet_previous_frame(struct packet_sock *po,
300                 struct packet_ring_buffer *rb,
301                 int status)
302 {
303         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
304         return packet_lookup_frame(po, rb, previous, status);
305 }
306
307 static inline void packet_increment_head(struct packet_ring_buffer *buff)
308 {
309         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
310 }
311
312 static inline struct packet_sock *pkt_sk(struct sock *sk)
313 {
314         return (struct packet_sock *)sk;
315 }
316
317 static void packet_sock_destruct(struct sock *sk)
318 {
319         skb_queue_purge(&sk->sk_error_queue);
320
321         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
322         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
323
324         if (!sock_flag(sk, SOCK_DEAD)) {
325                 pr_err("Attempt to release alive packet socket: %p\n", sk);
326                 return;
327         }
328
329         sk_refcnt_debug_dec(sk);
330 }
331
332
333 static const struct proto_ops packet_ops;
334
335 static const struct proto_ops packet_ops_spkt;
336
337 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
338                            struct packet_type *pt, struct net_device *orig_dev)
339 {
340         struct sock *sk;
341         struct sockaddr_pkt *spkt;
342
343         /*
344          *      When we registered the protocol we saved the socket in the data
345          *      field for just this event.
346          */
347
348         sk = pt->af_packet_priv;
349
350         /*
351          *      Yank back the headers [hope the device set this
352          *      right or kerboom...]
353          *
354          *      Incoming packets have ll header pulled,
355          *      push it back.
356          *
357          *      For outgoing ones skb->data == skb_mac_header(skb)
358          *      so that this procedure is noop.
359          */
360
361         if (skb->pkt_type == PACKET_LOOPBACK)
362                 goto out;
363
364         if (!net_eq(dev_net(dev), sock_net(sk)))
365                 goto out;
366
367         skb = skb_share_check(skb, GFP_ATOMIC);
368         if (skb == NULL)
369                 goto oom;
370
371         /* drop any routing info */
372         skb_dst_drop(skb);
373
374         /* drop conntrack reference */
375         nf_reset(skb);
376
377         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
378
379         skb_push(skb, skb->data - skb_mac_header(skb));
380
381         /*
382          *      The SOCK_PACKET socket receives _all_ frames.
383          */
384
385         spkt->spkt_family = dev->type;
386         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
387         spkt->spkt_protocol = skb->protocol;
388
389         /*
390          *      Charge the memory to the socket. This is done specifically
391          *      to prevent sockets using all the memory up.
392          */
393
394         if (sock_queue_rcv_skb(sk, skb) == 0)
395                 return 0;
396
397 out:
398         kfree_skb(skb);
399 oom:
400         return 0;
401 }
402
403
404 /*
405  *      Output a raw packet to a device layer. This bypasses all the other
406  *      protocol layers and you must therefore supply it with a complete frame
407  */
408
409 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
410                                struct msghdr *msg, size_t len)
411 {
412         struct sock *sk = sock->sk;
413         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
414         struct sk_buff *skb = NULL;
415         struct net_device *dev;
416         __be16 proto = 0;
417         int err;
418
419         /*
420          *      Get and verify the address.
421          */
422
423         if (saddr) {
424                 if (msg->msg_namelen < sizeof(struct sockaddr))
425                         return -EINVAL;
426                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
427                         proto = saddr->spkt_protocol;
428         } else
429                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
430
431         /*
432          *      Find the device first to size check it
433          */
434
435         saddr->spkt_device[13] = 0;
436 retry:
437         rcu_read_lock();
438         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
439         err = -ENODEV;
440         if (dev == NULL)
441                 goto out_unlock;
442
443         err = -ENETDOWN;
444         if (!(dev->flags & IFF_UP))
445                 goto out_unlock;
446
447         /*
448          * You may not queue a frame bigger than the mtu. This is the lowest level
449          * raw protocol and you must do your own fragmentation at this level.
450          */
451
452         err = -EMSGSIZE;
453         if (len > dev->mtu + dev->hard_header_len)
454                 goto out_unlock;
455
456         if (!skb) {
457                 size_t reserved = LL_RESERVED_SPACE(dev);
458                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
459
460                 rcu_read_unlock();
461                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
462                 if (skb == NULL)
463                         return -ENOBUFS;
464                 /* FIXME: Save some space for broken drivers that write a hard
465                  * header at transmission time by themselves. PPP is the notable
466                  * one here. This should really be fixed at the driver level.
467                  */
468                 skb_reserve(skb, reserved);
469                 skb_reset_network_header(skb);
470
471                 /* Try to align data part correctly */
472                 if (hhlen) {
473                         skb->data -= hhlen;
474                         skb->tail -= hhlen;
475                         if (len < hhlen)
476                                 skb_reset_network_header(skb);
477                 }
478                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
479                 if (err)
480                         goto out_free;
481                 goto retry;
482         }
483
484
485         skb->protocol = proto;
486         skb->dev = dev;
487         skb->priority = sk->sk_priority;
488         skb->mark = sk->sk_mark;
489         err = sock_tx_timestamp(msg, sk, skb_tx(skb));
490         if (err < 0)
491                 goto out_unlock;
492
493         dev_queue_xmit(skb);
494         rcu_read_unlock();
495         return len;
496
497 out_unlock:
498         rcu_read_unlock();
499 out_free:
500         kfree_skb(skb);
501         return err;
502 }
503
504 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
505                                       unsigned int res)
506 {
507         struct sk_filter *filter;
508
509         rcu_read_lock_bh();
510         filter = rcu_dereference_bh(sk->sk_filter);
511         if (filter != NULL)
512                 res = sk_run_filter(skb, filter->insns, filter->len);
513         rcu_read_unlock_bh();
514
515         return res;
516 }
517
518 /*
519    This function makes lazy skb cloning in hope that most of packets
520    are discarded by BPF.
521
522    Note tricky part: we DO mangle shared skb! skb->data, skb->len
523    and skb->cb are mangled. It works because (and until) packets
524    falling here are owned by current CPU. Output packets are cloned
525    by dev_queue_xmit_nit(), input packets are processed by net_bh
526    sequencially, so that if we return skb to original state on exit,
527    we will not harm anyone.
528  */
529
530 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
531                       struct packet_type *pt, struct net_device *orig_dev)
532 {
533         struct sock *sk;
534         struct sockaddr_ll *sll;
535         struct packet_sock *po;
536         u8 *skb_head = skb->data;
537         int skb_len = skb->len;
538         unsigned int snaplen, res;
539
540         if (skb->pkt_type == PACKET_LOOPBACK)
541                 goto drop;
542
543         sk = pt->af_packet_priv;
544         po = pkt_sk(sk);
545
546         if (!net_eq(dev_net(dev), sock_net(sk)))
547                 goto drop;
548
549         skb->dev = dev;
550
551         if (dev->header_ops) {
552                 /* The device has an explicit notion of ll header,
553                    exported to higher levels.
554
555                    Otherwise, the device hides datails of it frame
556                    structure, so that corresponding packet head
557                    never delivered to user.
558                  */
559                 if (sk->sk_type != SOCK_DGRAM)
560                         skb_push(skb, skb->data - skb_mac_header(skb));
561                 else if (skb->pkt_type == PACKET_OUTGOING) {
562                         /* Special case: outgoing packets have ll header at head */
563                         skb_pull(skb, skb_network_offset(skb));
564                 }
565         }
566
567         snaplen = skb->len;
568
569         res = run_filter(skb, sk, snaplen);
570         if (!res)
571                 goto drop_n_restore;
572         if (snaplen > res)
573                 snaplen = res;
574
575         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
576             (unsigned)sk->sk_rcvbuf)
577                 goto drop_n_acct;
578
579         if (skb_shared(skb)) {
580                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
581                 if (nskb == NULL)
582                         goto drop_n_acct;
583
584                 if (skb_head != skb->data) {
585                         skb->data = skb_head;
586                         skb->len = skb_len;
587                 }
588                 kfree_skb(skb);
589                 skb = nskb;
590         }
591
592         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
593                      sizeof(skb->cb));
594
595         sll = &PACKET_SKB_CB(skb)->sa.ll;
596         sll->sll_family = AF_PACKET;
597         sll->sll_hatype = dev->type;
598         sll->sll_protocol = skb->protocol;
599         sll->sll_pkttype = skb->pkt_type;
600         if (unlikely(po->origdev))
601                 sll->sll_ifindex = orig_dev->ifindex;
602         else
603                 sll->sll_ifindex = dev->ifindex;
604
605         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
606
607         PACKET_SKB_CB(skb)->origlen = skb->len;
608
609         if (pskb_trim(skb, snaplen))
610                 goto drop_n_acct;
611
612         skb_set_owner_r(skb, sk);
613         skb->dev = NULL;
614         skb_dst_drop(skb);
615
616         /* drop conntrack reference */
617         nf_reset(skb);
618
619         spin_lock(&sk->sk_receive_queue.lock);
620         po->stats.tp_packets++;
621         skb->dropcount = atomic_read(&sk->sk_drops);
622         __skb_queue_tail(&sk->sk_receive_queue, skb);
623         spin_unlock(&sk->sk_receive_queue.lock);
624         sk->sk_data_ready(sk, skb->len);
625         return 0;
626
627 drop_n_acct:
628         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
629
630 drop_n_restore:
631         if (skb_head != skb->data && skb_shared(skb)) {
632                 skb->data = skb_head;
633                 skb->len = skb_len;
634         }
635 drop:
636         consume_skb(skb);
637         return 0;
638 }
639
640 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
641                        struct packet_type *pt, struct net_device *orig_dev)
642 {
643         struct sock *sk;
644         struct packet_sock *po;
645         struct sockaddr_ll *sll;
646         union {
647                 struct tpacket_hdr *h1;
648                 struct tpacket2_hdr *h2;
649                 void *raw;
650         } h;
651         u8 *skb_head = skb->data;
652         int skb_len = skb->len;
653         unsigned int snaplen, res;
654         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
655         unsigned short macoff, netoff, hdrlen;
656         struct sk_buff *copy_skb = NULL;
657         struct timeval tv;
658         struct timespec ts;
659
660         if (skb->pkt_type == PACKET_LOOPBACK)
661                 goto drop;
662
663         sk = pt->af_packet_priv;
664         po = pkt_sk(sk);
665
666         if (!net_eq(dev_net(dev), sock_net(sk)))
667                 goto drop;
668
669         if (dev->header_ops) {
670                 if (sk->sk_type != SOCK_DGRAM)
671                         skb_push(skb, skb->data - skb_mac_header(skb));
672                 else if (skb->pkt_type == PACKET_OUTGOING) {
673                         /* Special case: outgoing packets have ll header at head */
674                         skb_pull(skb, skb_network_offset(skb));
675                 }
676         }
677
678         if (skb->ip_summed == CHECKSUM_PARTIAL)
679                 status |= TP_STATUS_CSUMNOTREADY;
680
681         snaplen = skb->len;
682
683         res = run_filter(skb, sk, snaplen);
684         if (!res)
685                 goto drop_n_restore;
686         if (snaplen > res)
687                 snaplen = res;
688
689         if (sk->sk_type == SOCK_DGRAM) {
690                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
691                                   po->tp_reserve;
692         } else {
693                 unsigned maclen = skb_network_offset(skb);
694                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
695                                        (maclen < 16 ? 16 : maclen)) +
696                         po->tp_reserve;
697                 macoff = netoff - maclen;
698         }
699
700         if (macoff + snaplen > po->rx_ring.frame_size) {
701                 if (po->copy_thresh &&
702                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
703                     (unsigned)sk->sk_rcvbuf) {
704                         if (skb_shared(skb)) {
705                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
706                         } else {
707                                 copy_skb = skb_get(skb);
708                                 skb_head = skb->data;
709                         }
710                         if (copy_skb)
711                                 skb_set_owner_r(copy_skb, sk);
712                 }
713                 snaplen = po->rx_ring.frame_size - macoff;
714                 if ((int)snaplen < 0)
715                         snaplen = 0;
716         }
717
718         spin_lock(&sk->sk_receive_queue.lock);
719         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
720         if (!h.raw)
721                 goto ring_is_full;
722         packet_increment_head(&po->rx_ring);
723         po->stats.tp_packets++;
724         if (copy_skb) {
725                 status |= TP_STATUS_COPY;
726                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
727         }
728         if (!po->stats.tp_drops)
729                 status &= ~TP_STATUS_LOSING;
730         spin_unlock(&sk->sk_receive_queue.lock);
731
732         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
733
734         switch (po->tp_version) {
735         case TPACKET_V1:
736                 h.h1->tp_len = skb->len;
737                 h.h1->tp_snaplen = snaplen;
738                 h.h1->tp_mac = macoff;
739                 h.h1->tp_net = netoff;
740                 if (skb->tstamp.tv64)
741                         tv = ktime_to_timeval(skb->tstamp);
742                 else
743                         do_gettimeofday(&tv);
744                 h.h1->tp_sec = tv.tv_sec;
745                 h.h1->tp_usec = tv.tv_usec;
746                 hdrlen = sizeof(*h.h1);
747                 break;
748         case TPACKET_V2:
749                 h.h2->tp_len = skb->len;
750                 h.h2->tp_snaplen = snaplen;
751                 h.h2->tp_mac = macoff;
752                 h.h2->tp_net = netoff;
753                 if (skb->tstamp.tv64)
754                         ts = ktime_to_timespec(skb->tstamp);
755                 else
756                         getnstimeofday(&ts);
757                 h.h2->tp_sec = ts.tv_sec;
758                 h.h2->tp_nsec = ts.tv_nsec;
759                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
760                 hdrlen = sizeof(*h.h2);
761                 break;
762         default:
763                 BUG();
764         }
765
766         sll = h.raw + TPACKET_ALIGN(hdrlen);
767         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
768         sll->sll_family = AF_PACKET;
769         sll->sll_hatype = dev->type;
770         sll->sll_protocol = skb->protocol;
771         sll->sll_pkttype = skb->pkt_type;
772         if (unlikely(po->origdev))
773                 sll->sll_ifindex = orig_dev->ifindex;
774         else
775                 sll->sll_ifindex = dev->ifindex;
776
777         __packet_set_status(po, h.raw, status);
778         smp_mb();
779         {
780                 struct page *p_start, *p_end;
781                 u8 *h_end = h.raw + macoff + snaplen - 1;
782
783                 p_start = virt_to_page(h.raw);
784                 p_end = virt_to_page(h_end);
785                 while (p_start <= p_end) {
786                         flush_dcache_page(p_start);
787                         p_start++;
788                 }
789         }
790
791         sk->sk_data_ready(sk, 0);
792
793 drop_n_restore:
794         if (skb_head != skb->data && skb_shared(skb)) {
795                 skb->data = skb_head;
796                 skb->len = skb_len;
797         }
798 drop:
799         kfree_skb(skb);
800         return 0;
801
802 ring_is_full:
803         po->stats.tp_drops++;
804         spin_unlock(&sk->sk_receive_queue.lock);
805
806         sk->sk_data_ready(sk, 0);
807         kfree_skb(copy_skb);
808         goto drop_n_restore;
809 }
810
811 static void tpacket_destruct_skb(struct sk_buff *skb)
812 {
813         struct packet_sock *po = pkt_sk(skb->sk);
814         void *ph;
815
816         BUG_ON(skb == NULL);
817
818         if (likely(po->tx_ring.pg_vec)) {
819                 ph = skb_shinfo(skb)->destructor_arg;
820                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
821                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
822                 atomic_dec(&po->tx_ring.pending);
823                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
824         }
825
826         sock_wfree(skb);
827 }
828
829 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
830                 void *frame, struct net_device *dev, int size_max,
831                 __be16 proto, unsigned char *addr)
832 {
833         union {
834                 struct tpacket_hdr *h1;
835                 struct tpacket2_hdr *h2;
836                 void *raw;
837         } ph;
838         int to_write, offset, len, tp_len, nr_frags, len_max;
839         struct socket *sock = po->sk.sk_socket;
840         struct page *page;
841         void *data;
842         int err;
843
844         ph.raw = frame;
845
846         skb->protocol = proto;
847         skb->dev = dev;
848         skb->priority = po->sk.sk_priority;
849         skb->mark = po->sk.sk_mark;
850         skb_shinfo(skb)->destructor_arg = ph.raw;
851
852         switch (po->tp_version) {
853         case TPACKET_V2:
854                 tp_len = ph.h2->tp_len;
855                 break;
856         default:
857                 tp_len = ph.h1->tp_len;
858                 break;
859         }
860         if (unlikely(tp_len > size_max)) {
861                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
862                 return -EMSGSIZE;
863         }
864
865         skb_reserve(skb, LL_RESERVED_SPACE(dev));
866         skb_reset_network_header(skb);
867
868         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
869         to_write = tp_len;
870
871         if (sock->type == SOCK_DGRAM) {
872                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
873                                 NULL, tp_len);
874                 if (unlikely(err < 0))
875                         return -EINVAL;
876         } else if (dev->hard_header_len) {
877                 /* net device doesn't like empty head */
878                 if (unlikely(tp_len <= dev->hard_header_len)) {
879                         pr_err("packet size is too short (%d < %d)\n",
880                                tp_len, dev->hard_header_len);
881                         return -EINVAL;
882                 }
883
884                 skb_push(skb, dev->hard_header_len);
885                 err = skb_store_bits(skb, 0, data,
886                                 dev->hard_header_len);
887                 if (unlikely(err))
888                         return err;
889
890                 data += dev->hard_header_len;
891                 to_write -= dev->hard_header_len;
892         }
893
894         err = -EFAULT;
895         page = virt_to_page(data);
896         offset = offset_in_page(data);
897         len_max = PAGE_SIZE - offset;
898         len = ((to_write > len_max) ? len_max : to_write);
899
900         skb->data_len = to_write;
901         skb->len += to_write;
902         skb->truesize += to_write;
903         atomic_add(to_write, &po->sk.sk_wmem_alloc);
904
905         while (likely(to_write)) {
906                 nr_frags = skb_shinfo(skb)->nr_frags;
907
908                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
909                         pr_err("Packet exceed the number of skb frags(%lu)\n",
910                                MAX_SKB_FRAGS);
911                         return -EFAULT;
912                 }
913
914                 flush_dcache_page(page);
915                 get_page(page);
916                 skb_fill_page_desc(skb,
917                                 nr_frags,
918                                 page++, offset, len);
919                 to_write -= len;
920                 offset = 0;
921                 len_max = PAGE_SIZE;
922                 len = ((to_write > len_max) ? len_max : to_write);
923         }
924
925         return tp_len;
926 }
927
928 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
929 {
930         struct socket *sock;
931         struct sk_buff *skb;
932         struct net_device *dev;
933         __be16 proto;
934         int ifindex, err, reserve = 0;
935         void *ph;
936         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
937         int tp_len, size_max;
938         unsigned char *addr;
939         int len_sum = 0;
940         int status = 0;
941
942         sock = po->sk.sk_socket;
943
944         mutex_lock(&po->pg_vec_lock);
945
946         err = -EBUSY;
947         if (saddr == NULL) {
948                 ifindex = po->ifindex;
949                 proto   = po->num;
950                 addr    = NULL;
951         } else {
952                 err = -EINVAL;
953                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
954                         goto out;
955                 if (msg->msg_namelen < (saddr->sll_halen
956                                         + offsetof(struct sockaddr_ll,
957                                                 sll_addr)))
958                         goto out;
959                 ifindex = saddr->sll_ifindex;
960                 proto   = saddr->sll_protocol;
961                 addr    = saddr->sll_addr;
962         }
963
964         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
965         err = -ENXIO;
966         if (unlikely(dev == NULL))
967                 goto out;
968
969         reserve = dev->hard_header_len;
970
971         err = -ENETDOWN;
972         if (unlikely(!(dev->flags & IFF_UP)))
973                 goto out_put;
974
975         size_max = po->tx_ring.frame_size
976                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
977
978         if (size_max > dev->mtu + reserve)
979                 size_max = dev->mtu + reserve;
980
981         do {
982                 ph = packet_current_frame(po, &po->tx_ring,
983                                 TP_STATUS_SEND_REQUEST);
984
985                 if (unlikely(ph == NULL)) {
986                         schedule();
987                         continue;
988                 }
989
990                 status = TP_STATUS_SEND_REQUEST;
991                 skb = sock_alloc_send_skb(&po->sk,
992                                 LL_ALLOCATED_SPACE(dev)
993                                 + sizeof(struct sockaddr_ll),
994                                 0, &err);
995
996                 if (unlikely(skb == NULL))
997                         goto out_status;
998
999                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1000                                 addr);
1001
1002                 if (unlikely(tp_len < 0)) {
1003                         if (po->tp_loss) {
1004                                 __packet_set_status(po, ph,
1005                                                 TP_STATUS_AVAILABLE);
1006                                 packet_increment_head(&po->tx_ring);
1007                                 kfree_skb(skb);
1008                                 continue;
1009                         } else {
1010                                 status = TP_STATUS_WRONG_FORMAT;
1011                                 err = tp_len;
1012                                 goto out_status;
1013                         }
1014                 }
1015
1016                 skb->destructor = tpacket_destruct_skb;
1017                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1018                 atomic_inc(&po->tx_ring.pending);
1019
1020                 status = TP_STATUS_SEND_REQUEST;
1021                 err = dev_queue_xmit(skb);
1022                 if (unlikely(err > 0)) {
1023                         err = net_xmit_errno(err);
1024                         if (err && __packet_get_status(po, ph) ==
1025                                    TP_STATUS_AVAILABLE) {
1026                                 /* skb was destructed already */
1027                                 skb = NULL;
1028                                 goto out_status;
1029                         }
1030                         /*
1031                          * skb was dropped but not destructed yet;
1032                          * let's treat it like congestion or err < 0
1033                          */
1034                         err = 0;
1035                 }
1036                 packet_increment_head(&po->tx_ring);
1037                 len_sum += tp_len;
1038         } while (likely((ph != NULL) ||
1039                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1040                          (atomic_read(&po->tx_ring.pending))))
1041                 );
1042
1043         err = len_sum;
1044         goto out_put;
1045
1046 out_status:
1047         __packet_set_status(po, ph, status);
1048         kfree_skb(skb);
1049 out_put:
1050         dev_put(dev);
1051 out:
1052         mutex_unlock(&po->pg_vec_lock);
1053         return err;
1054 }
1055
1056 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1057                                                size_t reserve, size_t len,
1058                                                size_t linear, int noblock,
1059                                                int *err)
1060 {
1061         struct sk_buff *skb;
1062
1063         /* Under a page?  Don't bother with paged skb. */
1064         if (prepad + len < PAGE_SIZE || !linear)
1065                 linear = len;
1066
1067         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1068                                    err);
1069         if (!skb)
1070                 return NULL;
1071
1072         skb_reserve(skb, reserve);
1073         skb_put(skb, linear);
1074         skb->data_len = len - linear;
1075         skb->len += len - linear;
1076
1077         return skb;
1078 }
1079
1080 static int packet_snd(struct socket *sock,
1081                           struct msghdr *msg, size_t len)
1082 {
1083         struct sock *sk = sock->sk;
1084         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1085         struct sk_buff *skb;
1086         struct net_device *dev;
1087         __be16 proto;
1088         unsigned char *addr;
1089         int ifindex, err, reserve = 0;
1090         struct virtio_net_hdr vnet_hdr = { 0 };
1091         int offset = 0;
1092         int vnet_hdr_len;
1093         struct packet_sock *po = pkt_sk(sk);
1094         unsigned short gso_type = 0;
1095
1096         /*
1097          *      Get and verify the address.
1098          */
1099
1100         if (saddr == NULL) {
1101                 ifindex = po->ifindex;
1102                 proto   = po->num;
1103                 addr    = NULL;
1104         } else {
1105                 err = -EINVAL;
1106                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1107                         goto out;
1108                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1109                         goto out;
1110                 ifindex = saddr->sll_ifindex;
1111                 proto   = saddr->sll_protocol;
1112                 addr    = saddr->sll_addr;
1113         }
1114
1115
1116         dev = dev_get_by_index(sock_net(sk), ifindex);
1117         err = -ENXIO;
1118         if (dev == NULL)
1119                 goto out_unlock;
1120         if (sock->type == SOCK_RAW)
1121                 reserve = dev->hard_header_len;
1122
1123         err = -ENETDOWN;
1124         if (!(dev->flags & IFF_UP))
1125                 goto out_unlock;
1126
1127         if (po->has_vnet_hdr) {
1128                 vnet_hdr_len = sizeof(vnet_hdr);
1129
1130                 err = -EINVAL;
1131                 if (len < vnet_hdr_len)
1132                         goto out_unlock;
1133
1134                 len -= vnet_hdr_len;
1135
1136                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1137                                        vnet_hdr_len);
1138                 if (err < 0)
1139                         goto out_unlock;
1140
1141                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1142                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1143                       vnet_hdr.hdr_len))
1144                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1145                                                  vnet_hdr.csum_offset + 2;
1146
1147                 err = -EINVAL;
1148                 if (vnet_hdr.hdr_len > len)
1149                         goto out_unlock;
1150
1151                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1152                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1153                         case VIRTIO_NET_HDR_GSO_TCPV4:
1154                                 gso_type = SKB_GSO_TCPV4;
1155                                 break;
1156                         case VIRTIO_NET_HDR_GSO_TCPV6:
1157                                 gso_type = SKB_GSO_TCPV6;
1158                                 break;
1159                         case VIRTIO_NET_HDR_GSO_UDP:
1160                                 gso_type = SKB_GSO_UDP;
1161                                 break;
1162                         default:
1163                                 goto out_unlock;
1164                         }
1165
1166                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1167                                 gso_type |= SKB_GSO_TCP_ECN;
1168
1169                         if (vnet_hdr.gso_size == 0)
1170                                 goto out_unlock;
1171
1172                 }
1173         }
1174
1175         err = -EMSGSIZE;
1176         if (!gso_type && (len > dev->mtu+reserve))
1177                 goto out_unlock;
1178
1179         err = -ENOBUFS;
1180         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1181                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1182                                msg->msg_flags & MSG_DONTWAIT, &err);
1183         if (skb == NULL)
1184                 goto out_unlock;
1185
1186         skb_set_network_header(skb, reserve);
1187
1188         err = -EINVAL;
1189         if (sock->type == SOCK_DGRAM &&
1190             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1191                 goto out_free;
1192
1193         /* Returns -EFAULT on error */
1194         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1195         if (err)
1196                 goto out_free;
1197         err = sock_tx_timestamp(msg, sk, skb_tx(skb));
1198         if (err < 0)
1199                 goto out_free;
1200
1201         skb->protocol = proto;
1202         skb->dev = dev;
1203         skb->priority = sk->sk_priority;
1204         skb->mark = sk->sk_mark;
1205
1206         if (po->has_vnet_hdr) {
1207                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1208                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1209                                                   vnet_hdr.csum_offset)) {
1210                                 err = -EINVAL;
1211                                 goto out_free;
1212                         }
1213                 }
1214
1215                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1216                 skb_shinfo(skb)->gso_type = gso_type;
1217
1218                 /* Header must be checked, and gso_segs computed. */
1219                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1220                 skb_shinfo(skb)->gso_segs = 0;
1221
1222                 len += vnet_hdr_len;
1223         }
1224
1225         /*
1226          *      Now send it
1227          */
1228
1229         err = dev_queue_xmit(skb);
1230         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1231                 goto out_unlock;
1232
1233         dev_put(dev);
1234
1235         return len;
1236
1237 out_free:
1238         kfree_skb(skb);
1239 out_unlock:
1240         if (dev)
1241                 dev_put(dev);
1242 out:
1243         return err;
1244 }
1245
1246 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1247                 struct msghdr *msg, size_t len)
1248 {
1249         struct sock *sk = sock->sk;
1250         struct packet_sock *po = pkt_sk(sk);
1251         if (po->tx_ring.pg_vec)
1252                 return tpacket_snd(po, msg);
1253         else
1254                 return packet_snd(sock, msg, len);
1255 }
1256
1257 /*
1258  *      Close a PACKET socket. This is fairly simple. We immediately go
1259  *      to 'closed' state and remove our protocol entry in the device list.
1260  */
1261
1262 static int packet_release(struct socket *sock)
1263 {
1264         struct sock *sk = sock->sk;
1265         struct packet_sock *po;
1266         struct net *net;
1267         struct tpacket_req req;
1268
1269         if (!sk)
1270                 return 0;
1271
1272         net = sock_net(sk);
1273         po = pkt_sk(sk);
1274
1275         spin_lock_bh(&net->packet.sklist_lock);
1276         sk_del_node_init_rcu(sk);
1277         sock_prot_inuse_add(net, sk->sk_prot, -1);
1278         spin_unlock_bh(&net->packet.sklist_lock);
1279
1280         spin_lock(&po->bind_lock);
1281         if (po->running) {
1282                 /*
1283                  * Remove from protocol table
1284                  */
1285                 po->running = 0;
1286                 po->num = 0;
1287                 __dev_remove_pack(&po->prot_hook);
1288                 __sock_put(sk);
1289         }
1290         spin_unlock(&po->bind_lock);
1291
1292         packet_flush_mclist(sk);
1293
1294         memset(&req, 0, sizeof(req));
1295
1296         if (po->rx_ring.pg_vec)
1297                 packet_set_ring(sk, &req, 1, 0);
1298
1299         if (po->tx_ring.pg_vec)
1300                 packet_set_ring(sk, &req, 1, 1);
1301
1302         synchronize_net();
1303         /*
1304          *      Now the socket is dead. No more input will appear.
1305          */
1306         sock_orphan(sk);
1307         sock->sk = NULL;
1308
1309         /* Purge queues */
1310
1311         skb_queue_purge(&sk->sk_receive_queue);
1312         sk_refcnt_debug_release(sk);
1313
1314         sock_put(sk);
1315         return 0;
1316 }
1317
1318 /*
1319  *      Attach a packet hook.
1320  */
1321
1322 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1323 {
1324         struct packet_sock *po = pkt_sk(sk);
1325         /*
1326          *      Detach an existing hook if present.
1327          */
1328
1329         lock_sock(sk);
1330
1331         spin_lock(&po->bind_lock);
1332         if (po->running) {
1333                 __sock_put(sk);
1334                 po->running = 0;
1335                 po->num = 0;
1336                 spin_unlock(&po->bind_lock);
1337                 dev_remove_pack(&po->prot_hook);
1338                 spin_lock(&po->bind_lock);
1339         }
1340
1341         po->num = protocol;
1342         po->prot_hook.type = protocol;
1343         po->prot_hook.dev = dev;
1344
1345         po->ifindex = dev ? dev->ifindex : 0;
1346
1347         if (protocol == 0)
1348                 goto out_unlock;
1349
1350         if (!dev || (dev->flags & IFF_UP)) {
1351                 dev_add_pack(&po->prot_hook);
1352                 sock_hold(sk);
1353                 po->running = 1;
1354         } else {
1355                 sk->sk_err = ENETDOWN;
1356                 if (!sock_flag(sk, SOCK_DEAD))
1357                         sk->sk_error_report(sk);
1358         }
1359
1360 out_unlock:
1361         spin_unlock(&po->bind_lock);
1362         release_sock(sk);
1363         return 0;
1364 }
1365
1366 /*
1367  *      Bind a packet socket to a device
1368  */
1369
1370 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1371                             int addr_len)
1372 {
1373         struct sock *sk = sock->sk;
1374         char name[15];
1375         struct net_device *dev;
1376         int err = -ENODEV;
1377
1378         /*
1379          *      Check legality
1380          */
1381
1382         if (addr_len != sizeof(struct sockaddr))
1383                 return -EINVAL;
1384         strlcpy(name, uaddr->sa_data, sizeof(name));
1385
1386         dev = dev_get_by_name(sock_net(sk), name);
1387         if (dev) {
1388                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1389                 dev_put(dev);
1390         }
1391         return err;
1392 }
1393
1394 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1395 {
1396         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1397         struct sock *sk = sock->sk;
1398         struct net_device *dev = NULL;
1399         int err;
1400
1401
1402         /*
1403          *      Check legality
1404          */
1405
1406         if (addr_len < sizeof(struct sockaddr_ll))
1407                 return -EINVAL;
1408         if (sll->sll_family != AF_PACKET)
1409                 return -EINVAL;
1410
1411         if (sll->sll_ifindex) {
1412                 err = -ENODEV;
1413                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1414                 if (dev == NULL)
1415                         goto out;
1416         }
1417         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1418         if (dev)
1419                 dev_put(dev);
1420
1421 out:
1422         return err;
1423 }
1424
1425 static struct proto packet_proto = {
1426         .name     = "PACKET",
1427         .owner    = THIS_MODULE,
1428         .obj_size = sizeof(struct packet_sock),
1429 };
1430
1431 /*
1432  *      Create a packet of type SOCK_PACKET.
1433  */
1434
1435 static int packet_create(struct net *net, struct socket *sock, int protocol,
1436                          int kern)
1437 {
1438         struct sock *sk;
1439         struct packet_sock *po;
1440         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1441         int err;
1442
1443         if (!capable(CAP_NET_RAW))
1444                 return -EPERM;
1445         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1446             sock->type != SOCK_PACKET)
1447                 return -ESOCKTNOSUPPORT;
1448
1449         sock->state = SS_UNCONNECTED;
1450
1451         err = -ENOBUFS;
1452         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1453         if (sk == NULL)
1454                 goto out;
1455
1456         sock->ops = &packet_ops;
1457         if (sock->type == SOCK_PACKET)
1458                 sock->ops = &packet_ops_spkt;
1459
1460         sock_init_data(sock, sk);
1461
1462         po = pkt_sk(sk);
1463         sk->sk_family = PF_PACKET;
1464         po->num = proto;
1465
1466         sk->sk_destruct = packet_sock_destruct;
1467         sk_refcnt_debug_inc(sk);
1468
1469         /*
1470          *      Attach a protocol block
1471          */
1472
1473         spin_lock_init(&po->bind_lock);
1474         mutex_init(&po->pg_vec_lock);
1475         po->prot_hook.func = packet_rcv;
1476
1477         if (sock->type == SOCK_PACKET)
1478                 po->prot_hook.func = packet_rcv_spkt;
1479
1480         po->prot_hook.af_packet_priv = sk;
1481
1482         if (proto) {
1483                 po->prot_hook.type = proto;
1484                 dev_add_pack(&po->prot_hook);
1485                 sock_hold(sk);
1486                 po->running = 1;
1487         }
1488
1489         spin_lock_bh(&net->packet.sklist_lock);
1490         sk_add_node_rcu(sk, &net->packet.sklist);
1491         sock_prot_inuse_add(net, &packet_proto, 1);
1492         spin_unlock_bh(&net->packet.sklist_lock);
1493
1494         return 0;
1495 out:
1496         return err;
1497 }
1498
1499 static int packet_recv_error(struct sock *sk, struct msghdr *msg, int len)
1500 {
1501         struct sock_exterr_skb *serr;
1502         struct sk_buff *skb, *skb2;
1503         int copied, err;
1504
1505         err = -EAGAIN;
1506         skb = skb_dequeue(&sk->sk_error_queue);
1507         if (skb == NULL)
1508                 goto out;
1509
1510         copied = skb->len;
1511         if (copied > len) {
1512                 msg->msg_flags |= MSG_TRUNC;
1513                 copied = len;
1514         }
1515         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1516         if (err)
1517                 goto out_free_skb;
1518
1519         sock_recv_timestamp(msg, sk, skb);
1520
1521         serr = SKB_EXT_ERR(skb);
1522         put_cmsg(msg, SOL_PACKET, PACKET_TX_TIMESTAMP,
1523                  sizeof(serr->ee), &serr->ee);
1524
1525         msg->msg_flags |= MSG_ERRQUEUE;
1526         err = copied;
1527
1528         /* Reset and regenerate socket error */
1529         spin_lock_bh(&sk->sk_error_queue.lock);
1530         sk->sk_err = 0;
1531         if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
1532                 sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
1533                 spin_unlock_bh(&sk->sk_error_queue.lock);
1534                 sk->sk_error_report(sk);
1535         } else
1536                 spin_unlock_bh(&sk->sk_error_queue.lock);
1537
1538 out_free_skb:
1539         kfree_skb(skb);
1540 out:
1541         return err;
1542 }
1543
1544 /*
1545  *      Pull a packet from our receive queue and hand it to the user.
1546  *      If necessary we block.
1547  */
1548
1549 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1550                           struct msghdr *msg, size_t len, int flags)
1551 {
1552         struct sock *sk = sock->sk;
1553         struct sk_buff *skb;
1554         int copied, err;
1555         struct sockaddr_ll *sll;
1556         int vnet_hdr_len = 0;
1557
1558         err = -EINVAL;
1559         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
1560                 goto out;
1561
1562 #if 0
1563         /* What error should we return now? EUNATTACH? */
1564         if (pkt_sk(sk)->ifindex < 0)
1565                 return -ENODEV;
1566 #endif
1567
1568         if (flags & MSG_ERRQUEUE) {
1569                 err = packet_recv_error(sk, msg, len);
1570                 goto out;
1571         }
1572
1573         /*
1574          *      Call the generic datagram receiver. This handles all sorts
1575          *      of horrible races and re-entrancy so we can forget about it
1576          *      in the protocol layers.
1577          *
1578          *      Now it will return ENETDOWN, if device have just gone down,
1579          *      but then it will block.
1580          */
1581
1582         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1583
1584         /*
1585          *      An error occurred so return it. Because skb_recv_datagram()
1586          *      handles the blocking we don't see and worry about blocking
1587          *      retries.
1588          */
1589
1590         if (skb == NULL)
1591                 goto out;
1592
1593         if (pkt_sk(sk)->has_vnet_hdr) {
1594                 struct virtio_net_hdr vnet_hdr = { 0 };
1595
1596                 err = -EINVAL;
1597                 vnet_hdr_len = sizeof(vnet_hdr);
1598                 if ((len -= vnet_hdr_len) < 0)
1599                         goto out_free;
1600
1601                 if (skb_is_gso(skb)) {
1602                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1603
1604                         /* This is a hint as to how much should be linear. */
1605                         vnet_hdr.hdr_len = skb_headlen(skb);
1606                         vnet_hdr.gso_size = sinfo->gso_size;
1607                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1608                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1609                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1610                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1611                         else if (sinfo->gso_type & SKB_GSO_UDP)
1612                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1613                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1614                                 goto out_free;
1615                         else
1616                                 BUG();
1617                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1618                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1619                 } else
1620                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1621
1622                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1623                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1624                         vnet_hdr.csum_start = skb->csum_start -
1625                                                         skb_headroom(skb);
1626                         vnet_hdr.csum_offset = skb->csum_offset;
1627                 } /* else everything is zero */
1628
1629                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1630                                      vnet_hdr_len);
1631                 if (err < 0)
1632                         goto out_free;
1633         }
1634
1635         /*
1636          *      If the address length field is there to be filled in, we fill
1637          *      it in now.
1638          */
1639
1640         sll = &PACKET_SKB_CB(skb)->sa.ll;
1641         if (sock->type == SOCK_PACKET)
1642                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1643         else
1644                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1645
1646         /*
1647          *      You lose any data beyond the buffer you gave. If it worries a
1648          *      user program they can ask the device for its MTU anyway.
1649          */
1650
1651         copied = skb->len;
1652         if (copied > len) {
1653                 copied = len;
1654                 msg->msg_flags |= MSG_TRUNC;
1655         }
1656
1657         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1658         if (err)
1659                 goto out_free;
1660
1661         sock_recv_ts_and_drops(msg, sk, skb);
1662
1663         if (msg->msg_name)
1664                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1665                        msg->msg_namelen);
1666
1667         if (pkt_sk(sk)->auxdata) {
1668                 struct tpacket_auxdata aux;
1669
1670                 aux.tp_status = TP_STATUS_USER;
1671                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1672                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1673                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1674                 aux.tp_snaplen = skb->len;
1675                 aux.tp_mac = 0;
1676                 aux.tp_net = skb_network_offset(skb);
1677                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1678
1679                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1680         }
1681
1682         /*
1683          *      Free or return the buffer as appropriate. Again this
1684          *      hides all the races and re-entrancy issues from us.
1685          */
1686         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1687
1688 out_free:
1689         skb_free_datagram(sk, skb);
1690 out:
1691         return err;
1692 }
1693
1694 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1695                                int *uaddr_len, int peer)
1696 {
1697         struct net_device *dev;
1698         struct sock *sk = sock->sk;
1699
1700         if (peer)
1701                 return -EOPNOTSUPP;
1702
1703         uaddr->sa_family = AF_PACKET;
1704         rcu_read_lock();
1705         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1706         if (dev)
1707                 strlcpy(uaddr->sa_data, dev->name, 15);
1708         else
1709                 memset(uaddr->sa_data, 0, 14);
1710         rcu_read_unlock();
1711         *uaddr_len = sizeof(*uaddr);
1712
1713         return 0;
1714 }
1715
1716 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1717                           int *uaddr_len, int peer)
1718 {
1719         struct net_device *dev;
1720         struct sock *sk = sock->sk;
1721         struct packet_sock *po = pkt_sk(sk);
1722         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1723
1724         if (peer)
1725                 return -EOPNOTSUPP;
1726
1727         sll->sll_family = AF_PACKET;
1728         sll->sll_ifindex = po->ifindex;
1729         sll->sll_protocol = po->num;
1730         rcu_read_lock();
1731         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1732         if (dev) {
1733                 sll->sll_hatype = dev->type;
1734                 sll->sll_halen = dev->addr_len;
1735                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1736         } else {
1737                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1738                 sll->sll_halen = 0;
1739         }
1740         rcu_read_unlock();
1741         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1742
1743         return 0;
1744 }
1745
1746 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1747                          int what)
1748 {
1749         switch (i->type) {
1750         case PACKET_MR_MULTICAST:
1751                 if (i->alen != dev->addr_len)
1752                         return -EINVAL;
1753                 if (what > 0)
1754                         return dev_mc_add(dev, i->addr);
1755                 else
1756                         return dev_mc_del(dev, i->addr);
1757                 break;
1758         case PACKET_MR_PROMISC:
1759                 return dev_set_promiscuity(dev, what);
1760                 break;
1761         case PACKET_MR_ALLMULTI:
1762                 return dev_set_allmulti(dev, what);
1763                 break;
1764         case PACKET_MR_UNICAST:
1765                 if (i->alen != dev->addr_len)
1766                         return -EINVAL;
1767                 if (what > 0)
1768                         return dev_uc_add(dev, i->addr);
1769                 else
1770                         return dev_uc_del(dev, i->addr);
1771                 break;
1772         default:
1773                 break;
1774         }
1775         return 0;
1776 }
1777
1778 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1779 {
1780         for ( ; i; i = i->next) {
1781                 if (i->ifindex == dev->ifindex)
1782                         packet_dev_mc(dev, i, what);
1783         }
1784 }
1785
1786 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1787 {
1788         struct packet_sock *po = pkt_sk(sk);
1789         struct packet_mclist *ml, *i;
1790         struct net_device *dev;
1791         int err;
1792
1793         rtnl_lock();
1794
1795         err = -ENODEV;
1796         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1797         if (!dev)
1798                 goto done;
1799
1800         err = -EINVAL;
1801         if (mreq->mr_alen > dev->addr_len)
1802                 goto done;
1803
1804         err = -ENOBUFS;
1805         i = kmalloc(sizeof(*i), GFP_KERNEL);
1806         if (i == NULL)
1807                 goto done;
1808
1809         err = 0;
1810         for (ml = po->mclist; ml; ml = ml->next) {
1811                 if (ml->ifindex == mreq->mr_ifindex &&
1812                     ml->type == mreq->mr_type &&
1813                     ml->alen == mreq->mr_alen &&
1814                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1815                         ml->count++;
1816                         /* Free the new element ... */
1817                         kfree(i);
1818                         goto done;
1819                 }
1820         }
1821
1822         i->type = mreq->mr_type;
1823         i->ifindex = mreq->mr_ifindex;
1824         i->alen = mreq->mr_alen;
1825         memcpy(i->addr, mreq->mr_address, i->alen);
1826         i->count = 1;
1827         i->next = po->mclist;
1828         po->mclist = i;
1829         err = packet_dev_mc(dev, i, 1);
1830         if (err) {
1831                 po->mclist = i->next;
1832                 kfree(i);
1833         }
1834
1835 done:
1836         rtnl_unlock();
1837         return err;
1838 }
1839
1840 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1841 {
1842         struct packet_mclist *ml, **mlp;
1843
1844         rtnl_lock();
1845
1846         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1847                 if (ml->ifindex == mreq->mr_ifindex &&
1848                     ml->type == mreq->mr_type &&
1849                     ml->alen == mreq->mr_alen &&
1850                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1851                         if (--ml->count == 0) {
1852                                 struct net_device *dev;
1853                                 *mlp = ml->next;
1854                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1855                                 if (dev)
1856                                         packet_dev_mc(dev, ml, -1);
1857                                 kfree(ml);
1858                         }
1859                         rtnl_unlock();
1860                         return 0;
1861                 }
1862         }
1863         rtnl_unlock();
1864         return -EADDRNOTAVAIL;
1865 }
1866
1867 static void packet_flush_mclist(struct sock *sk)
1868 {
1869         struct packet_sock *po = pkt_sk(sk);
1870         struct packet_mclist *ml;
1871
1872         if (!po->mclist)
1873                 return;
1874
1875         rtnl_lock();
1876         while ((ml = po->mclist) != NULL) {
1877                 struct net_device *dev;
1878
1879                 po->mclist = ml->next;
1880                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1881                 if (dev != NULL)
1882                         packet_dev_mc(dev, ml, -1);
1883                 kfree(ml);
1884         }
1885         rtnl_unlock();
1886 }
1887
1888 static int
1889 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1890 {
1891         struct sock *sk = sock->sk;
1892         struct packet_sock *po = pkt_sk(sk);
1893         int ret;
1894
1895         if (level != SOL_PACKET)
1896                 return -ENOPROTOOPT;
1897
1898         switch (optname) {
1899         case PACKET_ADD_MEMBERSHIP:
1900         case PACKET_DROP_MEMBERSHIP:
1901         {
1902                 struct packet_mreq_max mreq;
1903                 int len = optlen;
1904                 memset(&mreq, 0, sizeof(mreq));
1905                 if (len < sizeof(struct packet_mreq))
1906                         return -EINVAL;
1907                 if (len > sizeof(mreq))
1908                         len = sizeof(mreq);
1909                 if (copy_from_user(&mreq, optval, len))
1910                         return -EFAULT;
1911                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1912                         return -EINVAL;
1913                 if (optname == PACKET_ADD_MEMBERSHIP)
1914                         ret = packet_mc_add(sk, &mreq);
1915                 else
1916                         ret = packet_mc_drop(sk, &mreq);
1917                 return ret;
1918         }
1919
1920         case PACKET_RX_RING:
1921         case PACKET_TX_RING:
1922         {
1923                 struct tpacket_req req;
1924
1925                 if (optlen < sizeof(req))
1926                         return -EINVAL;
1927                 if (pkt_sk(sk)->has_vnet_hdr)
1928                         return -EINVAL;
1929                 if (copy_from_user(&req, optval, sizeof(req)))
1930                         return -EFAULT;
1931                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1932         }
1933         case PACKET_COPY_THRESH:
1934         {
1935                 int val;
1936
1937                 if (optlen != sizeof(val))
1938                         return -EINVAL;
1939                 if (copy_from_user(&val, optval, sizeof(val)))
1940                         return -EFAULT;
1941
1942                 pkt_sk(sk)->copy_thresh = val;
1943                 return 0;
1944         }
1945         case PACKET_VERSION:
1946         {
1947                 int val;
1948
1949                 if (optlen != sizeof(val))
1950                         return -EINVAL;
1951                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1952                         return -EBUSY;
1953                 if (copy_from_user(&val, optval, sizeof(val)))
1954                         return -EFAULT;
1955                 switch (val) {
1956                 case TPACKET_V1:
1957                 case TPACKET_V2:
1958                         po->tp_version = val;
1959                         return 0;
1960                 default:
1961                         return -EINVAL;
1962                 }
1963         }
1964         case PACKET_RESERVE:
1965         {
1966                 unsigned int val;
1967
1968                 if (optlen != sizeof(val))
1969                         return -EINVAL;
1970                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1971                         return -EBUSY;
1972                 if (copy_from_user(&val, optval, sizeof(val)))
1973                         return -EFAULT;
1974                 po->tp_reserve = val;
1975                 return 0;
1976         }
1977         case PACKET_LOSS:
1978         {
1979                 unsigned int val;
1980
1981                 if (optlen != sizeof(val))
1982                         return -EINVAL;
1983                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1984                         return -EBUSY;
1985                 if (copy_from_user(&val, optval, sizeof(val)))
1986                         return -EFAULT;
1987                 po->tp_loss = !!val;
1988                 return 0;
1989         }
1990         case PACKET_AUXDATA:
1991         {
1992                 int val;
1993
1994                 if (optlen < sizeof(val))
1995                         return -EINVAL;
1996                 if (copy_from_user(&val, optval, sizeof(val)))
1997                         return -EFAULT;
1998
1999                 po->auxdata = !!val;
2000                 return 0;
2001         }
2002         case PACKET_ORIGDEV:
2003         {
2004                 int val;
2005
2006                 if (optlen < sizeof(val))
2007                         return -EINVAL;
2008                 if (copy_from_user(&val, optval, sizeof(val)))
2009                         return -EFAULT;
2010
2011                 po->origdev = !!val;
2012                 return 0;
2013         }
2014         case PACKET_VNET_HDR:
2015         {
2016                 int val;
2017
2018                 if (sock->type != SOCK_RAW)
2019                         return -EINVAL;
2020                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
2021                         return -EBUSY;
2022                 if (optlen < sizeof(val))
2023                         return -EINVAL;
2024                 if (copy_from_user(&val, optval, sizeof(val)))
2025                         return -EFAULT;
2026
2027                 po->has_vnet_hdr = !!val;
2028                 return 0;
2029         }
2030         default:
2031                 return -ENOPROTOOPT;
2032         }
2033 }
2034
2035 static int packet_getsockopt(struct socket *sock, int level, int optname,
2036                              char __user *optval, int __user *optlen)
2037 {
2038         int len;
2039         int val;
2040         struct sock *sk = sock->sk;
2041         struct packet_sock *po = pkt_sk(sk);
2042         void *data;
2043         struct tpacket_stats st;
2044
2045         if (level != SOL_PACKET)
2046                 return -ENOPROTOOPT;
2047
2048         if (get_user(len, optlen))
2049                 return -EFAULT;
2050
2051         if (len < 0)
2052                 return -EINVAL;
2053
2054         switch (optname) {
2055         case PACKET_STATISTICS:
2056                 if (len > sizeof(struct tpacket_stats))
2057                         len = sizeof(struct tpacket_stats);
2058                 spin_lock_bh(&sk->sk_receive_queue.lock);
2059                 st = po->stats;
2060                 memset(&po->stats, 0, sizeof(st));
2061                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2062                 st.tp_packets += st.tp_drops;
2063
2064                 data = &st;
2065                 break;
2066         case PACKET_AUXDATA:
2067                 if (len > sizeof(int))
2068                         len = sizeof(int);
2069                 val = po->auxdata;
2070
2071                 data = &val;
2072                 break;
2073         case PACKET_ORIGDEV:
2074                 if (len > sizeof(int))
2075                         len = sizeof(int);
2076                 val = po->origdev;
2077
2078                 data = &val;
2079                 break;
2080         case PACKET_VNET_HDR:
2081                 if (len > sizeof(int))
2082                         len = sizeof(int);
2083                 val = po->has_vnet_hdr;
2084
2085                 data = &val;
2086                 break;
2087         case PACKET_VERSION:
2088                 if (len > sizeof(int))
2089                         len = sizeof(int);
2090                 val = po->tp_version;
2091                 data = &val;
2092                 break;
2093         case PACKET_HDRLEN:
2094                 if (len > sizeof(int))
2095                         len = sizeof(int);
2096                 if (copy_from_user(&val, optval, len))
2097                         return -EFAULT;
2098                 switch (val) {
2099                 case TPACKET_V1:
2100                         val = sizeof(struct tpacket_hdr);
2101                         break;
2102                 case TPACKET_V2:
2103                         val = sizeof(struct tpacket2_hdr);
2104                         break;
2105                 default:
2106                         return -EINVAL;
2107                 }
2108                 data = &val;
2109                 break;
2110         case PACKET_RESERVE:
2111                 if (len > sizeof(unsigned int))
2112                         len = sizeof(unsigned int);
2113                 val = po->tp_reserve;
2114                 data = &val;
2115                 break;
2116         case PACKET_LOSS:
2117                 if (len > sizeof(unsigned int))
2118                         len = sizeof(unsigned int);
2119                 val = po->tp_loss;
2120                 data = &val;
2121                 break;
2122         default:
2123                 return -ENOPROTOOPT;
2124         }
2125
2126         if (put_user(len, optlen))
2127                 return -EFAULT;
2128         if (copy_to_user(optval, data, len))
2129                 return -EFAULT;
2130         return 0;
2131 }
2132
2133
2134 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2135 {
2136         struct sock *sk;
2137         struct hlist_node *node;
2138         struct net_device *dev = data;
2139         struct net *net = dev_net(dev);
2140
2141         rcu_read_lock();
2142         sk_for_each_rcu(sk, node, &net->packet.sklist) {
2143                 struct packet_sock *po = pkt_sk(sk);
2144
2145                 switch (msg) {
2146                 case NETDEV_UNREGISTER:
2147                         if (po->mclist)
2148                                 packet_dev_mclist(dev, po->mclist, -1);
2149                         /* fallthrough */
2150
2151                 case NETDEV_DOWN:
2152                         if (dev->ifindex == po->ifindex) {
2153                                 spin_lock(&po->bind_lock);
2154                                 if (po->running) {
2155                                         __dev_remove_pack(&po->prot_hook);
2156                                         __sock_put(sk);
2157                                         po->running = 0;
2158                                         sk->sk_err = ENETDOWN;
2159                                         if (!sock_flag(sk, SOCK_DEAD))
2160                                                 sk->sk_error_report(sk);
2161                                 }
2162                                 if (msg == NETDEV_UNREGISTER) {
2163                                         po->ifindex = -1;
2164                                         po->prot_hook.dev = NULL;
2165                                 }
2166                                 spin_unlock(&po->bind_lock);
2167                         }
2168                         break;
2169                 case NETDEV_UP:
2170                         if (dev->ifindex == po->ifindex) {
2171                                 spin_lock(&po->bind_lock);
2172                                 if (po->num && !po->running) {
2173                                         dev_add_pack(&po->prot_hook);
2174                                         sock_hold(sk);
2175                                         po->running = 1;
2176                                 }
2177                                 spin_unlock(&po->bind_lock);
2178                         }
2179                         break;
2180                 }
2181         }
2182         rcu_read_unlock();
2183         return NOTIFY_DONE;
2184 }
2185
2186
2187 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2188                         unsigned long arg)
2189 {
2190         struct sock *sk = sock->sk;
2191
2192         switch (cmd) {
2193         case SIOCOUTQ:
2194         {
2195                 int amount = sk_wmem_alloc_get(sk);
2196
2197                 return put_user(amount, (int __user *)arg);
2198         }
2199         case SIOCINQ:
2200         {
2201                 struct sk_buff *skb;
2202                 int amount = 0;
2203
2204                 spin_lock_bh(&sk->sk_receive_queue.lock);
2205                 skb = skb_peek(&sk->sk_receive_queue);
2206                 if (skb)
2207                         amount = skb->len;
2208                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2209                 return put_user(amount, (int __user *)arg);
2210         }
2211         case SIOCGSTAMP:
2212                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2213         case SIOCGSTAMPNS:
2214                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2215
2216 #ifdef CONFIG_INET
2217         case SIOCADDRT:
2218         case SIOCDELRT:
2219         case SIOCDARP:
2220         case SIOCGARP:
2221         case SIOCSARP:
2222         case SIOCGIFADDR:
2223         case SIOCSIFADDR:
2224         case SIOCGIFBRDADDR:
2225         case SIOCSIFBRDADDR:
2226         case SIOCGIFNETMASK:
2227         case SIOCSIFNETMASK:
2228         case SIOCGIFDSTADDR:
2229         case SIOCSIFDSTADDR:
2230         case SIOCSIFFLAGS:
2231                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2232 #endif
2233
2234         default:
2235                 return -ENOIOCTLCMD;
2236         }
2237         return 0;
2238 }
2239
2240 static unsigned int packet_poll(struct file *file, struct socket *sock,
2241                                 poll_table *wait)
2242 {
2243         struct sock *sk = sock->sk;
2244         struct packet_sock *po = pkt_sk(sk);
2245         unsigned int mask = datagram_poll(file, sock, wait);
2246
2247         spin_lock_bh(&sk->sk_receive_queue.lock);
2248         if (po->rx_ring.pg_vec) {
2249                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2250                         mask |= POLLIN | POLLRDNORM;
2251         }
2252         spin_unlock_bh(&sk->sk_receive_queue.lock);
2253         spin_lock_bh(&sk->sk_write_queue.lock);
2254         if (po->tx_ring.pg_vec) {
2255                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2256                         mask |= POLLOUT | POLLWRNORM;
2257         }
2258         spin_unlock_bh(&sk->sk_write_queue.lock);
2259         return mask;
2260 }
2261
2262
2263 /* Dirty? Well, I still did not learn better way to account
2264  * for user mmaps.
2265  */
2266
2267 static void packet_mm_open(struct vm_area_struct *vma)
2268 {
2269         struct file *file = vma->vm_file;
2270         struct socket *sock = file->private_data;
2271         struct sock *sk = sock->sk;
2272
2273         if (sk)
2274                 atomic_inc(&pkt_sk(sk)->mapped);
2275 }
2276
2277 static void packet_mm_close(struct vm_area_struct *vma)
2278 {
2279         struct file *file = vma->vm_file;
2280         struct socket *sock = file->private_data;
2281         struct sock *sk = sock->sk;
2282
2283         if (sk)
2284                 atomic_dec(&pkt_sk(sk)->mapped);
2285 }
2286
2287 static const struct vm_operations_struct packet_mmap_ops = {
2288         .open   =       packet_mm_open,
2289         .close  =       packet_mm_close,
2290 };
2291
2292 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2293 {
2294         int i;
2295
2296         for (i = 0; i < len; i++) {
2297                 if (likely(pg_vec[i]))
2298                         free_pages((unsigned long) pg_vec[i], order);
2299         }
2300         kfree(pg_vec);
2301 }
2302
2303 static inline char *alloc_one_pg_vec_page(unsigned long order)
2304 {
2305         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2306
2307         return (char *) __get_free_pages(gfp_flags, order);
2308 }
2309
2310 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2311 {
2312         unsigned int block_nr = req->tp_block_nr;
2313         char **pg_vec;
2314         int i;
2315
2316         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2317         if (unlikely(!pg_vec))
2318                 goto out;
2319
2320         for (i = 0; i < block_nr; i++) {
2321                 pg_vec[i] = alloc_one_pg_vec_page(order);
2322                 if (unlikely(!pg_vec[i]))
2323                         goto out_free_pgvec;
2324         }
2325
2326 out:
2327         return pg_vec;
2328
2329 out_free_pgvec:
2330         free_pg_vec(pg_vec, order, block_nr);
2331         pg_vec = NULL;
2332         goto out;
2333 }
2334
2335 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2336                 int closing, int tx_ring)
2337 {
2338         char **pg_vec = NULL;
2339         struct packet_sock *po = pkt_sk(sk);
2340         int was_running, order = 0;
2341         struct packet_ring_buffer *rb;
2342         struct sk_buff_head *rb_queue;
2343         __be16 num;
2344         int err;
2345
2346         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2347         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2348
2349         err = -EBUSY;
2350         if (!closing) {
2351                 if (atomic_read(&po->mapped))
2352                         goto out;
2353                 if (atomic_read(&rb->pending))
2354                         goto out;
2355         }
2356
2357         if (req->tp_block_nr) {
2358                 /* Sanity tests and some calculations */
2359                 err = -EBUSY;
2360                 if (unlikely(rb->pg_vec))
2361                         goto out;
2362
2363                 switch (po->tp_version) {
2364                 case TPACKET_V1:
2365                         po->tp_hdrlen = TPACKET_HDRLEN;
2366                         break;
2367                 case TPACKET_V2:
2368                         po->tp_hdrlen = TPACKET2_HDRLEN;
2369                         break;
2370                 }
2371
2372                 err = -EINVAL;
2373                 if (unlikely((int)req->tp_block_size <= 0))
2374                         goto out;
2375                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2376                         goto out;
2377                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2378                                         po->tp_reserve))
2379                         goto out;
2380                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2381                         goto out;
2382
2383                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2384                 if (unlikely(rb->frames_per_block <= 0))
2385                         goto out;
2386                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2387                                         req->tp_frame_nr))
2388                         goto out;
2389
2390                 err = -ENOMEM;
2391                 order = get_order(req->tp_block_size);
2392                 pg_vec = alloc_pg_vec(req, order);
2393                 if (unlikely(!pg_vec))
2394                         goto out;
2395         }
2396         /* Done */
2397         else {
2398                 err = -EINVAL;
2399                 if (unlikely(req->tp_frame_nr))
2400                         goto out;
2401         }
2402
2403         lock_sock(sk);
2404
2405         /* Detach socket from network */
2406         spin_lock(&po->bind_lock);
2407         was_running = po->running;
2408         num = po->num;
2409         if (was_running) {
2410                 __dev_remove_pack(&po->prot_hook);
2411                 po->num = 0;
2412                 po->running = 0;
2413                 __sock_put(sk);
2414         }
2415         spin_unlock(&po->bind_lock);
2416
2417         synchronize_net();
2418
2419         err = -EBUSY;
2420         mutex_lock(&po->pg_vec_lock);
2421         if (closing || atomic_read(&po->mapped) == 0) {
2422                 err = 0;
2423 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2424                 spin_lock_bh(&rb_queue->lock);
2425                 pg_vec = XC(rb->pg_vec, pg_vec);
2426                 rb->frame_max = (req->tp_frame_nr - 1);
2427                 rb->head = 0;
2428                 rb->frame_size = req->tp_frame_size;
2429                 spin_unlock_bh(&rb_queue->lock);
2430
2431                 order = XC(rb->pg_vec_order, order);
2432                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2433
2434                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2435                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2436                                                 tpacket_rcv : packet_rcv;
2437                 skb_queue_purge(rb_queue);
2438 #undef XC
2439                 if (atomic_read(&po->mapped))
2440                         pr_err("packet_mmap: vma is busy: %d\n",
2441                                atomic_read(&po->mapped));
2442         }
2443         mutex_unlock(&po->pg_vec_lock);
2444
2445         spin_lock(&po->bind_lock);
2446         if (was_running && !po->running) {
2447                 sock_hold(sk);
2448                 po->running = 1;
2449                 po->num = num;
2450                 dev_add_pack(&po->prot_hook);
2451         }
2452         spin_unlock(&po->bind_lock);
2453
2454         release_sock(sk);
2455
2456         if (pg_vec)
2457                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2458 out:
2459         return err;
2460 }
2461
2462 static int packet_mmap(struct file *file, struct socket *sock,
2463                 struct vm_area_struct *vma)
2464 {
2465         struct sock *sk = sock->sk;
2466         struct packet_sock *po = pkt_sk(sk);
2467         unsigned long size, expected_size;
2468         struct packet_ring_buffer *rb;
2469         unsigned long start;
2470         int err = -EINVAL;
2471         int i;
2472
2473         if (vma->vm_pgoff)
2474                 return -EINVAL;
2475
2476         mutex_lock(&po->pg_vec_lock);
2477
2478         expected_size = 0;
2479         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2480                 if (rb->pg_vec) {
2481                         expected_size += rb->pg_vec_len
2482                                                 * rb->pg_vec_pages
2483                                                 * PAGE_SIZE;
2484                 }
2485         }
2486
2487         if (expected_size == 0)
2488                 goto out;
2489
2490         size = vma->vm_end - vma->vm_start;
2491         if (size != expected_size)
2492                 goto out;
2493
2494         start = vma->vm_start;
2495         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2496                 if (rb->pg_vec == NULL)
2497                         continue;
2498
2499                 for (i = 0; i < rb->pg_vec_len; i++) {
2500                         struct page *page = virt_to_page(rb->pg_vec[i]);
2501                         int pg_num;
2502
2503                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2504                                         pg_num++, page++) {
2505                                 err = vm_insert_page(vma, start, page);
2506                                 if (unlikely(err))
2507                                         goto out;
2508                                 start += PAGE_SIZE;
2509                         }
2510                 }
2511         }
2512
2513         atomic_inc(&po->mapped);
2514         vma->vm_ops = &packet_mmap_ops;
2515         err = 0;
2516
2517 out:
2518         mutex_unlock(&po->pg_vec_lock);
2519         return err;
2520 }
2521
2522 static const struct proto_ops packet_ops_spkt = {
2523         .family =       PF_PACKET,
2524         .owner =        THIS_MODULE,
2525         .release =      packet_release,
2526         .bind =         packet_bind_spkt,
2527         .connect =      sock_no_connect,
2528         .socketpair =   sock_no_socketpair,
2529         .accept =       sock_no_accept,
2530         .getname =      packet_getname_spkt,
2531         .poll =         datagram_poll,
2532         .ioctl =        packet_ioctl,
2533         .listen =       sock_no_listen,
2534         .shutdown =     sock_no_shutdown,
2535         .setsockopt =   sock_no_setsockopt,
2536         .getsockopt =   sock_no_getsockopt,
2537         .sendmsg =      packet_sendmsg_spkt,
2538         .recvmsg =      packet_recvmsg,
2539         .mmap =         sock_no_mmap,
2540         .sendpage =     sock_no_sendpage,
2541 };
2542
2543 static const struct proto_ops packet_ops = {
2544         .family =       PF_PACKET,
2545         .owner =        THIS_MODULE,
2546         .release =      packet_release,
2547         .bind =         packet_bind,
2548         .connect =      sock_no_connect,
2549         .socketpair =   sock_no_socketpair,
2550         .accept =       sock_no_accept,
2551         .getname =      packet_getname,
2552         .poll =         packet_poll,
2553         .ioctl =        packet_ioctl,
2554         .listen =       sock_no_listen,
2555         .shutdown =     sock_no_shutdown,
2556         .setsockopt =   packet_setsockopt,
2557         .getsockopt =   packet_getsockopt,
2558         .sendmsg =      packet_sendmsg,
2559         .recvmsg =      packet_recvmsg,
2560         .mmap =         packet_mmap,
2561         .sendpage =     sock_no_sendpage,
2562 };
2563
2564 static const struct net_proto_family packet_family_ops = {
2565         .family =       PF_PACKET,
2566         .create =       packet_create,
2567         .owner  =       THIS_MODULE,
2568 };
2569
2570 static struct notifier_block packet_netdev_notifier = {
2571         .notifier_call =        packet_notifier,
2572 };
2573
2574 #ifdef CONFIG_PROC_FS
2575
2576 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2577         __acquires(RCU)
2578 {
2579         struct net *net = seq_file_net(seq);
2580
2581         rcu_read_lock();
2582         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
2583 }
2584
2585 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2586 {
2587         struct net *net = seq_file_net(seq);
2588         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
2589 }
2590
2591 static void packet_seq_stop(struct seq_file *seq, void *v)
2592         __releases(RCU)
2593 {
2594         rcu_read_unlock();
2595 }
2596
2597 static int packet_seq_show(struct seq_file *seq, void *v)
2598 {
2599         if (v == SEQ_START_TOKEN)
2600                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2601         else {
2602                 struct sock *s = sk_entry(v);
2603                 const struct packet_sock *po = pkt_sk(s);
2604
2605                 seq_printf(seq,
2606                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2607                            s,
2608                            atomic_read(&s->sk_refcnt),
2609                            s->sk_type,
2610                            ntohs(po->num),
2611                            po->ifindex,
2612                            po->running,
2613                            atomic_read(&s->sk_rmem_alloc),
2614                            sock_i_uid(s),
2615                            sock_i_ino(s));
2616         }
2617
2618         return 0;
2619 }
2620
2621 static const struct seq_operations packet_seq_ops = {
2622         .start  = packet_seq_start,
2623         .next   = packet_seq_next,
2624         .stop   = packet_seq_stop,
2625         .show   = packet_seq_show,
2626 };
2627
2628 static int packet_seq_open(struct inode *inode, struct file *file)
2629 {
2630         return seq_open_net(inode, file, &packet_seq_ops,
2631                             sizeof(struct seq_net_private));
2632 }
2633
2634 static const struct file_operations packet_seq_fops = {
2635         .owner          = THIS_MODULE,
2636         .open           = packet_seq_open,
2637         .read           = seq_read,
2638         .llseek         = seq_lseek,
2639         .release        = seq_release_net,
2640 };
2641
2642 #endif
2643
2644 static int __net_init packet_net_init(struct net *net)
2645 {
2646         spin_lock_init(&net->packet.sklist_lock);
2647         INIT_HLIST_HEAD(&net->packet.sklist);
2648
2649         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2650                 return -ENOMEM;
2651
2652         return 0;
2653 }
2654
2655 static void __net_exit packet_net_exit(struct net *net)
2656 {
2657         proc_net_remove(net, "packet");
2658 }
2659
2660 static struct pernet_operations packet_net_ops = {
2661         .init = packet_net_init,
2662         .exit = packet_net_exit,
2663 };
2664
2665
2666 static void __exit packet_exit(void)
2667 {
2668         unregister_netdevice_notifier(&packet_netdev_notifier);
2669         unregister_pernet_subsys(&packet_net_ops);
2670         sock_unregister(PF_PACKET);
2671         proto_unregister(&packet_proto);
2672 }
2673
2674 static int __init packet_init(void)
2675 {
2676         int rc = proto_register(&packet_proto, 0);
2677
2678         if (rc != 0)
2679                 goto out;
2680
2681         sock_register(&packet_family_ops);
2682         register_pernet_subsys(&packet_net_ops);
2683         register_netdevice_notifier(&packet_netdev_notifier);
2684 out:
2685         return rc;
2686 }
2687
2688 module_init(packet_init);
2689 module_exit(packet_exit);
2690 MODULE_LICENSE("GPL");
2691 MODULE_ALIAS_NETPROTO(PF_PACKET);