net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <net/mpls.h>
 122 #include <linux/ipv6.h>
 123 #include <linux/in.h>
 124 #include <linux/jhash.h>
 125 #include <linux/random.h>
 126 #include <trace/events/napi.h>
 127 #include <trace/events/net.h>
 128 #include <trace/events/skb.h>
 129 #include <linux/pci.h>
 130 #include <linux/inetdevice.h>
 131 #include <linux/cpu_rmap.h>
 132 #include <linux/static_key.h>
 133 #include <linux/hashtable.h>
 134 #include <linux/vmalloc.h>
 135 #include <linux/if_macvlan.h>
 136 #include <linux/errqueue.h>
 137 #include <linux/hrtimer.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static DEFINE_SPINLOCK(offload_lock);
 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 150 struct list_head ptype_all __read_mostly;       /* Taps */
 151 static struct list_head offload_base __read_mostly;
 152
 153 static int netif_rx_internal(struct sk_buff *skb);
 154 static int call_netdevice_notifiers_info(unsigned long val,
 155                                          struct net_device *dev,
 156                                          struct netdev_notifier_info *info);
 157
 158 /*
 159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 160  * semaphore.
 161  *
 162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 163  *
 164  * Writers must hold the rtnl semaphore while they loop through the
 165  * dev_base_head list, and hold dev_base_lock for writing when they do the
 166  * actual updates.  This allows pure readers to access the list even
 167  * while a writer is preparing to update it.
 168  *
 169  * To put it another way, dev_base_lock is held for writing only to
 170  * protect against pure readers; the rtnl semaphore provides the
 171  * protection against other writers.
 172  *
 173  * See, for example usages, register_netdevice() and
 174  * unregister_netdevice(), which must be called with the rtnl
 175  * semaphore held.
 176  */
 177 DEFINE_RWLOCK(dev_base_lock);
 178 EXPORT_SYMBOL(dev_base_lock);
 179
 180 /* protects napi_hash addition/deletion and napi_gen_id */
 181 static DEFINE_SPINLOCK(napi_hash_lock);
 182
 183 static unsigned int napi_gen_id;
 184 static DEFINE_HASHTABLE(napi_hash, 8);
 185
 186 static seqcount_t devnet_rename_seq;
 187
 188 static inline void dev_base_seq_inc(struct net *net)
 189 {
 190         while (++net->dev_base_seq == 0);
 191 }
 192
 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 194 {
 195         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 196
 197         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 198 }
 199
 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 201 {
 202         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 203 }
 204
 205 static inline void rps_lock(struct softnet_data *sd)
 206 {
 207 #ifdef CONFIG_RPS
 208         spin_lock(&sd->input_pkt_queue.lock);
 209 #endif
 210 }
 211
 212 static inline void rps_unlock(struct softnet_data *sd)
 213 {
 214 #ifdef CONFIG_RPS
 215         spin_unlock(&sd->input_pkt_queue.lock);
 216 #endif
 217 }
 218
 219 /* Device list insertion */
 220 static void list_netdevice(struct net_device *dev)
 221 {
 222         struct net *net = dev_net(dev);
 223
 224         ASSERT_RTNL();
 225
 226         write_lock_bh(&dev_base_lock);
 227         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 228         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 229         hlist_add_head_rcu(&dev->index_hlist,
 230                            dev_index_hash(net, dev->ifindex));
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(net);
 234 }
 235
 236 /* Device list removal
 237  * caller must respect a RCU grace period before freeing/reusing dev
 238  */
 239 static void unlist_netdevice(struct net_device *dev)
 240 {
 241         ASSERT_RTNL();
 242
 243         /* Unlink dev from the device chain */
 244         write_lock_bh(&dev_base_lock);
 245         list_del_rcu(&dev->dev_list);
 246         hlist_del_rcu(&dev->name_hlist);
 247         hlist_del_rcu(&dev->index_hlist);
 248         write_unlock_bh(&dev_base_lock);
 249
 250         dev_base_seq_inc(dev_net(dev));
 251 }
 252
 253 /*
 254  *      Our notifier list
 255  */
 256
 257 static RAW_NOTIFIER_HEAD(netdev_chain);
 258
 259 /*
 260  *      Device drivers call our routines to queue packets here. We empty the
 261  *      queue in the local softnet handler.
 262  */
 263
 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 265 EXPORT_PER_CPU_SYMBOL(softnet_data);
 266
 267 #ifdef CONFIG_LOCKDEP
 268 /*
 269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 270  * according to dev->type
 271  */
 272 static const unsigned short netdev_lock_type[] =
 273         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 274          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 275          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 276          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 277          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 278          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 279          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 280          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 281          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 282          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 283          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 284          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 285          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 286          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 287          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 288
 289 static const char *const netdev_lock_name[] =
 290         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 291          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 292          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 293          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 294          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 295          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 296          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 297          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 298          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 299          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 300          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 301          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 302          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 303          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 304          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 305
 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 308
 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 310 {
 311         int i;
 312
 313         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 314                 if (netdev_lock_type[i] == dev_type)
 315                         return i;
 316         /* the last key is used by default */
 317         return ARRAY_SIZE(netdev_lock_type) - 1;
 318 }
 319
 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 321                                                  unsigned short dev_type)
 322 {
 323         int i;
 324
 325         i = netdev_lock_pos(dev_type);
 326         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329
 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 331 {
 332         int i;
 333
 334         i = netdev_lock_pos(dev->type);
 335         lockdep_set_class_and_name(&dev->addr_list_lock,
 336                                    &netdev_addr_lock_key[i],
 337                                    netdev_lock_name[i]);
 338 }
 339 #else
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343 }
 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 345 {
 346 }
 347 #endif
 348
 349 /*******************************************************************************
 350
 351                 Protocol management and registration routines
 352
 353 *******************************************************************************/
 354
 355 /*
 356  *      Add a protocol ID to the list. Now that the input handler is
 357  *      smarter we can dispense with all the messy stuff that used to be
 358  *      here.
 359  *
 360  *      BEWARE!!! Protocol handlers, mangling input packets,
 361  *      MUST BE last in hash buckets and checking protocol handlers
 362  *      MUST start from promiscuous ptype_all chain in net_bh.
 363  *      It is true now, do not change it.
 364  *      Explanation follows: if protocol handler, mangling packet, will
 365  *      be the first on list, it is not able to sense, that packet
 366  *      is cloned and should be copied-on-write, so that it will
 367  *      change it and subsequent readers will get broken packet.
 368  *                                                      --ANK (980803)
 369  */
 370
 371 static inline struct list_head *ptype_head(const struct packet_type *pt)
 372 {
 373         if (pt->type == htons(ETH_P_ALL))
 374                 return &ptype_all;
 375         else
 376                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 377 }
 378
 379 /**
 380  *      dev_add_pack - add packet handler
 381  *      @pt: packet type declaration
 382  *
 383  *      Add a protocol handler to the networking stack. The passed &packet_type
 384  *      is linked into kernel lists and may not be freed until it has been
 385  *      removed from the kernel lists.
 386  *
 387  *      This call does not sleep therefore it can not
 388  *      guarantee all CPU's that are in middle of receiving packets
 389  *      will see the new packet type (until the next received packet).
 390  */
 391
 392 void dev_add_pack(struct packet_type *pt)
 393 {
 394         struct list_head *head = ptype_head(pt);
 395
 396         spin_lock(&ptype_lock);
 397         list_add_rcu(&pt->list, head);
 398         spin_unlock(&ptype_lock);
 399 }
 400 EXPORT_SYMBOL(dev_add_pack);
 401
 402 /**
 403  *      __dev_remove_pack        - remove packet handler
 404  *      @pt: packet type declaration
 405  *
 406  *      Remove a protocol handler that was previously added to the kernel
 407  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 408  *      from the kernel lists and can be freed or reused once this function
 409  *      returns.
 410  *
 411  *      The packet type might still be in use by receivers
 412  *      and must not be freed until after all the CPU's have gone
 413  *      through a quiescent state.
 414  */
 415 void __dev_remove_pack(struct packet_type *pt)
 416 {
 417         struct list_head *head = ptype_head(pt);
 418         struct packet_type *pt1;
 419
 420         spin_lock(&ptype_lock);
 421
 422         list_for_each_entry(pt1, head, list) {
 423                 if (pt == pt1) {
 424                         list_del_rcu(&pt->list);
 425                         goto out;
 426                 }
 427         }
 428
 429         pr_warn("dev_remove_pack: %p not found\n", pt);
 430 out:
 431         spin_unlock(&ptype_lock);
 432 }
 433 EXPORT_SYMBOL(__dev_remove_pack);
 434
 435 /**
 436  *      dev_remove_pack  - remove packet handler
 437  *      @pt: packet type declaration
 438  *
 439  *      Remove a protocol handler that was previously added to the kernel
 440  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 441  *      from the kernel lists and can be freed or reused once this function
 442  *      returns.
 443  *
 444  *      This call sleeps to guarantee that no CPU is looking at the packet
 445  *      type after return.
 446  */
 447 void dev_remove_pack(struct packet_type *pt)
 448 {
 449         __dev_remove_pack(pt);
 450
 451         synchronize_net();
 452 }
 453 EXPORT_SYMBOL(dev_remove_pack);
 454
 455
 456 /**
 457  *      dev_add_offload - register offload handlers
 458  *      @po: protocol offload declaration
 459  *
 460  *      Add protocol offload handlers to the networking stack. The passed
 461  *      &proto_offload is linked into kernel lists and may not be freed until
 462  *      it has been removed from the kernel lists.
 463  *
 464  *      This call does not sleep therefore it can not
 465  *      guarantee all CPU's that are in middle of receiving packets
 466  *      will see the new offload handlers (until the next received packet).
 467  */
 468 void dev_add_offload(struct packet_offload *po)
 469 {
 470         struct list_head *head = &offload_base;
 471
 472         spin_lock(&offload_lock);
 473         list_add_rcu(&po->list, head);
 474         spin_unlock(&offload_lock);
 475 }
 476 EXPORT_SYMBOL(dev_add_offload);
 477
 478 /**
 479  *      __dev_remove_offload     - remove offload handler
 480  *      @po: packet offload declaration
 481  *
 482  *      Remove a protocol offload handler that was previously added to the
 483  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 484  *      is removed from the kernel lists and can be freed or reused once this
 485  *      function returns.
 486  *
 487  *      The packet type might still be in use by receivers
 488  *      and must not be freed until after all the CPU's have gone
 489  *      through a quiescent state.
 490  */
 491 static void __dev_remove_offload(struct packet_offload *po)
 492 {
 493         struct list_head *head = &offload_base;
 494         struct packet_offload *po1;
 495
 496         spin_lock(&offload_lock);
 497
 498         list_for_each_entry(po1, head, list) {
 499                 if (po == po1) {
 500                         list_del_rcu(&po->list);
 501                         goto out;
 502                 }
 503         }
 504
 505         pr_warn("dev_remove_offload: %p not found\n", po);
 506 out:
 507         spin_unlock(&offload_lock);
 508 }
 509
 510 /**
 511  *      dev_remove_offload       - remove packet offload handler
 512  *      @po: packet offload declaration
 513  *
 514  *      Remove a packet offload handler that was previously added to the kernel
 515  *      offload handlers by dev_add_offload(). The passed &offload_type is
 516  *      removed from the kernel lists and can be freed or reused once this
 517  *      function returns.
 518  *
 519  *      This call sleeps to guarantee that no CPU is looking at the packet
 520  *      type after return.
 521  */
 522 void dev_remove_offload(struct packet_offload *po)
 523 {
 524         __dev_remove_offload(po);
 525
 526         synchronize_net();
 527 }
 528 EXPORT_SYMBOL(dev_remove_offload);
 529
 530 /******************************************************************************
 531
 532                       Device Boot-time Settings Routines
 533
 534 *******************************************************************************/
 535
 536 /* Boot time configuration table */
 537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 538
 539 /**
 540  *      netdev_boot_setup_add   - add new setup entry
 541  *      @name: name of the device
 542  *      @map: configured settings for the device
 543  *
 544  *      Adds new setup entry to the dev_boot_setup list.  The function
 545  *      returns 0 on error and 1 on success.  This is a generic routine to
 546  *      all netdevices.
 547  */
 548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 549 {
 550         struct netdev_boot_setup *s;
 551         int i;
 552
 553         s = dev_boot_setup;
 554         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 555                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 556                         memset(s[i].name, 0, sizeof(s[i].name));
 557                         strlcpy(s[i].name, name, IFNAMSIZ);
 558                         memcpy(&s[i].map, map, sizeof(s[i].map));
 559                         break;
 560                 }
 561         }
 562
 563         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 564 }
 565
 566 /**
 567  *      netdev_boot_setup_check - check boot time settings
 568  *      @dev: the netdevice
 569  *
 570  *      Check boot time settings for the device.
 571  *      The found settings are set for the device to be used
 572  *      later in the device probing.
 573  *      Returns 0 if no settings found, 1 if they are.
 574  */
 575 int netdev_boot_setup_check(struct net_device *dev)
 576 {
 577         struct netdev_boot_setup *s = dev_boot_setup;
 578         int i;
 579
 580         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 581                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 582                     !strcmp(dev->name, s[i].name)) {
 583                         dev->irq        = s[i].map.irq;
 584                         dev->base_addr  = s[i].map.base_addr;
 585                         dev->mem_start  = s[i].map.mem_start;
 586                         dev->mem_end    = s[i].map.mem_end;
 587                         return 1;
 588                 }
 589         }
 590         return 0;
 591 }
 592 EXPORT_SYMBOL(netdev_boot_setup_check);
 593
 594
 595 /**
 596  *      netdev_boot_base        - get address from boot time settings
 597  *      @prefix: prefix for network device
 598  *      @unit: id for network device
 599  *
 600  *      Check boot time settings for the base address of device.
 601  *      The found settings are set for the device to be used
 602  *      later in the device probing.
 603  *      Returns 0 if no settings found.
 604  */
 605 unsigned long netdev_boot_base(const char *prefix, int unit)
 606 {
 607         const struct netdev_boot_setup *s = dev_boot_setup;
 608         char name[IFNAMSIZ];
 609         int i;
 610
 611         sprintf(name, "%s%d", prefix, unit);
 612
 613         /*
 614          * If device already registered then return base of 1
 615          * to indicate not to probe for this interface
 616          */
 617         if (__dev_get_by_name(&init_net, name))
 618                 return 1;
 619
 620         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 621                 if (!strcmp(name, s[i].name))
 622                         return s[i].map.base_addr;
 623         return 0;
 624 }
 625
 626 /*
 627  * Saves at boot time configured settings for any netdevice.
 628  */
 629 int __init netdev_boot_setup(char *str)
 630 {
 631         int ints[5];
 632         struct ifmap map;
 633
 634         str = get_options(str, ARRAY_SIZE(ints), ints);
 635         if (!str || !*str)
 636                 return 0;
 637
 638         /* Save settings */
 639         memset(&map, 0, sizeof(map));
 640         if (ints[0] > 0)
 641                 map.irq = ints[1];
 642         if (ints[0] > 1)
 643                 map.base_addr = ints[2];
 644         if (ints[0] > 2)
 645                 map.mem_start = ints[3];
 646         if (ints[0] > 3)
 647                 map.mem_end = ints[4];
 648
 649         /* Add new entry to the list */
 650         return netdev_boot_setup_add(str, &map);
 651 }
 652
 653 __setup("netdev=", netdev_boot_setup);
 654
 655 /*******************************************************************************
 656
 657                             Device Interface Subroutines
 658
 659 *******************************************************************************/
 660
 661 /**
 662  *      __dev_get_by_name       - find a device by its name
 663  *      @net: the applicable net namespace
 664  *      @name: name to find
 665  *
 666  *      Find an interface by name. Must be called under RTNL semaphore
 667  *      or @dev_base_lock. If the name is found a pointer to the device
 668  *      is returned. If the name is not found then %NULL is returned. The
 669  *      reference counters are not incremented so the caller must be
 670  *      careful with locks.
 671  */
 672
 673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 674 {
 675         struct net_device *dev;
 676         struct hlist_head *head = dev_name_hash(net, name);
 677
 678         hlist_for_each_entry(dev, head, name_hlist)
 679                 if (!strncmp(dev->name, name, IFNAMSIZ))
 680                         return dev;
 681
 682         return NULL;
 683 }
 684 EXPORT_SYMBOL(__dev_get_by_name);
 685
 686 /**
 687  *      dev_get_by_name_rcu     - find a device by its name
 688  *      @net: the applicable net namespace
 689  *      @name: name to find
 690  *
 691  *      Find an interface by name.
 692  *      If the name is found a pointer to the device is returned.
 693  *      If the name is not found then %NULL is returned.
 694  *      The reference counters are not incremented so the caller must be
 695  *      careful with locks. The caller must hold RCU lock.
 696  */
 697
 698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 699 {
 700         struct net_device *dev;
 701         struct hlist_head *head = dev_name_hash(net, name);
 702
 703         hlist_for_each_entry_rcu(dev, head, name_hlist)
 704                 if (!strncmp(dev->name, name, IFNAMSIZ))
 705                         return dev;
 706
 707         return NULL;
 708 }
 709 EXPORT_SYMBOL(dev_get_by_name_rcu);
 710
 711 /**
 712  *      dev_get_by_name         - find a device by its name
 713  *      @net: the applicable net namespace
 714  *      @name: name to find
 715  *
 716  *      Find an interface by name. This can be called from any
 717  *      context and does its own locking. The returned handle has
 718  *      the usage count incremented and the caller must use dev_put() to
 719  *      release it when it is no longer needed. %NULL is returned if no
 720  *      matching device is found.
 721  */
 722
 723 struct net_device *dev_get_by_name(struct net *net, const char *name)
 724 {
 725         struct net_device *dev;
 726
 727         rcu_read_lock();
 728         dev = dev_get_by_name_rcu(net, name);
 729         if (dev)
 730                 dev_hold(dev);
 731         rcu_read_unlock();
 732         return dev;
 733 }
 734 EXPORT_SYMBOL(dev_get_by_name);
 735
 736 /**
 737  *      __dev_get_by_index - find a device by its ifindex
 738  *      @net: the applicable net namespace
 739  *      @ifindex: index of device
 740  *
 741  *      Search for an interface by index. Returns %NULL if the device
 742  *      is not found or a pointer to the device. The device has not
 743  *      had its reference counter increased so the caller must be careful
 744  *      about locking. The caller must hold either the RTNL semaphore
 745  *      or @dev_base_lock.
 746  */
 747
 748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 749 {
 750         struct net_device *dev;
 751         struct hlist_head *head = dev_index_hash(net, ifindex);
 752
 753         hlist_for_each_entry(dev, head, index_hlist)
 754                 if (dev->ifindex == ifindex)
 755                         return dev;
 756
 757         return NULL;
 758 }
 759 EXPORT_SYMBOL(__dev_get_by_index);
 760
 761 /**
 762  *      dev_get_by_index_rcu - find a device by its ifindex
 763  *      @net: the applicable net namespace
 764  *      @ifindex: index of device
 765  *
 766  *      Search for an interface by index. Returns %NULL if the device
 767  *      is not found or a pointer to the device. The device has not
 768  *      had its reference counter increased so the caller must be careful
 769  *      about locking. The caller must hold RCU lock.
 770  */
 771
 772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 773 {
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry_rcu(dev, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(dev_get_by_index_rcu);
 784
 785
 786 /**
 787  *      dev_get_by_index - find a device by its ifindex
 788  *      @net: the applicable net namespace
 789  *      @ifindex: index of device
 790  *
 791  *      Search for an interface by index. Returns NULL if the device
 792  *      is not found or a pointer to the device. The device returned has
 793  *      had a reference added and the pointer is safe until the user calls
 794  *      dev_put to indicate they have finished with it.
 795  */
 796
 797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 798 {
 799         struct net_device *dev;
 800
 801         rcu_read_lock();
 802         dev = dev_get_by_index_rcu(net, ifindex);
 803         if (dev)
 804                 dev_hold(dev);
 805         rcu_read_unlock();
 806         return dev;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index);
 809
 810 /**
 811  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 812  *      @net: network namespace
 813  *      @name: a pointer to the buffer where the name will be stored.
 814  *      @ifindex: the ifindex of the interface to get the name from.
 815  *
 816  *      The use of raw_seqcount_begin() and cond_resched() before
 817  *      retrying is required as we want to give the writers a chance
 818  *      to complete when CONFIG_PREEMPT is not set.
 819  */
 820 int netdev_get_name(struct net *net, char *name, int ifindex)
 821 {
 822         struct net_device *dev;
 823         unsigned int seq;
 824
 825 retry:
 826         seq = raw_seqcount_begin(&devnet_rename_seq);
 827         rcu_read_lock();
 828         dev = dev_get_by_index_rcu(net, ifindex);
 829         if (!dev) {
 830                 rcu_read_unlock();
 831                 return -ENODEV;
 832         }
 833
 834         strcpy(name, dev->name);
 835         rcu_read_unlock();
 836         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 837                 cond_resched();
 838                 goto retry;
 839         }
 840
 841         return 0;
 842 }
 843
 844 /**
 845  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 846  *      @net: the applicable net namespace
 847  *      @type: media type of device
 848  *      @ha: hardware address
 849  *
 850  *      Search for an interface by MAC address. Returns NULL if the device
 851  *      is not found or a pointer to the device.
 852  *      The caller must hold RCU or RTNL.
 853  *      The returned device has not had its ref count increased
 854  *      and the caller must therefore be careful about locking
 855  *
 856  */
 857
 858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 859                                        const char *ha)
 860 {
 861         struct net_device *dev;
 862
 863         for_each_netdev_rcu(net, dev)
 864                 if (dev->type == type &&
 865                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 866                         return dev;
 867
 868         return NULL;
 869 }
 870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 871
 872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 873 {
 874         struct net_device *dev;
 875
 876         ASSERT_RTNL();
 877         for_each_netdev(net, dev)
 878                 if (dev->type == type)
 879                         return dev;
 880
 881         return NULL;
 882 }
 883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 884
 885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 886 {
 887         struct net_device *dev, *ret = NULL;
 888
 889         rcu_read_lock();
 890         for_each_netdev_rcu(net, dev)
 891                 if (dev->type == type) {
 892                         dev_hold(dev);
 893                         ret = dev;
 894                         break;
 895                 }
 896         rcu_read_unlock();
 897         return ret;
 898 }
 899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 900
 901 /**
 902  *      __dev_get_by_flags - find any device with given flags
 903  *      @net: the applicable net namespace
 904  *      @if_flags: IFF_* values
 905  *      @mask: bitmask of bits in if_flags to check
 906  *
 907  *      Search for any interface with the given flags. Returns NULL if a device
 908  *      is not found or a pointer to the device. Must be called inside
 909  *      rtnl_lock(), and result refcount is unchanged.
 910  */
 911
 912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 913                                       unsigned short mask)
 914 {
 915         struct net_device *dev, *ret;
 916
 917         ASSERT_RTNL();
 918
 919         ret = NULL;
 920         for_each_netdev(net, dev) {
 921                 if (((dev->flags ^ if_flags) & mask) == 0) {
 922                         ret = dev;
 923                         break;
 924                 }
 925         }
 926         return ret;
 927 }
 928 EXPORT_SYMBOL(__dev_get_by_flags);
 929
 930 /**
 931  *      dev_valid_name - check if name is okay for network device
 932  *      @name: name string
 933  *
 934  *      Network device names need to be valid file names to
 935  *      to allow sysfs to work.  We also disallow any kind of
 936  *      whitespace.
 937  */
 938 bool dev_valid_name(const char *name)
 939 {
 940         if (*name == '\0')
 941                 return false;
 942         if (strlen(name) >= IFNAMSIZ)
 943                 return false;
 944         if (!strcmp(name, ".") || !strcmp(name, ".."))
 945                 return false;
 946
 947         while (*name) {
 948                 if (*name == '/' || isspace(*name))
 949                         return false;
 950                 name++;
 951         }
 952         return true;
 953 }
 954 EXPORT_SYMBOL(dev_valid_name);
 955
 956 /**
 957  *      __dev_alloc_name - allocate a name for a device
 958  *      @net: network namespace to allocate the device name in
 959  *      @name: name format string
 960  *      @buf:  scratch buffer and result name string
 961  *
 962  *      Passed a format string - eg "lt%d" it will try and find a suitable
 963  *      id. It scans list of devices to build up a free map, then chooses
 964  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 965  *      while allocating the name and adding the device in order to avoid
 966  *      duplicates.
 967  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 968  *      Returns the number of the unit assigned or a negative errno code.
 969  */
 970
 971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 972 {
 973         int i = 0;
 974         const char *p;
 975         const int max_netdevices = 8*PAGE_SIZE;
 976         unsigned long *inuse;
 977         struct net_device *d;
 978
 979         p = strnchr(name, IFNAMSIZ-1, '%');
 980         if (p) {
 981                 /*
 982                  * Verify the string as this thing may have come from
 983                  * the user.  There must be either one "%d" and no other "%"
 984                  * characters.
 985                  */
 986                 if (p[1] != 'd' || strchr(p + 2, '%'))
 987                         return -EINVAL;
 988
 989                 /* Use one page as a bit array of possible slots */
 990                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 991                 if (!inuse)
 992                         return -ENOMEM;
 993
 994                 for_each_netdev(net, d) {
 995                         if (!sscanf(d->name, name, &i))
 996                                 continue;
 997                         if (i < 0 || i >= max_netdevices)
 998                                 continue;
 999
1000                         /*  avoid cases where sscanf is not exact inverse of printf */
1001                         snprintf(buf, IFNAMSIZ, name, i);
1002                         if (!strncmp(buf, d->name, IFNAMSIZ))
1003                                 set_bit(i, inuse);
1004                 }
1005
1006                 i = find_first_zero_bit(inuse, max_netdevices);
1007                 free_page((unsigned long) inuse);
1008         }
1009
1010         if (buf != name)
1011                 snprintf(buf, IFNAMSIZ, name, i);
1012         if (!__dev_get_by_name(net, buf))
1013                 return i;
1014
1015         /* It is possible to run out of possible slots
1016          * when the name is long and there isn't enough space left
1017          * for the digits, or if all bits are used.
1018          */
1019         return -ENFILE;
1020 }
1021
1022 /**
1023  *      dev_alloc_name - allocate a name for a device
1024  *      @dev: device
1025  *      @name: name format string
1026  *
1027  *      Passed a format string - eg "lt%d" it will try and find a suitable
1028  *      id. It scans list of devices to build up a free map, then chooses
1029  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *      while allocating the name and adding the device in order to avoid
1031  *      duplicates.
1032  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *      Returns the number of the unit assigned or a negative errno code.
1034  */
1035
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038         char buf[IFNAMSIZ];
1039         struct net *net;
1040         int ret;
1041
1042         BUG_ON(!dev_net(dev));
1043         net = dev_net(dev);
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050
1051 static int dev_alloc_name_ns(struct net *net,
1052                              struct net_device *dev,
1053                              const char *name)
1054 {
1055         char buf[IFNAMSIZ];
1056         int ret;
1057
1058         ret = __dev_alloc_name(net, name, buf);
1059         if (ret >= 0)
1060                 strlcpy(dev->name, buf, IFNAMSIZ);
1061         return ret;
1062 }
1063
1064 static int dev_get_valid_name(struct net *net,
1065                               struct net_device *dev,
1066                               const char *name)
1067 {
1068         BUG_ON(!net);
1069
1070         if (!dev_valid_name(name))
1071                 return -EINVAL;
1072
1073         if (strchr(name, '%'))
1074                 return dev_alloc_name_ns(net, dev, name);
1075         else if (__dev_get_by_name(net, name))
1076                 return -EEXIST;
1077         else if (dev->name != name)
1078                 strlcpy(dev->name, name, IFNAMSIZ);
1079
1080         return 0;
1081 }
1082
1083 /**
1084  *      dev_change_name - change name of a device
1085  *      @dev: device
1086  *      @newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *      Change name of a device, can pass format strings "eth%d".
1089  *      for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093         unsigned char old_assign_type;
1094         char oldname[IFNAMSIZ];
1095         int err = 0;
1096         int ret;
1097         struct net *net;
1098
1099         ASSERT_RTNL();
1100         BUG_ON(!dev_net(dev));
1101
1102         net = dev_net(dev);
1103         if (dev->flags & IFF_UP)
1104                 return -EBUSY;
1105
1106         write_seqcount_begin(&devnet_rename_seq);
1107
1108         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109                 write_seqcount_end(&devnet_rename_seq);
1110                 return 0;
1111         }
1112
1113         memcpy(oldname, dev->name, IFNAMSIZ);
1114
1115         err = dev_get_valid_name(net, dev, newname);
1116         if (err < 0) {
1117                 write_seqcount_end(&devnet_rename_seq);
1118                 return err;
1119         }
1120
1121         if (oldname[0] && !strchr(oldname, '%'))
1122                 netdev_info(dev, "renamed from %s\n", oldname);
1123
1124         old_assign_type = dev->name_assign_type;
1125         dev->name_assign_type = NET_NAME_RENAMED;
1126
1127 rollback:
1128         ret = device_rename(&dev->dev, dev->name);
1129         if (ret) {
1130                 memcpy(dev->name, oldname, IFNAMSIZ);
1131                 dev->name_assign_type = old_assign_type;
1132                 write_seqcount_end(&devnet_rename_seq);
1133                 return ret;
1134         }
1135
1136         write_seqcount_end(&devnet_rename_seq);
1137
1138         netdev_adjacent_rename_links(dev, oldname);
1139
1140         write_lock_bh(&dev_base_lock);
1141         hlist_del_rcu(&dev->name_hlist);
1142         write_unlock_bh(&dev_base_lock);
1143
1144         synchronize_rcu();
1145
1146         write_lock_bh(&dev_base_lock);
1147         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148         write_unlock_bh(&dev_base_lock);
1149
1150         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151         ret = notifier_to_errno(ret);
1152
1153         if (ret) {
1154                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1155                 if (err >= 0) {
1156                         err = ret;
1157                         write_seqcount_begin(&devnet_rename_seq);
1158                         memcpy(dev->name, oldname, IFNAMSIZ);
1159                         memcpy(oldname, newname, IFNAMSIZ);
1160                         dev->name_assign_type = old_assign_type;
1161                         old_assign_type = NET_NAME_RENAMED;
1162                         goto rollback;
1163                 } else {
1164                         pr_err("%s: name change rollback failed: %d\n",
1165                                dev->name, ret);
1166                 }
1167         }
1168
1169         return err;
1170 }
1171
1172 /**
1173  *      dev_set_alias - change ifalias of a device
1174  *      @dev: device
1175  *      @alias: name up to IFALIASZ
1176  *      @len: limit of bytes to copy from info
1177  *
1178  *      Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182         char *new_ifalias;
1183
1184         ASSERT_RTNL();
1185
1186         if (len >= IFALIASZ)
1187                 return -EINVAL;
1188
1189         if (!len) {
1190                 kfree(dev->ifalias);
1191                 dev->ifalias = NULL;
1192                 return 0;
1193         }
1194
1195         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196         if (!new_ifalias)
1197                 return -ENOMEM;
1198         dev->ifalias = new_ifalias;
1199
1200         strlcpy(dev->ifalias, alias, len+1);
1201         return len;
1202 }
1203
1204
1205 /**
1206  *      netdev_features_change - device changes features
1207  *      @dev: device to cause notification
1208  *
1209  *      Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216
1217 /**
1218  *      netdev_state_change - device changes state
1219  *      @dev: device to cause notification
1220  *
1221  *      Called to indicate a device has changed state. This function calls
1222  *      the notifier chains for netdev_chain and sends a NEWLINK message
1223  *      to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227         if (dev->flags & IFF_UP) {
1228                 struct netdev_notifier_change_info change_info;
1229
1230                 change_info.flags_changed = 0;
1231                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232                                               &change_info.info);
1233                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234         }
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237
1238 /**
1239  *      netdev_notify_peers - notify network peers about existence of @dev
1240  *      @dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250         rtnl_lock();
1251         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252         rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255
1256 static int __dev_open(struct net_device *dev)
1257 {
1258         const struct net_device_ops *ops = dev->netdev_ops;
1259         int ret;
1260
1261         ASSERT_RTNL();
1262
1263         if (!netif_device_present(dev))
1264                 return -ENODEV;
1265
1266         /* Block netpoll from trying to do any rx path servicing.
1267          * If we don't do this there is a chance ndo_poll_controller
1268          * or ndo_poll may be running while we open the device
1269          */
1270         netpoll_poll_disable(dev);
1271
1272         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273         ret = notifier_to_errno(ret);
1274         if (ret)
1275                 return ret;
1276
1277         set_bit(__LINK_STATE_START, &dev->state);
1278
1279         if (ops->ndo_validate_addr)
1280                 ret = ops->ndo_validate_addr(dev);
1281
1282         if (!ret && ops->ndo_open)
1283                 ret = ops->ndo_open(dev);
1284
1285         netpoll_poll_enable(dev);
1286
1287         if (ret)
1288                 clear_bit(__LINK_STATE_START, &dev->state);
1289         else {
1290                 dev->flags |= IFF_UP;
1291                 dev_set_rx_mode(dev);
1292                 dev_activate(dev);
1293                 add_device_randomness(dev->dev_addr, dev->addr_len);
1294         }
1295
1296         return ret;
1297 }
1298
1299 /**
1300  *      dev_open        - prepare an interface for use.
1301  *      @dev:   device to open
1302  *
1303  *      Takes a device from down to up state. The device's private open
1304  *      function is invoked and then the multicast lists are loaded. Finally
1305  *      the device is moved into the up state and a %NETDEV_UP message is
1306  *      sent to the netdev notifier chain.
1307  *
1308  *      Calling this function on an active interface is a nop. On a failure
1309  *      a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313         int ret;
1314
1315         if (dev->flags & IFF_UP)
1316                 return 0;
1317
1318         ret = __dev_open(dev);
1319         if (ret < 0)
1320                 return ret;
1321
1322         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323         call_netdevice_notifiers(NETDEV_UP, dev);
1324
1325         return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331         struct net_device *dev;
1332
1333         ASSERT_RTNL();
1334         might_sleep();
1335
1336         list_for_each_entry(dev, head, close_list) {
1337                 /* Temporarily disable netpoll until the interface is down */
1338                 netpoll_poll_disable(dev);
1339
1340                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341
1342                 clear_bit(__LINK_STATE_START, &dev->state);
1343
1344                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1345                  * can be even on different cpu. So just clear netif_running().
1346                  *
1347                  * dev->stop() will invoke napi_disable() on all of it's
1348                  * napi_struct instances on this device.
1349                  */
1350                 smp_mb__after_atomic(); /* Commit netif_running(). */
1351         }
1352
1353         dev_deactivate_many(head);
1354
1355         list_for_each_entry(dev, head, close_list) {
1356                 const struct net_device_ops *ops = dev->netdev_ops;
1357
1358                 /*
1359                  *      Call the device specific close. This cannot fail.
1360                  *      Only if device is UP
1361                  *
1362                  *      We allow it to be called even after a DETACH hot-plug
1363                  *      event.
1364                  */
1365                 if (ops->ndo_stop)
1366                         ops->ndo_stop(dev);
1367
1368                 dev->flags &= ~IFF_UP;
1369                 netpoll_poll_enable(dev);
1370         }
1371
1372         return 0;
1373 }
1374
1375 static int __dev_close(struct net_device *dev)
1376 {
1377         int retval;
1378         LIST_HEAD(single);
1379
1380         list_add(&dev->close_list, &single);
1381         retval = __dev_close_many(&single);
1382         list_del(&single);
1383
1384         return retval;
1385 }
1386
1387 static int dev_close_many(struct list_head *head)
1388 {
1389         struct net_device *dev, *tmp;
1390
1391         /* Remove the devices that don't need to be closed */
1392         list_for_each_entry_safe(dev, tmp, head, close_list)
1393                 if (!(dev->flags & IFF_UP))
1394                         list_del_init(&dev->close_list);
1395
1396         __dev_close_many(head);
1397
1398         list_for_each_entry_safe(dev, tmp, head, close_list) {
1399                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1401                 list_del_init(&dev->close_list);
1402         }
1403
1404         return 0;
1405 }
1406
1407 /**
1408  *      dev_close - shutdown an interface.
1409  *      @dev: device to shutdown
1410  *
1411  *      This function moves an active device into down state. A
1412  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *      chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418         if (dev->flags & IFF_UP) {
1419                 LIST_HEAD(single);
1420
1421                 list_add(&dev->close_list, &single);
1422                 dev_close_many(&single);
1423                 list_del(&single);
1424         }
1425         return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428
1429
1430 /**
1431  *      dev_disable_lro - disable Large Receive Offload on a device
1432  *      @dev: device
1433  *
1434  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *      called under RTNL.  This is needed if received packets may be
1436  *      forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440         struct net_device *lower_dev;
1441         struct list_head *iter;
1442
1443         dev->wanted_features &= ~NETIF_F_LRO;
1444         netdev_update_features(dev);
1445
1446         if (unlikely(dev->features & NETIF_F_LRO))
1447                 netdev_WARN(dev, "failed to disable LRO!\n");
1448
1449         netdev_for_each_lower_dev(dev, lower_dev, iter)
1450                 dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455                                    struct net_device *dev)
1456 {
1457         struct netdev_notifier_info info;
1458
1459         netdev_notifier_info_init(&info, dev);
1460         return nb->notifier_call(nb, val, &info);
1461 }
1462
1463 static int dev_boot_phase = 1;
1464
1465 /**
1466  *      register_netdevice_notifier - register a network notifier block
1467  *      @nb: notifier
1468  *
1469  *      Register a notifier to be called when network device events occur.
1470  *      The notifier passed is linked into the kernel structures and must
1471  *      not be reused until it has been unregistered. A negative errno code
1472  *      is returned on a failure.
1473  *
1474  *      When registered all registration and up events are replayed
1475  *      to the new notifier to allow device to have a race free
1476  *      view of the network device list.
1477  */
1478
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481         struct net_device *dev;
1482         struct net_device *last;
1483         struct net *net;
1484         int err;
1485
1486         rtnl_lock();
1487         err = raw_notifier_chain_register(&netdev_chain, nb);
1488         if (err)
1489                 goto unlock;
1490         if (dev_boot_phase)
1491                 goto unlock;
1492         for_each_net(net) {
1493                 for_each_netdev(net, dev) {
1494                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495                         err = notifier_to_errno(err);
1496                         if (err)
1497                                 goto rollback;
1498
1499                         if (!(dev->flags & IFF_UP))
1500                                 continue;
1501
1502                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1503                 }
1504         }
1505
1506 unlock:
1507         rtnl_unlock();
1508         return err;
1509
1510 rollback:
1511         last = dev;
1512         for_each_net(net) {
1513                 for_each_netdev(net, dev) {
1514                         if (dev == last)
1515                                 goto outroll;
1516
1517                         if (dev->flags & IFF_UP) {
1518                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519                                                         dev);
1520                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521                         }
1522                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523                 }
1524         }
1525
1526 outroll:
1527         raw_notifier_chain_unregister(&netdev_chain, nb);
1528         goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531
1532 /**
1533  *      unregister_netdevice_notifier - unregister a network notifier block
1534  *      @nb: notifier
1535  *
1536  *      Unregister a notifier previously registered by
1537  *      register_netdevice_notifier(). The notifier is unlinked into the
1538  *      kernel structures and may then be reused. A negative errno code
1539  *      is returned on a failure.
1540  *
1541  *      After unregistering unregister and down device events are synthesized
1542  *      for all devices on the device list to the removed notifier to remove
1543  *      the need for special case cleanup code.
1544  */
1545
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548         struct net_device *dev;
1549         struct net *net;
1550         int err;
1551
1552         rtnl_lock();
1553         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554         if (err)
1555                 goto unlock;
1556
1557         for_each_net(net) {
1558                 for_each_netdev(net, dev) {
1559                         if (dev->flags & IFF_UP) {
1560                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561                                                         dev);
1562                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563                         }
1564                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565                 }
1566         }
1567 unlock:
1568         rtnl_unlock();
1569         return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572
1573 /**
1574  *      call_netdevice_notifiers_info - call all network notifier blocks
1575  *      @val: value passed unmodified to notifier function
1576  *      @dev: net_device pointer passed unmodified to notifier function
1577  *      @info: notifier information data
1578  *
1579  *      Call all network notifier blocks.  Parameters and return value
1580  *      are as for raw_notifier_call_chain().
1581  */
1582
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584                                          struct net_device *dev,
1585                                          struct netdev_notifier_info *info)
1586 {
1587         ASSERT_RTNL();
1588         netdev_notifier_info_init(info, dev);
1589         return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591
1592 /**
1593  *      call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *      Call all network notifier blocks.  Parameters and return value
1598  *      are as for raw_notifier_call_chain().
1599  */
1600
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603         struct netdev_notifier_info info;
1604
1605         return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622
1623         if (deferred) {
1624                 while (--deferred)
1625                         static_key_slow_dec(&netstamp_needed);
1626                 return;
1627         }
1628 #endif
1629         static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636         if (in_interrupt()) {
1637                 atomic_inc(&netstamp_needed_deferred);
1638                 return;
1639         }
1640 #endif
1641         static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647         skb->tstamp.tv64 = 0;
1648         if (static_key_false(&netstamp_needed))
1649                 __net_timestamp(skb);
1650 }
1651
1652 #define net_timestamp_check(COND, SKB)                  \
1653         if (static_key_false(&netstamp_needed)) {               \
1654                 if ((COND) && !(SKB)->tstamp.tv64)      \
1655                         __net_timestamp(SKB);           \
1656         }                                               \
1657
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660         unsigned int len;
1661
1662         if (!(dev->flags & IFF_UP))
1663                 return false;
1664
1665         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666         if (skb->len <= len)
1667                 return true;
1668
1669         /* if TSO is enabled, we don't care about the length as the packet
1670          * could be forwarded without being segmented before
1671          */
1672         if (skb_is_gso(skb))
1673                 return true;
1674
1675         return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683                         atomic_long_inc(&dev->rx_dropped);
1684                         kfree_skb(skb);
1685                         return NET_RX_DROP;
1686                 }
1687         }
1688
1689         if (unlikely(!is_skb_forwardable(dev, skb))) {
1690                 atomic_long_inc(&dev->rx_dropped);
1691                 kfree_skb(skb);
1692                 return NET_RX_DROP;
1693         }
1694
1695         skb_scrub_packet(skb, true);
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1698
1699         return 0;
1700 }
1701 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1702
1703 /**
1704  * dev_forward_skb - loopback an skb to another netif
1705  *
1706  * @dev: destination network device
1707  * @skb: buffer to forward
1708  *
1709  * return values:
1710  *      NET_RX_SUCCESS  (no congestion)
1711  *      NET_RX_DROP     (packet was dropped, but freed)
1712  *
1713  * dev_forward_skb can be used for injecting an skb from the
1714  * start_xmit function of one device into the receive queue
1715  * of another device.
1716  *
1717  * The receiving device may be in another namespace, so
1718  * we have to clear all information in the skb that could
1719  * impact namespace isolation.
1720  */
1721 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1722 {
1723         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1724 }
1725 EXPORT_SYMBOL_GPL(dev_forward_skb);
1726
1727 static inline int deliver_skb(struct sk_buff *skb,
1728                               struct packet_type *pt_prev,
1729                               struct net_device *orig_dev)
1730 {
1731         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1732                 return -ENOMEM;
1733         atomic_inc(&skb->users);
1734         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1735 }
1736
1737 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1738 {
1739         if (!ptype->af_packet_priv || !skb->sk)
1740                 return false;
1741
1742         if (ptype->id_match)
1743                 return ptype->id_match(ptype, skb->sk);
1744         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1745                 return true;
1746
1747         return false;
1748 }
1749
1750 /*
1751  *      Support routine. Sends outgoing frames to any network
1752  *      taps currently in use.
1753  */
1754
1755 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1756 {
1757         struct packet_type *ptype;
1758         struct sk_buff *skb2 = NULL;
1759         struct packet_type *pt_prev = NULL;
1760
1761         rcu_read_lock();
1762         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1763                 /* Never send packets back to the socket
1764                  * they originated from - MvS (miquels@drinkel.ow.org)
1765                  */
1766                 if ((ptype->dev == dev || !ptype->dev) &&
1767                     (!skb_loop_sk(ptype, skb))) {
1768                         if (pt_prev) {
1769                                 deliver_skb(skb2, pt_prev, skb->dev);
1770                                 pt_prev = ptype;
1771                                 continue;
1772                         }
1773
1774                         skb2 = skb_clone(skb, GFP_ATOMIC);
1775                         if (!skb2)
1776                                 break;
1777
1778                         net_timestamp_set(skb2);
1779
1780                         /* skb->nh should be correctly
1781                            set by sender, so that the second statement is
1782                            just protection against buggy protocols.
1783                          */
1784                         skb_reset_mac_header(skb2);
1785
1786                         if (skb_network_header(skb2) < skb2->data ||
1787                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1788                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1789                                                      ntohs(skb2->protocol),
1790                                                      dev->name);
1791                                 skb_reset_network_header(skb2);
1792                         }
1793
1794                         skb2->transport_header = skb2->network_header;
1795                         skb2->pkt_type = PACKET_OUTGOING;
1796                         pt_prev = ptype;
1797                 }
1798         }
1799         if (pt_prev)
1800                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1801         rcu_read_unlock();
1802 }
1803
1804 /**
1805  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1806  * @dev: Network device
1807  * @txq: number of queues available
1808  *
1809  * If real_num_tx_queues is changed the tc mappings may no longer be
1810  * valid. To resolve this verify the tc mapping remains valid and if
1811  * not NULL the mapping. With no priorities mapping to this
1812  * offset/count pair it will no longer be used. In the worst case TC0
1813  * is invalid nothing can be done so disable priority mappings. If is
1814  * expected that drivers will fix this mapping if they can before
1815  * calling netif_set_real_num_tx_queues.
1816  */
1817 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1818 {
1819         int i;
1820         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1821
1822         /* If TC0 is invalidated disable TC mapping */
1823         if (tc->offset + tc->count > txq) {
1824                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1825                 dev->num_tc = 0;
1826                 return;
1827         }
1828
1829         /* Invalidated prio to tc mappings set to TC0 */
1830         for (i = 1; i < TC_BITMASK + 1; i++) {
1831                 int q = netdev_get_prio_tc_map(dev, i);
1832
1833                 tc = &dev->tc_to_txq[q];
1834                 if (tc->offset + tc->count > txq) {
1835                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1836                                 i, q);
1837                         netdev_set_prio_tc_map(dev, i, 0);
1838                 }
1839         }
1840 }
1841
1842 #ifdef CONFIG_XPS
1843 static DEFINE_MUTEX(xps_map_mutex);
1844 #define xmap_dereference(P)             \
1845         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1846
1847 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1848                                         int cpu, u16 index)
1849 {
1850         struct xps_map *map = NULL;
1851         int pos;
1852
1853         if (dev_maps)
1854                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1855
1856         for (pos = 0; map && pos < map->len; pos++) {
1857                 if (map->queues[pos] == index) {
1858                         if (map->len > 1) {
1859                                 map->queues[pos] = map->queues[--map->len];
1860                         } else {
1861                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1862                                 kfree_rcu(map, rcu);
1863                                 map = NULL;
1864                         }
1865                         break;
1866                 }
1867         }
1868
1869         return map;
1870 }
1871
1872 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1873 {
1874         struct xps_dev_maps *dev_maps;
1875         int cpu, i;
1876         bool active = false;
1877
1878         mutex_lock(&xps_map_mutex);
1879         dev_maps = xmap_dereference(dev->xps_maps);
1880
1881         if (!dev_maps)
1882                 goto out_no_maps;
1883
1884         for_each_possible_cpu(cpu) {
1885                 for (i = index; i < dev->num_tx_queues; i++) {
1886                         if (!remove_xps_queue(dev_maps, cpu, i))
1887                                 break;
1888                 }
1889                 if (i == dev->num_tx_queues)
1890                         active = true;
1891         }
1892
1893         if (!active) {
1894                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1895                 kfree_rcu(dev_maps, rcu);
1896         }
1897
1898         for (i = index; i < dev->num_tx_queues; i++)
1899                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1900                                              NUMA_NO_NODE);
1901
1902 out_no_maps:
1903         mutex_unlock(&xps_map_mutex);
1904 }
1905
1906 static struct xps_map *expand_xps_map(struct xps_map *map,
1907                                       int cpu, u16 index)
1908 {
1909         struct xps_map *new_map;
1910         int alloc_len = XPS_MIN_MAP_ALLOC;
1911         int i, pos;
1912
1913         for (pos = 0; map && pos < map->len; pos++) {
1914                 if (map->queues[pos] != index)
1915                         continue;
1916                 return map;
1917         }
1918
1919         /* Need to add queue to this CPU's existing map */
1920         if (map) {
1921                 if (pos < map->alloc_len)
1922                         return map;
1923
1924                 alloc_len = map->alloc_len * 2;
1925         }
1926
1927         /* Need to allocate new map to store queue on this CPU's map */
1928         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1929                                cpu_to_node(cpu));
1930         if (!new_map)
1931                 return NULL;
1932
1933         for (i = 0; i < pos; i++)
1934                 new_map->queues[i] = map->queues[i];
1935         new_map->alloc_len = alloc_len;
1936         new_map->len = pos;
1937
1938         return new_map;
1939 }
1940
1941 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1942                         u16 index)
1943 {
1944         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1945         struct xps_map *map, *new_map;
1946         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1947         int cpu, numa_node_id = -2;
1948         bool active = false;
1949
1950         mutex_lock(&xps_map_mutex);
1951
1952         dev_maps = xmap_dereference(dev->xps_maps);
1953
1954         /* allocate memory for queue storage */
1955         for_each_online_cpu(cpu) {
1956                 if (!cpumask_test_cpu(cpu, mask))
1957                         continue;
1958
1959                 if (!new_dev_maps)
1960                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1961                 if (!new_dev_maps) {
1962                         mutex_unlock(&xps_map_mutex);
1963                         return -ENOMEM;
1964                 }
1965
1966                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1967                                  NULL;
1968
1969                 map = expand_xps_map(map, cpu, index);
1970                 if (!map)
1971                         goto error;
1972
1973                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1974         }
1975
1976         if (!new_dev_maps)
1977                 goto out_no_new_maps;
1978
1979         for_each_possible_cpu(cpu) {
1980                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1981                         /* add queue to CPU maps */
1982                         int pos = 0;
1983
1984                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1985                         while ((pos < map->len) && (map->queues[pos] != index))
1986                                 pos++;
1987
1988                         if (pos == map->len)
1989                                 map->queues[map->len++] = index;
1990 #ifdef CONFIG_NUMA
1991                         if (numa_node_id == -2)
1992                                 numa_node_id = cpu_to_node(cpu);
1993                         else if (numa_node_id != cpu_to_node(cpu))
1994                                 numa_node_id = -1;
1995 #endif
1996                 } else if (dev_maps) {
1997                         /* fill in the new device map from the old device map */
1998                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1999                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2000                 }
2001
2002         }
2003
2004         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2005
2006         /* Cleanup old maps */
2007         if (dev_maps) {
2008                 for_each_possible_cpu(cpu) {
2009                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2010                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2011                         if (map && map != new_map)
2012                                 kfree_rcu(map, rcu);
2013                 }
2014
2015                 kfree_rcu(dev_maps, rcu);
2016         }
2017
2018         dev_maps = new_dev_maps;
2019         active = true;
2020
2021 out_no_new_maps:
2022         /* update Tx queue numa node */
2023         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2024                                      (numa_node_id >= 0) ? numa_node_id :
2025                                      NUMA_NO_NODE);
2026
2027         if (!dev_maps)
2028                 goto out_no_maps;
2029
2030         /* removes queue from unused CPUs */
2031         for_each_possible_cpu(cpu) {
2032                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2033                         continue;
2034
2035                 if (remove_xps_queue(dev_maps, cpu, index))
2036                         active = true;
2037         }
2038
2039         /* free map if not active */
2040         if (!active) {
2041                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2042                 kfree_rcu(dev_maps, rcu);
2043         }
2044
2045 out_no_maps:
2046         mutex_unlock(&xps_map_mutex);
2047
2048         return 0;
2049 error:
2050         /* remove any maps that we added */
2051         for_each_possible_cpu(cpu) {
2052                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2053                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2054                                  NULL;
2055                 if (new_map && new_map != map)
2056                         kfree(new_map);
2057         }
2058
2059         mutex_unlock(&xps_map_mutex);
2060
2061         kfree(new_dev_maps);
2062         return -ENOMEM;
2063 }
2064 EXPORT_SYMBOL(netif_set_xps_queue);
2065
2066 #endif
2067 /*
2068  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2069  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2070  */
2071 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2072 {
2073         int rc;
2074
2075         if (txq < 1 || txq > dev->num_tx_queues)
2076                 return -EINVAL;
2077
2078         if (dev->reg_state == NETREG_REGISTERED ||
2079             dev->reg_state == NETREG_UNREGISTERING) {
2080                 ASSERT_RTNL();
2081
2082                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2083                                                   txq);
2084                 if (rc)
2085                         return rc;
2086
2087                 if (dev->num_tc)
2088                         netif_setup_tc(dev, txq);
2089
2090                 if (txq < dev->real_num_tx_queues) {
2091                         qdisc_reset_all_tx_gt(dev, txq);
2092 #ifdef CONFIG_XPS
2093                         netif_reset_xps_queues_gt(dev, txq);
2094 #endif
2095                 }
2096         }
2097
2098         dev->real_num_tx_queues = txq;
2099         return 0;
2100 }
2101 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2102
2103 #ifdef CONFIG_SYSFS
2104 /**
2105  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2106  *      @dev: Network device
2107  *      @rxq: Actual number of RX queues
2108  *
2109  *      This must be called either with the rtnl_lock held or before
2110  *      registration of the net device.  Returns 0 on success, or a
2111  *      negative error code.  If called before registration, it always
2112  *      succeeds.
2113  */
2114 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2115 {
2116         int rc;
2117
2118         if (rxq < 1 || rxq > dev->num_rx_queues)
2119                 return -EINVAL;
2120
2121         if (dev->reg_state == NETREG_REGISTERED) {
2122                 ASSERT_RTNL();
2123
2124                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2125                                                   rxq);
2126                 if (rc)
2127                         return rc;
2128         }
2129
2130         dev->real_num_rx_queues = rxq;
2131         return 0;
2132 }
2133 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2134 #endif
2135
2136 /**
2137  * netif_get_num_default_rss_queues - default number of RSS queues
2138  *
2139  * This routine should set an upper limit on the number of RSS queues
2140  * used by default by multiqueue devices.
2141  */
2142 int netif_get_num_default_rss_queues(void)
2143 {
2144         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2145 }
2146 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2147
2148 static inline void __netif_reschedule(struct Qdisc *q)
2149 {
2150         struct softnet_data *sd;
2151         unsigned long flags;
2152
2153         local_irq_save(flags);
2154         sd = this_cpu_ptr(&softnet_data);
2155         q->next_sched = NULL;
2156         *sd->output_queue_tailp = q;
2157         sd->output_queue_tailp = &q->next_sched;
2158         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159         local_irq_restore(flags);
2160 }
2161
2162 void __netif_schedule(struct Qdisc *q)
2163 {
2164         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2165                 __netif_reschedule(q);
2166 }
2167 EXPORT_SYMBOL(__netif_schedule);
2168
2169 struct dev_kfree_skb_cb {
2170         enum skb_free_reason reason;
2171 };
2172
2173 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2174 {
2175         return (struct dev_kfree_skb_cb *)skb->cb;
2176 }
2177
2178 void netif_schedule_queue(struct netdev_queue *txq)
2179 {
2180         rcu_read_lock();
2181         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2182                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2183
2184                 __netif_schedule(q);
2185         }
2186         rcu_read_unlock();
2187 }
2188 EXPORT_SYMBOL(netif_schedule_queue);
2189
2190 /**
2191  *      netif_wake_subqueue - allow sending packets on subqueue
2192  *      @dev: network device
2193  *      @queue_index: sub queue index
2194  *
2195  * Resume individual transmit queue of a device with multiple transmit queues.
2196  */
2197 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2198 {
2199         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2200
2201         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2202                 struct Qdisc *q;
2203
2204                 rcu_read_lock();
2205                 q = rcu_dereference(txq->qdisc);
2206                 __netif_schedule(q);
2207                 rcu_read_unlock();
2208         }
2209 }
2210 EXPORT_SYMBOL(netif_wake_subqueue);
2211
2212 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2213 {
2214         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2215                 struct Qdisc *q;
2216
2217                 rcu_read_lock();
2218                 q = rcu_dereference(dev_queue->qdisc);
2219                 __netif_schedule(q);
2220                 rcu_read_unlock();
2221         }
2222 }
2223 EXPORT_SYMBOL(netif_tx_wake_queue);
2224
2225 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2226 {
2227         unsigned long flags;
2228
2229         if (likely(atomic_read(&skb->users) == 1)) {
2230                 smp_rmb();
2231                 atomic_set(&skb->users, 0);
2232         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2233                 return;
2234         }
2235         get_kfree_skb_cb(skb)->reason = reason;
2236         local_irq_save(flags);
2237         skb->next = __this_cpu_read(softnet_data.completion_queue);
2238         __this_cpu_write(softnet_data.completion_queue, skb);
2239         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2240         local_irq_restore(flags);
2241 }
2242 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2243
2244 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2245 {
2246         if (in_irq() || irqs_disabled())
2247                 __dev_kfree_skb_irq(skb, reason);
2248         else
2249                 dev_kfree_skb(skb);
2250 }
2251 EXPORT_SYMBOL(__dev_kfree_skb_any);
2252
2253
2254 /**
2255  * netif_device_detach - mark device as removed
2256  * @dev: network device
2257  *
2258  * Mark device as removed from system and therefore no longer available.
2259  */
2260 void netif_device_detach(struct net_device *dev)
2261 {
2262         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2263             netif_running(dev)) {
2264                 netif_tx_stop_all_queues(dev);
2265         }
2266 }
2267 EXPORT_SYMBOL(netif_device_detach);
2268
2269 /**
2270  * netif_device_attach - mark device as attached
2271  * @dev: network device
2272  *
2273  * Mark device as attached from system and restart if needed.
2274  */
2275 void netif_device_attach(struct net_device *dev)
2276 {
2277         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2278             netif_running(dev)) {
2279                 netif_tx_wake_all_queues(dev);
2280                 __netdev_watchdog_up(dev);
2281         }
2282 }
2283 EXPORT_SYMBOL(netif_device_attach);
2284
2285 static void skb_warn_bad_offload(const struct sk_buff *skb)
2286 {
2287         static const netdev_features_t null_features = 0;
2288         struct net_device *dev = skb->dev;
2289         const char *driver = "";
2290
2291         if (!net_ratelimit())
2292                 return;
2293
2294         if (dev && dev->dev.parent)
2295                 driver = dev_driver_string(dev->dev.parent);
2296
2297         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2298              "gso_type=%d ip_summed=%d\n",
2299              driver, dev ? &dev->features : &null_features,
2300              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2301              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2302              skb_shinfo(skb)->gso_type, skb->ip_summed);
2303 }
2304
2305 /*
2306  * Invalidate hardware checksum when packet is to be mangled, and
2307  * complete checksum manually on outgoing path.
2308  */
2309 int skb_checksum_help(struct sk_buff *skb)
2310 {
2311         __wsum csum;
2312         int ret = 0, offset;
2313
2314         if (skb->ip_summed == CHECKSUM_COMPLETE)
2315                 goto out_set_summed;
2316
2317         if (unlikely(skb_shinfo(skb)->gso_size)) {
2318                 skb_warn_bad_offload(skb);
2319                 return -EINVAL;
2320         }
2321
2322         /* Before computing a checksum, we should make sure no frag could
2323          * be modified by an external entity : checksum could be wrong.
2324          */
2325         if (skb_has_shared_frag(skb)) {
2326                 ret = __skb_linearize(skb);
2327                 if (ret)
2328                         goto out;
2329         }
2330
2331         offset = skb_checksum_start_offset(skb);
2332         BUG_ON(offset >= skb_headlen(skb));
2333         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2334
2335         offset += skb->csum_offset;
2336         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2337
2338         if (skb_cloned(skb) &&
2339             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2340                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2341                 if (ret)
2342                         goto out;
2343         }
2344
2345         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2346 out_set_summed:
2347         skb->ip_summed = CHECKSUM_NONE;
2348 out:
2349         return ret;
2350 }
2351 EXPORT_SYMBOL(skb_checksum_help);
2352
2353 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2354 {
2355         __be16 type = skb->protocol;
2356
2357         /* Tunnel gso handlers can set protocol to ethernet. */
2358         if (type == htons(ETH_P_TEB)) {
2359                 struct ethhdr *eth;
2360
2361                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362                         return 0;
2363
2364                 eth = (struct ethhdr *)skb_mac_header(skb);
2365                 type = eth->h_proto;
2366         }
2367
2368         return __vlan_get_protocol(skb, type, depth);
2369 }
2370
2371 /**
2372  *      skb_mac_gso_segment - mac layer segmentation handler.
2373  *      @skb: buffer to segment
2374  *      @features: features for the output path (see dev->features)
2375  */
2376 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2377                                     netdev_features_t features)
2378 {
2379         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2380         struct packet_offload *ptype;
2381         int vlan_depth = skb->mac_len;
2382         __be16 type = skb_network_protocol(skb, &vlan_depth);
2383
2384         if (unlikely(!type))
2385                 return ERR_PTR(-EINVAL);
2386
2387         __skb_pull(skb, vlan_depth);
2388
2389         rcu_read_lock();
2390         list_for_each_entry_rcu(ptype, &offload_base, list) {
2391                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2392                         segs = ptype->callbacks.gso_segment(skb, features);
2393                         break;
2394                 }
2395         }
2396         rcu_read_unlock();
2397
2398         __skb_push(skb, skb->data - skb_mac_header(skb));
2399
2400         return segs;
2401 }
2402 EXPORT_SYMBOL(skb_mac_gso_segment);
2403
2404
2405 /* openvswitch calls this on rx path, so we need a different check.
2406  */
2407 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2408 {
2409         if (tx_path)
2410                 return skb->ip_summed != CHECKSUM_PARTIAL;
2411         else
2412                 return skb->ip_summed == CHECKSUM_NONE;
2413 }
2414
2415 /**
2416  *      __skb_gso_segment - Perform segmentation on skb.
2417  *      @skb: buffer to segment
2418  *      @features: features for the output path (see dev->features)
2419  *      @tx_path: whether it is called in TX path
2420  *
2421  *      This function segments the given skb and returns a list of segments.
2422  *
2423  *      It may return NULL if the skb requires no segmentation.  This is
2424  *      only possible when GSO is used for verifying header integrity.
2425  */
2426 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2427                                   netdev_features_t features, bool tx_path)
2428 {
2429         if (unlikely(skb_needs_check(skb, tx_path))) {
2430                 int err;
2431
2432                 skb_warn_bad_offload(skb);
2433
2434                 err = skb_cow_head(skb, 0);
2435                 if (err < 0)
2436                         return ERR_PTR(err);
2437         }
2438
2439         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2440         SKB_GSO_CB(skb)->encap_level = 0;
2441
2442         skb_reset_mac_header(skb);
2443         skb_reset_mac_len(skb);
2444
2445         return skb_mac_gso_segment(skb, features);
2446 }
2447 EXPORT_SYMBOL(__skb_gso_segment);
2448
2449 /* Take action when hardware reception checksum errors are detected. */
2450 #ifdef CONFIG_BUG
2451 void netdev_rx_csum_fault(struct net_device *dev)
2452 {
2453         if (net_ratelimit()) {
2454                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2455                 dump_stack();
2456         }
2457 }
2458 EXPORT_SYMBOL(netdev_rx_csum_fault);
2459 #endif
2460
2461 /* Actually, we should eliminate this check as soon as we know, that:
2462  * 1. IOMMU is present and allows to map all the memory.
2463  * 2. No high memory really exists on this machine.
2464  */
2465
2466 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2467 {
2468 #ifdef CONFIG_HIGHMEM
2469         int i;
2470         if (!(dev->features & NETIF_F_HIGHDMA)) {
2471                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2472                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2473                         if (PageHighMem(skb_frag_page(frag)))
2474                                 return 1;
2475                 }
2476         }
2477
2478         if (PCI_DMA_BUS_IS_PHYS) {
2479                 struct device *pdev = dev->dev.parent;
2480
2481                 if (!pdev)
2482                         return 0;
2483                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2484                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2485                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2486                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2487                                 return 1;
2488                 }
2489         }
2490 #endif
2491         return 0;
2492 }
2493
2494 /* If MPLS offload request, verify we are testing hardware MPLS features
2495  * instead of standard features for the netdev.
2496  */
2497 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2498 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2499                                            netdev_features_t features,
2500                                            __be16 type)
2501 {
2502         if (eth_p_mpls(type))
2503                 features &= skb->dev->mpls_features;
2504
2505         return features;
2506 }
2507 #else
2508 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2509                                            netdev_features_t features,
2510                                            __be16 type)
2511 {
2512         return features;
2513 }
2514 #endif
2515
2516 static netdev_features_t harmonize_features(struct sk_buff *skb,
2517         netdev_features_t features)
2518 {
2519         int tmp;
2520         __be16 type;
2521
2522         type = skb_network_protocol(skb, &tmp);
2523         features = net_mpls_features(skb, features, type);
2524
2525         if (skb->ip_summed != CHECKSUM_NONE &&
2526             !can_checksum_protocol(features, type)) {
2527                 features &= ~NETIF_F_ALL_CSUM;
2528         } else if (illegal_highdma(skb->dev, skb)) {
2529                 features &= ~NETIF_F_SG;
2530         }
2531
2532         return features;
2533 }
2534
2535 netdev_features_t netif_skb_features(struct sk_buff *skb)
2536 {
2537         struct net_device *dev = skb->dev;
2538         netdev_features_t features = dev->features;
2539         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2540         __be16 protocol = skb->protocol;
2541
2542         if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2543                 features &= ~NETIF_F_GSO_MASK;
2544
2545         /* If encapsulation offload request, verify we are testing
2546          * hardware encapsulation features instead of standard
2547          * features for the netdev
2548          */
2549         if (skb->encapsulation)
2550                 features &= dev->hw_enc_features;
2551
2552         if (!vlan_tx_tag_present(skb)) {
2553                 if (unlikely(protocol == htons(ETH_P_8021Q) ||
2554                              protocol == htons(ETH_P_8021AD))) {
2555                         struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2556                         protocol = veh->h_vlan_encapsulated_proto;
2557                 } else {
2558                         goto finalize;
2559                 }
2560         }
2561
2562         features = netdev_intersect_features(features,
2563                                              dev->vlan_features |
2564                                              NETIF_F_HW_VLAN_CTAG_TX |
2565                                              NETIF_F_HW_VLAN_STAG_TX);
2566
2567         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2568                 features = netdev_intersect_features(features,
2569                                                      NETIF_F_SG |
2570                                                      NETIF_F_HIGHDMA |
2571                                                      NETIF_F_FRAGLIST |
2572                                                      NETIF_F_GEN_CSUM |
2573                                                      NETIF_F_HW_VLAN_CTAG_TX |
2574                                                      NETIF_F_HW_VLAN_STAG_TX);
2575
2576 finalize:
2577         if (dev->netdev_ops->ndo_features_check)
2578                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2579                                                                 features);
2580
2581         return harmonize_features(skb, features);
2582 }
2583 EXPORT_SYMBOL(netif_skb_features);
2584
2585 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2586                     struct netdev_queue *txq, bool more)
2587 {
2588         unsigned int len;
2589         int rc;
2590
2591         if (!list_empty(&ptype_all))
2592                 dev_queue_xmit_nit(skb, dev);
2593
2594         len = skb->len;
2595         trace_net_dev_start_xmit(skb, dev);
2596         rc = netdev_start_xmit(skb, dev, txq, more);
2597         trace_net_dev_xmit(skb, rc, dev, len);
2598
2599         return rc;
2600 }
2601
2602 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2603                                     struct netdev_queue *txq, int *ret)
2604 {
2605         struct sk_buff *skb = first;
2606         int rc = NETDEV_TX_OK;
2607
2608         while (skb) {
2609                 struct sk_buff *next = skb->next;
2610
2611                 skb->next = NULL;
2612                 rc = xmit_one(skb, dev, txq, next != NULL);
2613                 if (unlikely(!dev_xmit_complete(rc))) {
2614                         skb->next = next;
2615                         goto out;
2616                 }
2617
2618                 skb = next;
2619                 if (netif_xmit_stopped(txq) && skb) {
2620                         rc = NETDEV_TX_BUSY;
2621                         break;
2622                 }
2623         }
2624
2625 out:
2626         *ret = rc;
2627         return skb;
2628 }
2629
2630 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2631                                           netdev_features_t features)
2632 {
2633         if (vlan_tx_tag_present(skb) &&
2634             !vlan_hw_offload_capable(features, skb->vlan_proto))
2635                 skb = __vlan_hwaccel_push_inside(skb);
2636         return skb;
2637 }
2638
2639 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2640 {
2641         netdev_features_t features;
2642
2643         if (skb->next)
2644                 return skb;
2645
2646         features = netif_skb_features(skb);
2647         skb = validate_xmit_vlan(skb, features);
2648         if (unlikely(!skb))
2649                 goto out_null;
2650
2651         if (netif_needs_gso(dev, skb, features)) {
2652                 struct sk_buff *segs;
2653
2654                 segs = skb_gso_segment(skb, features);
2655                 if (IS_ERR(segs)) {
2656                         goto out_kfree_skb;
2657                 } else if (segs) {
2658                         consume_skb(skb);
2659                         skb = segs;
2660                 }
2661         } else {
2662                 if (skb_needs_linearize(skb, features) &&
2663                     __skb_linearize(skb))
2664                         goto out_kfree_skb;
2665
2666                 /* If packet is not checksummed and device does not
2667                  * support checksumming for this protocol, complete
2668                  * checksumming here.
2669                  */
2670                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2671                         if (skb->encapsulation)
2672                                 skb_set_inner_transport_header(skb,
2673                                                                skb_checksum_start_offset(skb));
2674                         else
2675                                 skb_set_transport_header(skb,
2676                                                          skb_checksum_start_offset(skb));
2677                         if (!(features & NETIF_F_ALL_CSUM) &&
2678                             skb_checksum_help(skb))
2679                                 goto out_kfree_skb;
2680                 }
2681         }
2682
2683         return skb;
2684
2685 out_kfree_skb:
2686         kfree_skb(skb);
2687 out_null:
2688         return NULL;
2689 }
2690
2691 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2692 {
2693         struct sk_buff *next, *head = NULL, *tail;
2694
2695         for (; skb != NULL; skb = next) {
2696                 next = skb->next;
2697                 skb->next = NULL;
2698
2699                 /* in case skb wont be segmented, point to itself */
2700                 skb->prev = skb;
2701
2702                 skb = validate_xmit_skb(skb, dev);
2703                 if (!skb)
2704                         continue;
2705
2706                 if (!head)
2707                         head = skb;
2708                 else
2709                         tail->next = skb;
2710                 /* If skb was segmented, skb->prev points to
2711                  * the last segment. If not, it still contains skb.
2712                  */
2713                 tail = skb->prev;
2714         }
2715         return head;
2716 }
2717
2718 static void qdisc_pkt_len_init(struct sk_buff *skb)
2719 {
2720         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2721
2722         qdisc_skb_cb(skb)->pkt_len = skb->len;
2723
2724         /* To get more precise estimation of bytes sent on wire,
2725          * we add to pkt_len the headers size of all segments
2726          */
2727         if (shinfo->gso_size)  {
2728                 unsigned int hdr_len;
2729                 u16 gso_segs = shinfo->gso_segs;
2730
2731                 /* mac layer + network layer */
2732                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2733
2734                 /* + transport layer */
2735                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2736                         hdr_len += tcp_hdrlen(skb);
2737                 else
2738                         hdr_len += sizeof(struct udphdr);
2739
2740                 if (shinfo->gso_type & SKB_GSO_DODGY)
2741                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2742                                                 shinfo->gso_size);
2743
2744                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2745         }
2746 }
2747
2748 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2749                                  struct net_device *dev,
2750                                  struct netdev_queue *txq)
2751 {
2752         spinlock_t *root_lock = qdisc_lock(q);
2753         bool contended;
2754         int rc;
2755
2756         qdisc_pkt_len_init(skb);
2757         qdisc_calculate_pkt_len(skb, q);
2758         /*
2759          * Heuristic to force contended enqueues to serialize on a
2760          * separate lock before trying to get qdisc main lock.
2761          * This permits __QDISC___STATE_RUNNING owner to get the lock more
2762          * often and dequeue packets faster.
2763          */
2764         contended = qdisc_is_running(q);
2765         if (unlikely(contended))
2766                 spin_lock(&q->busylock);
2767
2768         spin_lock(root_lock);
2769         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2770                 kfree_skb(skb);
2771                 rc = NET_XMIT_DROP;
2772         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2773                    qdisc_run_begin(q)) {
2774                 /*
2775                  * This is a work-conserving queue; there are no old skbs
2776                  * waiting to be sent out; and the qdisc is not running -
2777                  * xmit the skb directly.
2778                  */
2779
2780                 qdisc_bstats_update(q, skb);
2781
2782                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2783                         if (unlikely(contended)) {
2784                                 spin_unlock(&q->busylock);
2785                                 contended = false;
2786                         }
2787                         __qdisc_run(q);
2788                 } else
2789                         qdisc_run_end(q);
2790
2791                 rc = NET_XMIT_SUCCESS;
2792         } else {
2793                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2794                 if (qdisc_run_begin(q)) {
2795                         if (unlikely(contended)) {
2796                                 spin_unlock(&q->busylock);
2797                                 contended = false;
2798                         }
2799                         __qdisc_run(q);
2800                 }
2801         }
2802         spin_unlock(root_lock);
2803         if (unlikely(contended))
2804                 spin_unlock(&q->busylock);
2805         return rc;
2806 }
2807
2808 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2809 static void skb_update_prio(struct sk_buff *skb)
2810 {
2811         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2812
2813         if (!skb->priority && skb->sk && map) {
2814                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2815
2816                 if (prioidx < map->priomap_len)
2817                         skb->priority = map->priomap[prioidx];
2818         }
2819 }
2820 #else
2821 #define skb_update_prio(skb)
2822 #endif
2823
2824 static DEFINE_PER_CPU(int, xmit_recursion);
2825 #define RECURSION_LIMIT 10
2826
2827 /**
2828  *      dev_loopback_xmit - loop back @skb
2829  *      @skb: buffer to transmit
2830  */
2831 int dev_loopback_xmit(struct sk_buff *skb)
2832 {
2833         skb_reset_mac_header(skb);
2834         __skb_pull(skb, skb_network_offset(skb));
2835         skb->pkt_type = PACKET_LOOPBACK;
2836         skb->ip_summed = CHECKSUM_UNNECESSARY;
2837         WARN_ON(!skb_dst(skb));
2838         skb_dst_force(skb);
2839         netif_rx_ni(skb);
2840         return 0;
2841 }
2842 EXPORT_SYMBOL(dev_loopback_xmit);
2843
2844 /**
2845  *      __dev_queue_xmit - transmit a buffer
2846  *      @skb: buffer to transmit
2847  *      @accel_priv: private data used for L2 forwarding offload
2848  *
2849  *      Queue a buffer for transmission to a network device. The caller must
2850  *      have set the device and priority and built the buffer before calling
2851  *      this function. The function can be called from an interrupt.
2852  *
2853  *      A negative errno code is returned on a failure. A success does not
2854  *      guarantee the frame will be transmitted as it may be dropped due
2855  *      to congestion or traffic shaping.
2856  *
2857  * -----------------------------------------------------------------------------------
2858  *      I notice this method can also return errors from the queue disciplines,
2859  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2860  *      be positive.
2861  *
2862  *      Regardless of the return value, the skb is consumed, so it is currently
2863  *      difficult to retry a send to this method.  (You can bump the ref count
2864  *      before sending to hold a reference for retry if you are careful.)
2865  *
2866  *      When calling this method, interrupts MUST be enabled.  This is because
2867  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2868  *          --BLG
2869  */
2870 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2871 {
2872         struct net_device *dev = skb->dev;
2873         struct netdev_queue *txq;
2874         struct Qdisc *q;
2875         int rc = -ENOMEM;
2876
2877         skb_reset_mac_header(skb);
2878
2879         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2880                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2881
2882         /* Disable soft irqs for various locks below. Also
2883          * stops preemption for RCU.
2884          */
2885         rcu_read_lock_bh();
2886
2887         skb_update_prio(skb);
2888
2889         /* If device/qdisc don't need skb->dst, release it right now while
2890          * its hot in this cpu cache.
2891          */
2892         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2893                 skb_dst_drop(skb);
2894         else
2895                 skb_dst_force(skb);
2896
2897         txq = netdev_pick_tx(dev, skb, accel_priv);
2898         q = rcu_dereference_bh(txq->qdisc);
2899
2900 #ifdef CONFIG_NET_CLS_ACT
2901         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2902 #endif
2903         trace_net_dev_queue(skb);
2904         if (q->enqueue) {
2905                 rc = __dev_xmit_skb(skb, q, dev, txq);
2906                 goto out;
2907         }
2908
2909         /* The device has no queue. Common case for software devices:
2910            loopback, all the sorts of tunnels...
2911
2912            Really, it is unlikely that netif_tx_lock protection is necessary
2913            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2914            counters.)
2915            However, it is possible, that they rely on protection
2916            made by us here.
2917
2918            Check this and shot the lock. It is not prone from deadlocks.
2919            Either shot noqueue qdisc, it is even simpler 8)
2920          */
2921         if (dev->flags & IFF_UP) {
2922                 int cpu = smp_processor_id(); /* ok because BHs are off */
2923
2924                 if (txq->xmit_lock_owner != cpu) {
2925
2926                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2927                                 goto recursion_alert;
2928
2929                         skb = validate_xmit_skb(skb, dev);
2930                         if (!skb)
2931                                 goto drop;
2932
2933                         HARD_TX_LOCK(dev, txq, cpu);
2934
2935                         if (!netif_xmit_stopped(txq)) {
2936                                 __this_cpu_inc(xmit_recursion);
2937                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2938                                 __this_cpu_dec(xmit_recursion);
2939                                 if (dev_xmit_complete(rc)) {
2940                                         HARD_TX_UNLOCK(dev, txq);
2941                                         goto out;
2942                                 }
2943                         }
2944                         HARD_TX_UNLOCK(dev, txq);
2945                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2946                                              dev->name);
2947                 } else {
2948                         /* Recursion is detected! It is possible,
2949                          * unfortunately
2950                          */
2951 recursion_alert:
2952                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2953                                              dev->name);
2954                 }
2955         }
2956
2957         rc = -ENETDOWN;
2958 drop:
2959         rcu_read_unlock_bh();
2960
2961         atomic_long_inc(&dev->tx_dropped);
2962         kfree_skb_list(skb);
2963         return rc;
2964 out:
2965         rcu_read_unlock_bh();
2966         return rc;
2967 }
2968
2969 int dev_queue_xmit(struct sk_buff *skb)
2970 {
2971         return __dev_queue_xmit(skb, NULL);
2972 }
2973 EXPORT_SYMBOL(dev_queue_xmit);
2974
2975 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2976 {
2977         return __dev_queue_xmit(skb, accel_priv);
2978 }
2979 EXPORT_SYMBOL(dev_queue_xmit_accel);
2980
2981
2982 /*=======================================================================
2983                         Receiver routines
2984   =======================================================================*/
2985
2986 int netdev_max_backlog __read_mostly = 1000;
2987 EXPORT_SYMBOL(netdev_max_backlog);
2988
2989 int netdev_tstamp_prequeue __read_mostly = 1;
2990 int netdev_budget __read_mostly = 300;
2991 int weight_p __read_mostly = 64;            /* old backlog weight */
2992
2993 /* Called with irq disabled */
2994 static inline void ____napi_schedule(struct softnet_data *sd,
2995                                      struct napi_struct *napi)
2996 {
2997         list_add_tail(&napi->poll_list, &sd->poll_list);
2998         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2999 }
3000
3001 #ifdef CONFIG_RPS
3002
3003 /* One global table that all flow-based protocols share. */
3004 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3005 EXPORT_SYMBOL(rps_sock_flow_table);
3006
3007 struct static_key rps_needed __read_mostly;
3008
3009 static struct rps_dev_flow *
3010 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3011             struct rps_dev_flow *rflow, u16 next_cpu)
3012 {
3013         if (next_cpu != RPS_NO_CPU) {
3014 #ifdef CONFIG_RFS_ACCEL
3015                 struct netdev_rx_queue *rxqueue;
3016                 struct rps_dev_flow_table *flow_table;
3017                 struct rps_dev_flow *old_rflow;
3018                 u32 flow_id;
3019                 u16 rxq_index;
3020                 int rc;
3021
3022                 /* Should we steer this flow to a different hardware queue? */
3023                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3024                     !(dev->features & NETIF_F_NTUPLE))
3025                         goto out;
3026                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3027                 if (rxq_index == skb_get_rx_queue(skb))
3028                         goto out;
3029
3030                 rxqueue = dev->_rx + rxq_index;
3031                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3032                 if (!flow_table)
3033                         goto out;
3034                 flow_id = skb_get_hash(skb) & flow_table->mask;
3035                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3036                                                         rxq_index, flow_id);
3037                 if (rc < 0)
3038                         goto out;
3039                 old_rflow = rflow;
3040                 rflow = &flow_table->flows[flow_id];
3041                 rflow->filter = rc;
3042                 if (old_rflow->filter == rflow->filter)
3043                         old_rflow->filter = RPS_NO_FILTER;
3044         out:
3045 #endif
3046                 rflow->last_qtail =
3047                         per_cpu(softnet_data, next_cpu).input_queue_head;
3048         }
3049
3050         rflow->cpu = next_cpu;
3051         return rflow;
3052 }
3053
3054 /*
3055  * get_rps_cpu is called from netif_receive_skb and returns the target
3056  * CPU from the RPS map of the receiving queue for a given skb.
3057  * rcu_read_lock must be held on entry.
3058  */
3059 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3060                        struct rps_dev_flow **rflowp)
3061 {
3062         struct netdev_rx_queue *rxqueue;
3063         struct rps_map *map;
3064         struct rps_dev_flow_table *flow_table;
3065         struct rps_sock_flow_table *sock_flow_table;
3066         int cpu = -1;
3067         u16 tcpu;
3068         u32 hash;
3069
3070         if (skb_rx_queue_recorded(skb)) {
3071                 u16 index = skb_get_rx_queue(skb);
3072                 if (unlikely(index >= dev->real_num_rx_queues)) {
3073                         WARN_ONCE(dev->real_num_rx_queues > 1,
3074                                   "%s received packet on queue %u, but number "
3075                                   "of RX queues is %u\n",
3076                                   dev->name, index, dev->real_num_rx_queues);
3077                         goto done;
3078                 }
3079                 rxqueue = dev->_rx + index;
3080         } else
3081                 rxqueue = dev->_rx;
3082
3083         map = rcu_dereference(rxqueue->rps_map);
3084         if (map) {
3085                 if (map->len == 1 &&
3086                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3087                         tcpu = map->cpus[0];
3088                         if (cpu_online(tcpu))
3089                                 cpu = tcpu;
3090                         goto done;
3091                 }
3092         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3093                 goto done;
3094         }
3095
3096         skb_reset_network_header(skb);
3097         hash = skb_get_hash(skb);
3098         if (!hash)
3099                 goto done;
3100
3101         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3102         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3103         if (flow_table && sock_flow_table) {
3104                 u16 next_cpu;
3105                 struct rps_dev_flow *rflow;
3106
3107                 rflow = &flow_table->flows[hash & flow_table->mask];
3108                 tcpu = rflow->cpu;
3109
3110                 next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3111
3112                 /*
3113                  * If the desired CPU (where last recvmsg was done) is
3114                  * different from current CPU (one in the rx-queue flow
3115                  * table entry), switch if one of the following holds:
3116                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3117                  *   - Current CPU is offline.
3118                  *   - The current CPU's queue tail has advanced beyond the
3119                  *     last packet that was enqueued using this table entry.
3120                  *     This guarantees that all previous packets for the flow
3121                  *     have been dequeued, thus preserving in order delivery.
3122                  */
3123                 if (unlikely(tcpu != next_cpu) &&
3124                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3125                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3126                       rflow->last_qtail)) >= 0)) {
3127                         tcpu = next_cpu;
3128                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3129                 }
3130
3131                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3132                         *rflowp = rflow;
3133                         cpu = tcpu;
3134                         goto done;
3135                 }
3136         }
3137
3138         if (map) {
3139                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3140                 if (cpu_online(tcpu)) {
3141                         cpu = tcpu;
3142                         goto done;
3143                 }
3144         }
3145
3146 done:
3147         return cpu;
3148 }
3149
3150 #ifdef CONFIG_RFS_ACCEL
3151
3152 /**
3153  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3154  * @dev: Device on which the filter was set
3155  * @rxq_index: RX queue index
3156  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3157  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3158  *
3159  * Drivers that implement ndo_rx_flow_steer() should periodically call
3160  * this function for each installed filter and remove the filters for
3161  * which it returns %true.
3162  */
3163 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3164                          u32 flow_id, u16 filter_id)
3165 {
3166         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3167         struct rps_dev_flow_table *flow_table;
3168         struct rps_dev_flow *rflow;
3169         bool expire = true;
3170         int cpu;
3171
3172         rcu_read_lock();
3173         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3174         if (flow_table && flow_id <= flow_table->mask) {
3175                 rflow = &flow_table->flows[flow_id];
3176                 cpu = ACCESS_ONCE(rflow->cpu);
3177                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3178                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3179                            rflow->last_qtail) <
3180                      (int)(10 * flow_table->mask)))
3181                         expire = false;
3182         }
3183         rcu_read_unlock();
3184         return expire;
3185 }
3186 EXPORT_SYMBOL(rps_may_expire_flow);
3187
3188 #endif /* CONFIG_RFS_ACCEL */
3189
3190 /* Called from hardirq (IPI) context */
3191 static void rps_trigger_softirq(void *data)
3192 {
3193         struct softnet_data *sd = data;
3194
3195         ____napi_schedule(sd, &sd->backlog);
3196         sd->received_rps++;
3197 }
3198
3199 #endif /* CONFIG_RPS */
3200
3201 /*
3202  * Check if this softnet_data structure is another cpu one
3203  * If yes, queue it to our IPI list and return 1
3204  * If no, return 0
3205  */
3206 static int rps_ipi_queued(struct softnet_data *sd)
3207 {
3208 #ifdef CONFIG_RPS
3209         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3210
3211         if (sd != mysd) {
3212                 sd->rps_ipi_next = mysd->rps_ipi_list;
3213                 mysd->rps_ipi_list = sd;
3214
3215                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3216                 return 1;
3217         }
3218 #endif /* CONFIG_RPS */
3219         return 0;
3220 }
3221
3222 #ifdef CONFIG_NET_FLOW_LIMIT
3223 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3224 #endif
3225
3226 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3227 {
3228 #ifdef CONFIG_NET_FLOW_LIMIT
3229         struct sd_flow_limit *fl;
3230         struct softnet_data *sd;
3231         unsigned int old_flow, new_flow;
3232
3233         if (qlen < (netdev_max_backlog >> 1))
3234                 return false;
3235
3236         sd = this_cpu_ptr(&softnet_data);
3237
3238         rcu_read_lock();
3239         fl = rcu_dereference(sd->flow_limit);
3240         if (fl) {
3241                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3242                 old_flow = fl->history[fl->history_head];
3243                 fl->history[fl->history_head] = new_flow;
3244
3245                 fl->history_head++;
3246                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3247
3248                 if (likely(fl->buckets[old_flow]))
3249                         fl->buckets[old_flow]--;
3250
3251                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3252                         fl->count++;
3253                         rcu_read_unlock();
3254                         return true;
3255                 }
3256         }
3257         rcu_read_unlock();
3258 #endif
3259         return false;
3260 }
3261
3262 /*
3263  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3264  * queue (may be a remote CPU queue).
3265  */
3266 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3267                               unsigned int *qtail)
3268 {
3269         struct softnet_data *sd;
3270         unsigned long flags;
3271         unsigned int qlen;
3272
3273         sd = &per_cpu(softnet_data, cpu);
3274
3275         local_irq_save(flags);
3276
3277         rps_lock(sd);
3278         qlen = skb_queue_len(&sd->input_pkt_queue);
3279         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3280                 if (qlen) {
3281 enqueue:
3282                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3283                         input_queue_tail_incr_save(sd, qtail);
3284                         rps_unlock(sd);
3285                         local_irq_restore(flags);
3286                         return NET_RX_SUCCESS;
3287                 }
3288
3289                 /* Schedule NAPI for backlog device
3290                  * We can use non atomic operation since we own the queue lock
3291                  */
3292                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3293                         if (!rps_ipi_queued(sd))
3294                                 ____napi_schedule(sd, &sd->backlog);
3295                 }
3296                 goto enqueue;
3297         }
3298
3299         sd->dropped++;
3300         rps_unlock(sd);
3301
3302         local_irq_restore(flags);
3303
3304         atomic_long_inc(&skb->dev->rx_dropped);
3305         kfree_skb(skb);
3306         return NET_RX_DROP;
3307 }
3308
3309 static int netif_rx_internal(struct sk_buff *skb)
3310 {
3311         int ret;
3312
3313         net_timestamp_check(netdev_tstamp_prequeue, skb);
3314
3315         trace_netif_rx(skb);
3316 #ifdef CONFIG_RPS
3317         if (static_key_false(&rps_needed)) {
3318                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3319                 int cpu;
3320
3321                 preempt_disable();
3322                 rcu_read_lock();
3323
3324                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3325                 if (cpu < 0)
3326                         cpu = smp_processor_id();
3327
3328                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3329
3330                 rcu_read_unlock();
3331                 preempt_enable();
3332         } else
3333 #endif
3334         {
3335                 unsigned int qtail;
3336                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3337                 put_cpu();
3338         }
3339         return ret;
3340 }
3341
3342 /**
3343  *      netif_rx        -       post buffer to the network code
3344  *      @skb: buffer to post
3345  *
3346  *      This function receives a packet from a device driver and queues it for
3347  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3348  *      may be dropped during processing for congestion control or by the
3349  *      protocol layers.
3350  *
3351  *      return values:
3352  *      NET_RX_SUCCESS  (no congestion)
3353  *      NET_RX_DROP     (packet was dropped)
3354  *
3355  */
3356
3357 int netif_rx(struct sk_buff *skb)
3358 {
3359         trace_netif_rx_entry(skb);
3360
3361         return netif_rx_internal(skb);
3362 }
3363 EXPORT_SYMBOL(netif_rx);
3364
3365 int netif_rx_ni(struct sk_buff *skb)
3366 {
3367         int err;
3368
3369         trace_netif_rx_ni_entry(skb);
3370
3371         preempt_disable();
3372         err = netif_rx_internal(skb);
3373         if (local_softirq_pending())
3374                 do_softirq();
3375         preempt_enable();
3376
3377         return err;
3378 }
3379 EXPORT_SYMBOL(netif_rx_ni);
3380
3381 static void net_tx_action(struct softirq_action *h)
3382 {
3383         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3384
3385         if (sd->completion_queue) {
3386                 struct sk_buff *clist;
3387
3388                 local_irq_disable();
3389                 clist = sd->completion_queue;
3390                 sd->completion_queue = NULL;
3391                 local_irq_enable();
3392
3393                 while (clist) {
3394                         struct sk_buff *skb = clist;
3395                         clist = clist->next;
3396
3397                         WARN_ON(atomic_read(&skb->users));
3398                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3399                                 trace_consume_skb(skb);
3400                         else
3401                                 trace_kfree_skb(skb, net_tx_action);
3402                         __kfree_skb(skb);
3403                 }
3404         }
3405
3406         if (sd->output_queue) {
3407                 struct Qdisc *head;
3408
3409                 local_irq_disable();
3410                 head = sd->output_queue;
3411                 sd->output_queue = NULL;
3412                 sd->output_queue_tailp = &sd->output_queue;
3413                 local_irq_enable();
3414
3415                 while (head) {
3416                         struct Qdisc *q = head;
3417                         spinlock_t *root_lock;
3418
3419                         head = head->next_sched;
3420
3421                         root_lock = qdisc_lock(q);
3422                         if (spin_trylock(root_lock)) {
3423                                 smp_mb__before_atomic();
3424                                 clear_bit(__QDISC_STATE_SCHED,
3425                                           &q->state);
3426                                 qdisc_run(q);
3427                                 spin_unlock(root_lock);
3428                         } else {
3429                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3430                                               &q->state)) {
3431                                         __netif_reschedule(q);
3432                                 } else {
3433                                         smp_mb__before_atomic();
3434                                         clear_bit(__QDISC_STATE_SCHED,
3435                                                   &q->state);
3436                                 }
3437                         }
3438                 }
3439         }
3440 }
3441
3442 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3443     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3444 /* This hook is defined here for ATM LANE */
3445 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3446                              unsigned char *addr) __read_mostly;
3447 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3448 #endif
3449
3450 #ifdef CONFIG_NET_CLS_ACT
3451 /* TODO: Maybe we should just force sch_ingress to be compiled in
3452  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3453  * a compare and 2 stores extra right now if we dont have it on
3454  * but have CONFIG_NET_CLS_ACT
3455  * NOTE: This doesn't stop any functionality; if you dont have
3456  * the ingress scheduler, you just can't add policies on ingress.
3457  *
3458  */
3459 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3460 {
3461         struct net_device *dev = skb->dev;
3462         u32 ttl = G_TC_RTTL(skb->tc_verd);
3463         int result = TC_ACT_OK;
3464         struct Qdisc *q;
3465
3466         if (unlikely(MAX_RED_LOOP < ttl++)) {
3467                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3468                                      skb->skb_iif, dev->ifindex);
3469                 return TC_ACT_SHOT;
3470         }
3471
3472         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3473         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3474
3475         q = rcu_dereference(rxq->qdisc);
3476         if (q != &noop_qdisc) {
3477                 spin_lock(qdisc_lock(q));
3478                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3479                         result = qdisc_enqueue_root(skb, q);
3480                 spin_unlock(qdisc_lock(q));
3481         }
3482
3483         return result;
3484 }
3485
3486 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3487                                          struct packet_type **pt_prev,
3488                                          int *ret, struct net_device *orig_dev)
3489 {
3490         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3491
3492         if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3493                 goto out;
3494
3495         if (*pt_prev) {
3496                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3497                 *pt_prev = NULL;
3498         }
3499
3500         switch (ing_filter(skb, rxq)) {
3501         case TC_ACT_SHOT:
3502         case TC_ACT_STOLEN:
3503                 kfree_skb(skb);
3504                 return NULL;
3505         }
3506
3507 out:
3508         skb->tc_verd = 0;
3509         return skb;
3510 }
3511 #endif
3512
3513 /**
3514  *      netdev_rx_handler_register - register receive handler
3515  *      @dev: device to register a handler for
3516  *      @rx_handler: receive handler to register
3517  *      @rx_handler_data: data pointer that is used by rx handler
3518  *
3519  *      Register a receive handler for a device. This handler will then be
3520  *      called from __netif_receive_skb. A negative errno code is returned
3521  *      on a failure.
3522  *
3523  *      The caller must hold the rtnl_mutex.
3524  *
3525  *      For a general description of rx_handler, see enum rx_handler_result.
3526  */
3527 int netdev_rx_handler_register(struct net_device *dev,
3528                                rx_handler_func_t *rx_handler,
3529                                void *rx_handler_data)
3530 {
3531         ASSERT_RTNL();
3532
3533         if (dev->rx_handler)
3534                 return -EBUSY;
3535
3536         /* Note: rx_handler_data must be set before rx_handler */
3537         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3538         rcu_assign_pointer(dev->rx_handler, rx_handler);
3539
3540         return 0;
3541 }
3542 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3543
3544 /**
3545  *      netdev_rx_handler_unregister - unregister receive handler
3546  *      @dev: device to unregister a handler from
3547  *
3548  *      Unregister a receive handler from a device.
3549  *
3550  *      The caller must hold the rtnl_mutex.
3551  */
3552 void netdev_rx_handler_unregister(struct net_device *dev)
3553 {
3554
3555         ASSERT_RTNL();
3556         RCU_INIT_POINTER(dev->rx_handler, NULL);
3557         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3558          * section has a guarantee to see a non NULL rx_handler_data
3559          * as well.
3560          */
3561         synchronize_net();
3562         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3563 }
3564 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3565
3566 /*
3567  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3568  * the special handling of PFMEMALLOC skbs.
3569  */
3570 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3571 {
3572         switch (skb->protocol) {
3573         case htons(ETH_P_ARP):
3574         case htons(ETH_P_IP):
3575         case htons(ETH_P_IPV6):
3576         case htons(ETH_P_8021Q):
3577         case htons(ETH_P_8021AD):
3578                 return true;
3579         default:
3580                 return false;
3581         }
3582 }
3583
3584 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3585 {
3586         struct packet_type *ptype, *pt_prev;
3587         rx_handler_func_t *rx_handler;
3588         struct net_device *orig_dev;
3589         struct net_device *null_or_dev;
3590         bool deliver_exact = false;
3591         int ret = NET_RX_DROP;
3592         __be16 type;
3593
3594         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3595
3596         trace_netif_receive_skb(skb);
3597
3598         orig_dev = skb->dev;
3599
3600         skb_reset_network_header(skb);
3601         if (!skb_transport_header_was_set(skb))
3602                 skb_reset_transport_header(skb);
3603         skb_reset_mac_len(skb);
3604
3605         pt_prev = NULL;
3606
3607         rcu_read_lock();
3608
3609 another_round:
3610         skb->skb_iif = skb->dev->ifindex;
3611
3612         __this_cpu_inc(softnet_data.processed);
3613
3614         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3615             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3616                 skb = skb_vlan_untag(skb);
3617                 if (unlikely(!skb))
3618                         goto unlock;
3619         }
3620
3621 #ifdef CONFIG_NET_CLS_ACT
3622         if (skb->tc_verd & TC_NCLS) {
3623                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3624                 goto ncls;
3625         }
3626 #endif
3627
3628         if (pfmemalloc)
3629                 goto skip_taps;
3630
3631         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3632                 if (!ptype->dev || ptype->dev == skb->dev) {
3633                         if (pt_prev)
3634                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3635                         pt_prev = ptype;
3636                 }
3637         }
3638
3639 skip_taps:
3640 #ifdef CONFIG_NET_CLS_ACT
3641         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3642         if (!skb)
3643                 goto unlock;
3644 ncls:
3645 #endif
3646
3647         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3648                 goto drop;
3649
3650         if (vlan_tx_tag_present(skb)) {
3651                 if (pt_prev) {
3652                         ret = deliver_skb(skb, pt_prev, orig_dev);
3653                         pt_prev = NULL;
3654                 }
3655                 if (vlan_do_receive(&skb))
3656                         goto another_round;
3657                 else if (unlikely(!skb))
3658                         goto unlock;
3659         }
3660
3661         rx_handler = rcu_dereference(skb->dev->rx_handler);
3662         if (rx_handler) {
3663                 if (pt_prev) {
3664                         ret = deliver_skb(skb, pt_prev, orig_dev);
3665                         pt_prev = NULL;
3666                 }
3667                 switch (rx_handler(&skb)) {
3668                 case RX_HANDLER_CONSUMED:
3669                         ret = NET_RX_SUCCESS;
3670                         goto unlock;
3671                 case RX_HANDLER_ANOTHER:
3672                         goto another_round;
3673                 case RX_HANDLER_EXACT:
3674                         deliver_exact = true;
3675                 case RX_HANDLER_PASS:
3676                         break;
3677                 default:
3678                         BUG();
3679                 }
3680         }
3681
3682         if (unlikely(vlan_tx_tag_present(skb))) {
3683                 if (vlan_tx_tag_get_id(skb))
3684                         skb->pkt_type = PACKET_OTHERHOST;
3685                 /* Note: we might in the future use prio bits
3686                  * and set skb->priority like in vlan_do_receive()
3687                  * For the time being, just ignore Priority Code Point
3688                  */
3689                 skb->vlan_tci = 0;
3690         }
3691
3692         /* deliver only exact match when indicated */
3693         null_or_dev = deliver_exact ? skb->dev : NULL;
3694
3695         type = skb->protocol;
3696         list_for_each_entry_rcu(ptype,
3697                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3698                 if (ptype->type == type &&
3699                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3700                      ptype->dev == orig_dev)) {
3701                         if (pt_prev)
3702                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3703                         pt_prev = ptype;
3704                 }
3705         }
3706
3707         if (pt_prev) {
3708                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3709                         goto drop;
3710                 else
3711                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3712         } else {
3713 drop:
3714                 atomic_long_inc(&skb->dev->rx_dropped);
3715                 kfree_skb(skb);
3716                 /* Jamal, now you will not able to escape explaining
3717                  * me how you were going to use this. :-)
3718                  */
3719                 ret = NET_RX_DROP;
3720         }
3721
3722 unlock:
3723         rcu_read_unlock();
3724         return ret;
3725 }
3726
3727 static int __netif_receive_skb(struct sk_buff *skb)
3728 {
3729         int ret;
3730
3731         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3732                 unsigned long pflags = current->flags;
3733
3734                 /*
3735                  * PFMEMALLOC skbs are special, they should
3736                  * - be delivered to SOCK_MEMALLOC sockets only
3737                  * - stay away from userspace
3738                  * - have bounded memory usage
3739                  *
3740                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3741                  * context down to all allocation sites.
3742                  */
3743                 current->flags |= PF_MEMALLOC;
3744                 ret = __netif_receive_skb_core(skb, true);
3745                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3746         } else
3747                 ret = __netif_receive_skb_core(skb, false);
3748
3749         return ret;
3750 }
3751
3752 static int netif_receive_skb_internal(struct sk_buff *skb)
3753 {
3754         net_timestamp_check(netdev_tstamp_prequeue, skb);
3755
3756         if (skb_defer_rx_timestamp(skb))
3757                 return NET_RX_SUCCESS;
3758
3759 #ifdef CONFIG_RPS
3760         if (static_key_false(&rps_needed)) {
3761                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3762                 int cpu, ret;
3763
3764                 rcu_read_lock();
3765
3766                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3767
3768                 if (cpu >= 0) {
3769                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3770                         rcu_read_unlock();
3771                         return ret;
3772                 }
3773                 rcu_read_unlock();
3774         }
3775 #endif
3776         return __netif_receive_skb(skb);
3777 }
3778
3779 /**
3780  *      netif_receive_skb - process receive buffer from network
3781  *      @skb: buffer to process
3782  *
3783  *      netif_receive_skb() is the main receive data processing function.
3784  *      It always succeeds. The buffer may be dropped during processing
3785  *      for congestion control or by the protocol layers.
3786  *
3787  *      This function may only be called from softirq context and interrupts
3788  *      should be enabled.
3789  *
3790  *      Return values (usually ignored):
3791  *      NET_RX_SUCCESS: no congestion
3792  *      NET_RX_DROP: packet was dropped
3793  */
3794 int netif_receive_skb(struct sk_buff *skb)
3795 {
3796         trace_netif_receive_skb_entry(skb);
3797
3798         return netif_receive_skb_internal(skb);
3799 }
3800 EXPORT_SYMBOL(netif_receive_skb);
3801
3802 /* Network device is going away, flush any packets still pending
3803  * Called with irqs disabled.
3804  */
3805 static void flush_backlog(void *arg)
3806 {
3807         struct net_device *dev = arg;
3808         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3809         struct sk_buff *skb, *tmp;
3810
3811         rps_lock(sd);
3812         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3813                 if (skb->dev == dev) {
3814                         __skb_unlink(skb, &sd->input_pkt_queue);
3815                         kfree_skb(skb);
3816                         input_queue_head_incr(sd);
3817                 }
3818         }
3819         rps_unlock(sd);
3820
3821         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3822                 if (skb->dev == dev) {
3823                         __skb_unlink(skb, &sd->process_queue);
3824                         kfree_skb(skb);
3825                         input_queue_head_incr(sd);
3826                 }
3827         }
3828 }
3829
3830 static int napi_gro_complete(struct sk_buff *skb)
3831 {
3832         struct packet_offload *ptype;
3833         __be16 type = skb->protocol;
3834         struct list_head *head = &offload_base;
3835         int err = -ENOENT;
3836
3837         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3838
3839         if (NAPI_GRO_CB(skb)->count == 1) {
3840                 skb_shinfo(skb)->gso_size = 0;
3841                 goto out;
3842         }
3843
3844         rcu_read_lock();
3845         list_for_each_entry_rcu(ptype, head, list) {
3846                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3847                         continue;
3848
3849                 err = ptype->callbacks.gro_complete(skb, 0);
3850                 break;
3851         }
3852         rcu_read_unlock();
3853
3854         if (err) {
3855                 WARN_ON(&ptype->list == head);
3856                 kfree_skb(skb);
3857                 return NET_RX_SUCCESS;
3858         }
3859
3860 out:
3861         return netif_receive_skb_internal(skb);
3862 }
3863
3864 /* napi->gro_list contains packets ordered by age.
3865  * youngest packets at the head of it.
3866  * Complete skbs in reverse order to reduce latencies.
3867  */
3868 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3869 {
3870         struct sk_buff *skb, *prev = NULL;
3871
3872         /* scan list and build reverse chain */
3873         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3874                 skb->prev = prev;
3875                 prev = skb;
3876         }
3877
3878         for (skb = prev; skb; skb = prev) {
3879                 skb->next = NULL;
3880
3881                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3882                         return;
3883
3884                 prev = skb->prev;
3885                 napi_gro_complete(skb);
3886                 napi->gro_count--;
3887         }
3888
3889         napi->gro_list = NULL;
3890 }
3891 EXPORT_SYMBOL(napi_gro_flush);
3892
3893 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3894 {
3895         struct sk_buff *p;
3896         unsigned int maclen = skb->dev->hard_header_len;
3897         u32 hash = skb_get_hash_raw(skb);
3898
3899         for (p = napi->gro_list; p; p = p->next) {
3900                 unsigned long diffs;
3901
3902                 NAPI_GRO_CB(p)->flush = 0;
3903
3904                 if (hash != skb_get_hash_raw(p)) {
3905                         NAPI_GRO_CB(p)->same_flow = 0;
3906                         continue;
3907                 }
3908
3909                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3910                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3911                 if (maclen == ETH_HLEN)
3912                         diffs |= compare_ether_header(skb_mac_header(p),
3913                                                       skb_mac_header(skb));
3914                 else if (!diffs)
3915                         diffs = memcmp(skb_mac_header(p),
3916                                        skb_mac_header(skb),
3917                                        maclen);
3918                 NAPI_GRO_CB(p)->same_flow = !diffs;
3919         }
3920 }
3921
3922 static void skb_gro_reset_offset(struct sk_buff *skb)
3923 {
3924         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3925         const skb_frag_t *frag0 = &pinfo->frags[0];
3926
3927         NAPI_GRO_CB(skb)->data_offset = 0;
3928         NAPI_GRO_CB(skb)->frag0 = NULL;
3929         NAPI_GRO_CB(skb)->frag0_len = 0;
3930
3931         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3932             pinfo->nr_frags &&
3933             !PageHighMem(skb_frag_page(frag0))) {
3934                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3935                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3936         }
3937 }
3938
3939 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3940 {
3941         struct skb_shared_info *pinfo = skb_shinfo(skb);
3942
3943         BUG_ON(skb->end - skb->tail < grow);
3944
3945         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3946
3947         skb->data_len -= grow;
3948         skb->tail += grow;
3949
3950         pinfo->frags[0].page_offset += grow;
3951         skb_frag_size_sub(&pinfo->frags[0], grow);
3952
3953         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3954                 skb_frag_unref(skb, 0);
3955                 memmove(pinfo->frags, pinfo->frags + 1,
3956                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
3957         }
3958 }
3959
3960 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3961 {
3962         struct sk_buff **pp = NULL;
3963         struct packet_offload *ptype;
3964         __be16 type = skb->protocol;
3965         struct list_head *head = &offload_base;
3966         int same_flow;
3967         enum gro_result ret;
3968         int grow;
3969
3970         if (!(skb->dev->features & NETIF_F_GRO))
3971                 goto normal;
3972
3973         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3974                 goto normal;
3975
3976         gro_list_prepare(napi, skb);
3977
3978         rcu_read_lock();
3979         list_for_each_entry_rcu(ptype, head, list) {
3980                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3981                         continue;
3982
3983                 skb_set_network_header(skb, skb_gro_offset(skb));
3984                 skb_reset_mac_len(skb);
3985                 NAPI_GRO_CB(skb)->same_flow = 0;
3986                 NAPI_GRO_CB(skb)->flush = 0;
3987                 NAPI_GRO_CB(skb)->free = 0;
3988                 NAPI_GRO_CB(skb)->udp_mark = 0;
3989
3990                 /* Setup for GRO checksum validation */
3991                 switch (skb->ip_summed) {
3992                 case CHECKSUM_COMPLETE:
3993                         NAPI_GRO_CB(skb)->csum = skb->csum;
3994                         NAPI_GRO_CB(skb)->csum_valid = 1;
3995                         NAPI_GRO_CB(skb)->csum_cnt = 0;
3996                         break;
3997                 case CHECKSUM_UNNECESSARY:
3998                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
3999                         NAPI_GRO_CB(skb)->csum_valid = 0;
4000                         break;
4001                 default:
4002                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4003                         NAPI_GRO_CB(skb)->csum_valid = 0;
4004                 }
4005
4006                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4007                 break;
4008         }
4009         rcu_read_unlock();
4010
4011         if (&ptype->list == head)
4012                 goto normal;
4013
4014         same_flow = NAPI_GRO_CB(skb)->same_flow;
4015         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4016
4017         if (pp) {
4018                 struct sk_buff *nskb = *pp;
4019
4020                 *pp = nskb->next;
4021                 nskb->next = NULL;
4022                 napi_gro_complete(nskb);
4023                 napi->gro_count--;
4024         }
4025
4026         if (same_flow)
4027                 goto ok;
4028
4029         if (NAPI_GRO_CB(skb)->flush)
4030                 goto normal;
4031
4032         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4033                 struct sk_buff *nskb = napi->gro_list;
4034
4035                 /* locate the end of the list to select the 'oldest' flow */
4036                 while (nskb->next) {
4037                         pp = &nskb->next;
4038                         nskb = *pp;
4039                 }
4040                 *pp = NULL;
4041                 nskb->next = NULL;
4042                 napi_gro_complete(nskb);
4043         } else {
4044                 napi->gro_count++;
4045         }
4046         NAPI_GRO_CB(skb)->count = 1;
4047         NAPI_GRO_CB(skb)->age = jiffies;
4048         NAPI_GRO_CB(skb)->last = skb;
4049         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4050         skb->next = napi->gro_list;
4051         napi->gro_list = skb;
4052         ret = GRO_HELD;
4053
4054 pull:
4055         grow = skb_gro_offset(skb) - skb_headlen(skb);
4056         if (grow > 0)
4057                 gro_pull_from_frag0(skb, grow);
4058 ok:
4059         return ret;
4060
4061 normal:
4062         ret = GRO_NORMAL;
4063         goto pull;
4064 }
4065
4066 struct packet_offload *gro_find_receive_by_type(__be16 type)
4067 {
4068         struct list_head *offload_head = &offload_base;
4069         struct packet_offload *ptype;
4070
4071         list_for_each_entry_rcu(ptype, offload_head, list) {
4072                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4073                         continue;
4074                 return ptype;
4075         }
4076         return NULL;
4077 }
4078 EXPORT_SYMBOL(gro_find_receive_by_type);
4079
4080 struct packet_offload *gro_find_complete_by_type(__be16 type)
4081 {
4082         struct list_head *offload_head = &offload_base;
4083         struct packet_offload *ptype;
4084
4085         list_for_each_entry_rcu(ptype, offload_head, list) {
4086                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4087                         continue;
4088                 return ptype;
4089         }
4090         return NULL;
4091 }
4092 EXPORT_SYMBOL(gro_find_complete_by_type);
4093
4094 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4095 {
4096         switch (ret) {
4097         case GRO_NORMAL:
4098                 if (netif_receive_skb_internal(skb))
4099                         ret = GRO_DROP;
4100                 break;
4101
4102         case GRO_DROP:
4103                 kfree_skb(skb);
4104                 break;
4105
4106         case GRO_MERGED_FREE:
4107                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4108                         kmem_cache_free(skbuff_head_cache, skb);
4109                 else
4110                         __kfree_skb(skb);
4111                 break;
4112
4113         case GRO_HELD:
4114         case GRO_MERGED:
4115                 break;
4116         }
4117
4118         return ret;
4119 }
4120
4121 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4122 {
4123         trace_napi_gro_receive_entry(skb);
4124
4125         skb_gro_reset_offset(skb);
4126
4127         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4128 }
4129 EXPORT_SYMBOL(napi_gro_receive);
4130
4131 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4132 {
4133         if (unlikely(skb->pfmemalloc)) {
4134                 consume_skb(skb);
4135                 return;
4136         }
4137         __skb_pull(skb, skb_headlen(skb));
4138         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4139         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4140         skb->vlan_tci = 0;
4141         skb->dev = napi->dev;
4142         skb->skb_iif = 0;
4143         skb->encapsulation = 0;
4144         skb_shinfo(skb)->gso_type = 0;
4145         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4146
4147         napi->skb = skb;
4148 }
4149
4150 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4151 {
4152         struct sk_buff *skb = napi->skb;
4153
4154         if (!skb) {
4155                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4156                 napi->skb = skb;
4157         }
4158         return skb;
4159 }
4160 EXPORT_SYMBOL(napi_get_frags);
4161
4162 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4163                                       struct sk_buff *skb,
4164                                       gro_result_t ret)
4165 {
4166         switch (ret) {
4167         case GRO_NORMAL:
4168         case GRO_HELD:
4169                 __skb_push(skb, ETH_HLEN);
4170                 skb->protocol = eth_type_trans(skb, skb->dev);
4171                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4172                         ret = GRO_DROP;
4173                 break;
4174
4175         case GRO_DROP:
4176         case GRO_MERGED_FREE:
4177                 napi_reuse_skb(napi, skb);
4178                 break;
4179
4180         case GRO_MERGED:
4181                 break;
4182         }
4183
4184         return ret;
4185 }
4186
4187 /* Upper GRO stack assumes network header starts at gro_offset=0
4188  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4189  * We copy ethernet header into skb->data to have a common layout.
4190  */
4191 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4192 {
4193         struct sk_buff *skb = napi->skb;
4194         const struct ethhdr *eth;
4195         unsigned int hlen = sizeof(*eth);
4196
4197         napi->skb = NULL;
4198
4199         skb_reset_mac_header(skb);
4200         skb_gro_reset_offset(skb);
4201
4202         eth = skb_gro_header_fast(skb, 0);
4203         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4204                 eth = skb_gro_header_slow(skb, hlen, 0);
4205                 if (unlikely(!eth)) {
4206                         napi_reuse_skb(napi, skb);
4207                         return NULL;
4208                 }
4209         } else {
4210                 gro_pull_from_frag0(skb, hlen);
4211                 NAPI_GRO_CB(skb)->frag0 += hlen;
4212                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4213         }
4214         __skb_pull(skb, hlen);
4215
4216         /*
4217          * This works because the only protocols we care about don't require
4218          * special handling.
4219          * We'll fix it up properly in napi_frags_finish()
4220          */
4221         skb->protocol = eth->h_proto;
4222
4223         return skb;
4224 }
4225
4226 gro_result_t napi_gro_frags(struct napi_struct *napi)
4227 {
4228         struct sk_buff *skb = napi_frags_skb(napi);
4229
4230         if (!skb)
4231                 return GRO_DROP;
4232
4233         trace_napi_gro_frags_entry(skb);
4234
4235         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4236 }
4237 EXPORT_SYMBOL(napi_gro_frags);
4238
4239 /* Compute the checksum from gro_offset and return the folded value
4240  * after adding in any pseudo checksum.
4241  */
4242 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4243 {
4244         __wsum wsum;
4245         __sum16 sum;
4246
4247         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4248
4249         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4250         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4251         if (likely(!sum)) {
4252                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4253                     !skb->csum_complete_sw)
4254                         netdev_rx_csum_fault(skb->dev);
4255         }
4256
4257         NAPI_GRO_CB(skb)->csum = wsum;
4258         NAPI_GRO_CB(skb)->csum_valid = 1;
4259
4260         return sum;
4261 }
4262 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4263
4264 /*
4265  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4266  * Note: called with local irq disabled, but exits with local irq enabled.
4267  */
4268 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4269 {
4270 #ifdef CONFIG_RPS
4271         struct softnet_data *remsd = sd->rps_ipi_list;
4272
4273         if (remsd) {
4274                 sd->rps_ipi_list = NULL;
4275
4276                 local_irq_enable();
4277
4278                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4279                 while (remsd) {
4280                         struct softnet_data *next = remsd->rps_ipi_next;
4281
4282                         if (cpu_online(remsd->cpu))
4283                                 smp_call_function_single_async(remsd->cpu,
4284                                                            &remsd->csd);
4285                         remsd = next;
4286                 }
4287         } else
4288 #endif
4289                 local_irq_enable();
4290 }
4291
4292 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4293 {
4294 #ifdef CONFIG_RPS
4295         return sd->rps_ipi_list != NULL;
4296 #else
4297         return false;
4298 #endif
4299 }
4300
4301 static int process_backlog(struct napi_struct *napi, int quota)
4302 {
4303         int work = 0;
4304         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4305
4306         /* Check if we have pending ipi, its better to send them now,
4307          * not waiting net_rx_action() end.
4308          */
4309         if (sd_has_rps_ipi_waiting(sd)) {
4310                 local_irq_disable();
4311                 net_rps_action_and_irq_enable(sd);
4312         }
4313
4314         napi->weight = weight_p;
4315         local_irq_disable();
4316         while (1) {
4317                 struct sk_buff *skb;
4318
4319                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4320                         local_irq_enable();
4321                         __netif_receive_skb(skb);
4322                         local_irq_disable();
4323                         input_queue_head_incr(sd);
4324                         if (++work >= quota) {
4325                                 local_irq_enable();
4326                                 return work;
4327                         }
4328                 }
4329
4330                 rps_lock(sd);
4331                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4332                         /*
4333                          * Inline a custom version of __napi_complete().
4334                          * only current cpu owns and manipulates this napi,
4335                          * and NAPI_STATE_SCHED is the only possible flag set
4336                          * on backlog.
4337                          * We can use a plain write instead of clear_bit(),
4338                          * and we dont need an smp_mb() memory barrier.
4339                          */
4340                         napi->state = 0;
4341                         rps_unlock(sd);
4342
4343                         break;
4344                 }
4345
4346                 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4347                                            &sd->process_queue);
4348                 rps_unlock(sd);
4349         }
4350         local_irq_enable();
4351
4352         return work;
4353 }
4354
4355 /**
4356  * __napi_schedule - schedule for receive
4357  * @n: entry to schedule
4358  *
4359  * The entry's receive function will be scheduled to run.
4360  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4361  */
4362 void __napi_schedule(struct napi_struct *n)
4363 {
4364         unsigned long flags;
4365
4366         local_irq_save(flags);
4367         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4368         local_irq_restore(flags);
4369 }
4370 EXPORT_SYMBOL(__napi_schedule);
4371
4372 /**
4373  * __napi_schedule_irqoff - schedule for receive
4374  * @n: entry to schedule
4375  *
4376  * Variant of __napi_schedule() assuming hard irqs are masked
4377  */
4378 void __napi_schedule_irqoff(struct napi_struct *n)
4379 {
4380         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4381 }
4382 EXPORT_SYMBOL(__napi_schedule_irqoff);
4383
4384 void __napi_complete(struct napi_struct *n)
4385 {
4386         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4387
4388         list_del_init(&n->poll_list);
4389         smp_mb__before_atomic();
4390         clear_bit(NAPI_STATE_SCHED, &n->state);
4391 }
4392 EXPORT_SYMBOL(__napi_complete);
4393
4394 void napi_complete_done(struct napi_struct *n, int work_done)
4395 {
4396         unsigned long flags;
4397
4398         /*
4399          * don't let napi dequeue from the cpu poll list
4400          * just in case its running on a different cpu
4401          */
4402         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4403                 return;
4404
4405         if (n->gro_list) {
4406                 unsigned long timeout = 0;
4407
4408                 if (work_done)
4409                         timeout = n->dev->gro_flush_timeout;
4410
4411                 if (timeout)
4412                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4413                                       HRTIMER_MODE_REL_PINNED);
4414                 else
4415                         napi_gro_flush(n, false);
4416         }
4417         if (likely(list_empty(&n->poll_list))) {
4418                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4419         } else {
4420                 /* If n->poll_list is not empty, we need to mask irqs */
4421                 local_irq_save(flags);
4422                 __napi_complete(n);
4423                 local_irq_restore(flags);
4424         }
4425 }
4426 EXPORT_SYMBOL(napi_complete_done);
4427
4428 /* must be called under rcu_read_lock(), as we dont take a reference */
4429 struct napi_struct *napi_by_id(unsigned int napi_id)
4430 {
4431         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4432         struct napi_struct *napi;
4433
4434         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4435                 if (napi->napi_id == napi_id)
4436                         return napi;
4437
4438         return NULL;
4439 }
4440 EXPORT_SYMBOL_GPL(napi_by_id);
4441
4442 void napi_hash_add(struct napi_struct *napi)
4443 {
4444         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4445
4446                 spin_lock(&napi_hash_lock);
4447
4448                 /* 0 is not a valid id, we also skip an id that is taken
4449                  * we expect both events to be extremely rare
4450                  */
4451                 napi->napi_id = 0;
4452                 while (!napi->napi_id) {
4453                         napi->napi_id = ++napi_gen_id;
4454                         if (napi_by_id(napi->napi_id))
4455                                 napi->napi_id = 0;
4456                 }
4457
4458                 hlist_add_head_rcu(&napi->napi_hash_node,
4459                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4460
4461                 spin_unlock(&napi_hash_lock);
4462         }
4463 }
4464 EXPORT_SYMBOL_GPL(napi_hash_add);
4465
4466 /* Warning : caller is responsible to make sure rcu grace period
4467  * is respected before freeing memory containing @napi
4468  */
4469 void napi_hash_del(struct napi_struct *napi)
4470 {
4471         spin_lock(&napi_hash_lock);
4472
4473         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4474                 hlist_del_rcu(&napi->napi_hash_node);
4475
4476         spin_unlock(&napi_hash_lock);
4477 }
4478 EXPORT_SYMBOL_GPL(napi_hash_del);
4479
4480 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4481 {
4482         struct napi_struct *napi;
4483
4484         napi = container_of(timer, struct napi_struct, timer);
4485         if (napi->gro_list)
4486                 napi_schedule(napi);
4487
4488         return HRTIMER_NORESTART;
4489 }
4490
4491 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4492                     int (*poll)(struct napi_struct *, int), int weight)
4493 {
4494         INIT_LIST_HEAD(&napi->poll_list);
4495         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4496         napi->timer.function = napi_watchdog;
4497         napi->gro_count = 0;
4498         napi->gro_list = NULL;
4499         napi->skb = NULL;
4500         napi->poll = poll;
4501         if (weight > NAPI_POLL_WEIGHT)
4502                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4503                             weight, dev->name);
4504         napi->weight = weight;
4505         list_add(&napi->dev_list, &dev->napi_list);
4506         napi->dev = dev;
4507 #ifdef CONFIG_NETPOLL
4508         spin_lock_init(&napi->poll_lock);
4509         napi->poll_owner = -1;
4510 #endif
4511         set_bit(NAPI_STATE_SCHED, &napi->state);
4512 }
4513 EXPORT_SYMBOL(netif_napi_add);
4514
4515 void napi_disable(struct napi_struct *n)
4516 {
4517         might_sleep();
4518         set_bit(NAPI_STATE_DISABLE, &n->state);
4519
4520         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4521                 msleep(1);
4522
4523         hrtimer_cancel(&n->timer);
4524
4525         clear_bit(NAPI_STATE_DISABLE, &n->state);
4526 }
4527 EXPORT_SYMBOL(napi_disable);
4528
4529 void netif_napi_del(struct napi_struct *napi)
4530 {
4531         list_del_init(&napi->dev_list);
4532         napi_free_frags(napi);
4533
4534         kfree_skb_list(napi->gro_list);
4535         napi->gro_list = NULL;
4536         napi->gro_count = 0;
4537 }
4538 EXPORT_SYMBOL(netif_napi_del);
4539
4540 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4541 {
4542         void *have;
4543         int work, weight;
4544
4545         list_del_init(&n->poll_list);
4546
4547         have = netpoll_poll_lock(n);
4548
4549         weight = n->weight;
4550
4551         /* This NAPI_STATE_SCHED test is for avoiding a race
4552          * with netpoll's poll_napi().  Only the entity which
4553          * obtains the lock and sees NAPI_STATE_SCHED set will
4554          * actually make the ->poll() call.  Therefore we avoid
4555          * accidentally calling ->poll() when NAPI is not scheduled.
4556          */
4557         work = 0;
4558         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4559                 work = n->poll(n, weight);
4560                 trace_napi_poll(n);
4561         }
4562
4563         WARN_ON_ONCE(work > weight);
4564
4565         if (likely(work < weight))
4566                 goto out_unlock;
4567
4568         /* Drivers must not modify the NAPI state if they
4569          * consume the entire weight.  In such cases this code
4570          * still "owns" the NAPI instance and therefore can
4571          * move the instance around on the list at-will.
4572          */
4573         if (unlikely(napi_disable_pending(n))) {
4574                 napi_complete(n);
4575                 goto out_unlock;
4576         }
4577
4578         if (n->gro_list) {
4579                 /* flush too old packets
4580                  * If HZ < 1000, flush all packets.
4581                  */
4582                 napi_gro_flush(n, HZ >= 1000);
4583         }
4584
4585         /* Some drivers may have called napi_schedule
4586          * prior to exhausting their budget.
4587          */
4588         if (unlikely(!list_empty(&n->poll_list))) {
4589                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4590                              n->dev ? n->dev->name : "backlog");
4591                 goto out_unlock;
4592         }
4593
4594         list_add_tail(&n->poll_list, repoll);
4595
4596 out_unlock:
4597         netpoll_poll_unlock(have);
4598
4599         return work;
4600 }
4601
4602 static void net_rx_action(struct softirq_action *h)
4603 {
4604         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4605         unsigned long time_limit = jiffies + 2;
4606         int budget = netdev_budget;
4607         LIST_HEAD(list);
4608         LIST_HEAD(repoll);
4609
4610         local_irq_disable();
4611         list_splice_init(&sd->poll_list, &list);
4612         local_irq_enable();
4613
4614         for (;;) {
4615                 struct napi_struct *n;
4616
4617                 if (list_empty(&list)) {
4618                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4619                                 return;
4620                         break;
4621                 }
4622
4623                 n = list_first_entry(&list, struct napi_struct, poll_list);
4624                 budget -= napi_poll(n, &repoll);
4625
4626                 /* If softirq window is exhausted then punt.
4627                  * Allow this to run for 2 jiffies since which will allow
4628                  * an average latency of 1.5/HZ.
4629                  */
4630                 if (unlikely(budget <= 0 ||
4631                              time_after_eq(jiffies, time_limit))) {
4632                         sd->time_squeeze++;
4633                         break;
4634                 }
4635         }
4636
4637         local_irq_disable();
4638
4639         list_splice_tail_init(&sd->poll_list, &list);
4640         list_splice_tail(&repoll, &list);
4641         list_splice(&list, &sd->poll_list);
4642         if (!list_empty(&sd->poll_list))
4643                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4644
4645         net_rps_action_and_irq_enable(sd);
4646 }
4647
4648 struct netdev_adjacent {
4649         struct net_device *dev;
4650
4651         /* upper master flag, there can only be one master device per list */
4652         bool master;
4653
4654         /* counter for the number of times this device was added to us */
4655         u16 ref_nr;
4656
4657         /* private field for the users */
4658         void *private;
4659
4660         struct list_head list;
4661         struct rcu_head rcu;
4662 };
4663
4664 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4665                                                  struct net_device *adj_dev,
4666                                                  struct list_head *adj_list)
4667 {
4668         struct netdev_adjacent *adj;
4669
4670         list_for_each_entry(adj, adj_list, list) {
4671                 if (adj->dev == adj_dev)
4672                         return adj;
4673         }
4674         return NULL;
4675 }
4676
4677 /**
4678  * netdev_has_upper_dev - Check if device is linked to an upper device
4679  * @dev: device
4680  * @upper_dev: upper device to check
4681  *
4682  * Find out if a device is linked to specified upper device and return true
4683  * in case it is. Note that this checks only immediate upper device,
4684  * not through a complete stack of devices. The caller must hold the RTNL lock.
4685  */
4686 bool netdev_has_upper_dev(struct net_device *dev,
4687                           struct net_device *upper_dev)
4688 {
4689         ASSERT_RTNL();
4690
4691         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4692 }
4693 EXPORT_SYMBOL(netdev_has_upper_dev);
4694
4695 /**
4696  * netdev_has_any_upper_dev - Check if device is linked to some device
4697  * @dev: device
4698  *
4699  * Find out if a device is linked to an upper device and return true in case
4700  * it is. The caller must hold the RTNL lock.
4701  */
4702 static bool netdev_has_any_upper_dev(struct net_device *dev)
4703 {
4704         ASSERT_RTNL();
4705
4706         return !list_empty(&dev->all_adj_list.upper);
4707 }
4708
4709 /**
4710  * netdev_master_upper_dev_get - Get master upper device
4711  * @dev: device
4712  *
4713  * Find a master upper device and return pointer to it or NULL in case
4714  * it's not there. The caller must hold the RTNL lock.
4715  */
4716 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4717 {
4718         struct netdev_adjacent *upper;
4719
4720         ASSERT_RTNL();
4721
4722         if (list_empty(&dev->adj_list.upper))
4723                 return NULL;
4724
4725         upper = list_first_entry(&dev->adj_list.upper,
4726                                  struct netdev_adjacent, list);
4727         if (likely(upper->master))
4728                 return upper->dev;
4729         return NULL;
4730 }
4731 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4732
4733 void *netdev_adjacent_get_private(struct list_head *adj_list)
4734 {
4735         struct netdev_adjacent *adj;
4736
4737         adj = list_entry(adj_list, struct netdev_adjacent, list);
4738
4739         return adj->private;
4740 }
4741 EXPORT_SYMBOL(netdev_adjacent_get_private);
4742
4743 /**
4744  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4745  * @dev: device
4746  * @iter: list_head ** of the current position
4747  *
4748  * Gets the next device from the dev's upper list, starting from iter
4749  * position. The caller must hold RCU read lock.
4750  */
4751 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4752                                                  struct list_head **iter)
4753 {
4754         struct netdev_adjacent *upper;
4755
4756         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4757
4758         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4759
4760         if (&upper->list == &dev->adj_list.upper)
4761                 return NULL;
4762
4763         *iter = &upper->list;
4764
4765         return upper->dev;
4766 }
4767 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4768
4769 /**
4770  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4771  * @dev: device
4772  * @iter: list_head ** of the current position
4773  *
4774  * Gets the next device from the dev's upper list, starting from iter
4775  * position. The caller must hold RCU read lock.
4776  */
4777 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4778                                                      struct list_head **iter)
4779 {
4780         struct netdev_adjacent *upper;
4781
4782         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4783
4784         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4785
4786         if (&upper->list == &dev->all_adj_list.upper)
4787                 return NULL;
4788
4789         *iter = &upper->list;
4790
4791         return upper->dev;
4792 }
4793 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4794
4795 /**
4796  * netdev_lower_get_next_private - Get the next ->private from the
4797  *                                 lower neighbour list
4798  * @dev: device
4799  * @iter: list_head ** of the current position
4800  *
4801  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4802  * list, starting from iter position. The caller must hold either hold the
4803  * RTNL lock or its own locking that guarantees that the neighbour lower
4804  * list will remain unchainged.
4805  */
4806 void *netdev_lower_get_next_private(struct net_device *dev,
4807                                     struct list_head **iter)
4808 {
4809         struct netdev_adjacent *lower;
4810
4811         lower = list_entry(*iter, struct netdev_adjacent, list);
4812
4813         if (&lower->list == &dev->adj_list.lower)
4814                 return NULL;
4815
4816         *iter = lower->list.next;
4817
4818         return lower->private;
4819 }
4820 EXPORT_SYMBOL(netdev_lower_get_next_private);
4821
4822 /**
4823  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4824  *                                     lower neighbour list, RCU
4825  *                                     variant
4826  * @dev: device
4827  * @iter: list_head ** of the current position
4828  *
4829  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4830  * list, starting from iter position. The caller must hold RCU read lock.
4831  */
4832 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4833                                         struct list_head **iter)
4834 {
4835         struct netdev_adjacent *lower;
4836
4837         WARN_ON_ONCE(!rcu_read_lock_held());
4838
4839         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4840
4841         if (&lower->list == &dev->adj_list.lower)
4842                 return NULL;
4843
4844         *iter = &lower->list;
4845
4846         return lower->private;
4847 }
4848 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4849
4850 /**
4851  * netdev_lower_get_next - Get the next device from the lower neighbour
4852  *                         list
4853  * @dev: device
4854  * @iter: list_head ** of the current position
4855  *
4856  * Gets the next netdev_adjacent from the dev's lower neighbour
4857  * list, starting from iter position. The caller must hold RTNL lock or
4858  * its own locking that guarantees that the neighbour lower
4859  * list will remain unchainged.
4860  */
4861 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4862 {
4863         struct netdev_adjacent *lower;
4864
4865         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4866
4867         if (&lower->list == &dev->adj_list.lower)
4868                 return NULL;
4869
4870         *iter = &lower->list;
4871
4872         return lower->dev;
4873 }
4874 EXPORT_SYMBOL(netdev_lower_get_next);
4875
4876 /**
4877  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4878  *                                     lower neighbour list, RCU
4879  *                                     variant
4880  * @dev: device
4881  *
4882  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4883  * list. The caller must hold RCU read lock.
4884  */
4885 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4886 {
4887         struct netdev_adjacent *lower;
4888
4889         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4890                         struct netdev_adjacent, list);
4891         if (lower)
4892                 return lower->private;
4893         return NULL;
4894 }
4895 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4896
4897 /**
4898  * netdev_master_upper_dev_get_rcu - Get master upper device
4899  * @dev: device
4900  *
4901  * Find a master upper device and return pointer to it or NULL in case
4902  * it's not there. The caller must hold the RCU read lock.
4903  */
4904 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4905 {
4906         struct netdev_adjacent *upper;
4907
4908         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4909                                        struct netdev_adjacent, list);
4910         if (upper && likely(upper->master))
4911                 return upper->dev;
4912         return NULL;
4913 }
4914 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4915
4916 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4917                               struct net_device *adj_dev,
4918                               struct list_head *dev_list)
4919 {
4920         char linkname[IFNAMSIZ+7];
4921         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4922                 "upper_%s" : "lower_%s", adj_dev->name);
4923         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4924                                  linkname);
4925 }
4926 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4927                                char *name,
4928                                struct list_head *dev_list)
4929 {
4930         char linkname[IFNAMSIZ+7];
4931         sprintf(linkname, dev_list == &dev->adj_list.upper ?
4932                 "upper_%s" : "lower_%s", name);
4933         sysfs_remove_link(&(dev->dev.kobj), linkname);
4934 }
4935
4936 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4937                                                  struct net_device *adj_dev,
4938                                                  struct list_head *dev_list)
4939 {
4940         return (dev_list == &dev->adj_list.upper ||
4941                 dev_list == &dev->adj_list.lower) &&
4942                 net_eq(dev_net(dev), dev_net(adj_dev));
4943 }
4944
4945 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4946                                         struct net_device *adj_dev,
4947                                         struct list_head *dev_list,
4948                                         void *private, bool master)
4949 {
4950         struct netdev_adjacent *adj;
4951         int ret;
4952
4953         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4954
4955         if (adj) {
4956                 adj->ref_nr++;
4957                 return 0;
4958         }
4959
4960         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4961         if (!adj)
4962                 return -ENOMEM;
4963
4964         adj->dev = adj_dev;
4965         adj->master = master;
4966         adj->ref_nr = 1;
4967         adj->private = private;
4968         dev_hold(adj_dev);
4969
4970         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4971                  adj_dev->name, dev->name, adj_dev->name);
4972
4973         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4974                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4975                 if (ret)
4976                         goto free_adj;
4977         }
4978
4979         /* Ensure that master link is always the first item in list. */
4980         if (master) {
4981                 ret = sysfs_create_link(&(dev->dev.kobj),
4982                                         &(adj_dev->dev.kobj), "master");
4983                 if (ret)
4984                         goto remove_symlinks;
4985
4986                 list_add_rcu(&adj->list, dev_list);
4987         } else {
4988                 list_add_tail_rcu(&adj->list, dev_list);
4989         }
4990
4991         return 0;
4992
4993 remove_symlinks:
4994         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4995                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
4996 free_adj:
4997         kfree(adj);
4998         dev_put(adj_dev);
4999
5000         return ret;
5001 }
5002
5003 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5004                                          struct net_device *adj_dev,
5005                                          struct list_head *dev_list)
5006 {
5007         struct netdev_adjacent *adj;
5008
5009         adj = __netdev_find_adj(dev, adj_dev, dev_list);
5010
5011         if (!adj) {
5012                 pr_err("tried to remove device %s from %s\n",
5013                        dev->name, adj_dev->name);
5014                 BUG();
5015         }
5016
5017         if (adj->ref_nr > 1) {
5018                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5019                          adj->ref_nr-1);
5020                 adj->ref_nr--;
5021                 return;
5022         }
5023
5024         if (adj->master)
5025                 sysfs_remove_link(&(dev->dev.kobj), "master");
5026
5027         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5028                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5029
5030         list_del_rcu(&adj->list);
5031         pr_debug("dev_put for %s, because link removed from %s to %s\n",
5032                  adj_dev->name, dev->name, adj_dev->name);
5033         dev_put(adj_dev);
5034         kfree_rcu(adj, rcu);
5035 }
5036
5037 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5038                                             struct net_device *upper_dev,
5039                                             struct list_head *up_list,
5040                                             struct list_head *down_list,
5041                                             void *private, bool master)
5042 {
5043         int ret;
5044
5045         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5046                                            master);
5047         if (ret)
5048                 return ret;
5049
5050         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5051                                            false);
5052         if (ret) {
5053                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5054                 return ret;
5055         }
5056
5057         return 0;
5058 }
5059
5060 static int __netdev_adjacent_dev_link(struct net_device *dev,
5061                                       struct net_device *upper_dev)
5062 {
5063         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5064                                                 &dev->all_adj_list.upper,
5065                                                 &upper_dev->all_adj_list.lower,
5066                                                 NULL, false);
5067 }
5068
5069 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5070                                                struct net_device *upper_dev,
5071                                                struct list_head *up_list,
5072                                                struct list_head *down_list)
5073 {
5074         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5075         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5076 }
5077
5078 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5079                                          struct net_device *upper_dev)
5080 {
5081         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5082                                            &dev->all_adj_list.upper,
5083                                            &upper_dev->all_adj_list.lower);
5084 }
5085
5086 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5087                                                 struct net_device *upper_dev,
5088                                                 void *private, bool master)
5089 {
5090         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5091
5092         if (ret)
5093                 return ret;
5094
5095         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5096                                                &dev->adj_list.upper,
5097                                                &upper_dev->adj_list.lower,
5098                                                private, master);
5099         if (ret) {
5100                 __netdev_adjacent_dev_unlink(dev, upper_dev);
5101                 return ret;
5102         }
5103
5104         return 0;
5105 }
5106
5107 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5108                                                    struct net_device *upper_dev)
5109 {
5110         __netdev_adjacent_dev_unlink(dev, upper_dev);
5111         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5112                                            &dev->adj_list.upper,
5113                                            &upper_dev->adj_list.lower);
5114 }
5115
5116 static int __netdev_upper_dev_link(struct net_device *dev,
5117                                    struct net_device *upper_dev, bool master,
5118                                    void *private)
5119 {
5120         struct netdev_adjacent *i, *j, *to_i, *to_j;
5121         int ret = 0;
5122
5123         ASSERT_RTNL();
5124
5125         if (dev == upper_dev)
5126                 return -EBUSY;
5127
5128         /* To prevent loops, check if dev is not upper device to upper_dev. */
5129         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5130                 return -EBUSY;
5131
5132         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5133                 return -EEXIST;
5134
5135         if (master && netdev_master_upper_dev_get(dev))
5136                 return -EBUSY;
5137
5138         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5139                                                    master);
5140         if (ret)
5141                 return ret;
5142
5143         /* Now that we linked these devs, make all the upper_dev's
5144          * all_adj_list.upper visible to every dev's all_adj_list.lower an
5145          * versa, and don't forget the devices itself. All of these
5146          * links are non-neighbours.
5147          */
5148         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5149                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5150                         pr_debug("Interlinking %s with %s, non-neighbour\n",
5151                                  i->dev->name, j->dev->name);
5152                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5153                         if (ret)
5154                                 goto rollback_mesh;
5155                 }
5156         }
5157
5158         /* add dev to every upper_dev's upper device */
5159         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5160                 pr_debug("linking %s's upper device %s with %s\n",
5161                          upper_dev->name, i->dev->name, dev->name);
5162                 ret = __netdev_adjacent_dev_link(dev, i->dev);
5163                 if (ret)
5164                         goto rollback_upper_mesh;
5165         }
5166
5167         /* add upper_dev to every dev's lower device */
5168         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5169                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5170                          i->dev->name, upper_dev->name);
5171                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5172                 if (ret)
5173                         goto rollback_lower_mesh;
5174         }
5175
5176         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5177         return 0;
5178
5179 rollback_lower_mesh:
5180         to_i = i;
5181         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5182                 if (i == to_i)
5183                         break;
5184                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5185         }
5186
5187         i = NULL;
5188
5189 rollback_upper_mesh:
5190         to_i = i;
5191         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5192                 if (i == to_i)
5193                         break;
5194                 __netdev_adjacent_dev_unlink(dev, i->dev);
5195         }
5196
5197         i = j = NULL;
5198
5199 rollback_mesh:
5200         to_i = i;
5201         to_j = j;
5202         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5203                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5204                         if (i == to_i && j == to_j)
5205                                 break;
5206                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5207                 }
5208                 if (i == to_i)
5209                         break;
5210         }
5211
5212         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5213
5214         return ret;
5215 }
5216
5217 /**
5218  * netdev_upper_dev_link - Add a link to the upper device
5219  * @dev: device
5220  * @upper_dev: new upper device
5221  *
5222  * Adds a link to device which is upper to this one. The caller must hold
5223  * the RTNL lock. On a failure a negative errno code is returned.
5224  * On success the reference counts are adjusted and the function
5225  * returns zero.
5226  */
5227 int netdev_upper_dev_link(struct net_device *dev,
5228                           struct net_device *upper_dev)
5229 {
5230         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5231 }
5232 EXPORT_SYMBOL(netdev_upper_dev_link);
5233
5234 /**
5235  * netdev_master_upper_dev_link - Add a master link to the upper device
5236  * @dev: device
5237  * @upper_dev: new upper device
5238  *
5239  * Adds a link to device which is upper to this one. In this case, only
5240  * one master upper device can be linked, although other non-master devices
5241  * might be linked as well. The caller must hold the RTNL lock.
5242  * On a failure a negative errno code is returned. On success the reference
5243  * counts are adjusted and the function returns zero.
5244  */
5245 int netdev_master_upper_dev_link(struct net_device *dev,
5246                                  struct net_device *upper_dev)
5247 {
5248         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5249 }
5250 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5251
5252 int netdev_master_upper_dev_link_private(struct net_device *dev,
5253                                          struct net_device *upper_dev,
5254                                          void *private)
5255 {
5256         return __netdev_upper_dev_link(dev, upper_dev, true, private);
5257 }
5258 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5259
5260 /**
5261  * netdev_upper_dev_unlink - Removes a link to upper device
5262  * @dev: device
5263  * @upper_dev: new upper device
5264  *
5265  * Removes a link to device which is upper to this one. The caller must hold
5266  * the RTNL lock.
5267  */
5268 void netdev_upper_dev_unlink(struct net_device *dev,
5269                              struct net_device *upper_dev)
5270 {
5271         struct netdev_adjacent *i, *j;
5272         ASSERT_RTNL();
5273
5274         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5275
5276         /* Here is the tricky part. We must remove all dev's lower
5277          * devices from all upper_dev's upper devices and vice
5278          * versa, to maintain the graph relationship.
5279          */
5280         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5281                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5282                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
5283
5284         /* remove also the devices itself from lower/upper device
5285          * list
5286          */
5287         list_for_each_entry(i, &dev->all_adj_list.lower, list)
5288                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
5289
5290         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5291                 __netdev_adjacent_dev_unlink(dev, i->dev);
5292
5293         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5294 }
5295 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5296
5297 static void netdev_adjacent_add_links(struct net_device *dev)
5298 {
5299         struct netdev_adjacent *iter;
5300
5301         struct net *net = dev_net(dev);
5302
5303         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5304                 if (!net_eq(net,dev_net(iter->dev)))
5305                         continue;
5306                 netdev_adjacent_sysfs_add(iter->dev, dev,
5307                                           &iter->dev->adj_list.lower);
5308                 netdev_adjacent_sysfs_add(dev, iter->dev,
5309                                           &dev->adj_list.upper);
5310         }
5311
5312         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5313                 if (!net_eq(net,dev_net(iter->dev)))
5314                         continue;
5315                 netdev_adjacent_sysfs_add(iter->dev, dev,
5316                                           &iter->dev->adj_list.upper);
5317                 netdev_adjacent_sysfs_add(dev, iter->dev,
5318                                           &dev->adj_list.lower);
5319         }
5320 }
5321
5322 static void netdev_adjacent_del_links(struct net_device *dev)
5323 {
5324         struct netdev_adjacent *iter;
5325
5326         struct net *net = dev_net(dev);
5327
5328         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5329                 if (!net_eq(net,dev_net(iter->dev)))
5330                         continue;
5331                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5332                                           &iter->dev->adj_list.lower);
5333                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5334                                           &dev->adj_list.upper);
5335         }
5336
5337         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5338                 if (!net_eq(net,dev_net(iter->dev)))
5339                         continue;
5340                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5341                                           &iter->dev->adj_list.upper);
5342                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5343                                           &dev->adj_list.lower);
5344         }
5345 }
5346
5347 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5348 {
5349         struct netdev_adjacent *iter;
5350
5351         struct net *net = dev_net(dev);
5352
5353         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5354                 if (!net_eq(net,dev_net(iter->dev)))
5355                         continue;
5356                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5357                                           &iter->dev->adj_list.lower);
5358                 netdev_adjacent_sysfs_add(iter->dev, dev,
5359                                           &iter->dev->adj_list.lower);
5360         }
5361
5362         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5363                 if (!net_eq(net,dev_net(iter->dev)))
5364                         continue;
5365                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5366                                           &iter->dev->adj_list.upper);
5367                 netdev_adjacent_sysfs_add(iter->dev, dev,
5368                                           &iter->dev->adj_list.upper);
5369         }
5370 }
5371
5372 void *netdev_lower_dev_get_private(struct net_device *dev,
5373                                    struct net_device *lower_dev)
5374 {
5375         struct netdev_adjacent *lower;
5376
5377         if (!lower_dev)
5378                 return NULL;
5379         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5380         if (!lower)
5381                 return NULL;
5382
5383         return lower->private;
5384 }
5385 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5386
5387
5388 int dev_get_nest_level(struct net_device *dev,
5389                        bool (*type_check)(struct net_device *dev))
5390 {
5391         struct net_device *lower = NULL;
5392         struct list_head *iter;
5393         int max_nest = -1;
5394         int nest;
5395
5396         ASSERT_RTNL();
5397
5398         netdev_for_each_lower_dev(dev, lower, iter) {
5399                 nest = dev_get_nest_level(lower, type_check);
5400                 if (max_nest < nest)
5401                         max_nest = nest;
5402         }
5403
5404         if (type_check(dev))
5405                 max_nest++;
5406
5407         return max_nest;
5408 }
5409 EXPORT_SYMBOL(dev_get_nest_level);
5410
5411 static void dev_change_rx_flags(struct net_device *dev, int flags)
5412 {
5413         const struct net_device_ops *ops = dev->netdev_ops;
5414
5415         if (ops->ndo_change_rx_flags)
5416                 ops->ndo_change_rx_flags(dev, flags);
5417 }
5418
5419 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5420 {
5421         unsigned int old_flags = dev->flags;
5422         kuid_t uid;
5423         kgid_t gid;
5424
5425         ASSERT_RTNL();
5426
5427         dev->flags |= IFF_PROMISC;
5428         dev->promiscuity += inc;
5429         if (dev->promiscuity == 0) {
5430                 /*
5431                  * Avoid overflow.
5432                  * If inc causes overflow, untouch promisc and return error.
5433                  */
5434                 if (inc < 0)
5435                         dev->flags &= ~IFF_PROMISC;
5436                 else {
5437                         dev->promiscuity -= inc;
5438                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5439                                 dev->name);
5440                         return -EOVERFLOW;
5441                 }
5442         }
5443         if (dev->flags != old_flags) {
5444                 pr_info("device %s %s promiscuous mode\n",
5445                         dev->name,
5446                         dev->flags & IFF_PROMISC ? "entered" : "left");
5447                 if (audit_enabled) {
5448                         current_uid_gid(&uid, &gid);
5449                         audit_log(current->audit_context, GFP_ATOMIC,
5450                                 AUDIT_ANOM_PROMISCUOUS,
5451                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5452                                 dev->name, (dev->flags & IFF_PROMISC),
5453                                 (old_flags & IFF_PROMISC),
5454                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5455                                 from_kuid(&init_user_ns, uid),
5456                                 from_kgid(&init_user_ns, gid),
5457                                 audit_get_sessionid(current));
5458                 }
5459
5460                 dev_change_rx_flags(dev, IFF_PROMISC);
5461         }
5462         if (notify)
5463                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5464         return 0;
5465 }
5466
5467 /**
5468  *      dev_set_promiscuity     - update promiscuity count on a device
5469  *      @dev: device
5470  *      @inc: modifier
5471  *
5472  *      Add or remove promiscuity from a device. While the count in the device
5473  *      remains above zero the interface remains promiscuous. Once it hits zero
5474  *      the device reverts back to normal filtering operation. A negative inc
5475  *      value is used to drop promiscuity on the device.
5476  *      Return 0 if successful or a negative errno code on error.
5477  */
5478 int dev_set_promiscuity(struct net_device *dev, int inc)
5479 {
5480         unsigned int old_flags = dev->flags;
5481         int err;
5482
5483         err = __dev_set_promiscuity(dev, inc, true);
5484         if (err < 0)
5485                 return err;
5486         if (dev->flags != old_flags)
5487                 dev_set_rx_mode(dev);
5488         return err;
5489 }
5490 EXPORT_SYMBOL(dev_set_promiscuity);
5491
5492 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5493 {
5494         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5495
5496         ASSERT_RTNL();
5497
5498         dev->flags |= IFF_ALLMULTI;
5499         dev->allmulti += inc;
5500         if (dev->allmulti == 0) {
5501                 /*
5502                  * Avoid overflow.
5503                  * If inc causes overflow, untouch allmulti and return error.
5504                  */
5505                 if (inc < 0)
5506                         dev->flags &= ~IFF_ALLMULTI;
5507                 else {
5508                         dev->allmulti -= inc;
5509                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5510                                 dev->name);
5511                         return -EOVERFLOW;
5512                 }
5513         }
5514         if (dev->flags ^ old_flags) {
5515                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5516                 dev_set_rx_mode(dev);
5517                 if (notify)
5518                         __dev_notify_flags(dev, old_flags,
5519                                            dev->gflags ^ old_gflags);
5520         }
5521         return 0;
5522 }
5523
5524 /**
5525  *      dev_set_allmulti        - update allmulti count on a device
5526  *      @dev: device
5527  *      @inc: modifier
5528  *
5529  *      Add or remove reception of all multicast frames to a device. While the
5530  *      count in the device remains above zero the interface remains listening
5531  *      to all interfaces. Once it hits zero the device reverts back to normal
5532  *      filtering operation. A negative @inc value is used to drop the counter
5533  *      when releasing a resource needing all multicasts.
5534  *      Return 0 if successful or a negative errno code on error.
5535  */
5536
5537 int dev_set_allmulti(struct net_device *dev, int inc)
5538 {
5539         return __dev_set_allmulti(dev, inc, true);
5540 }
5541 EXPORT_SYMBOL(dev_set_allmulti);
5542
5543 /*
5544  *      Upload unicast and multicast address lists to device and
5545  *      configure RX filtering. When the device doesn't support unicast
5546  *      filtering it is put in promiscuous mode while unicast addresses
5547  *      are present.
5548  */
5549 void __dev_set_rx_mode(struct net_device *dev)
5550 {
5551         const struct net_device_ops *ops = dev->netdev_ops;
5552
5553         /* dev_open will call this function so the list will stay sane. */
5554         if (!(dev->flags&IFF_UP))
5555                 return;
5556
5557         if (!netif_device_present(dev))
5558                 return;
5559
5560         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5561                 /* Unicast addresses changes may only happen under the rtnl,
5562                  * therefore calling __dev_set_promiscuity here is safe.
5563                  */
5564                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5565                         __dev_set_promiscuity(dev, 1, false);
5566                         dev->uc_promisc = true;
5567                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5568                         __dev_set_promiscuity(dev, -1, false);
5569                         dev->uc_promisc = false;
5570                 }
5571         }
5572
5573         if (ops->ndo_set_rx_mode)
5574                 ops->ndo_set_rx_mode(dev);
5575 }
5576
5577 void dev_set_rx_mode(struct net_device *dev)
5578 {
5579         netif_addr_lock_bh(dev);
5580         __dev_set_rx_mode(dev);
5581         netif_addr_unlock_bh(dev);
5582 }
5583
5584 /**
5585  *      dev_get_flags - get flags reported to userspace
5586  *      @dev: device
5587  *
5588  *      Get the combination of flag bits exported through APIs to userspace.
5589  */
5590 unsigned int dev_get_flags(const struct net_device *dev)
5591 {
5592         unsigned int flags;
5593
5594         flags = (dev->flags & ~(IFF_PROMISC |
5595                                 IFF_ALLMULTI |
5596                                 IFF_RUNNING |
5597                                 IFF_LOWER_UP |
5598                                 IFF_DORMANT)) |
5599                 (dev->gflags & (IFF_PROMISC |
5600                                 IFF_ALLMULTI));
5601
5602         if (netif_running(dev)) {
5603                 if (netif_oper_up(dev))
5604                         flags |= IFF_RUNNING;
5605                 if (netif_carrier_ok(dev))
5606                         flags |= IFF_LOWER_UP;
5607                 if (netif_dormant(dev))
5608                         flags |= IFF_DORMANT;
5609         }
5610
5611         return flags;
5612 }
5613 EXPORT_SYMBOL(dev_get_flags);
5614
5615 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5616 {
5617         unsigned int old_flags = dev->flags;
5618         int ret;
5619
5620         ASSERT_RTNL();
5621
5622         /*
5623          *      Set the flags on our device.
5624          */
5625
5626         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5627                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5628                                IFF_AUTOMEDIA)) |
5629                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5630                                     IFF_ALLMULTI));
5631
5632         /*
5633          *      Load in the correct multicast list now the flags have changed.
5634          */
5635
5636         if ((old_flags ^ flags) & IFF_MULTICAST)
5637                 dev_change_rx_flags(dev, IFF_MULTICAST);
5638
5639         dev_set_rx_mode(dev);
5640
5641         /*
5642          *      Have we downed the interface. We handle IFF_UP ourselves
5643          *      according to user attempts to set it, rather than blindly
5644          *      setting it.
5645          */
5646
5647         ret = 0;
5648         if ((old_flags ^ flags) & IFF_UP)
5649                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5650
5651         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5652                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5653                 unsigned int old_flags = dev->flags;
5654
5655                 dev->gflags ^= IFF_PROMISC;
5656
5657                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5658                         if (dev->flags != old_flags)
5659                                 dev_set_rx_mode(dev);
5660         }
5661
5662         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5663            is important. Some (broken) drivers set IFF_PROMISC, when
5664            IFF_ALLMULTI is requested not asking us and not reporting.
5665          */
5666         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5667                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5668
5669                 dev->gflags ^= IFF_ALLMULTI;
5670                 __dev_set_allmulti(dev, inc, false);
5671         }
5672
5673         return ret;
5674 }
5675
5676 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5677                         unsigned int gchanges)
5678 {
5679         unsigned int changes = dev->flags ^ old_flags;
5680
5681         if (gchanges)
5682                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5683
5684         if (changes & IFF_UP) {
5685                 if (dev->flags & IFF_UP)
5686                         call_netdevice_notifiers(NETDEV_UP, dev);
5687                 else
5688                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5689         }
5690
5691         if (dev->flags & IFF_UP &&
5692             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5693                 struct netdev_notifier_change_info change_info;
5694
5695                 change_info.flags_changed = changes;
5696                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5697                                               &change_info.info);
5698         }
5699 }
5700
5701 /**
5702  *      dev_change_flags - change device settings
5703  *      @dev: device
5704  *      @flags: device state flags
5705  *
5706  *      Change settings on device based state flags. The flags are
5707  *      in the userspace exported format.
5708  */
5709 int dev_change_flags(struct net_device *dev, unsigned int flags)
5710 {
5711         int ret;
5712         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5713
5714         ret = __dev_change_flags(dev, flags);
5715         if (ret < 0)
5716                 return ret;
5717
5718         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5719         __dev_notify_flags(dev, old_flags, changes);
5720         return ret;
5721 }
5722 EXPORT_SYMBOL(dev_change_flags);
5723
5724 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5725 {
5726         const struct net_device_ops *ops = dev->netdev_ops;
5727
5728         if (ops->ndo_change_mtu)
5729                 return ops->ndo_change_mtu(dev, new_mtu);
5730
5731         dev->mtu = new_mtu;
5732         return 0;
5733 }
5734
5735 /**
5736  *      dev_set_mtu - Change maximum transfer unit
5737  *      @dev: device
5738  *      @new_mtu: new transfer unit
5739  *
5740  *      Change the maximum transfer size of the network device.
5741  */
5742 int dev_set_mtu(struct net_device *dev, int new_mtu)
5743 {
5744         int err, orig_mtu;
5745
5746         if (new_mtu == dev->mtu)
5747                 return 0;
5748
5749         /*      MTU must be positive.    */
5750         if (new_mtu < 0)
5751                 return -EINVAL;
5752
5753         if (!netif_device_present(dev))
5754                 return -ENODEV;
5755
5756         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5757         err = notifier_to_errno(err);
5758         if (err)
5759                 return err;
5760
5761         orig_mtu = dev->mtu;
5762         err = __dev_set_mtu(dev, new_mtu);
5763
5764         if (!err) {
5765                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5766                 err = notifier_to_errno(err);
5767                 if (err) {
5768                         /* setting mtu back and notifying everyone again,
5769                          * so that they have a chance to revert changes.
5770                          */
5771                         __dev_set_mtu(dev, orig_mtu);
5772                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5773                 }
5774         }
5775         return err;
5776 }
5777 EXPORT_SYMBOL(dev_set_mtu);
5778
5779 /**
5780  *      dev_set_group - Change group this device belongs to
5781  *      @dev: device
5782  *      @new_group: group this device should belong to
5783  */
5784 void dev_set_group(struct net_device *dev, int new_group)
5785 {
5786         dev->group = new_group;
5787 }
5788 EXPORT_SYMBOL(dev_set_group);
5789
5790 /**
5791  *      dev_set_mac_address - Change Media Access Control Address
5792  *      @dev: device
5793  *      @sa: new address
5794  *
5795  *      Change the hardware (MAC) address of the device
5796  */
5797 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5798 {
5799         const struct net_device_ops *ops = dev->netdev_ops;
5800         int err;
5801
5802         if (!ops->ndo_set_mac_address)
5803                 return -EOPNOTSUPP;
5804         if (sa->sa_family != dev->type)
5805                 return -EINVAL;
5806         if (!netif_device_present(dev))
5807                 return -ENODEV;
5808         err = ops->ndo_set_mac_address(dev, sa);
5809         if (err)
5810                 return err;
5811         dev->addr_assign_type = NET_ADDR_SET;
5812         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5813         add_device_randomness(dev->dev_addr, dev->addr_len);
5814         return 0;
5815 }
5816 EXPORT_SYMBOL(dev_set_mac_address);
5817
5818 /**
5819  *      dev_change_carrier - Change device carrier
5820  *      @dev: device
5821  *      @new_carrier: new value
5822  *
5823  *      Change device carrier
5824  */
5825 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5826 {
5827         const struct net_device_ops *ops = dev->netdev_ops;
5828
5829         if (!ops->ndo_change_carrier)
5830                 return -EOPNOTSUPP;
5831         if (!netif_device_present(dev))
5832                 return -ENODEV;
5833         return ops->ndo_change_carrier(dev, new_carrier);
5834 }
5835 EXPORT_SYMBOL(dev_change_carrier);
5836
5837 /**
5838  *      dev_get_phys_port_id - Get device physical port ID
5839  *      @dev: device
5840  *      @ppid: port ID
5841  *
5842  *      Get device physical port ID
5843  */
5844 int dev_get_phys_port_id(struct net_device *dev,
5845                          struct netdev_phys_item_id *ppid)
5846 {
5847         const struct net_device_ops *ops = dev->netdev_ops;
5848
5849         if (!ops->ndo_get_phys_port_id)
5850                 return -EOPNOTSUPP;
5851         return ops->ndo_get_phys_port_id(dev, ppid);
5852 }
5853 EXPORT_SYMBOL(dev_get_phys_port_id);
5854
5855 /**
5856  *      dev_new_index   -       allocate an ifindex
5857  *      @net: the applicable net namespace
5858  *
5859  *      Returns a suitable unique value for a new device interface
5860  *      number.  The caller must hold the rtnl semaphore or the
5861  *      dev_base_lock to be sure it remains unique.
5862  */
5863 static int dev_new_index(struct net *net)
5864 {
5865         int ifindex = net->ifindex;
5866         for (;;) {
5867                 if (++ifindex <= 0)
5868                         ifindex = 1;
5869                 if (!__dev_get_by_index(net, ifindex))
5870                         return net->ifindex = ifindex;
5871         }
5872 }
5873
5874 /* Delayed registration/unregisteration */
5875 static LIST_HEAD(net_todo_list);
5876 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5877
5878 static void net_set_todo(struct net_device *dev)
5879 {
5880         list_add_tail(&dev->todo_list, &net_todo_list);
5881         dev_net(dev)->dev_unreg_count++;
5882 }
5883
5884 static void rollback_registered_many(struct list_head *head)
5885 {
5886         struct net_device *dev, *tmp;
5887         LIST_HEAD(close_head);
5888
5889         BUG_ON(dev_boot_phase);
5890         ASSERT_RTNL();
5891
5892         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5893                 /* Some devices call without registering
5894                  * for initialization unwind. Remove those
5895                  * devices and proceed with the remaining.
5896                  */
5897                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5898                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5899                                  dev->name, dev);
5900
5901                         WARN_ON(1);
5902                         list_del(&dev->unreg_list);
5903                         continue;
5904                 }
5905                 dev->dismantle = true;
5906                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5907         }
5908
5909         /* If device is running, close it first. */
5910         list_for_each_entry(dev, head, unreg_list)
5911                 list_add_tail(&dev->close_list, &close_head);
5912         dev_close_many(&close_head);
5913
5914         list_for_each_entry(dev, head, unreg_list) {
5915                 /* And unlink it from device chain. */
5916                 unlist_netdevice(dev);
5917
5918                 dev->reg_state = NETREG_UNREGISTERING;
5919         }
5920
5921         synchronize_net();
5922
5923         list_for_each_entry(dev, head, unreg_list) {
5924                 struct sk_buff *skb = NULL;
5925
5926                 /* Shutdown queueing discipline. */
5927                 dev_shutdown(dev);
5928
5929
5930                 /* Notify protocols, that we are about to destroy
5931                    this device. They should clean all the things.
5932                 */
5933                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5934
5935                 if (!dev->rtnl_link_ops ||
5936                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5937                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5938                                                      GFP_KERNEL);
5939
5940                 /*
5941                  *      Flush the unicast and multicast chains
5942                  */
5943                 dev_uc_flush(dev);
5944                 dev_mc_flush(dev);
5945
5946                 if (dev->netdev_ops->ndo_uninit)
5947                         dev->netdev_ops->ndo_uninit(dev);
5948
5949                 if (skb)
5950                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5951
5952                 /* Notifier chain MUST detach us all upper devices. */
5953                 WARN_ON(netdev_has_any_upper_dev(dev));
5954
5955                 /* Remove entries from kobject tree */
5956                 netdev_unregister_kobject(dev);
5957 #ifdef CONFIG_XPS
5958                 /* Remove XPS queueing entries */
5959                 netif_reset_xps_queues_gt(dev, 0);
5960 #endif
5961         }
5962
5963         synchronize_net();
5964
5965         list_for_each_entry(dev, head, unreg_list)
5966                 dev_put(dev);
5967 }
5968
5969 static void rollback_registered(struct net_device *dev)
5970 {
5971         LIST_HEAD(single);
5972
5973         list_add(&dev->unreg_list, &single);
5974         rollback_registered_many(&single);
5975         list_del(&single);
5976 }
5977
5978 static netdev_features_t netdev_fix_features(struct net_device *dev,
5979         netdev_features_t features)
5980 {
5981         /* Fix illegal checksum combinations */
5982         if ((features & NETIF_F_HW_CSUM) &&
5983             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5984                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5985                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5986         }
5987
5988         /* TSO requires that SG is present as well. */
5989         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5990                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5991                 features &= ~NETIF_F_ALL_TSO;
5992         }
5993
5994         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5995                                         !(features & NETIF_F_IP_CSUM)) {
5996                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5997                 features &= ~NETIF_F_TSO;
5998                 features &= ~NETIF_F_TSO_ECN;
5999         }
6000
6001         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6002                                          !(features & NETIF_F_IPV6_CSUM)) {
6003                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6004                 features &= ~NETIF_F_TSO6;
6005         }
6006
6007         /* TSO ECN requires that TSO is present as well. */
6008         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6009                 features &= ~NETIF_F_TSO_ECN;
6010
6011         /* Software GSO depends on SG. */
6012         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6013                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6014                 features &= ~NETIF_F_GSO;
6015         }
6016
6017         /* UFO needs SG and checksumming */
6018         if (features & NETIF_F_UFO) {
6019                 /* maybe split UFO into V4 and V6? */
6020                 if (!((features & NETIF_F_GEN_CSUM) ||
6021                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6022                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6023                         netdev_dbg(dev,
6024                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6025                         features &= ~NETIF_F_UFO;
6026                 }
6027
6028                 if (!(features & NETIF_F_SG)) {
6029                         netdev_dbg(dev,
6030                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6031                         features &= ~NETIF_F_UFO;
6032                 }
6033         }
6034
6035 #ifdef CONFIG_NET_RX_BUSY_POLL
6036         if (dev->netdev_ops->ndo_busy_poll)
6037                 features |= NETIF_F_BUSY_POLL;
6038         else
6039 #endif
6040                 features &= ~NETIF_F_BUSY_POLL;
6041
6042         return features;
6043 }
6044
6045 int __netdev_update_features(struct net_device *dev)
6046 {
6047         netdev_features_t features;
6048         int err = 0;
6049
6050         ASSERT_RTNL();
6051
6052         features = netdev_get_wanted_features(dev);
6053
6054         if (dev->netdev_ops->ndo_fix_features)
6055                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6056
6057         /* driver might be less strict about feature dependencies */
6058         features = netdev_fix_features(dev, features);
6059
6060         if (dev->features == features)
6061                 return 0;
6062
6063         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6064                 &dev->features, &features);
6065
6066         if (dev->netdev_ops->ndo_set_features)
6067                 err = dev->netdev_ops->ndo_set_features(dev, features);
6068
6069         if (unlikely(err < 0)) {
6070                 netdev_err(dev,
6071                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6072                         err, &features, &dev->features);
6073                 return -1;
6074         }
6075
6076         if (!err)
6077                 dev->features = features;
6078
6079         return 1;
6080 }
6081
6082 /**
6083  *      netdev_update_features - recalculate device features
6084  *      @dev: the device to check
6085  *
6086  *      Recalculate dev->features set and send notifications if it
6087  *      has changed. Should be called after driver or hardware dependent
6088  *      conditions might have changed that influence the features.
6089  */
6090 void netdev_update_features(struct net_device *dev)
6091 {
6092         if (__netdev_update_features(dev))
6093                 netdev_features_change(dev);
6094 }
6095 EXPORT_SYMBOL(netdev_update_features);
6096
6097 /**
6098  *      netdev_change_features - recalculate device features
6099  *      @dev: the device to check
6100  *
6101  *      Recalculate dev->features set and send notifications even
6102  *      if they have not changed. Should be called instead of
6103  *      netdev_update_features() if also dev->vlan_features might
6104  *      have changed to allow the changes to be propagated to stacked
6105  *      VLAN devices.
6106  */
6107 void netdev_change_features(struct net_device *dev)
6108 {
6109         __netdev_update_features(dev);
6110         netdev_features_change(dev);
6111 }
6112 EXPORT_SYMBOL(netdev_change_features);
6113
6114 /**
6115  *      netif_stacked_transfer_operstate -      transfer operstate
6116  *      @rootdev: the root or lower level device to transfer state from
6117  *      @dev: the device to transfer operstate to
6118  *
6119  *      Transfer operational state from root to device. This is normally
6120  *      called when a stacking relationship exists between the root
6121  *      device and the device(a leaf device).
6122  */
6123 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6124                                         struct net_device *dev)
6125 {
6126         if (rootdev->operstate == IF_OPER_DORMANT)
6127                 netif_dormant_on(dev);
6128         else
6129                 netif_dormant_off(dev);
6130
6131         if (netif_carrier_ok(rootdev)) {
6132                 if (!netif_carrier_ok(dev))
6133                         netif_carrier_on(dev);
6134         } else {
6135                 if (netif_carrier_ok(dev))
6136                         netif_carrier_off(dev);
6137         }
6138 }
6139 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6140
6141 #ifdef CONFIG_SYSFS
6142 static int netif_alloc_rx_queues(struct net_device *dev)
6143 {
6144         unsigned int i, count = dev->num_rx_queues;
6145         struct netdev_rx_queue *rx;
6146
6147         BUG_ON(count < 1);
6148
6149         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6150         if (!rx)
6151                 return -ENOMEM;
6152
6153         dev->_rx = rx;
6154
6155         for (i = 0; i < count; i++)
6156                 rx[i].dev = dev;
6157         return 0;
6158 }
6159 #endif
6160
6161 static void netdev_init_one_queue(struct net_device *dev,
6162                                   struct netdev_queue *queue, void *_unused)
6163 {
6164         /* Initialize queue lock */
6165         spin_lock_init(&queue->_xmit_lock);
6166         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6167         queue->xmit_lock_owner = -1;
6168         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6169         queue->dev = dev;
6170 #ifdef CONFIG_BQL
6171         dql_init(&queue->dql, HZ);
6172 #endif
6173 }
6174
6175 static void netif_free_tx_queues(struct net_device *dev)
6176 {
6177         kvfree(dev->_tx);
6178 }
6179
6180 static int netif_alloc_netdev_queues(struct net_device *dev)
6181 {
6182         unsigned int count = dev->num_tx_queues;
6183         struct netdev_queue *tx;
6184         size_t sz = count * sizeof(*tx);
6185
6186         BUG_ON(count < 1 || count > 0xffff);
6187
6188         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6189         if (!tx) {
6190                 tx = vzalloc(sz);
6191                 if (!tx)
6192                         return -ENOMEM;
6193         }
6194         dev->_tx = tx;
6195
6196         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6197         spin_lock_init(&dev->tx_global_lock);
6198
6199         return 0;
6200 }
6201
6202 /**
6203  *      register_netdevice      - register a network device
6204  *      @dev: device to register
6205  *
6206  *      Take a completed network device structure and add it to the kernel
6207  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6208  *      chain. 0 is returned on success. A negative errno code is returned
6209  *      on a failure to set up the device, or if the name is a duplicate.
6210  *
6211  *      Callers must hold the rtnl semaphore. You may want
6212  *      register_netdev() instead of this.
6213  *
6214  *      BUGS:
6215  *      The locking appears insufficient to guarantee two parallel registers
6216  *      will not get the same name.
6217  */
6218
6219 int register_netdevice(struct net_device *dev)
6220 {
6221         int ret;
6222         struct net *net = dev_net(dev);
6223
6224         BUG_ON(dev_boot_phase);
6225         ASSERT_RTNL();
6226
6227         might_sleep();
6228
6229         /* When net_device's are persistent, this will be fatal. */
6230         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6231         BUG_ON(!net);
6232
6233         spin_lock_init(&dev->addr_list_lock);
6234         netdev_set_addr_lockdep_class(dev);
6235
6236         dev->iflink = -1;
6237
6238         ret = dev_get_valid_name(net, dev, dev->name);
6239         if (ret < 0)
6240                 goto out;
6241
6242         /* Init, if this function is available */
6243         if (dev->netdev_ops->ndo_init) {
6244                 ret = dev->netdev_ops->ndo_init(dev);
6245                 if (ret) {
6246                         if (ret > 0)
6247                                 ret = -EIO;
6248                         goto out;
6249                 }
6250         }
6251
6252         if (((dev->hw_features | dev->features) &
6253              NETIF_F_HW_VLAN_CTAG_FILTER) &&
6254             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6255              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6256                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6257                 ret = -EINVAL;
6258                 goto err_uninit;
6259         }
6260
6261         ret = -EBUSY;
6262         if (!dev->ifindex)
6263                 dev->ifindex = dev_new_index(net);
6264         else if (__dev_get_by_index(net, dev->ifindex))
6265                 goto err_uninit;
6266
6267         if (dev->iflink == -1)
6268                 dev->iflink = dev->ifindex;
6269
6270         /* Transfer changeable features to wanted_features and enable
6271          * software offloads (GSO and GRO).
6272          */
6273         dev->hw_features |= NETIF_F_SOFT_FEATURES;
6274         dev->features |= NETIF_F_SOFT_FEATURES;
6275         dev->wanted_features = dev->features & dev->hw_features;
6276
6277         if (!(dev->flags & IFF_LOOPBACK)) {
6278                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6279         }
6280
6281         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6282          */
6283         dev->vlan_features |= NETIF_F_HIGHDMA;
6284
6285         /* Make NETIF_F_SG inheritable to tunnel devices.
6286          */
6287         dev->hw_enc_features |= NETIF_F_SG;
6288
6289         /* Make NETIF_F_SG inheritable to MPLS.
6290          */
6291         dev->mpls_features |= NETIF_F_SG;
6292
6293         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6294         ret = notifier_to_errno(ret);
6295         if (ret)
6296                 goto err_uninit;
6297
6298         ret = netdev_register_kobject(dev);
6299         if (ret)
6300                 goto err_uninit;
6301         dev->reg_state = NETREG_REGISTERED;
6302
6303         __netdev_update_features(dev);
6304
6305         /*
6306          *      Default initial state at registry is that the
6307          *      device is present.
6308          */
6309
6310         set_bit(__LINK_STATE_PRESENT, &dev->state);
6311
6312         linkwatch_init_dev(dev);
6313
6314         dev_init_scheduler(dev);
6315         dev_hold(dev);
6316         list_netdevice(dev);
6317         add_device_randomness(dev->dev_addr, dev->addr_len);
6318
6319         /* If the device has permanent device address, driver should
6320          * set dev_addr and also addr_assign_type should be set to
6321          * NET_ADDR_PERM (default value).
6322          */
6323         if (dev->addr_assign_type == NET_ADDR_PERM)
6324                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6325
6326         /* Notify protocols, that a new device appeared. */
6327         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6328         ret = notifier_to_errno(ret);
6329         if (ret) {
6330                 rollback_registered(dev);
6331                 dev->reg_state = NETREG_UNREGISTERED;
6332         }
6333         /*
6334          *      Prevent userspace races by waiting until the network
6335          *      device is fully setup before sending notifications.
6336          */
6337         if (!dev->rtnl_link_ops ||
6338             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6339                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6340
6341 out:
6342         return ret;
6343
6344 err_uninit:
6345         if (dev->netdev_ops->ndo_uninit)
6346                 dev->netdev_ops->ndo_uninit(dev);
6347         goto out;
6348 }
6349 EXPORT_SYMBOL(register_netdevice);
6350
6351 /**
6352  *      init_dummy_netdev       - init a dummy network device for NAPI
6353  *      @dev: device to init
6354  *
6355  *      This takes a network device structure and initialize the minimum
6356  *      amount of fields so it can be used to schedule NAPI polls without
6357  *      registering a full blown interface. This is to be used by drivers
6358  *      that need to tie several hardware interfaces to a single NAPI
6359  *      poll scheduler due to HW limitations.
6360  */
6361 int init_dummy_netdev(struct net_device *dev)
6362 {
6363         /* Clear everything. Note we don't initialize spinlocks
6364          * are they aren't supposed to be taken by any of the
6365          * NAPI code and this dummy netdev is supposed to be
6366          * only ever used for NAPI polls
6367          */
6368         memset(dev, 0, sizeof(struct net_device));
6369
6370         /* make sure we BUG if trying to hit standard
6371          * register/unregister code path
6372          */
6373         dev->reg_state = NETREG_DUMMY;
6374
6375         /* NAPI wants this */
6376         INIT_LIST_HEAD(&dev->napi_list);
6377
6378         /* a dummy interface is started by default */
6379         set_bit(__LINK_STATE_PRESENT, &dev->state);
6380         set_bit(__LINK_STATE_START, &dev->state);
6381
6382         /* Note : We dont allocate pcpu_refcnt for dummy devices,
6383          * because users of this 'device' dont need to change
6384          * its refcount.
6385          */
6386
6387         return 0;
6388 }
6389 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6390
6391
6392 /**
6393  *      register_netdev - register a network device
6394  *      @dev: device to register
6395  *
6396  *      Take a completed network device structure and add it to the kernel
6397  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6398  *      chain. 0 is returned on success. A negative errno code is returned
6399  *      on a failure to set up the device, or if the name is a duplicate.
6400  *
6401  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
6402  *      and expands the device name if you passed a format string to
6403  *      alloc_netdev.
6404  */
6405 int register_netdev(struct net_device *dev)
6406 {
6407         int err;
6408
6409         rtnl_lock();
6410         err = register_netdevice(dev);
6411         rtnl_unlock();
6412         return err;
6413 }
6414 EXPORT_SYMBOL(register_netdev);
6415
6416 int netdev_refcnt_read(const struct net_device *dev)
6417 {
6418         int i, refcnt = 0;
6419
6420         for_each_possible_cpu(i)
6421                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6422         return refcnt;
6423 }
6424 EXPORT_SYMBOL(netdev_refcnt_read);
6425
6426 /**
6427  * netdev_wait_allrefs - wait until all references are gone.
6428  * @dev: target net_device
6429  *
6430  * This is called when unregistering network devices.
6431  *
6432  * Any protocol or device that holds a reference should register
6433  * for netdevice notification, and cleanup and put back the
6434  * reference if they receive an UNREGISTER event.
6435  * We can get stuck here if buggy protocols don't correctly
6436  * call dev_put.
6437  */
6438 static void netdev_wait_allrefs(struct net_device *dev)
6439 {
6440         unsigned long rebroadcast_time, warning_time;
6441         int refcnt;
6442
6443         linkwatch_forget_dev(dev);
6444
6445         rebroadcast_time = warning_time = jiffies;
6446         refcnt = netdev_refcnt_read(dev);
6447
6448         while (refcnt != 0) {
6449                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6450                         rtnl_lock();
6451
6452                         /* Rebroadcast unregister notification */
6453                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6454
6455                         __rtnl_unlock();
6456                         rcu_barrier();
6457                         rtnl_lock();
6458
6459                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6460                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6461                                      &dev->state)) {
6462                                 /* We must not have linkwatch events
6463                                  * pending on unregister. If this
6464                                  * happens, we simply run the queue
6465                                  * unscheduled, resulting in a noop
6466                                  * for this device.
6467                                  */
6468                                 linkwatch_run_queue();
6469                         }
6470
6471                         __rtnl_unlock();
6472
6473                         rebroadcast_time = jiffies;
6474                 }
6475
6476                 msleep(250);
6477
6478                 refcnt = netdev_refcnt_read(dev);
6479
6480                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6481                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6482                                  dev->name, refcnt);
6483                         warning_time = jiffies;
6484                 }
6485         }
6486 }
6487
6488 /* The sequence is:
6489  *
6490  *      rtnl_lock();
6491  *      ...
6492  *      register_netdevice(x1);
6493  *      register_netdevice(x2);
6494  *      ...
6495  *      unregister_netdevice(y1);
6496  *      unregister_netdevice(y2);
6497  *      ...
6498  *      rtnl_unlock();
6499  *      free_netdev(y1);
6500  *      free_netdev(y2);
6501  *
6502  * We are invoked by rtnl_unlock().
6503  * This allows us to deal with problems:
6504  * 1) We can delete sysfs objects which invoke hotplug
6505  *    without deadlocking with linkwatch via keventd.
6506  * 2) Since we run with the RTNL semaphore not held, we can sleep
6507  *    safely in order to wait for the netdev refcnt to drop to zero.
6508  *
6509  * We must not return until all unregister events added during
6510  * the interval the lock was held have been completed.
6511  */
6512 void netdev_run_todo(void)
6513 {
6514         struct list_head list;
6515
6516         /* Snapshot list, allow later requests */
6517         list_replace_init(&net_todo_list, &list);
6518
6519         __rtnl_unlock();
6520
6521
6522         /* Wait for rcu callbacks to finish before next phase */
6523         if (!list_empty(&list))
6524                 rcu_barrier();
6525
6526         while (!list_empty(&list)) {
6527                 struct net_device *dev
6528                         = list_first_entry(&list, struct net_device, todo_list);
6529                 list_del(&dev->todo_list);
6530
6531                 rtnl_lock();
6532                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6533                 __rtnl_unlock();
6534
6535                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6536                         pr_err("network todo '%s' but state %d\n",
6537                                dev->name, dev->reg_state);
6538                         dump_stack();
6539                         continue;
6540                 }
6541
6542                 dev->reg_state = NETREG_UNREGISTERED;
6543
6544                 on_each_cpu(flush_backlog, dev, 1);
6545
6546                 netdev_wait_allrefs(dev);
6547
6548                 /* paranoia */
6549                 BUG_ON(netdev_refcnt_read(dev));
6550                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6551                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6552                 WARN_ON(dev->dn_ptr);
6553
6554                 if (dev->destructor)
6555                         dev->destructor(dev);
6556
6557                 /* Report a network device has been unregistered */
6558                 rtnl_lock();
6559                 dev_net(dev)->dev_unreg_count--;
6560                 __rtnl_unlock();
6561                 wake_up(&netdev_unregistering_wq);
6562
6563                 /* Free network device */
6564                 kobject_put(&dev->dev.kobj);
6565         }
6566 }
6567
6568 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6569  * fields in the same order, with only the type differing.
6570  */
6571 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6572                              const struct net_device_stats *netdev_stats)
6573 {
6574 #if BITS_PER_LONG == 64
6575         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6576         memcpy(stats64, netdev_stats, sizeof(*stats64));
6577 #else
6578         size_t i, n = sizeof(*stats64) / sizeof(u64);
6579         const unsigned long *src = (const unsigned long *)netdev_stats;
6580         u64 *dst = (u64 *)stats64;
6581
6582         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6583                      sizeof(*stats64) / sizeof(u64));
6584         for (i = 0; i < n; i++)
6585                 dst[i] = src[i];
6586 #endif
6587 }
6588 EXPORT_SYMBOL(netdev_stats_to_stats64);
6589
6590 /**
6591  *      dev_get_stats   - get network device statistics
6592  *      @dev: device to get statistics from
6593  *      @storage: place to store stats
6594  *
6595  *      Get network statistics from device. Return @storage.
6596  *      The device driver may provide its own method by setting
6597  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6598  *      otherwise the internal statistics structure is used.
6599  */
6600 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6601                                         struct rtnl_link_stats64 *storage)
6602 {
6603         const struct net_device_ops *ops = dev->netdev_ops;
6604
6605         if (ops->ndo_get_stats64) {
6606                 memset(storage, 0, sizeof(*storage));
6607                 ops->ndo_get_stats64(dev, storage);
6608         } else if (ops->ndo_get_stats) {
6609                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6610         } else {
6611                 netdev_stats_to_stats64(storage, &dev->stats);
6612         }
6613         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6614         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6615         return storage;
6616 }
6617 EXPORT_SYMBOL(dev_get_stats);
6618
6619 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6620 {
6621         struct netdev_queue *queue = dev_ingress_queue(dev);
6622
6623 #ifdef CONFIG_NET_CLS_ACT
6624         if (queue)
6625                 return queue;
6626         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6627         if (!queue)
6628                 return NULL;
6629         netdev_init_one_queue(dev, queue, NULL);
6630         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6631         queue->qdisc_sleeping = &noop_qdisc;
6632         rcu_assign_pointer(dev->ingress_queue, queue);
6633 #endif
6634         return queue;
6635 }
6636
6637 static const struct ethtool_ops default_ethtool_ops;
6638
6639 void netdev_set_default_ethtool_ops(struct net_device *dev,
6640                                     const struct ethtool_ops *ops)
6641 {
6642         if (dev->ethtool_ops == &default_ethtool_ops)
6643                 dev->ethtool_ops = ops;
6644 }
6645 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6646
6647 void netdev_freemem(struct net_device *dev)
6648 {
6649         char *addr = (char *)dev - dev->padded;
6650
6651         kvfree(addr);
6652 }
6653
6654 /**
6655  *      alloc_netdev_mqs - allocate network device
6656  *      @sizeof_priv:           size of private data to allocate space for
6657  *      @name:                  device name format string
6658  *      @name_assign_type:      origin of device name
6659  *      @setup:                 callback to initialize device
6660  *      @txqs:                  the number of TX subqueues to allocate
6661  *      @rxqs:                  the number of RX subqueues to allocate
6662  *
6663  *      Allocates a struct net_device with private data area for driver use
6664  *      and performs basic initialization.  Also allocates subqueue structs
6665  *      for each queue on the device.
6666  */
6667 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6668                 unsigned char name_assign_type,
6669                 void (*setup)(struct net_device *),
6670                 unsigned int txqs, unsigned int rxqs)
6671 {
6672         struct net_device *dev;
6673         size_t alloc_size;
6674         struct net_device *p;
6675
6676         BUG_ON(strlen(name) >= sizeof(dev->name));
6677
6678         if (txqs < 1) {
6679                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6680                 return NULL;
6681         }
6682
6683 #ifdef CONFIG_SYSFS
6684         if (rxqs < 1) {
6685                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6686                 return NULL;
6687         }
6688 #endif
6689
6690         alloc_size = sizeof(struct net_device);
6691         if (sizeof_priv) {
6692                 /* ensure 32-byte alignment of private area */
6693                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6694                 alloc_size += sizeof_priv;
6695         }
6696         /* ensure 32-byte alignment of whole construct */
6697         alloc_size += NETDEV_ALIGN - 1;
6698
6699         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6700         if (!p)
6701                 p = vzalloc(alloc_size);
6702         if (!p)
6703                 return NULL;
6704
6705         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6706         dev->padded = (char *)dev - (char *)p;
6707
6708         dev->pcpu_refcnt = alloc_percpu(int);
6709         if (!dev->pcpu_refcnt)
6710                 goto free_dev;
6711
6712         if (dev_addr_init(dev))
6713                 goto free_pcpu;
6714
6715         dev_mc_init(dev);
6716         dev_uc_init(dev);
6717
6718         dev_net_set(dev, &init_net);
6719
6720         dev->gso_max_size = GSO_MAX_SIZE;
6721         dev->gso_max_segs = GSO_MAX_SEGS;
6722         dev->gso_min_segs = 0;
6723
6724         INIT_LIST_HEAD(&dev->napi_list);
6725         INIT_LIST_HEAD(&dev->unreg_list);
6726         INIT_LIST_HEAD(&dev->close_list);
6727         INIT_LIST_HEAD(&dev->link_watch_list);
6728         INIT_LIST_HEAD(&dev->adj_list.upper);
6729         INIT_LIST_HEAD(&dev->adj_list.lower);
6730         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6731         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6732         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6733         setup(dev);
6734
6735         dev->num_tx_queues = txqs;
6736         dev->real_num_tx_queues = txqs;
6737         if (netif_alloc_netdev_queues(dev))
6738                 goto free_all;
6739
6740 #ifdef CONFIG_SYSFS
6741         dev->num_rx_queues = rxqs;
6742         dev->real_num_rx_queues = rxqs;
6743         if (netif_alloc_rx_queues(dev))
6744                 goto free_all;
6745 #endif
6746
6747         strcpy(dev->name, name);
6748         dev->name_assign_type = name_assign_type;
6749         dev->group = INIT_NETDEV_GROUP;
6750         if (!dev->ethtool_ops)
6751                 dev->ethtool_ops = &default_ethtool_ops;
6752         return dev;
6753
6754 free_all:
6755         free_netdev(dev);
6756         return NULL;
6757
6758 free_pcpu:
6759         free_percpu(dev->pcpu_refcnt);
6760 free_dev:
6761         netdev_freemem(dev);
6762         return NULL;
6763 }
6764 EXPORT_SYMBOL(alloc_netdev_mqs);
6765
6766 /**
6767  *      free_netdev - free network device
6768  *      @dev: device
6769  *
6770  *      This function does the last stage of destroying an allocated device
6771  *      interface. The reference to the device object is released.
6772  *      If this is the last reference then it will be freed.
6773  */
6774 void free_netdev(struct net_device *dev)
6775 {
6776         struct napi_struct *p, *n;
6777
6778         release_net(dev_net(dev));
6779
6780         netif_free_tx_queues(dev);
6781 #ifdef CONFIG_SYSFS
6782         kfree(dev->_rx);
6783 #endif
6784
6785         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6786
6787         /* Flush device addresses */
6788         dev_addr_flush(dev);
6789
6790         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6791                 netif_napi_del(p);
6792
6793         free_percpu(dev->pcpu_refcnt);
6794         dev->pcpu_refcnt = NULL;
6795
6796         /*  Compatibility with error handling in drivers */
6797         if (dev->reg_state == NETREG_UNINITIALIZED) {
6798                 netdev_freemem(dev);
6799                 return;
6800         }
6801
6802         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6803         dev->reg_state = NETREG_RELEASED;
6804
6805         /* will free via device release */
6806         put_device(&dev->dev);
6807 }
6808 EXPORT_SYMBOL(free_netdev);
6809
6810 /**
6811  *      synchronize_net -  Synchronize with packet receive processing
6812  *
6813  *      Wait for packets currently being received to be done.
6814  *      Does not block later packets from starting.
6815  */
6816 void synchronize_net(void)
6817 {
6818         might_sleep();
6819         if (rtnl_is_locked())
6820                 synchronize_rcu_expedited();
6821         else
6822                 synchronize_rcu();
6823 }
6824 EXPORT_SYMBOL(synchronize_net);
6825
6826 /**
6827  *      unregister_netdevice_queue - remove device from the kernel
6828  *      @dev: device
6829  *      @head: list
6830  *
6831  *      This function shuts down a device interface and removes it
6832  *      from the kernel tables.
6833  *      If head not NULL, device is queued to be unregistered later.
6834  *
6835  *      Callers must hold the rtnl semaphore.  You may want
6836  *      unregister_netdev() instead of this.
6837  */
6838
6839 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6840 {
6841         ASSERT_RTNL();
6842
6843         if (head) {
6844                 list_move_tail(&dev->unreg_list, head);
6845         } else {
6846                 rollback_registered(dev);
6847                 /* Finish processing unregister after unlock */
6848                 net_set_todo(dev);
6849         }
6850 }
6851 EXPORT_SYMBOL(unregister_netdevice_queue);
6852
6853 /**
6854  *      unregister_netdevice_many - unregister many devices
6855  *      @head: list of devices
6856  *
6857  *  Note: As most callers use a stack allocated list_head,
6858  *  we force a list_del() to make sure stack wont be corrupted later.
6859  */
6860 void unregister_netdevice_many(struct list_head *head)
6861 {
6862         struct net_device *dev;
6863
6864         if (!list_empty(head)) {
6865                 rollback_registered_many(head);
6866                 list_for_each_entry(dev, head, unreg_list)
6867                         net_set_todo(dev);
6868                 list_del(head);
6869         }
6870 }
6871 EXPORT_SYMBOL(unregister_netdevice_many);
6872
6873 /**
6874  *      unregister_netdev - remove device from the kernel
6875  *      @dev: device
6876  *
6877  *      This function shuts down a device interface and removes it
6878  *      from the kernel tables.
6879  *
6880  *      This is just a wrapper for unregister_netdevice that takes
6881  *      the rtnl semaphore.  In general you want to use this and not
6882  *      unregister_netdevice.
6883  */
6884 void unregister_netdev(struct net_device *dev)
6885 {
6886         rtnl_lock();
6887         unregister_netdevice(dev);
6888         rtnl_unlock();
6889 }
6890 EXPORT_SYMBOL(unregister_netdev);
6891
6892 /**
6893  *      dev_change_net_namespace - move device to different nethost namespace
6894  *      @dev: device
6895  *      @net: network namespace
6896  *      @pat: If not NULL name pattern to try if the current device name
6897  *            is already taken in the destination network namespace.
6898  *
6899  *      This function shuts down a device interface and moves it
6900  *      to a new network namespace. On success 0 is returned, on
6901  *      a failure a netagive errno code is returned.
6902  *
6903  *      Callers must hold the rtnl semaphore.
6904  */
6905
6906 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6907 {
6908         int err;
6909
6910         ASSERT_RTNL();
6911
6912         /* Don't allow namespace local devices to be moved. */
6913         err = -EINVAL;
6914         if (dev->features & NETIF_F_NETNS_LOCAL)
6915                 goto out;
6916
6917         /* Ensure the device has been registrered */
6918         if (dev->reg_state != NETREG_REGISTERED)
6919                 goto out;
6920
6921         /* Get out if there is nothing todo */
6922         err = 0;
6923         if (net_eq(dev_net(dev), net))
6924                 goto out;
6925
6926         /* Pick the destination device name, and ensure
6927          * we can use it in the destination network namespace.
6928          */
6929         err = -EEXIST;
6930         if (__dev_get_by_name(net, dev->name)) {
6931                 /* We get here if we can't use the current device name */
6932                 if (!pat)
6933                         goto out;
6934                 if (dev_get_valid_name(net, dev, pat) < 0)
6935                         goto out;
6936         }
6937
6938         /*
6939          * And now a mini version of register_netdevice unregister_netdevice.
6940          */
6941
6942         /* If device is running close it first. */
6943         dev_close(dev);
6944
6945         /* And unlink it from device chain */
6946         err = -ENODEV;
6947         unlist_netdevice(dev);
6948
6949         synchronize_net();
6950
6951         /* Shutdown queueing discipline. */
6952         dev_shutdown(dev);
6953
6954         /* Notify protocols, that we are about to destroy
6955            this device. They should clean all the things.
6956
6957            Note that dev->reg_state stays at NETREG_REGISTERED.
6958            This is wanted because this way 8021q and macvlan know
6959            the device is just moving and can keep their slaves up.
6960         */
6961         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6962         rcu_barrier();
6963         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6964         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6965
6966         /*
6967          *      Flush the unicast and multicast chains
6968          */
6969         dev_uc_flush(dev);
6970         dev_mc_flush(dev);
6971
6972         /* Send a netdev-removed uevent to the old namespace */
6973         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6974         netdev_adjacent_del_links(dev);
6975
6976         /* Actually switch the network namespace */
6977         dev_net_set(dev, net);
6978
6979         /* If there is an ifindex conflict assign a new one */
6980         if (__dev_get_by_index(net, dev->ifindex)) {
6981                 int iflink = (dev->iflink == dev->ifindex);
6982                 dev->ifindex = dev_new_index(net);
6983                 if (iflink)
6984                         dev->iflink = dev->ifindex;
6985         }
6986
6987         /* Send a netdev-add uevent to the new namespace */
6988         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6989         netdev_adjacent_add_links(dev);
6990
6991         /* Fixup kobjects */
6992         err = device_rename(&dev->dev, dev->name);
6993         WARN_ON(err);
6994
6995         /* Add the device back in the hashes */
6996         list_netdevice(dev);
6997
6998         /* Notify protocols, that a new device appeared. */
6999         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7000
7001         /*
7002          *      Prevent userspace races by waiting until the network
7003          *      device is fully setup before sending notifications.
7004          */
7005         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7006
7007         synchronize_net();
7008         err = 0;
7009 out:
7010         return err;
7011 }
7012 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7013
7014 static int dev_cpu_callback(struct notifier_block *nfb,
7015                             unsigned long action,
7016                             void *ocpu)
7017 {
7018         struct sk_buff **list_skb;
7019         struct sk_buff *skb;
7020         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7021         struct softnet_data *sd, *oldsd;
7022
7023         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7024                 return NOTIFY_OK;
7025
7026         local_irq_disable();
7027         cpu = smp_processor_id();
7028         sd = &per_cpu(softnet_data, cpu);
7029         oldsd = &per_cpu(softnet_data, oldcpu);
7030
7031         /* Find end of our completion_queue. */
7032         list_skb = &sd->completion_queue;
7033         while (*list_skb)
7034                 list_skb = &(*list_skb)->next;
7035         /* Append completion queue from offline CPU. */
7036         *list_skb = oldsd->completion_queue;
7037         oldsd->completion_queue = NULL;
7038
7039         /* Append output queue from offline CPU. */
7040         if (oldsd->output_queue) {
7041                 *sd->output_queue_tailp = oldsd->output_queue;
7042                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7043                 oldsd->output_queue = NULL;
7044                 oldsd->output_queue_tailp = &oldsd->output_queue;
7045         }
7046         /* Append NAPI poll list from offline CPU, with one exception :
7047          * process_backlog() must be called by cpu owning percpu backlog.
7048          * We properly handle process_queue & input_pkt_queue later.
7049          */
7050         while (!list_empty(&oldsd->poll_list)) {
7051                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7052                                                             struct napi_struct,
7053                                                             poll_list);
7054
7055                 list_del_init(&napi->poll_list);
7056                 if (napi->poll == process_backlog)
7057                         napi->state = 0;
7058                 else
7059                         ____napi_schedule(sd, napi);
7060         }
7061
7062         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7063         local_irq_enable();
7064
7065         /* Process offline CPU's input_pkt_queue */
7066         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7067                 netif_rx_internal(skb);
7068                 input_queue_head_incr(oldsd);
7069         }
7070         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7071                 netif_rx_internal(skb);
7072                 input_queue_head_incr(oldsd);
7073         }
7074
7075         return NOTIFY_OK;
7076 }
7077
7078
7079 /**
7080  *      netdev_increment_features - increment feature set by one
7081  *      @all: current feature set
7082  *      @one: new feature set
7083  *      @mask: mask feature set
7084  *
7085  *      Computes a new feature set after adding a device with feature set
7086  *      @one to the master device with current feature set @all.  Will not
7087  *      enable anything that is off in @mask. Returns the new feature set.
7088  */
7089 netdev_features_t netdev_increment_features(netdev_features_t all,
7090         netdev_features_t one, netdev_features_t mask)
7091 {
7092         if (mask & NETIF_F_GEN_CSUM)
7093                 mask |= NETIF_F_ALL_CSUM;
7094         mask |= NETIF_F_VLAN_CHALLENGED;
7095
7096         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7097         all &= one | ~NETIF_F_ALL_FOR_ALL;
7098
7099         /* If one device supports hw checksumming, set for all. */
7100         if (all & NETIF_F_GEN_CSUM)
7101                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7102
7103         return all;
7104 }
7105 EXPORT_SYMBOL(netdev_increment_features);
7106
7107 static struct hlist_head * __net_init netdev_create_hash(void)
7108 {
7109         int i;
7110         struct hlist_head *hash;
7111
7112         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7113         if (hash != NULL)
7114                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7115                         INIT_HLIST_HEAD(&hash[i]);
7116
7117         return hash;
7118 }
7119
7120 /* Initialize per network namespace state */
7121 static int __net_init netdev_init(struct net *net)
7122 {
7123         if (net != &init_net)
7124                 INIT_LIST_HEAD(&net->dev_base_head);
7125
7126         net->dev_name_head = netdev_create_hash();
7127         if (net->dev_name_head == NULL)
7128                 goto err_name;
7129
7130         net->dev_index_head = netdev_create_hash();
7131         if (net->dev_index_head == NULL)
7132                 goto err_idx;
7133
7134         return 0;
7135
7136 err_idx:
7137         kfree(net->dev_name_head);
7138 err_name:
7139         return -ENOMEM;
7140 }
7141
7142 /**
7143  *      netdev_drivername - network driver for the device
7144  *      @dev: network device
7145  *
7146  *      Determine network driver for device.
7147  */
7148 const char *netdev_drivername(const struct net_device *dev)
7149 {
7150         const struct device_driver *driver;
7151         const struct device *parent;
7152         const char *empty = "";
7153
7154         parent = dev->dev.parent;
7155         if (!parent)
7156                 return empty;
7157
7158         driver = parent->driver;
7159         if (driver && driver->name)
7160                 return driver->name;
7161         return empty;
7162 }
7163
7164 static void __netdev_printk(const char *level, const struct net_device *dev,
7165                             struct va_format *vaf)
7166 {
7167         if (dev && dev->dev.parent) {
7168                 dev_printk_emit(level[1] - '0',
7169                                 dev->dev.parent,
7170                                 "%s %s %s%s: %pV",
7171                                 dev_driver_string(dev->dev.parent),
7172                                 dev_name(dev->dev.parent),
7173                                 netdev_name(dev), netdev_reg_state(dev),
7174                                 vaf);
7175         } else if (dev) {
7176                 printk("%s%s%s: %pV",
7177                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7178         } else {
7179                 printk("%s(NULL net_device): %pV", level, vaf);
7180         }
7181 }
7182
7183 void netdev_printk(const char *level, const struct net_device *dev,
7184                    const char *format, ...)
7185 {
7186         struct va_format vaf;
7187         va_list args;
7188
7189         va_start(args, format);
7190
7191         vaf.fmt = format;
7192         vaf.va = &args;
7193
7194         __netdev_printk(level, dev, &vaf);
7195
7196         va_end(args);
7197 }
7198 EXPORT_SYMBOL(netdev_printk);
7199
7200 #define define_netdev_printk_level(func, level)                 \
7201 void func(const struct net_device *dev, const char *fmt, ...)   \
7202 {                                                               \
7203         struct va_format vaf;                                   \
7204         va_list args;                                           \
7205                                                                 \
7206         va_start(args, fmt);                                    \
7207                                                                 \
7208         vaf.fmt = fmt;                                          \
7209         vaf.va = &args;                                         \
7210                                                                 \
7211         __netdev_printk(level, dev, &vaf);                      \
7212                                                                 \
7213         va_end(args);                                           \
7214 }                                                               \
7215 EXPORT_SYMBOL(func);
7216
7217 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7218 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7219 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7220 define_netdev_printk_level(netdev_err, KERN_ERR);
7221 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7222 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7223 define_netdev_printk_level(netdev_info, KERN_INFO);
7224
7225 static void __net_exit netdev_exit(struct net *net)
7226 {
7227         kfree(net->dev_name_head);
7228         kfree(net->dev_index_head);
7229 }
7230
7231 static struct pernet_operations __net_initdata netdev_net_ops = {
7232         .init = netdev_init,
7233         .exit = netdev_exit,
7234 };
7235
7236 static void __net_exit default_device_exit(struct net *net)
7237 {
7238         struct net_device *dev, *aux;
7239         /*
7240          * Push all migratable network devices back to the
7241          * initial network namespace
7242          */
7243         rtnl_lock();
7244         for_each_netdev_safe(net, dev, aux) {
7245                 int err;
7246                 char fb_name[IFNAMSIZ];
7247
7248                 /* Ignore unmoveable devices (i.e. loopback) */
7249                 if (dev->features & NETIF_F_NETNS_LOCAL)
7250                         continue;
7251
7252                 /* Leave virtual devices for the generic cleanup */
7253                 if (dev->rtnl_link_ops)
7254                         continue;
7255
7256                 /* Push remaining network devices to init_net */
7257                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7258                 err = dev_change_net_namespace(dev, &init_net, fb_name);
7259                 if (err) {
7260                         pr_emerg("%s: failed to move %s to init_net: %d\n",
7261                                  __func__, dev->name, err);
7262                         BUG();
7263                 }
7264         }
7265         rtnl_unlock();
7266 }
7267
7268 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7269 {
7270         /* Return with the rtnl_lock held when there are no network
7271          * devices unregistering in any network namespace in net_list.
7272          */
7273         struct net *net;
7274         bool unregistering;
7275         DEFINE_WAIT_FUNC(wait, woken_wake_function);
7276
7277         add_wait_queue(&netdev_unregistering_wq, &wait);
7278         for (;;) {
7279                 unregistering = false;
7280                 rtnl_lock();
7281                 list_for_each_entry(net, net_list, exit_list) {
7282                         if (net->dev_unreg_count > 0) {
7283                                 unregistering = true;
7284                                 break;
7285                         }
7286                 }
7287                 if (!unregistering)
7288                         break;
7289                 __rtnl_unlock();
7290
7291                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7292         }
7293         remove_wait_queue(&netdev_unregistering_wq, &wait);
7294 }
7295
7296 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7297 {
7298         /* At exit all network devices most be removed from a network
7299          * namespace.  Do this in the reverse order of registration.
7300          * Do this across as many network namespaces as possible to
7301          * improve batching efficiency.
7302          */
7303         struct net_device *dev;
7304         struct net *net;
7305         LIST_HEAD(dev_kill_list);
7306
7307         /* To prevent network device cleanup code from dereferencing
7308          * loopback devices or network devices that have been freed
7309          * wait here for all pending unregistrations to complete,
7310          * before unregistring the loopback device and allowing the
7311          * network namespace be freed.
7312          *
7313          * The netdev todo list containing all network devices
7314          * unregistrations that happen in default_device_exit_batch
7315          * will run in the rtnl_unlock() at the end of
7316          * default_device_exit_batch.
7317          */
7318         rtnl_lock_unregistering(net_list);
7319         list_for_each_entry(net, net_list, exit_list) {
7320                 for_each_netdev_reverse(net, dev) {
7321                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7322                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7323                         else
7324                                 unregister_netdevice_queue(dev, &dev_kill_list);
7325                 }
7326         }
7327         unregister_netdevice_many(&dev_kill_list);
7328         rtnl_unlock();
7329 }
7330
7331 static struct pernet_operations __net_initdata default_device_ops = {
7332         .exit = default_device_exit,
7333         .exit_batch = default_device_exit_batch,
7334 };
7335
7336 /*
7337  *      Initialize the DEV module. At boot time this walks the device list and
7338  *      unhooks any devices that fail to initialise (normally hardware not
7339  *      present) and leaves us with a valid list of present and active devices.
7340  *
7341  */
7342
7343 /*
7344  *       This is called single threaded during boot, so no need
7345  *       to take the rtnl semaphore.
7346  */
7347 static int __init net_dev_init(void)
7348 {
7349         int i, rc = -ENOMEM;
7350
7351         BUG_ON(!dev_boot_phase);
7352
7353         if (dev_proc_init())
7354                 goto out;
7355
7356         if (netdev_kobject_init())
7357                 goto out;
7358
7359         INIT_LIST_HEAD(&ptype_all);
7360         for (i = 0; i < PTYPE_HASH_SIZE; i++)
7361                 INIT_LIST_HEAD(&ptype_base[i]);
7362
7363         INIT_LIST_HEAD(&offload_base);
7364
7365         if (register_pernet_subsys(&netdev_net_ops))
7366                 goto out;
7367
7368         /*
7369          *      Initialise the packet receive queues.
7370          */
7371
7372         for_each_possible_cpu(i) {
7373                 struct softnet_data *sd = &per_cpu(softnet_data, i);
7374
7375                 skb_queue_head_init(&sd->input_pkt_queue);
7376                 skb_queue_head_init(&sd->process_queue);
7377                 INIT_LIST_HEAD(&sd->poll_list);
7378                 sd->output_queue_tailp = &sd->output_queue;
7379 #ifdef CONFIG_RPS
7380                 sd->csd.func = rps_trigger_softirq;
7381                 sd->csd.info = sd;
7382                 sd->cpu = i;
7383 #endif
7384
7385                 sd->backlog.poll = process_backlog;
7386                 sd->backlog.weight = weight_p;
7387         }
7388
7389         dev_boot_phase = 0;
7390
7391         /* The loopback device is special if any other network devices
7392          * is present in a network namespace the loopback device must
7393          * be present. Since we now dynamically allocate and free the
7394          * loopback device ensure this invariant is maintained by
7395          * keeping the loopback device as the first device on the
7396          * list of network devices.  Ensuring the loopback devices
7397          * is the first device that appears and the last network device
7398          * that disappears.
7399          */
7400         if (register_pernet_device(&loopback_net_ops))
7401                 goto out;
7402
7403         if (register_pernet_device(&default_device_ops))
7404                 goto out;
7405
7406         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7407         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7408
7409         hotcpu_notifier(dev_cpu_callback, 0);
7410         dst_init();
7411         rc = 0;
7412 out:
7413         return rc;
7414 }
7415
7416 subsys_initcall(net_dev_init);