Merge git://oss.sgi.com:8090/xfs/xfs-2.6

[linux-drm-fsl-dcu.git] / net / sunrpc / svcsock.c
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c

index 96521f16342b63e80f18f77888fa7b5b06dbd333..2fd0ba2b20dfd5e149d680e0e0abda687fc6229e 100644 (file)
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -32,6 +32,7 @@
  #include <linux/netdevice.h>
  #include <linux/skbuff.h>
  #include <linux/file.h>
+#include <linux/freezer.h>
  #include <net/sock.h>
  #include <net/checksum.h>
  #include <net/ip.h>
@@ -57,10 +58,16 @@
   *     providing that certain rules are followed:
   *
   *     SK_CONN, SK_DATA, can be set or cleared at any time.
- *             after a set, svc_sock_enqueue must be called.   
+ *             after a set, svc_sock_enqueue must be called.
   *             after a clear, the socket must be read/accepted
   *              if this succeeds, it must be set again.
   *     SK_CLOSE can set at any time. It is never cleared.
+ *      sk_inuse contains a bias of '1' until SK_DEAD is set.
+ *             so when sk_inuse hits zero, we know the socket is dead
+ *             and no-one is using it.
+ *      SK_DEAD can only be set while SK_BUSY is held which ensures
+ *             no other thread will be using the socket or will try to
+ *            set SK_DEAD.
   *
   */
  
@@ -69,6 +76,7 @@
  
  static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *,
                                          int *errp, int pmap_reg);
+static void            svc_delete_socket(struct svc_sock *svsk);
  static void            svc_udp_data_ready(struct sock *, int);
  static int             svc_udp_recvfrom(struct svc_rqst *);
  static int             svc_udp_sendto(struct svc_rqst *);
@@ -84,6 +92,35 @@ static struct cache_deferred_req *svc_defer(struct cache_req *req);
   */
  static int svc_conn_age_period = 6*60;
  
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key svc_key[2];
+static struct lock_class_key svc_slock_key[2];
+
+static inline void svc_reclassify_socket(struct socket *sock)
+{
+       struct sock *sk = sock->sk;
+       BUG_ON(sk->sk_lock.owner != NULL);
+       switch (sk->sk_family) {
+       case AF_INET:
+               sock_lock_init_class_and_name(sk, "slock-AF_INET-NFSD",
+                   &svc_slock_key[0], "sk_lock-AF_INET-NFSD", &svc_key[0]);
+               break;
+
+       case AF_INET6:
+               sock_lock_init_class_and_name(sk, "slock-AF_INET6-NFSD",
+                   &svc_slock_key[1], "sk_lock-AF_INET6-NFSD", &svc_key[1]);
+               break;
+
+       default:
+               BUG();
+       }
+}
+#else
+static inline void svc_reclassify_socket(struct socket *sock)
+{
+}
+#endif
+
  /*
   * Queue up an idle server thread.  Must have pool->sp_lock held.
   * Note: this is really a stack rather than a queue, so that we only
@@ -215,7 +252,7 @@ svc_sock_enqueue(struct svc_sock *svsk)
                         svsk->sk_sk, rqstp);
                 svc_thread_dequeue(pool, rqstp);
                 if (rqstp->rq_sock)
-                       printk(KERN_ERR 
+                       printk(KERN_ERR
                                 "svc_sock_enqueue: server %p, rq_sock=%p!\n",
                                 rqstp, rqstp->rq_sock);
                 rqstp->rq_sock = svsk;
@@ -299,9 +336,16 @@ void svc_reserve(struct svc_rqst *rqstp, int space)
  static inline void
  svc_sock_put(struct svc_sock *svsk)
  {
-       if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) {
+       if (atomic_dec_and_test(&svsk->sk_inuse)) {
+               BUG_ON(! test_bit(SK_DEAD, &svsk->sk_flags));
+
                 dprintk("svc: releasing dead socket\n");
-               sock_release(svsk->sk_sock);
+               if (svsk->sk_sock->file)
+                       sockfd_put(svsk->sk_sock);
+               else
+                       sock_release(svsk->sk_sock);
+               if (svsk->sk_info_authunix != NULL)
+                       svcauth_unix_info_release(svsk->sk_info_authunix);
                 kfree(svsk);
         }
  }
@@ -440,7 +484,7 @@ svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
         if (xdr->tail[0].iov_len) {
                 result = kernel_sendpage(sock, rqstp->rq_respages[0],
                                              ((unsigned long)xdr->tail[0].iov_base)
-                                               & (PAGE_SIZE-1),
+                                               & (PAGE_SIZE-1),
                                              xdr->tail[0].iov_len, 0);
  
                 if (result > 0)
@@ -484,7 +528,7 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
  
         if (!serv)
                 return 0;
-       spin_lock(&serv->sv_lock);
+       spin_lock_bh(&serv->sv_lock);
         list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) {
                 int onelen = one_sock_name(buf+len, svsk);
                 if (toclose && strcmp(toclose, buf+len) == 0)
@@ -492,12 +536,12 @@ svc_sock_names(char *buf, struct svc_serv *serv, char *toclose)
                 else
                         len += onelen;
         }
-       spin_unlock(&serv->sv_lock);
+       spin_unlock_bh(&serv->sv_lock);
         if (closesk)
                 /* Should unregister with portmap, but you cannot
                  * unregister just one protocol...
                  */
-               svc_delete_socket(closesk);
+               svc_close_socket(closesk);
         else if (toclose)
                 return -ENOENT;
         return len;
@@ -647,6 +691,11 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                 return svc_deferred_recv(rqstp);
         }
  
+       if (test_bit(SK_CLOSE, &svsk->sk_flags)) {
+               svc_delete_socket(svsk);
+               return 0;
+       }
+
         clear_bit(SK_DATA, &svsk->sk_flags);
         while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) {
                 if (err == -EAGAIN) {
@@ -662,7 +711,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                 tv.tv_sec = xtime.tv_sec;
                 tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC;
                 skb_set_timestamp(skb, &tv);
-               /* Don't enable netstamp, sunrpc doesn't 
+               /* Don't enable netstamp, sunrpc doesn't
                    need that much accuracy */
         }
         skb_get_timestamp(skb, &svsk->sk_sk->sk_stamp);
@@ -694,7 +743,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
                         return 0;
                 }
                 local_bh_enable();
-               skb_free_datagram(svsk->sk_sk, skb); 
+               skb_free_datagram(svsk->sk_sk, skb);
         } else {
                 /* we can use it in-place */
                 rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr);
@@ -745,7 +794,7 @@ svc_udp_init(struct svc_sock *svsk)
         svsk->sk_sendto = svc_udp_sendto;
  
         /* initialise setting must have enough space to
-        * receive and respond to one request.  
+        * receive and respond to one request.
          * svc_udp_recvfrom will re-adjust if necessary
          */
         svc_sock_setbufsize(svsk->sk_sock,
@@ -874,7 +923,7 @@ svc_tcp_accept(struct svc_sock *svsk)
         if (ntohs(sin.sin_port) >= 1024) {
                 dprintk(KERN_WARNING
                         "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n",
-                       serv->sv_name, 
+                       serv->sv_name,
                         NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
         }
  
@@ -989,7 +1038,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                  * on the number of threads which will access the socket.
                  *
                  * rcvbuf just needs to be able to hold a few requests.
-                * Normally they will be removed from the queue 
+                * Normally they will be removed from the queue
                  * as soon a a complete request arrives.
                  */
                 svc_sock_setbufsize(svsk->sk_sock,
@@ -1014,7 +1063,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
  
                 if (len < want) {
                         dprintk("svc: short recvfrom while reading record length (%d of %lu)\n",
-                               len, want);
+                               len, want);
                         svc_sock_received(svsk);
                         return -EAGAIN; /* record header not complete */
                 }
@@ -1026,15 +1075,19 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
                          *  bit set in the fragment length header.
                          *  But apparently no known nfs clients send fragmented
                          *  records. */
-                       printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n",
-                              (unsigned long) svsk->sk_reclen);
+                       if (net_ratelimit())
+                               printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx"
+                                      " (non-terminal)\n",
+                                      (unsigned long) svsk->sk_reclen);
                         goto err_delete;
                 }
                 svsk->sk_reclen &= 0x7fffffff;
                 dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
                 if (svsk->sk_reclen > serv->sv_max_mesg) {
-                       printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
-                              (unsigned long) svsk->sk_reclen);
+                       if (net_ratelimit())
+                               printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx"
+                                      " (large)\n",
+                                      (unsigned long) svsk->sk_reclen);
                         goto err_delete;
                 }
         }
@@ -1136,7 +1189,8 @@ svc_tcp_sendto(struct svc_rqst *rqstp)
                        rqstp->rq_sock->sk_server->sv_name,
                        (sent<0)?"got error":"sent only",
                        sent, xbufp->len);
-               svc_delete_socket(rqstp->rq_sock);
+               set_bit(SK_CLOSE, &rqstp->rq_sock->sk_flags);
+               svc_sock_enqueue(rqstp->rq_sock);
                 sent = -EAGAIN;
         }
         return sent;
@@ -1167,7 +1221,7 @@ svc_tcp_init(struct svc_sock *svsk)
                 tp->nonagle = 1;        /* disable Nagle's algorithm */
  
                 /* initialise setting must have enough space to
-                * receive and respond to one request.  
+                * receive and respond to one request.
                  * svc_tcp_recvfrom will re-adjust if necessary
                  */
                 svc_sock_setbufsize(svsk->sk_sock,
@@ -1176,7 +1230,7 @@ svc_tcp_init(struct svc_sock *svsk)
  
                 set_bit(SK_CHNGBUF, &svsk->sk_flags);
                 set_bit(SK_DATA, &svsk->sk_flags);
-               if (sk->sk_state != TCP_ESTABLISHED) 
+               if (sk->sk_state != TCP_ESTABLISHED)
                         set_bit(SK_CLOSE, &svsk->sk_flags);
         }
  }
@@ -1192,7 +1246,7 @@ svc_sock_update_bufs(struct svc_serv *serv)
  
         spin_lock_bh(&serv->sv_lock);
         list_for_each(le, &serv->sv_permsocks) {
-               struct svc_sock *svsk = 
+               struct svc_sock *svsk =
                         list_entry(le, struct svc_sock, sk_list);
                 set_bit(SK_CHNGBUF, &svsk->sk_flags);
         }
@@ -1224,11 +1278,11 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
                 rqstp, timeout);
  
         if (rqstp->rq_sock)
-               printk(KERN_ERR 
+               printk(KERN_ERR
                         "svc_recv: service %p, socket not NULL!\n",
                          rqstp);
         if (waitqueue_active(&rqstp->rq_wait))
-               printk(KERN_ERR 
+               printk(KERN_ERR
                         "svc_recv: service %p, wait queue active!\n",
                          rqstp);
  
@@ -1242,6 +1296,8 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
                                 schedule_timeout_uninterruptible(msecs_to_jiffies(500));
                         rqstp->rq_pages[i] = p;
                 }
+       rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
+       BUG_ON(pages >= RPCSVC_MAXPAGES);
  
         /* Make arg->head point to first page and arg->pages point to rest */
         arg = &rqstp->rq_arg;
@@ -1315,7 +1371,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
         return len;
  }
  
-/* 
+/*
   * Drop request
   */
  void
@@ -1453,7 +1509,7 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock,
         svsk->sk_odata = inet->sk_data_ready;
         svsk->sk_owspace = inet->sk_write_space;
         svsk->sk_server = serv;
-       atomic_set(&svsk->sk_inuse, 0);
+       atomic_set(&svsk->sk_inuse, 1);
         svsk->sk_lastrecv = get_seconds();
         spin_lock_init(&svsk->sk_defer_lock);
         INIT_LIST_HEAD(&svsk->sk_deferred);
@@ -1550,6 +1606,8 @@ svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin)
         if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0)
                 return error;
  
+       svc_reclassify_socket(sock);
+
         if (type == SOCK_STREAM)
                 sock->sk->sk_reuse = 1; /* allow address reuse */
         error = kernel_bind(sock, (struct sockaddr *) sin,
@@ -1574,7 +1632,7 @@ bummer:
  /*
   * Remove a dead socket
   */
-void
+static void
  svc_delete_socket(struct svc_sock *svsk)
  {
         struct svc_serv *serv;
@@ -1593,31 +1651,34 @@ svc_delete_socket(struct svc_sock *svsk)
  
         if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
                 list_del_init(&svsk->sk_list);
-       /*
+       /*
          * We used to delete the svc_sock from whichever list
          * it's sk_ready node was on, but we don't actually
          * need to.  This is because the only time we're called
          * while still attached to a queue, the queue itself
          * is about to be destroyed (in svc_destroy).
          */
-       if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
+       if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) {
+               BUG_ON(atomic_read(&svsk->sk_inuse)<2);
+               atomic_dec(&svsk->sk_inuse);
                 if (test_bit(SK_TEMP, &svsk->sk_flags))
                         serv->sv_tmpcnt--;
-
-       if (!atomic_read(&svsk->sk_inuse)) {
-               spin_unlock_bh(&serv->sv_lock);
-               if (svsk->sk_sock->file)
-                       sockfd_put(svsk->sk_sock);
-               else
-                       sock_release(svsk->sk_sock);
-               if (svsk->sk_info_authunix != NULL)
-                       svcauth_unix_info_release(svsk->sk_info_authunix);
-               kfree(svsk);
-       } else {
-               spin_unlock_bh(&serv->sv_lock);
-               dprintk(KERN_NOTICE "svc: server socket destroy delayed\n");
-               /* svsk->sk_server = NULL; */
         }
+
+       spin_unlock_bh(&serv->sv_lock);
+}
+
+void svc_close_socket(struct svc_sock *svsk)
+{
+       set_bit(SK_CLOSE, &svsk->sk_flags);
+       if (test_and_set_bit(SK_BUSY, &svsk->sk_flags))
+               /* someone else will have to effect the close */
+               return;
+
+       atomic_inc(&svsk->sk_inuse);
+       svc_delete_socket(svsk);
+       clear_bit(SK_BUSY, &svsk->sk_flags);
+       svc_sock_put(svsk);
  }
  
  /*
@@ -1636,7 +1697,7 @@ svc_makesock(struct svc_serv *serv, int protocol, unsigned short port)
  }
  
  /*
- * Handle defer and revisit of requests 
+ * Handle defer and revisit of requests
   */
  
  static void svc_revisit(struct cache_deferred_req *dreq, int too_many)
@@ -1715,7 +1776,7 @@ static int svc_deferred_recv(struct svc_rqst *rqstp)
  static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk)
  {
         struct svc_deferred_req *dr = NULL;
-       
+
         if (!test_bit(SK_DEFERRED, &svsk->sk_flags))
                 return NULL;
         spin_lock_bh(&svsk->sk_defer_lock);