f1efd17b2614b28cc0be7aff2e5f1fff8f65463f
[linux-drm-fsl-dcu.git] / fs / dlm / lowcomms-tcp.c
1 /******************************************************************************
2 *******************************************************************************
3 **
4 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
5 **  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
6 **
7 **  This copyrighted material is made available to anyone wishing to use,
8 **  modify, copy, or redistribute it subject to the terms and conditions
9 **  of the GNU General Public License v.2.
10 **
11 *******************************************************************************
12 ******************************************************************************/
13
14 /*
15  * lowcomms.c
16  *
17  * This is the "low-level" comms layer.
18  *
19  * It is responsible for sending/receiving messages
20  * from other nodes in the cluster.
21  *
22  * Cluster nodes are referred to by their nodeids. nodeids are
23  * simply 32 bit numbers to the locking module - if they need to
24  * be expanded for the cluster infrastructure then that is it's
25  * responsibility. It is this layer's
26  * responsibility to resolve these into IP address or
27  * whatever it needs for inter-node communication.
28  *
29  * The comms level is two kernel threads that deal mainly with
30  * the receiving of messages from other nodes and passing them
31  * up to the mid-level comms layer (which understands the
32  * message format) for execution by the locking core, and
33  * a send thread which does all the setting up of connections
34  * to remote nodes and the sending of data. Threads are not allowed
35  * to send their own data because it may cause them to wait in times
36  * of high load. Also, this way, the sending thread can collect together
37  * messages bound for one node and send them in one block.
38  *
39  * I don't see any problem with the recv thread executing the locking
40  * code on behalf of remote processes as the locking code is
41  * short, efficient and never waits.
42  *
43  */
44
45
46 #include <asm/ioctls.h>
47 #include <net/sock.h>
48 #include <net/tcp.h>
49 #include <linux/pagemap.h>
50
51 #include "dlm_internal.h"
52 #include "lowcomms.h"
53 #include "midcomms.h"
54 #include "config.h"
55
56 struct cbuf {
57         unsigned int base;
58         unsigned int len;
59         unsigned int mask;
60 };
61
62 #define NODE_INCREMENT 32
63 static void cbuf_add(struct cbuf *cb, int n)
64 {
65         cb->len += n;
66 }
67
68 static int cbuf_data(struct cbuf *cb)
69 {
70         return ((cb->base + cb->len) & cb->mask);
71 }
72
73 static void cbuf_init(struct cbuf *cb, int size)
74 {
75         cb->base = cb->len = 0;
76         cb->mask = size-1;
77 }
78
79 static void cbuf_eat(struct cbuf *cb, int n)
80 {
81         cb->len  -= n;
82         cb->base += n;
83         cb->base &= cb->mask;
84 }
85
86 static bool cbuf_empty(struct cbuf *cb)
87 {
88         return cb->len == 0;
89 }
90
91 /* Maximum number of incoming messages to process before
92    doing a cond_resched()
93 */
94 #define MAX_RX_MSG_COUNT 25
95
96 struct connection {
97         struct socket *sock;    /* NULL if not connected */
98         uint32_t nodeid;        /* So we know who we are in the list */
99         struct mutex sock_mutex;
100         unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
101 #define CF_READ_PENDING 1
102 #define CF_WRITE_PENDING 2
103 #define CF_CONNECT_PENDING 3
104 #define CF_IS_OTHERCON 4
105         struct list_head writequeue;  /* List of outgoing writequeue_entries */
106         struct list_head listenlist;  /* List of allocated listening sockets */
107         spinlock_t writequeue_lock;
108         int (*rx_action) (struct connection *); /* What to do when active */
109         struct page *rx_page;
110         struct cbuf cb;
111         int retries;
112 #define MAX_CONNECT_RETRIES 3
113         struct connection *othercon;
114         struct work_struct rwork; /* Receive workqueue */
115         struct work_struct swork; /* Send workqueue */
116 };
117 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
118
119 /* An entry waiting to be sent */
120 struct writequeue_entry {
121         struct list_head list;
122         struct page *page;
123         int offset;
124         int len;
125         int end;
126         int users;
127         struct connection *con;
128 };
129
130 static struct sockaddr_storage dlm_local_addr;
131
132 /* Work queues */
133 static struct workqueue_struct *recv_workqueue;
134 static struct workqueue_struct *send_workqueue;
135
136 /* An array of pointers to connections, indexed by NODEID */
137 static struct connection **connections;
138 static DECLARE_MUTEX(connections_lock);
139 static struct kmem_cache *con_cache;
140 static int conn_array_size;
141
142 static void process_recv_sockets(struct work_struct *work);
143 static void process_send_sockets(struct work_struct *work);
144
145 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
146 {
147         struct connection *con = NULL;
148
149         down(&connections_lock);
150         if (nodeid >= conn_array_size) {
151                 int new_size = nodeid + NODE_INCREMENT;
152                 struct connection **new_conns;
153
154                 new_conns = kzalloc(sizeof(struct connection *) *
155                                     new_size, allocation);
156                 if (!new_conns)
157                         goto finish;
158
159                 memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
160                 conn_array_size = new_size;
161                 kfree(connections);
162                 connections = new_conns;
163
164         }
165
166         con = connections[nodeid];
167         if (con == NULL && allocation) {
168                 con = kmem_cache_zalloc(con_cache, allocation);
169                 if (!con)
170                         goto finish;
171
172                 con->nodeid = nodeid;
173                 mutex_init(&con->sock_mutex);
174                 INIT_LIST_HEAD(&con->writequeue);
175                 spin_lock_init(&con->writequeue_lock);
176                 INIT_WORK(&con->swork, process_send_sockets);
177                 INIT_WORK(&con->rwork, process_recv_sockets);
178
179                 connections[nodeid] = con;
180         }
181
182 finish:
183         up(&connections_lock);
184         return con;
185 }
186
187 /* Data available on socket or listen socket received a connect */
188 static void lowcomms_data_ready(struct sock *sk, int count_unused)
189 {
190         struct connection *con = sock2con(sk);
191
192         if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
193                 queue_work(recv_workqueue, &con->rwork);
194 }
195
196 static void lowcomms_write_space(struct sock *sk)
197 {
198         struct connection *con = sock2con(sk);
199
200         if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
201                 queue_work(send_workqueue, &con->swork);
202 }
203
204 static inline void lowcomms_connect_sock(struct connection *con)
205 {
206         if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
207                 queue_work(send_workqueue, &con->swork);
208 }
209
210 static void lowcomms_state_change(struct sock *sk)
211 {
212         if (sk->sk_state == TCP_ESTABLISHED)
213                 lowcomms_write_space(sk);
214 }
215
216 /* Make a socket active */
217 static int add_sock(struct socket *sock, struct connection *con)
218 {
219         con->sock = sock;
220
221         /* Install a data_ready callback */
222         con->sock->sk->sk_data_ready = lowcomms_data_ready;
223         con->sock->sk->sk_write_space = lowcomms_write_space;
224         con->sock->sk->sk_state_change = lowcomms_state_change;
225
226         return 0;
227 }
228
229 /* Add the port number to an IP6 or 4 sockaddr and return the address
230    length */
231 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
232                           int *addr_len)
233 {
234         saddr->ss_family =  dlm_local_addr.ss_family;
235         if (saddr->ss_family == AF_INET) {
236                 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
237                 in4_addr->sin_port = cpu_to_be16(port);
238                 *addr_len = sizeof(struct sockaddr_in);
239         } else {
240                 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
241                 in6_addr->sin6_port = cpu_to_be16(port);
242                 *addr_len = sizeof(struct sockaddr_in6);
243         }
244 }
245
246 /* Close a remote connection and tidy up */
247 static void close_connection(struct connection *con, bool and_other)
248 {
249         mutex_lock(&con->sock_mutex);
250
251         if (con->sock) {
252                 sock_release(con->sock);
253                 con->sock = NULL;
254         }
255         if (con->othercon && and_other) {
256                 /* Will only re-enter once. */
257                 close_connection(con->othercon, false);
258         }
259         if (con->rx_page) {
260                 __free_page(con->rx_page);
261                 con->rx_page = NULL;
262         }
263         con->retries = 0;
264         mutex_unlock(&con->sock_mutex);
265 }
266
267 /* Data received from remote end */
268 static int receive_from_sock(struct connection *con)
269 {
270         int ret = 0;
271         struct msghdr msg;
272         struct iovec iov[2];
273         mm_segment_t fs;
274         unsigned len;
275         int r;
276         int call_again_soon = 0;
277
278         mutex_lock(&con->sock_mutex);
279
280         if (con->sock == NULL) {
281                 ret = -EAGAIN;
282                 goto out_close;
283         }
284
285         if (con->rx_page == NULL) {
286                 /*
287                  * This doesn't need to be atomic, but I think it should
288                  * improve performance if it is.
289                  */
290                 con->rx_page = alloc_page(GFP_ATOMIC);
291                 if (con->rx_page == NULL)
292                         goto out_resched;
293                 cbuf_init(&con->cb, PAGE_CACHE_SIZE);
294         }
295
296         msg.msg_control = NULL;
297         msg.msg_controllen = 0;
298         msg.msg_iovlen = 1;
299         msg.msg_iov = iov;
300         msg.msg_name = NULL;
301         msg.msg_namelen = 0;
302         msg.msg_flags = 0;
303
304         /*
305          * iov[0] is the bit of the circular buffer between the current end
306          * point (cb.base + cb.len) and the end of the buffer.
307          */
308         iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
309         iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
310         iov[1].iov_len = 0;
311
312         /*
313          * iov[1] is the bit of the circular buffer between the start of the
314          * buffer and the start of the currently used section (cb.base)
315          */
316         if (cbuf_data(&con->cb) >= con->cb.base) {
317                 iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
318                 iov[1].iov_len = con->cb.base;
319                 iov[1].iov_base = page_address(con->rx_page);
320                 msg.msg_iovlen = 2;
321         }
322         len = iov[0].iov_len + iov[1].iov_len;
323
324         fs = get_fs();
325         set_fs(get_ds());
326         r = ret = sock_recvmsg(con->sock, &msg, len,
327                                MSG_DONTWAIT | MSG_NOSIGNAL);
328         set_fs(fs);
329
330         if (ret <= 0)
331                 goto out_close;
332         if (ret == -EAGAIN)
333                 goto out_resched;
334
335         if (ret == len)
336                 call_again_soon = 1;
337         cbuf_add(&con->cb, ret);
338         ret = dlm_process_incoming_buffer(con->nodeid,
339                                           page_address(con->rx_page),
340                                           con->cb.base, con->cb.len,
341                                           PAGE_CACHE_SIZE);
342         if (ret == -EBADMSG) {
343                 printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
344                        "iov_len=%u, iov_base[0]=%p, read=%d\n",
345                        page_address(con->rx_page), con->cb.base, con->cb.len,
346                        len, iov[0].iov_base, r);
347         }
348         if (ret < 0)
349                 goto out_close;
350         cbuf_eat(&con->cb, ret);
351
352         if (cbuf_empty(&con->cb) && !call_again_soon) {
353                 __free_page(con->rx_page);
354                 con->rx_page = NULL;
355         }
356
357         if (call_again_soon)
358                 goto out_resched;
359         mutex_unlock(&con->sock_mutex);
360         return 0;
361
362 out_resched:
363         if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
364                 queue_work(recv_workqueue, &con->rwork);
365         mutex_unlock(&con->sock_mutex);
366         return -EAGAIN;
367
368 out_close:
369         mutex_unlock(&con->sock_mutex);
370         if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
371                 close_connection(con, false);
372                 /* Reconnect when there is something to send */
373         }
374         /* Don't return success if we really got EOF */
375         if (ret == 0)
376                 ret = -EAGAIN;
377
378         return ret;
379 }
380
381 /* Listening socket is busy, accept a connection */
382 static int accept_from_sock(struct connection *con)
383 {
384         int result;
385         struct sockaddr_storage peeraddr;
386         struct socket *newsock;
387         int len;
388         int nodeid;
389         struct connection *newcon;
390         struct connection *addcon;
391
392         memset(&peeraddr, 0, sizeof(peeraddr));
393         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
394                                   IPPROTO_TCP, &newsock);
395         if (result < 0)
396                 return -ENOMEM;
397
398         mutex_lock_nested(&con->sock_mutex, 0);
399
400         result = -ENOTCONN;
401         if (con->sock == NULL)
402                 goto accept_err;
403
404         newsock->type = con->sock->type;
405         newsock->ops = con->sock->ops;
406
407         result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
408         if (result < 0)
409                 goto accept_err;
410
411         /* Get the connected socket's peer */
412         memset(&peeraddr, 0, sizeof(peeraddr));
413         if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
414                                   &len, 2)) {
415                 result = -ECONNABORTED;
416                 goto accept_err;
417         }
418
419         /* Get the new node's NODEID */
420         make_sockaddr(&peeraddr, 0, &len);
421         if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
422                 printk("dlm: connect from non cluster node\n");
423                 sock_release(newsock);
424                 mutex_unlock(&con->sock_mutex);
425                 return -1;
426         }
427
428         log_print("got connection from %d", nodeid);
429
430         /*  Check to see if we already have a connection to this node. This
431          *  could happen if the two nodes initiate a connection at roughly
432          *  the same time and the connections cross on the wire.
433          * TEMPORARY FIX:
434          *  In this case we store the incoming one in "othercon"
435          */
436         newcon = nodeid2con(nodeid, GFP_KERNEL);
437         if (!newcon) {
438                 result = -ENOMEM;
439                 goto accept_err;
440         }
441         mutex_lock_nested(&newcon->sock_mutex, 1);
442         if (newcon->sock) {
443                 struct connection *othercon = newcon->othercon;
444
445                 if (!othercon) {
446                         othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
447                         if (!othercon) {
448                                 printk("dlm: failed to allocate incoming socket\n");
449                                 mutex_unlock(&newcon->sock_mutex);
450                                 result = -ENOMEM;
451                                 goto accept_err;
452                         }
453                         othercon->nodeid = nodeid;
454                         othercon->rx_action = receive_from_sock;
455                         mutex_init(&othercon->sock_mutex);
456                         INIT_WORK(&othercon->swork, process_send_sockets);
457                         INIT_WORK(&othercon->rwork, process_recv_sockets);
458                         set_bit(CF_IS_OTHERCON, &othercon->flags);
459                         newcon->othercon = othercon;
460                 }
461                 othercon->sock = newsock;
462                 newsock->sk->sk_user_data = othercon;
463                 add_sock(newsock, othercon);
464                 addcon = othercon;
465         }
466         else {
467                 newsock->sk->sk_user_data = newcon;
468                 newcon->rx_action = receive_from_sock;
469                 add_sock(newsock, newcon);
470                 addcon = newcon;
471         }
472
473         mutex_unlock(&newcon->sock_mutex);
474
475         /*
476          * Add it to the active queue in case we got data
477          * beween processing the accept adding the socket
478          * to the read_sockets list
479          */
480         if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
481                 queue_work(recv_workqueue, &addcon->rwork);
482         mutex_unlock(&con->sock_mutex);
483
484         return 0;
485
486 accept_err:
487         mutex_unlock(&con->sock_mutex);
488         sock_release(newsock);
489
490         if (result != -EAGAIN)
491                 printk("dlm: error accepting connection from node: %d\n", result);
492         return result;
493 }
494
495 /* Connect a new socket to its peer */
496 static void connect_to_sock(struct connection *con)
497 {
498         int result = -EHOSTUNREACH;
499         struct sockaddr_storage saddr;
500         int addr_len;
501         struct socket *sock;
502
503         if (con->nodeid == 0) {
504                 log_print("attempt to connect sock 0 foiled");
505                 return;
506         }
507
508         mutex_lock(&con->sock_mutex);
509         if (con->retries++ > MAX_CONNECT_RETRIES)
510                 goto out;
511
512         /* Some odd races can cause double-connects, ignore them */
513         if (con->sock) {
514                 result = 0;
515                 goto out;
516         }
517
518         /* Create a socket to communicate with */
519         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
520                                   IPPROTO_TCP, &sock);
521         if (result < 0)
522                 goto out_err;
523
524         memset(&saddr, 0, sizeof(saddr));
525         if (dlm_nodeid_to_addr(con->nodeid, &saddr))
526                 goto out_err;
527
528         sock->sk->sk_user_data = con;
529         con->rx_action = receive_from_sock;
530
531         make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
532
533         add_sock(sock, con);
534
535         log_print("connecting to %d", con->nodeid);
536         result =
537                 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
538                                    O_NONBLOCK);
539         if (result == -EINPROGRESS)
540                 result = 0;
541         if (result == 0)
542                 goto out;
543
544 out_err:
545         if (con->sock) {
546                 sock_release(con->sock);
547                 con->sock = NULL;
548         }
549         /*
550          * Some errors are fatal and this list might need adjusting. For other
551          * errors we try again until the max number of retries is reached.
552          */
553         if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
554             result != -ENETDOWN && result != EINVAL
555             && result != -EPROTONOSUPPORT) {
556                 lowcomms_connect_sock(con);
557                 result = 0;
558         }
559 out:
560         mutex_unlock(&con->sock_mutex);
561         return;
562 }
563
564 static struct socket *create_listen_sock(struct connection *con,
565                                          struct sockaddr_storage *saddr)
566 {
567         struct socket *sock = NULL;
568         mm_segment_t fs;
569         int result = 0;
570         int one = 1;
571         int addr_len;
572
573         if (dlm_local_addr.ss_family == AF_INET)
574                 addr_len = sizeof(struct sockaddr_in);
575         else
576                 addr_len = sizeof(struct sockaddr_in6);
577
578         /* Create a socket to communicate with */
579         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
580         if (result < 0) {
581                 printk("dlm: Can't create listening comms socket\n");
582                 goto create_out;
583         }
584
585         fs = get_fs();
586         set_fs(get_ds());
587         result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
588                                  (char *)&one, sizeof(one));
589         set_fs(fs);
590         if (result < 0) {
591                 printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
592                        result);
593         }
594         sock->sk->sk_user_data = con;
595         con->rx_action = accept_from_sock;
596         con->sock = sock;
597
598         /* Bind to our port */
599         make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
600         result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
601         if (result < 0) {
602                 printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
603                 sock_release(sock);
604                 sock = NULL;
605                 con->sock = NULL;
606                 goto create_out;
607         }
608
609         fs = get_fs();
610         set_fs(get_ds());
611
612         result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
613                                  (char *)&one, sizeof(one));
614         set_fs(fs);
615         if (result < 0) {
616                 printk("dlm: Set keepalive failed: %d\n", result);
617         }
618
619         result = sock->ops->listen(sock, 5);
620         if (result < 0) {
621                 printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
622                 sock_release(sock);
623                 sock = NULL;
624                 goto create_out;
625         }
626
627 create_out:
628         return sock;
629 }
630
631
632 /* Listen on all interfaces */
633 static int listen_for_all(void)
634 {
635         struct socket *sock = NULL;
636         struct connection *con = nodeid2con(0, GFP_KERNEL);
637         int result = -EINVAL;
638
639         /* We don't support multi-homed hosts */
640         set_bit(CF_IS_OTHERCON, &con->flags);
641
642         sock = create_listen_sock(con, &dlm_local_addr);
643         if (sock) {
644                 add_sock(sock, con);
645                 result = 0;
646         }
647         else {
648                 result = -EADDRINUSE;
649         }
650
651         return result;
652 }
653
654
655
656 static struct writequeue_entry *new_writequeue_entry(struct connection *con,
657                                                      gfp_t allocation)
658 {
659         struct writequeue_entry *entry;
660
661         entry = kmalloc(sizeof(struct writequeue_entry), allocation);
662         if (!entry)
663                 return NULL;
664
665         entry->page = alloc_page(allocation);
666         if (!entry->page) {
667                 kfree(entry);
668                 return NULL;
669         }
670
671         entry->offset = 0;
672         entry->len = 0;
673         entry->end = 0;
674         entry->users = 0;
675         entry->con = con;
676
677         return entry;
678 }
679
680 void *dlm_lowcomms_get_buffer(int nodeid, int len,
681                               gfp_t allocation, char **ppc)
682 {
683         struct connection *con;
684         struct writequeue_entry *e;
685         int offset = 0;
686         int users = 0;
687
688         con = nodeid2con(nodeid, allocation);
689         if (!con)
690                 return NULL;
691
692         spin_lock(&con->writequeue_lock);
693         e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
694         if ((&e->list == &con->writequeue) ||
695             (PAGE_CACHE_SIZE - e->end < len)) {
696                 e = NULL;
697         } else {
698                 offset = e->end;
699                 e->end += len;
700                 users = e->users++;
701         }
702         spin_unlock(&con->writequeue_lock);
703
704         if (e) {
705         got_one:
706                 if (users == 0)
707                         kmap(e->page);
708                 *ppc = page_address(e->page) + offset;
709                 return e;
710         }
711
712         e = new_writequeue_entry(con, allocation);
713         if (e) {
714                 spin_lock(&con->writequeue_lock);
715                 offset = e->end;
716                 e->end += len;
717                 users = e->users++;
718                 list_add_tail(&e->list, &con->writequeue);
719                 spin_unlock(&con->writequeue_lock);
720                 goto got_one;
721         }
722         return NULL;
723 }
724
725 void dlm_lowcomms_commit_buffer(void *mh)
726 {
727         struct writequeue_entry *e = (struct writequeue_entry *)mh;
728         struct connection *con = e->con;
729         int users;
730
731         spin_lock(&con->writequeue_lock);
732         users = --e->users;
733         if (users)
734                 goto out;
735         e->len = e->end - e->offset;
736         kunmap(e->page);
737         spin_unlock(&con->writequeue_lock);
738
739         if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
740                 queue_work(send_workqueue, &con->swork);
741         }
742         return;
743
744 out:
745         spin_unlock(&con->writequeue_lock);
746         return;
747 }
748
749 static void free_entry(struct writequeue_entry *e)
750 {
751         __free_page(e->page);
752         kfree(e);
753 }
754
755 /* Send a message */
756 static void send_to_sock(struct connection *con)
757 {
758         int ret = 0;
759         ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
760         const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
761         struct writequeue_entry *e;
762         int len, offset;
763
764         mutex_lock(&con->sock_mutex);
765         if (con->sock == NULL)
766                 goto out_connect;
767
768         sendpage = con->sock->ops->sendpage;
769
770         spin_lock(&con->writequeue_lock);
771         for (;;) {
772                 e = list_entry(con->writequeue.next, struct writequeue_entry,
773                                list);
774                 if ((struct list_head *) e == &con->writequeue)
775                         break;
776
777                 len = e->len;
778                 offset = e->offset;
779                 BUG_ON(len == 0 && e->users == 0);
780                 spin_unlock(&con->writequeue_lock);
781                 kmap(e->page);
782
783                 ret = 0;
784                 if (len) {
785                         ret = sendpage(con->sock, e->page, offset, len,
786                                        msg_flags);
787                         if (ret == -EAGAIN || ret == 0)
788                                 goto out;
789                         if (ret <= 0)
790                                 goto send_error;
791                 }
792                 else {
793                         /* Don't starve people filling buffers */
794                         cond_resched();
795                 }
796
797                 spin_lock(&con->writequeue_lock);
798                 e->offset += ret;
799                 e->len -= ret;
800
801                 if (e->len == 0 && e->users == 0) {
802                         list_del(&e->list);
803                         kunmap(e->page);
804                         free_entry(e);
805                         continue;
806                 }
807         }
808         spin_unlock(&con->writequeue_lock);
809 out:
810         mutex_unlock(&con->sock_mutex);
811         return;
812
813 send_error:
814         mutex_unlock(&con->sock_mutex);
815         close_connection(con, false);
816         lowcomms_connect_sock(con);
817         return;
818
819 out_connect:
820         mutex_unlock(&con->sock_mutex);
821         connect_to_sock(con);
822         return;
823 }
824
825 static void clean_one_writequeue(struct connection *con)
826 {
827         struct list_head *list;
828         struct list_head *temp;
829
830         spin_lock(&con->writequeue_lock);
831         list_for_each_safe(list, temp, &con->writequeue) {
832                 struct writequeue_entry *e =
833                         list_entry(list, struct writequeue_entry, list);
834                 list_del(&e->list);
835                 free_entry(e);
836         }
837         spin_unlock(&con->writequeue_lock);
838 }
839
840 /* Called from recovery when it knows that a node has
841    left the cluster */
842 int dlm_lowcomms_close(int nodeid)
843 {
844         struct connection *con;
845
846         if (!connections)
847                 goto out;
848
849         log_print("closing connection to node %d", nodeid);
850         con = nodeid2con(nodeid, 0);
851         if (con) {
852                 clean_one_writequeue(con);
853                 close_connection(con, true);
854         }
855         return 0;
856
857 out:
858         return -1;
859 }
860
861 /* Look for activity on active sockets */
862 static void process_recv_sockets(struct work_struct *work)
863 {
864         struct connection *con = container_of(work, struct connection, rwork);
865         int err;
866
867         clear_bit(CF_READ_PENDING, &con->flags);
868         do {
869                 err = con->rx_action(con);
870         } while (!err);
871 }
872
873
874 static void process_send_sockets(struct work_struct *work)
875 {
876         struct connection *con = container_of(work, struct connection, swork);
877
878         if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
879                 connect_to_sock(con);
880         }
881
882         clear_bit(CF_WRITE_PENDING, &con->flags);
883         send_to_sock(con);
884 }
885
886
887 /* Discard all entries on the write queues */
888 static void clean_writequeues(void)
889 {
890         int nodeid;
891
892         for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
893                 struct connection *con = nodeid2con(nodeid, 0);
894
895                 if (con)
896                         clean_one_writequeue(con);
897         }
898 }
899
900 static void work_stop(void)
901 {
902         destroy_workqueue(recv_workqueue);
903         destroy_workqueue(send_workqueue);
904 }
905
906 static int work_start(void)
907 {
908         int error;
909         recv_workqueue = create_workqueue("dlm_recv");
910         error = IS_ERR(recv_workqueue);
911         if (error) {
912                 log_print("can't start dlm_recv %d", error);
913                 return error;
914         }
915
916         send_workqueue = create_singlethread_workqueue("dlm_send");
917         error = IS_ERR(send_workqueue);
918         if (error) {
919                 log_print("can't start dlm_send %d", error);
920                 destroy_workqueue(recv_workqueue);
921                 return error;
922         }
923
924         return 0;
925 }
926
927 void dlm_lowcomms_stop(void)
928 {
929         int i;
930
931         /* Set all the flags to prevent any
932            socket activity.
933         */
934         for (i = 0; i < conn_array_size; i++) {
935                 if (connections[i])
936                         connections[i]->flags |= 0xFF;
937         }
938
939         work_stop();
940         clean_writequeues();
941
942         for (i = 0; i < conn_array_size; i++) {
943                 if (connections[i]) {
944                         close_connection(connections[i], true);
945                         if (connections[i]->othercon)
946                                 kmem_cache_free(con_cache, connections[i]->othercon);
947                         kmem_cache_free(con_cache, connections[i]);
948                 }
949         }
950
951         kfree(connections);
952         connections = NULL;
953
954         kmem_cache_destroy(con_cache);
955 }
956
957 /* This is quite likely to sleep... */
958 int dlm_lowcomms_start(void)
959 {
960         int error = 0;
961
962         error = -ENOMEM;
963         connections = kzalloc(sizeof(struct connection *) *
964                               NODE_INCREMENT, GFP_KERNEL);
965         if (!connections)
966                 goto out;
967
968         conn_array_size = NODE_INCREMENT;
969
970         if (dlm_our_addr(&dlm_local_addr, 0)) {
971                 log_print("no local IP address has been set");
972                 goto fail_free_conn;
973         }
974         if (!dlm_our_addr(&dlm_local_addr, 1)) {
975                 log_print("This dlm comms module does not support multi-homed clustering");
976                 goto fail_free_conn;
977         }
978
979         con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
980                                       __alignof__(struct connection), 0,
981                                       NULL, NULL);
982         if (!con_cache)
983                 goto fail_free_conn;
984
985
986         /* Start listening */
987         error = listen_for_all();
988         if (error)
989                 goto fail_unlisten;
990
991         error = work_start();
992         if (error)
993                 goto fail_unlisten;
994
995         return 0;
996
997 fail_unlisten:
998         close_connection(connections[0], false);
999         kmem_cache_free(con_cache, connections[0]);
1000         kmem_cache_destroy(con_cache);
1001
1002 fail_free_conn:
1003         kfree(connections);
1004
1005 out:
1006         return error;
1007 }
1008
1009 /*
1010  * Overrides for Emacs so that we follow Linus's tabbing style.
1011  * Emacs will notice this stuff at the end of the file and automatically
1012  * adjust the settings for this buffer only.  This must remain at the end
1013  * of the file.
1014  * ---------------------------------------------------------------------------
1015  * Local variables:
1016  * c-file-style: "linux"
1017  * End:
1018  */