Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jlbec...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c

index ed553c60de827e0ebad24e3501e0e00d21c82cfc..3165aebb43c87934b743ecf08e5f02cef586d771 100644 (file)
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5699,7 +5699,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
                                            OCFS2_JOURNAL_ACCESS_WRITE);
         if (ret) {
                 mlog_errno(ret);
-               goto out;
+               goto out_commit;
         }
  
         dquot_free_space_nodirty(inode,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c

index c1efe939c774e2c9b909892f6c95434d6da760ee..78b68af3b0e32627b1874277d8ae58003501acb5 100644 (file)
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -290,7 +290,15 @@ static int ocfs2_readpage(struct file *file, struct page *page)
         }
  
         if (down_read_trylock(&oi->ip_alloc_sem) == 0) {
+               /*
+                * Unlock the page and cycle ip_alloc_sem so that we don't
+                * busyloop waiting for ip_alloc_sem to unlock
+                */
                 ret = AOP_TRUNCATED_PAGE;
+               unlock_page(page);
+               unlock = 0;
+               down_read(&oi->ip_alloc_sem);
+               up_read(&oi->ip_alloc_sem);
                 goto out_inode_unlock;
         }
  
@@ -563,6 +571,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
  {
         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         int level;
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
  
         /* this io's submitter should not have unlocked this before we could */
         BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -570,6 +579,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
         if (ocfs2_iocb_is_sem_locked(iocb))
                 ocfs2_iocb_clear_sem_locked(iocb);
  
+       if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+               ocfs2_iocb_clear_unaligned_aio(iocb);
+
+               if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+                   waitqueue_active(wq)) {
+                       wake_up_all(wq);
+               }
+       }
+
         ocfs2_iocb_clear_rw_locked(iocb);
  
         level = ocfs2_iocb_rw_locked_level(iocb);
@@ -862,6 +880,12 @@ struct ocfs2_write_ctxt {
         struct page                     *w_pages[OCFS2_MAX_CTXT_PAGES];
         struct page                     *w_target_page;
  
+       /*
+        * w_target_locked is used for page_mkwrite path indicating no unlocking
+        * against w_target_page in ocfs2_write_end_nolock.
+        */
+       unsigned int                    w_target_locked:1;
+
         /*
          * ocfs2_write_end() uses this to know what the real range to
          * write in the target should be.
@@ -895,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
  
  static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
  {
+       int i;
+
+       /*
+        * w_target_locked is only set to true in the page_mkwrite() case.
+        * The intent is to allow us to lock the target page from write_begin()
+        * to write_end(). The caller must hold a ref on w_target_page.
+        */
+       if (wc->w_target_locked) {
+               BUG_ON(!wc->w_target_page);
+               for (i = 0; i < wc->w_num_pages; i++) {
+                       if (wc->w_target_page == wc->w_pages[i]) {
+                               wc->w_pages[i] = NULL;
+                               break;
+                       }
+               }
+               mark_page_accessed(wc->w_target_page);
+               page_cache_release(wc->w_target_page);
+       }
         ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
  
         brelse(wc->w_di_bh);
@@ -1132,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                          */
                         lock_page(mmap_page);
  
+                       /* Exit and let the caller retry */
                         if (mmap_page->mapping != mapping) {
+                               WARN_ON(mmap_page->mapping);
                                 unlock_page(mmap_page);
-                               /*
-                                * Sanity check - the locking in
-                                * ocfs2_pagemkwrite() should ensure
-                                * that this code doesn't trigger.
-                                */
-                               ret = -EINVAL;
-                               mlog_errno(ret);
+                               ret = -EAGAIN;
                                 goto out;
                         }
  
                         page_cache_get(mmap_page);
                         wc->w_pages[i] = mmap_page;
+                       wc->w_target_locked = true;
                 } else {
                         wc->w_pages[i] = find_or_create_page(mapping, index,
                                                              GFP_NOFS);
@@ -1160,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                         wc->w_target_page = wc->w_pages[i];
         }
  out:
+       if (ret)
+               wc->w_target_locked = false;
         return ret;
  }
  
@@ -1817,11 +1858,23 @@ try_again:
          */
         ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
                                          cluster_of_pages, mmap_page);
-       if (ret) {
+       if (ret && ret != -EAGAIN) {
                 mlog_errno(ret);
                 goto out_quota;
         }
  
+       /*
+        * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
+        * the target page. In this case, we exit with no error and no target
+        * page. This will trigger the caller, page_mkwrite(), to re-try
+        * the operation.
+        */
+       if (ret == -EAGAIN) {
+               BUG_ON(wc->w_target_page);
+               ret = 0;
+               goto out_quota;
+       }
+
         ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
                                           len);
         if (ret) {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h

index 75cf3ad987a66d911c15234a803243185ccc5a94..ffb2da370a99d05dd4b919fc64a5483dbc2df7a3 100644 (file)
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
         OCFS2_IOCB_RW_LOCK = 0,
         OCFS2_IOCB_RW_LOCK_LEVEL,
         OCFS2_IOCB_SEM,
+       OCFS2_IOCB_UNALIGNED_IO,
         OCFS2_IOCB_NUM_LOCKS
  };
  
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
         clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
  #define ocfs2_iocb_is_sem_locked(iocb) \
         test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+       set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+       clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+       test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ 37
+#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+                                           OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
  #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c

index 9a3e6bbff27bd4839b487c2c14282fd9ef1a4675..a4e855e3690e6ab844d37788b71649975321cb19 100644 (file)
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
  
         struct list_head        hr_all_item;
         unsigned                hr_unclean_stop:1,
+                               hr_aborted_start:1,
                                 hr_item_pinned:1,
                                 hr_item_dropped:1;
  
@@ -254,6 +255,10 @@ struct o2hb_region {
          * a more complete api that doesn't lead to this sort of fragility. */
         atomic_t                hr_steady_iterations;
  
+       /* terminate o2hb thread if it does not reach steady state
+        * (hr_steady_iterations == 0) within hr_unsteady_iterations */
+       atomic_t                hr_unsteady_iterations;
+
         char                    hr_dev_name[BDEVNAME_SIZE];
  
         unsigned int            hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
  
  static void o2hb_arm_write_timeout(struct o2hb_region *reg)
  {
+       /* Arm writeout only after thread reaches steady state */
+       if (atomic_read(&reg->hr_steady_iterations) != 0)
+               return;
+
         mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
              O2HB_MAX_WRITE_TIMEOUT_MS);
  
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
         return read == computed;
  }
  
-/* We want to make sure that nobody is heartbeating on top of us --
- * this will help detect an invalid configuration. */
-static void o2hb_check_last_timestamp(struct o2hb_region *reg)
+/*
+ * Compare the slot data with what we wrote in the last iteration.
+ * If the match fails, print an appropriate error message. This is to
+ * detect errors like... another node hearting on the same slot,
+ * flaky device that is losing writes, etc.
+ * Returns 1 if check succeeds, 0 otherwise.
+ */
+static int o2hb_check_own_slot(struct o2hb_region *reg)
  {
         struct o2hb_disk_slot *slot;
         struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
         slot = &reg->hr_slots[o2nm_this_node()];
         /* Don't check on our 1st timestamp */
         if (!slot->ds_last_time)
-               return;
+               return 0;
  
         hb_block = slot->ds_raw_block;
         if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
             le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
             hb_block->hb_node == slot->ds_node_num)
-               return;
+               return 1;
  
  #define ERRSTR1                "Another node is heartbeating on device"
  #define ERRSTR2                "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
              (unsigned long long)slot->ds_last_time, hb_block->hb_node,
              (unsigned long long)le64_to_cpu(hb_block->hb_generation),
              (unsigned long long)le64_to_cpu(hb_block->hb_seq));
+
+       return 0;
  }
  
  static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
         o2nm_node_put(node);
  }
  
-static void o2hb_set_quorum_device(struct o2hb_region *reg,
-                                  struct o2hb_disk_slot *slot)
+static void o2hb_set_quorum_device(struct o2hb_region *reg)
  {
-       assert_spin_locked(&o2hb_live_lock);
-
         if (!o2hb_global_heartbeat_active())
                 return;
  
-       if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+       /* Prevent race with o2hb_heartbeat_group_drop_item() */
+       if (kthread_should_stop())
+               return;
+
+       /* Tag region as quorum only after thread reaches steady state */
+       if (atomic_read(&reg->hr_steady_iterations) != 0)
                 return;
  
+       spin_lock(&o2hb_live_lock);
+
+       if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+               goto unlock;
+
         /*
          * A region can be added to the quorum only when it sees all
          * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
          */
         if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
                    sizeof(o2hb_live_node_bitmap)))
-               return;
-
-       if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
-               return;
+               goto unlock;
  
-       printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n",
-              config_item_name(&reg->hr_item));
+       printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
+              config_item_name(&reg->hr_item), reg->hr_dev_name);
  
         set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
  
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
         if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
                            O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
                 o2hb_region_unpin(NULL);
+unlock:
+       spin_unlock(&o2hb_live_lock);
  }
  
  static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
                 slot->ds_equal_samples = 0;
         }
  out:
-       o2hb_set_quorum_device(reg, slot);
-
         spin_unlock(&o2hb_live_lock);
  
         o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
  
  static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
  {
-       int i, ret, highest_node, change = 0;
+       int i, ret, highest_node;
+       int membership_change = 0, own_slot_ok = 0;
         unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
         unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
         struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
                                        sizeof(configured_nodes));
         if (ret) {
                 mlog_errno(ret);
-               return ret;
+               goto bail;
         }
  
         /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
  
         highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
         if (highest_node >= O2NM_MAX_NODES) {
-               mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
-               return -EINVAL;
+               mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
+               ret = -EINVAL;
+               goto bail;
         }
  
         /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
         ret = o2hb_read_slots(reg, highest_node + 1);
         if (ret < 0) {
                 mlog_errno(ret);
-               return ret;
+               goto bail;
         }
  
         /* With an up to date view of the slots, we can check that no
          * other node has been improperly configured to heartbeat in
          * our slot. */
-       o2hb_check_last_timestamp(reg);
+       own_slot_ok = o2hb_check_own_slot(reg);
  
         /* fill in the proper info for our next heartbeat */
         o2hb_prepare_block(reg, reg->hr_generation);
  
-       /* And fire off the write. Note that we don't wait on this I/O
-        * until later. */
         ret = o2hb_issue_node_write(reg, &write_wc);
         if (ret < 0) {
                 mlog_errno(ret);
-               return ret;
+               goto bail;
         }
  
         i = -1;
         while((i = find_next_bit(configured_nodes,
                                  O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
-               change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
+               membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
         }
  
         /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
                  * disk */
                 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
                      write_wc.wc_error, reg->hr_dev_name);
-               return write_wc.wc_error;
+               ret = write_wc.wc_error;
+               goto bail;
         }
  
-       o2hb_arm_write_timeout(reg);
+       /* Skip disarming the timeout if own slot has stale/bad data */
+       if (own_slot_ok) {
+               o2hb_set_quorum_device(reg);
+               o2hb_arm_write_timeout(reg);
+       }
  
+bail:
         /* let the person who launched us know when things are steady */
-       if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
-               if (atomic_dec_and_test(&reg->hr_steady_iterations))
+       if (atomic_read(&reg->hr_steady_iterations) != 0) {
+               if (!ret && own_slot_ok && !membership_change) {
+                       if (atomic_dec_and_test(&reg->hr_steady_iterations))
+                               wake_up(&o2hb_steady_queue);
+               }
+       }
+
+       if (atomic_read(&reg->hr_steady_iterations) != 0) {
+               if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
+                       printk(KERN_NOTICE "o2hb: Unable to stabilize "
+                              "heartbeart on region %s (%s)\n",
+                              config_item_name(&reg->hr_item),
+                              reg->hr_dev_name);
+                       atomic_set(&reg->hr_steady_iterations, 0);
+                       reg->hr_aborted_start = 1;
                         wake_up(&o2hb_steady_queue);
+                       ret = -EIO;
+               }
         }
  
-       return 0;
+       return ret;
  }
  
  /* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
         /* Pin node */
         o2nm_depend_this_node();
  
-       while (!kthread_should_stop() && !reg->hr_unclean_stop) {
+       while (!kthread_should_stop() &&
+              !reg->hr_unclean_stop && !reg->hr_aborted_start) {
                 /* We track the time spent inside
                  * o2hb_do_disk_heartbeat so that we avoid more than
                  * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
                  * likely to time itself out. */
                 do_gettimeofday(&before_hb);
  
-               i = 0;
-               do {
-                       ret = o2hb_do_disk_heartbeat(reg);
-               } while (ret && ++i < 2);
+               ret = o2hb_do_disk_heartbeat(reg);
  
                 do_gettimeofday(&after_hb);
                 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
                      after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
                      elapsed_msec);
  
-               if (elapsed_msec < reg->hr_timeout_ms) {
+               if (!kthread_should_stop() &&
+                   elapsed_msec < reg->hr_timeout_ms) {
                         /* the kthread api has blocked signals for us so no
                          * need to record the return value. */
                         msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
          * to timeout on this region when we could just as easily
          * write a clear generation - thus indicating to them that
          * this node has left this region.
-        *
-        * XXX: Should we skip this on unclean_stop? */
-       o2hb_prepare_block(reg, 0);
-       ret = o2hb_issue_node_write(reg, &write_wc);
-       if (ret == 0) {
-               o2hb_wait_on_io(reg, &write_wc);
-       } else {
-               mlog_errno(ret);
+        */
+       if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
+               o2hb_prepare_block(reg, 0);
+               ret = o2hb_issue_node_write(reg, &write_wc);
+               if (ret == 0)
+                       o2hb_wait_on_io(reg, &write_wc);
+               else
+                       mlog_errno(ret);
         }
  
         /* Unpin node */
         o2nm_undepend_this_node();
  
-       mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");
+       mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
  
         return 0;
  }
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
         struct o2hb_debug_buf *db = inode->i_private;
         struct o2hb_region *reg;
         unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long lts;
         char *buf = NULL;
         int i = -1;
         int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
  
         case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
                 reg = (struct o2hb_region *)db->db_data;
-               out += snprintf(buf + out, PAGE_SIZE - out, "%u\n",
-                               jiffies_to_msecs(jiffies -
-                                                reg->hr_last_timeout_start));
+               lts = reg->hr_last_timeout_start;
+               /* If 0, it has never been set before */
+               if (lts)
+                       lts = jiffies_to_msecs(jiffies - lts);
+               out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
                 goto done;
  
         case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
         struct page *page;
         struct o2hb_region *reg = to_o2hb_region(item);
  
+       mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
+
         if (reg->hr_tmp_block)
                 kfree(reg->hr_tmp_block);
  
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                         live_threshold <<= 1;
                 spin_unlock(&o2hb_live_lock);
         }
-       atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
+       ++live_threshold;
+       atomic_set(&reg->hr_steady_iterations, live_threshold);
+       /* unsteady_iterations is double the steady_iterations */
+       atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
  
         hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
                               reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
         ret = wait_event_interruptible(o2hb_steady_queue,
                                 atomic_read(&reg->hr_steady_iterations) == 0);
         if (ret) {
-               /* We got interrupted (hello ptrace!).  Clean up */
-               spin_lock(&o2hb_live_lock);
-               hb_task = reg->hr_task;
-               reg->hr_task = NULL;
-               spin_unlock(&o2hb_live_lock);
+               atomic_set(&reg->hr_steady_iterations, 0);
+               reg->hr_aborted_start = 1;
+       }
  
-               if (hb_task)
-                       kthread_stop(hb_task);
+       if (reg->hr_aborted_start) {
+               ret = -EIO;
                 goto out;
         }
  
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
                 ret = -EIO;
  
         if (hb_task && o2hb_global_heartbeat_active())
-               printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n",
-                      config_item_name(&reg->hr_item));
+               printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
+                      config_item_name(&reg->hr_item), reg->hr_dev_name);
  
  out:
         if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
  
         /* stop the thread when the user removes the region dir */
         spin_lock(&o2hb_live_lock);
-       if (o2hb_global_heartbeat_active()) {
-               clear_bit(reg->hr_region_num, o2hb_region_bitmap);
-               clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
-               if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
-                       quorum_region = 1;
-               clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
-       }
         hb_task = reg->hr_task;
         reg->hr_task = NULL;
         reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
         if (hb_task)
                 kthread_stop(hb_task);
  
+       if (o2hb_global_heartbeat_active()) {
+               spin_lock(&o2hb_live_lock);
+               clear_bit(reg->hr_region_num, o2hb_region_bitmap);
+               clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
+               if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
+                       quorum_region = 1;
+               clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
+               spin_unlock(&o2hb_live_lock);
+               printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
+                      ((atomic_read(&reg->hr_steady_iterations) == 0) ?
+                       "stopped" : "start aborted"), config_item_name(item),
+                      reg->hr_dev_name);
+       }
+
         /*
          * If we're racing a dev_write(), we need to wake them.  They will
          * check reg->hr_task
          */
         if (atomic_read(&reg->hr_steady_iterations) != 0) {
+               reg->hr_aborted_start = 1;
                 atomic_set(&reg->hr_steady_iterations, 0);
                 wake_up(&o2hb_steady_queue);
         }
  
-       if (o2hb_global_heartbeat_active())
-               printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
-                      config_item_name(&reg->hr_item));
-
         config_item_put(item);
  
         if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c

index 3a5835904b3db4d522c561908171f528bb74f6bd..dc45deb19e6885e56a1f5be46cbd39444c46f810 100644 (file)
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
  #define SC_DEBUG_NAME          "sock_containers"
  #define NST_DEBUG_NAME         "send_tracking"
  #define STATS_DEBUG_NAME       "stats"
+#define NODES_DEBUG_NAME       "connected_nodes"
  
  #define SHOW_SOCK_CONTAINERS   0
  #define SHOW_SOCK_STATS                1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
  static struct dentry *sc_dentry;
  static struct dentry *nst_dentry;
  static struct dentry *stats_dentry;
+static struct dentry *nodes_dentry;
  
  static DEFINE_SPINLOCK(o2net_debug_lock);
  
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
         .release = sc_fop_release,
  };
  
-int o2net_debugfs_init(void)
+static int o2net_fill_bitmap(char *buf, int len)
  {
-       o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
-       if (!o2net_dentry) {
-               mlog_errno(-ENOMEM);
-               goto bail;
-       }
+       unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       int i = -1, out = 0;
  
-       nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
-                                        o2net_dentry, NULL,
-                                        &nst_seq_fops);
-       if (!nst_dentry) {
-               mlog_errno(-ENOMEM);
-               goto bail;
-       }
+       o2net_fill_node_map(map, sizeof(map));
  
-       sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
-                                       o2net_dentry, NULL,
-                                       &sc_seq_fops);
-       if (!sc_dentry) {
-               mlog_errno(-ENOMEM);
-               goto bail;
-       }
+       while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
+               out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
+       out += snprintf(buf + out, PAGE_SIZE - out, "\n");
  
-       stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR,
-                                          o2net_dentry, NULL,
-                                          &stats_seq_fops);
-       if (!stats_dentry) {
-               mlog_errno(-ENOMEM);
-               goto bail;
-       }
+       return out;
+}
+
+static int nodes_fop_open(struct inode *inode, struct file *file)
+{
+       char *buf;
+
+       buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+       if (!buf)
+               return -ENOMEM;
+
+       i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
+
+       file->private_data = buf;
  
         return 0;
-bail:
-       debugfs_remove(stats_dentry);
-       debugfs_remove(sc_dentry);
-       debugfs_remove(nst_dentry);
-       debugfs_remove(o2net_dentry);
-       return -ENOMEM;
  }
  
+static int o2net_debug_release(struct inode *inode, struct file *file)
+{
+       kfree(file->private_data);
+       return 0;
+}
+
+static ssize_t o2net_debug_read(struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
+{
+       return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
+                                      i_size_read(file->f_mapping->host));
+}
+
+static const struct file_operations nodes_fops = {
+       .open           = nodes_fop_open,
+       .release        = o2net_debug_release,
+       .read           = o2net_debug_read,
+       .llseek         = generic_file_llseek,
+};
+
  void o2net_debugfs_exit(void)
  {
+       debugfs_remove(nodes_dentry);
         debugfs_remove(stats_dentry);
         debugfs_remove(sc_dentry);
         debugfs_remove(nst_dentry);
         debugfs_remove(o2net_dentry);
  }
  
+int o2net_debugfs_init(void)
+{
+       mode_t mode = S_IFREG|S_IRUSR;
+
+       o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
+       if (o2net_dentry)
+               nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
+                                       o2net_dentry, NULL, &nst_seq_fops);
+       if (nst_dentry)
+               sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
+                                       o2net_dentry, NULL, &sc_seq_fops);
+       if (sc_dentry)
+               stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
+                                       o2net_dentry, NULL, &stats_seq_fops);
+       if (stats_dentry)
+               nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
+                                       o2net_dentry, NULL, &nodes_fops);
+       if (nodes_dentry)
+               return 0;
+
+       o2net_debugfs_exit();
+       mlog_errno(-ENOMEM);
+       return -ENOMEM;
+}
+
  #endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c

index ad7d0c155de41a3912b5a6fa330b28ca48edf790..044e7b58d31c7662a75e29f636cfb86bf2846b67 100644 (file)
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -546,7 +546,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
         }
  
         if (was_valid && !valid) {
-               printk(KERN_NOTICE "o2net: no longer connected to "
+               printk(KERN_NOTICE "o2net: No longer connected to "
                        SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
                 o2net_complete_nodes_nsw(nn);
         }
@@ -556,7 +556,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
                 cancel_delayed_work(&nn->nn_connect_expired);
                 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
                        o2nm_this_node() > sc->sc_node->nd_num ?
-                               "connected to" : "accepted connection from",
+                      "Connected to" : "Accepted connection from",
                        SC_NODEF_ARGS(sc));
         }
  
@@ -644,7 +644,7 @@ static void o2net_state_change(struct sock *sk)
                         o2net_sc_queue_work(sc, &sc->sc_connect_work);
                         break;
                 default:
-                       printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT
+                       printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
                               " shutdown, state %d\n",
                               SC_NODEF_ARGS(sc), sk->sk_state);
                         o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1035,6 +1035,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
         return ret;
  }
  
+/* Get a map of all nodes to which this node is currently connected to */
+void o2net_fill_node_map(unsigned long *map, unsigned bytes)
+{
+       struct o2net_sock_container *sc;
+       int node, ret;
+
+       BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
+
+       memset(map, 0, bytes);
+       for (node = 0; node < O2NM_MAX_NODES; ++node) {
+               o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+               if (!ret) {
+                       set_bit(node, map);
+                       sc_put(sc);
+               }
+       }
+}
+EXPORT_SYMBOL_GPL(o2net_fill_node_map);
+
  int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
                            size_t caller_veclen, u8 target_node, int *status)
  {
@@ -1285,11 +1304,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
         struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
  
         if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
-               mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol "
-                    "version %llu but %llu is required, disconnecting\n",
-                    SC_NODEF_ARGS(sc),
-                    (unsigned long long)be64_to_cpu(hand->protocol_version),
-                    O2NET_PROTOCOL_VERSION);
+               printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
+                      "protocol version %llu but %llu is required. "
+                      "Disconnecting.\n", SC_NODEF_ARGS(sc),
+                      (unsigned long long)be64_to_cpu(hand->protocol_version),
+                      O2NET_PROTOCOL_VERSION);
  
                 /* don't bother reconnecting if its the wrong version. */
                 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1303,33 +1322,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
          */
         if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
                                 o2net_idle_timeout()) {
-               mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
-                    "%u ms, but we use %u ms locally.  disconnecting\n",
-                    SC_NODEF_ARGS(sc),
-                    be32_to_cpu(hand->o2net_idle_timeout_ms),
-                    o2net_idle_timeout());
+               printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
+                      "idle timeout of %u ms, but we use %u ms locally. "
+                      "Disconnecting.\n", SC_NODEF_ARGS(sc),
+                      be32_to_cpu(hand->o2net_idle_timeout_ms),
+                      o2net_idle_timeout());
                 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                 return -1;
         }
  
         if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
                         o2net_keepalive_delay()) {
-               mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
-                    "%u ms, but we use %u ms locally.  disconnecting\n",
-                    SC_NODEF_ARGS(sc),
-                    be32_to_cpu(hand->o2net_keepalive_delay_ms),
-                    o2net_keepalive_delay());
+               printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
+                      "delay of %u ms, but we use %u ms locally. "
+                      "Disconnecting.\n", SC_NODEF_ARGS(sc),
+                      be32_to_cpu(hand->o2net_keepalive_delay_ms),
+                      o2net_keepalive_delay());
                 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                 return -1;
         }
  
         if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
                         O2HB_MAX_WRITE_TIMEOUT_MS) {
-               mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of "
-                    "%u ms, but we use %u ms locally.  disconnecting\n",
-                    SC_NODEF_ARGS(sc),
-                    be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
-                    O2HB_MAX_WRITE_TIMEOUT_MS);
+               printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
+                      "timeout of %u ms, but we use %u ms locally. "
+                      "Disconnecting.\n", SC_NODEF_ARGS(sc),
+                      be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
+                      O2HB_MAX_WRITE_TIMEOUT_MS);
                 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
                 return -1;
         }
@@ -1540,28 +1559,16 @@ static void o2net_idle_timer(unsigned long data)
  {
         struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
         struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
-
  #ifdef CONFIG_DEBUG_FS
-       ktime_t now = ktime_get();
+       unsigned long msecs = ktime_to_ms(ktime_get()) -
+               ktime_to_ms(sc->sc_tv_timer);
+#else
+       unsigned long msecs = o2net_idle_timeout();
  #endif
  
-       printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
-            "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
-                    o2net_idle_timeout() / 1000,
-                    o2net_idle_timeout() % 1000);
-
-#ifdef CONFIG_DEBUG_FS
-       mlog(ML_NOTICE, "Here are some times that might help debug the "
-            "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
-            "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
-            (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
-            (long long)ktime_to_us(sc->sc_tv_data_ready),
-            (long long)ktime_to_us(sc->sc_tv_advance_start),
-            (long long)ktime_to_us(sc->sc_tv_advance_stop),
-            sc->sc_msg_key, sc->sc_msg_type,
-            (long long)ktime_to_us(sc->sc_tv_func_start),
-            (long long)ktime_to_us(sc->sc_tv_func_stop));
-#endif
+       printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
+              "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
+              msecs / 1000, msecs % 1000);
  
         /*
          * Initialize the nn_timeout so that the next connection attempt
@@ -1694,8 +1701,8 @@ static void o2net_start_connect(struct work_struct *work)
  
  out:
         if (ret) {
-               mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed "
-                    "with errno %d\n", SC_NODEF_ARGS(sc), ret);
+               printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
+                      " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
                 /* 0 err so that another will be queued and attempted
                  * from set_nn_state */
                 if (sc)
@@ -1718,8 +1725,8 @@ static void o2net_connect_expired(struct work_struct *work)
  
         spin_lock(&nn->nn_lock);
         if (!nn->nn_sc_valid) {
-               mlog(ML_ERROR, "no connection established with node %u after "
-                    "%u.%u seconds, giving up and returning errors.\n",
+               printk(KERN_NOTICE "o2net: No connection established with "
+                      "node %u after %u.%u seconds, giving up.\n",
                      o2net_num_from_nn(nn),
                      o2net_idle_timeout() / 1000,
                      o2net_idle_timeout() % 1000);
@@ -1862,21 +1869,21 @@ static int o2net_accept_one(struct socket *sock)
  
         node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
         if (node == NULL) {
-               mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n",
-                    &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+               printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
+                      "node at %pI4:%d\n", &sin.sin_addr.s_addr,
+                      ntohs(sin.sin_port));
                 ret = -EINVAL;
                 goto out;
         }
  
         if (o2nm_this_node() >= node->nd_num) {
                 local_node = o2nm_get_node_by_num(o2nm_this_node());
-               mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' ("
-                    "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n",
-                    local_node->nd_name, local_node->nd_num,
-                    &(local_node->nd_ipv4_address),
-                    ntohs(local_node->nd_ipv4_port),
-                    node->nd_name, node->nd_num, &sin.sin_addr.s_addr,
-                    ntohs(sin.sin_port));
+               printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
+                      "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
+                      "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
+                      &(local_node->nd_ipv4_address),
+                      ntohs(local_node->nd_ipv4_port), node->nd_name,
+                      node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
                 ret = -EINVAL;
                 goto out;
         }
@@ -1901,10 +1908,10 @@ static int o2net_accept_one(struct socket *sock)
                 ret = 0;
         spin_unlock(&nn->nn_lock);
         if (ret) {
-               mlog(ML_NOTICE, "attempt to connect from node '%s' at "
-                    "%pI4:%d but it already has an open connection\n",
-                    node->nd_name, &sin.sin_addr.s_addr,
-                    ntohs(sin.sin_port));
+               printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
+                      "at %pI4:%d but it already has an open connection\n",
+                      node->nd_name, &sin.sin_addr.s_addr,
+                      ntohs(sin.sin_port));
                 goto out;
         }
  
@@ -1984,7 +1991,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
  
         ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
         if (ret < 0) {
-               mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret);
+               printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
                 goto out;
         }
  
@@ -2001,16 +2008,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
         sock->sk->sk_reuse = 1;
         ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
         if (ret < 0) {
-               mlog(ML_ERROR, "unable to bind socket at %pI4:%u, "
-                    "ret=%d\n", &addr, ntohs(port), ret);
+               printk(KERN_ERR "o2net: Error %d while binding socket at "
+                      "%pI4:%u\n", ret, &addr, ntohs(port)); 
                 goto out;
         }
  
         ret = sock->ops->listen(sock, 64);
-       if (ret < 0) {
-               mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n",
-                    &addr, ntohs(port), ret);
-       }
+       if (ret < 0)
+               printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
+                      ret, &addr, ntohs(port));
  
  out:
         if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h

index fd6179eb26d4cd2cfb43f4ff053237837712113e..5bada2a69b503cd365d626a25e4ae6d08ad50b7a 100644 (file)
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
                            struct list_head *unreg_list);
  void o2net_unregister_handler_list(struct list_head *list);
  
+void o2net_fill_node_map(unsigned long *map, unsigned bytes);
+
  struct o2nm_node;
  int o2net_register_hb_callbacks(void);
  void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c

index e2878b5895fb543a86c11f0128b025dbb0335c38..8fe4e2892ab9ccd983304a825c338df57a5376b8 100644 (file)
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1184,8 +1184,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
                         if (pde)
                                 le16_add_cpu(&pde->rec_len,
                                                 le16_to_cpu(de->rec_len));
-                       else
-                               de->inode = 0;
+                       de->inode = 0;
                         dir->i_version++;
                         ocfs2_journal_dirty(handle, bh);
                         goto bail;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h

index d602abb51b610d525cc437daa05c25d2105fe0c3..a5952ceecba5a83147389ad4a1cd24972ee0bfbe 100644 (file)
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
  void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
  void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
  int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
  
  void dlm_put(struct dlm_ctxt *dlm);
  struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
         kref_get(&res->refs);
  }
  void dlm_lockres_put(struct dlm_lock_resource *res);
-void __dlm_unhash_lockres(struct dlm_lock_resource *res);
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
-                         struct dlm_lock_resource *res);
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
  struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
                                                      const char *name,
                                                      unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
                                           const char *name,
                                           unsigned int namelen);
  
-#define dlm_lockres_set_refmap_bit(bit,res)  \
-       __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__)
-#define dlm_lockres_clear_refmap_bit(bit,res)  \
-       __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+                               struct dlm_lock_resource *res, int bit);
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+                                 struct dlm_lock_resource *res, int bit);
  
-static inline void __dlm_lockres_set_refmap_bit(int bit,
-                                               struct dlm_lock_resource *res,
-                                               const char *file,
-                                               int line)
-{
-       //printk("%s:%d:%.*s: setting bit %d\n", file, line,
-       //     res->lockname.len, res->lockname.name, bit);
-       set_bit(bit, res->refmap);
-}
-
-static inline void __dlm_lockres_clear_refmap_bit(int bit,
-                                                 struct dlm_lock_resource *res,
-                                                 const char *file,
-                                                 int line)
-{
-       //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
-       //     res->lockname.len, res->lockname.name, bit);
-       clear_bit(bit, res->refmap);
-}
-
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  const char *file,
-                                  int line);
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  int new_lockres,
-                                  const char *file,
-                                  int line);
-#define dlm_lockres_drop_inflight_ref(d,r)  \
-       __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref(d,r)  \
-       __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
-#define dlm_lockres_grab_inflight_ref_new(d,r)  \
-       __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *res);
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *res);
  
  void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
  void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c

index 6ed6b95dcf935a6516e935b85a3ca9ffc0b8a9d8..92f2ead0fab6de22fa138cc4410dee6e1544216c 100644 (file)
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
  
  static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
  
-void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
+void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
  {
-       if (!hlist_unhashed(&lockres->hash_node)) {
-               hlist_del_init(&lockres->hash_node);
-               dlm_lockres_put(lockres);
-       }
+       if (hlist_unhashed(&res->hash_node))
+               return;
+
+       mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
+            res->lockname.name);
+       hlist_del_init(&res->hash_node);
+       dlm_lockres_put(res);
  }
  
-void __dlm_insert_lockres(struct dlm_ctxt *dlm,
-                      struct dlm_lock_resource *res)
+void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
  {
         struct hlist_head *bucket;
         struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
         dlm_lockres_get(res);
  
         hlist_add_head(&res->hash_node, bucket);
+
+       mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
+            res->lockname.name);
  }
  
  struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
  
  static void __dlm_print_nodes(struct dlm_ctxt *dlm)
  {
-       int node = -1;
+       int node = -1, num = 0;
  
         assert_spin_locked(&dlm->spinlock);
  
-       printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name);
-
+       printk("( ");
         while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
                                      node + 1)) < O2NM_MAX_NODES) {
                 printk("%d ", node);
+               ++num;
         }
-       printk("\n");
+       printk(") %u nodes\n", num);
  }
  
  static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
  
         node = exit_msg->node_idx;
  
-       printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
-
         spin_lock(&dlm->spinlock);
         clear_bit(node, dlm->domain_map);
         clear_bit(node, dlm->exit_domain_map);
+       printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
         __dlm_print_nodes(dlm);
  
         /* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
  
                 dlm_mark_domain_leaving(dlm);
                 dlm_leave_domain(dlm);
+               printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
                 dlm_force_free_mles(dlm);
                 dlm_complete_dlm_shutdown(dlm);
         }
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
                 clear_bit(assert->node_idx, dlm->exit_domain_map);
                 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
  
-               printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n",
+               printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
                        assert->node_idx, dlm->name);
                 __dlm_print_nodes(dlm);
  
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
  bail:
         spin_lock(&dlm->spinlock);
         __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
-       if (!status)
+       if (!status) {
+               printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
                 __dlm_print_nodes(dlm);
+       }
         spin_unlock(&dlm->spinlock);
  
         if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
                 goto leave;
         }
  
-       if (!o2hb_check_local_node_heartbeating()) {
-               mlog(ML_ERROR, "the local node has not been configured, or is "
-                    "not heartbeating\n");
-               ret = -EPROTO;
-               goto leave;
-       }
-
         mlog(0, "register called for domain \"%s\"\n", domain);
  
  retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c

index 8d39e0fd66f7379b8fb08ac40502efa0d4a51cc6..975810b98492a34f4576b9d3d3ac3e4421fb18c1 100644 (file)
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
                         kick_thread = 1;
                 }
         }
-       /* reduce the inflight count, this may result in the lockres
-        * being purged below during calc_usage */
-       if (lock->ml.node == dlm->node_num)
-               dlm_lockres_drop_inflight_ref(dlm, res);
  
         spin_unlock(&res->spinlock);
         wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
              lock->ml.type, res->lockname.len,
              res->lockname.name, flags);
  
+       /*
+        * Wait if resource is getting recovered, remastered, etc.
+        * If the resource was remastered and new owner is self, then exit.
+        */
         spin_lock(&res->spinlock);
-
-       /* will exit this call with spinlock held */
         __dlm_wait_on_lockres(res);
+       if (res->owner == dlm->node_num) {
+               spin_unlock(&res->spinlock);
+               return DLM_RECOVERING;
+       }
         res->state |= DLM_LOCK_RES_IN_PROGRESS;
  
         /* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
         tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
                                     sizeof(create), res->owner, &status);
         if (tmpret >= 0) {
-               // successfully sent and received
-               ret = status;  // this is already a dlm_status
+               ret = status;
                 if (ret == DLM_REJECTED) {
-                       mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
-                            "no longer owned by %u.  that node is coming back "
-                            "up currently.\n", dlm->name, create.namelen,
+                       mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
+                            "owned by node %u. That node is coming back up "
+                            "currently.\n", dlm->name, create.namelen,
                              create.name, res->owner);
                         dlm_print_one_lock_resource(res);
                         BUG();
                 }
         } else {
-               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-                    "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key,
-                    res->owner);
-               if (dlm_is_host_down(tmpret)) {
+               mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
+                    "node %u\n", dlm->name, create.namelen, create.name,
+                    tmpret, res->owner);
+               if (dlm_is_host_down(tmpret))
                         ret = DLM_RECOVERING;
-                       mlog(0, "node %u died so returning DLM_RECOVERING "
-                            "from lock message!\n", res->owner);
-               } else {
+               else
                         ret = dlm_err_to_dlm_status(tmpret);
-               }
         }
  
         return ret;
@@ -440,7 +438,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
                 /* zero memory only if kernel-allocated */
                 lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
                 if (!lksb) {
-                       kfree(lock);
+                       kmem_cache_free(dlm_lock_cache, lock);
                         return NULL;
                 }
                 kernel_allocated = 1;
@@ -718,18 +716,10 @@ retry_lock:
  
                 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
                     status == DLM_FORWARD) {
-                       mlog(0, "retrying lock with migration/"
-                            "recovery/in progress\n");
                         msleep(100);
-                       /* no waiting for dlm_reco_thread */
                         if (recovery) {
                                 if (status != DLM_RECOVERING)
                                         goto retry_lock;
-
-                               mlog(0, "%s: got RECOVERING "
-                                    "for $RECOVERY lock, master "
-                                    "was %u\n", dlm->name,
-                                    res->owner);
                                 /* wait to see the node go down, then
                                  * drop down and allow the lockres to
                                  * get cleaned up.  need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
                         }
                 }
  
+               /* Inflight taken in dlm_get_lock_resource() is dropped here */
+               spin_lock(&res->spinlock);
+               dlm_lockres_drop_inflight_ref(dlm, res);
+               spin_unlock(&res->spinlock);
+
+               dlm_lockres_calc_usage(dlm, res);
+               dlm_kick_thread(dlm, res);
+
                 if (status != DLM_NORMAL) {
                         lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
                         if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c

index 11eefb8c12e98fb418f41c31a3b0a32201be3ca1..005261c333b090f5f53f376bd5bbed55b8e16ba7 100644 (file)
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
         return NULL;
  }
  
-void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  int new_lockres,
-                                  const char *file,
-                                  int line)
+void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
+                               struct dlm_lock_resource *res, int bit)
  {
-       if (!new_lockres)
-               assert_spin_locked(&res->spinlock);
+       assert_spin_locked(&res->spinlock);
+
+       mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
+            res->lockname.name, bit, __builtin_return_address(0));
+
+       set_bit(bit, res->refmap);
+}
+
+void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
+                                 struct dlm_lock_resource *res, int bit)
+{
+       assert_spin_locked(&res->spinlock);
+
+       mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
+            res->lockname.name, bit, __builtin_return_address(0));
+
+       clear_bit(bit, res->refmap);
+}
+
+
+void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *res)
+{
+       assert_spin_locked(&res->spinlock);
  
-       if (!test_bit(dlm->node_num, res->refmap)) {
-               BUG_ON(res->inflight_locks != 0);
-               dlm_lockres_set_refmap_bit(dlm->node_num, res);
-       }
         res->inflight_locks++;
-       mlog(0, "%s:%.*s: inflight++: now %u\n",
-            dlm->name, res->lockname.len, res->lockname.name,
-            res->inflight_locks);
+
+       mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
+            res->lockname.len, res->lockname.name, res->inflight_locks,
+            __builtin_return_address(0));
  }
  
-void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
-                                  struct dlm_lock_resource *res,
-                                  const char *file,
-                                  int line)
+void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
+                                  struct dlm_lock_resource *res)
  {
         assert_spin_locked(&res->spinlock);
  
         BUG_ON(res->inflight_locks == 0);
+
         res->inflight_locks--;
-       mlog(0, "%s:%.*s: inflight--: now %u\n",
-            dlm->name, res->lockname.len, res->lockname.name,
-            res->inflight_locks);
-       if (res->inflight_locks == 0)
-               dlm_lockres_clear_refmap_bit(dlm->node_num, res);
+
+       mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
+            res->lockname.len, res->lockname.name, res->inflight_locks,
+            __builtin_return_address(0));
+
         wake_up(&res->wq);
  }
  
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
         unsigned int hash;
         int tries = 0;
         int bit, wait_on_recovery = 0;
-       int drop_inflight_if_nonlocal = 0;
  
         BUG_ON(!lockid);
  
@@ -709,36 +723,33 @@ lookup:
         spin_lock(&dlm->spinlock);
         tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
         if (tmpres) {
-               int dropping_ref = 0;
-
                 spin_unlock(&dlm->spinlock);
-
                 spin_lock(&tmpres->spinlock);
-               /* We wait for the other thread that is mastering the resource */
+               /* Wait on the thread that is mastering the resource */
                 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                         __dlm_wait_on_lockres(tmpres);
                         BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+                       spin_unlock(&tmpres->spinlock);
+                       dlm_lockres_put(tmpres);
+                       tmpres = NULL;
+                       goto lookup;
                 }
  
-               if (tmpres->owner == dlm->node_num) {
-                       BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
-                       dlm_lockres_grab_inflight_ref(dlm, tmpres);
-               } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
-                       dropping_ref = 1;
-               spin_unlock(&tmpres->spinlock);
-
-               /* wait until done messaging the master, drop our ref to allow
-                * the lockres to be purged, start over. */
-               if (dropping_ref) {
-                       spin_lock(&tmpres->spinlock);
-                       __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
+               /* Wait on the resource purge to complete before continuing */
+               if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
+                       BUG_ON(tmpres->owner == dlm->node_num);
+                       __dlm_wait_on_lockres_flags(tmpres,
+                                                   DLM_LOCK_RES_DROPPING_REF);
                         spin_unlock(&tmpres->spinlock);
                         dlm_lockres_put(tmpres);
                         tmpres = NULL;
                         goto lookup;
                 }
  
-               mlog(0, "found in hash!\n");
+               /* Grab inflight ref to pin the resource */
+               dlm_lockres_grab_inflight_ref(dlm, tmpres);
+
+               spin_unlock(&tmpres->spinlock);
                 if (res)
                         dlm_lockres_put(res);
                 res = tmpres;
@@ -829,8 +840,8 @@ lookup:
                  * but they might own this lockres.  wait on them. */
                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                 if (bit < O2NM_MAX_NODES) {
-                       mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
-                            "recover before lock mastery can begin\n",
+                       mlog(0, "%s: res %.*s, At least one node (%d) "
+                            "to recover before lock mastery can begin\n",
                              dlm->name, namelen, (char *)lockid, bit);
                         wait_on_recovery = 1;
                 }
@@ -843,12 +854,11 @@ lookup:
  
         /* finally add the lockres to its hash bucket */
         __dlm_insert_lockres(dlm, res);
-       /* since this lockres is new it doesn't not require the spinlock */
-       dlm_lockres_grab_inflight_ref_new(dlm, res);
  
-       /* if this node does not become the master make sure to drop
-        * this inflight reference below */
-       drop_inflight_if_nonlocal = 1;
+       /* Grab inflight ref to pin the resource */
+       spin_lock(&res->spinlock);
+       dlm_lockres_grab_inflight_ref(dlm, res);
+       spin_unlock(&res->spinlock);
  
         /* get an extra ref on the mle in case this is a BLOCK
          * if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
                  * dlm spinlock would be detectable be a change on the mle,
                  * so we only need to clear out the recovery map once. */
                 if (dlm_is_recovery_lock(lockid, namelen)) {
-                       mlog(ML_NOTICE, "%s: recovery map is not empty, but "
-                            "must master $RECOVERY lock now\n", dlm->name);
+                       mlog(0, "%s: Recovery map is not empty, but must "
+                            "master $RECOVERY lock now\n", dlm->name);
                         if (!dlm_pre_master_reco_lockres(dlm, res))
                                 wait_on_recovery = 0;
                         else {
@@ -883,8 +893,8 @@ redo_request:
                 spin_lock(&dlm->spinlock);
                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
                 if (bit < O2NM_MAX_NODES) {
-                       mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to "
-                            "recover before lock mastery can begin\n",
+                       mlog(0, "%s: res %.*s, At least one node (%d) "
+                            "to recover before lock mastery can begin\n",
                              dlm->name, namelen, (char *)lockid, bit);
                         wait_on_recovery = 1;
                 } else
@@ -913,8 +923,8 @@ redo_request:
                          * yet, keep going until it does.  this is how the
                          * master will know that asserts are needed back to
                          * the lower nodes. */
-                       mlog(0, "%s:%.*s: requests only up to %u but master "
-                            "is %u, keep going\n", dlm->name, namelen,
+                       mlog(0, "%s: res %.*s, Requests only up to %u but "
+                            "master is %u, keep going\n", dlm->name, namelen,
                              lockid, nodenum, mle->master);
                 }
         }
@@ -924,13 +934,12 @@ wait:
         ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
         if (ret < 0) {
                 wait_on_recovery = 1;
-               mlog(0, "%s:%.*s: node map changed, redo the "
-                    "master request now, blocked=%d\n",
-                    dlm->name, res->lockname.len,
+               mlog(0, "%s: res %.*s, Node map changed, redo the master "
+                    "request now, blocked=%d\n", dlm->name, res->lockname.len,
                      res->lockname.name, blocked);
                 if (++tries > 20) {
-                       mlog(ML_ERROR, "%s:%.*s: spinning on "
-                            "dlm_wait_for_lock_mastery, blocked=%d\n",
+                       mlog(ML_ERROR, "%s: res %.*s, Spinning on "
+                            "dlm_wait_for_lock_mastery, blocked = %d\n",
                              dlm->name, res->lockname.len,
                              res->lockname.name, blocked);
                         dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
                 goto redo_request;
         }
  
-       mlog(0, "lockres mastered by %u\n", res->owner);
+       mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
+            res->lockname.name, res->owner);
         /* make sure we never continue without this */
         BUG_ON(res->owner == O2NM_MAX_NODES);
  
@@ -952,8 +962,6 @@ wait:
  
  wake_waiters:
         spin_lock(&res->spinlock);
-       if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
-               dlm_lockres_drop_inflight_ref(dlm, res);
         res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
         spin_unlock(&res->spinlock);
         wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
                 }
  
                 if (res->owner == dlm->node_num) {
-                       mlog(0, "%s:%.*s: setting bit %u in refmap\n",
-                            dlm->name, namelen, name, request->node_idx);
-                       dlm_lockres_set_refmap_bit(request->node_idx, res);
+                       dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
                         spin_unlock(&res->spinlock);
                         response = DLM_MASTER_RESP_YES;
                         if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
                                  * go back and clean the mles on any
                                  * other nodes */
                                 dispatch_assert = 1;
-                               dlm_lockres_set_refmap_bit(request->node_idx, res);
-                               mlog(0, "%s:%.*s: setting bit %u in refmap\n",
-                                    dlm->name, namelen, name,
-                                    request->node_idx);
+                               dlm_lockres_set_refmap_bit(dlm, res,
+                                                          request->node_idx);
                         } else
                                 response = DLM_MASTER_RESP_NO;
                 } else {
@@ -1702,7 +1706,7 @@ again:
                              "lockres, set the bit in the refmap\n",
                              namelen, lockname, to);
                         spin_lock(&res->spinlock);
-                       dlm_lockres_set_refmap_bit(to, res);
+                       dlm_lockres_set_refmap_bit(dlm, res, to);
                         spin_unlock(&res->spinlock);
                 }
         }
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
         namelen = res->lockname.len;
         BUG_ON(namelen > O2NM_MAX_NAME_LEN);
  
-       mlog(0, "%s:%.*s: sending deref to %d\n",
-            dlm->name, namelen, lockname, res->owner);
         memset(&deref, 0, sizeof(deref));
         deref.node_idx = dlm->node_num;
         deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
         ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
                                  &deref, sizeof(deref), res->owner, &r);
         if (ret < 0)
-               mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to "
-                    "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key,
-                    res->owner);
+               mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
+                    dlm->name, namelen, lockname, ret, res->owner);
         else if (r < 0) {
                 /* BAD.  other node says I did not have a ref. */
-               mlog(ML_ERROR,"while dropping ref on %s:%.*s "
-                   "(master=%u) got %d.\n", dlm->name, namelen,
-                   lockname, res->owner, r);
+               mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
+                    dlm->name, namelen, lockname, res->owner, r);
                 dlm_print_one_lock_resource(res);
                 BUG();
         }
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
         else {
                 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
                 if (test_bit(node, res->refmap)) {
-                       dlm_lockres_clear_refmap_bit(node, res);
+                       dlm_lockres_clear_refmap_bit(dlm, res, node);
                         cleared = 1;
                 }
         }
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
         BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
         if (test_bit(node, res->refmap)) {
                 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
-               dlm_lockres_clear_refmap_bit(node, res);
+               dlm_lockres_clear_refmap_bit(dlm, res, node);
                 cleared = 1;
         }
         spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                                 BUG_ON(!list_empty(&lock->bast_list));
                                 BUG_ON(lock->ast_pending);
                                 BUG_ON(lock->bast_pending);
-                               dlm_lockres_clear_refmap_bit(lock->ml.node, res);
+                               dlm_lockres_clear_refmap_bit(dlm, res,
+                                                            lock->ml.node);
                                 list_del_init(&lock->list);
                                 dlm_lock_put(lock);
                                 /* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
                         mlog(0, "%s:%.*s: node %u had a ref to this "
                              "migrating lockres, clearing\n", dlm->name,
                              res->lockname.len, res->lockname.name, bit);
-                       dlm_lockres_clear_refmap_bit(bit, res);
+                       dlm_lockres_clear_refmap_bit(dlm, res, bit);
                 }
                 bit++;
         }
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                                          &migrate, sizeof(migrate), nodenum,
                                          &status);
                 if (ret < 0) {
-                       mlog(ML_ERROR, "Error %d when sending message %u (key "
-                            "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG,
-                            dlm->key, nodenum);
+                       mlog(ML_ERROR, "%s: res %.*s, Error %d send "
+                            "MIGRATE_REQUEST to node %u\n", dlm->name,
+                            migrate.namelen, migrate.name, ret, nodenum);
                         if (!dlm_is_host_down(ret)) {
                                 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
                                 BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
                              dlm->name, res->lockname.len, res->lockname.name,
                              nodenum);
                         spin_lock(&res->spinlock);
-                       dlm_lockres_set_refmap_bit(nodenum, res);
+                       dlm_lockres_set_refmap_bit(dlm, res, nodenum);
                         spin_unlock(&res->spinlock);
                 }
         }
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
          * mastery reference here since old_master will briefly have
          * a reference after the migration completes */
         spin_lock(&res->spinlock);
-       dlm_lockres_set_refmap_bit(old_master, res);
+       dlm_lockres_set_refmap_bit(dlm, res, old_master);
         spin_unlock(&res->spinlock);
  
         mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c

index 7efab6d28a21b4ee6a8376559d70f739a4e1da90..01ebfd0bdad72264b99345378f0c6febe246503d 100644 (file)
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
  }
  
  
-int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
  {
-       if (timeout) {
-               mlog(ML_NOTICE, "%s: waiting %dms for notification of "
-                    "death of node %u\n", dlm->name, timeout, node);
+       if (dlm_is_node_dead(dlm, node))
+               return;
+
+       printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
+              "domain %s\n", node, dlm->name);
+
+       if (timeout)
                 wait_event_timeout(dlm->dlm_reco_thread_wq,
-                          dlm_is_node_dead(dlm, node),
-                          msecs_to_jiffies(timeout));
-       } else {
-               mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
-                    "of death of node %u\n", dlm->name, node);
+                                  dlm_is_node_dead(dlm, node),
+                                  msecs_to_jiffies(timeout));
+       else
                 wait_event(dlm->dlm_reco_thread_wq,
                            dlm_is_node_dead(dlm, node));
-       }
-       /* for now, return 0 */
-       return 0;
  }
  
-int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
  {
-       if (timeout) {
-               mlog(0, "%s: waiting %dms for notification of "
-                    "recovery of node %u\n", dlm->name, timeout, node);
+       if (dlm_is_node_recovered(dlm, node))
+               return;
+
+       printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
+              "domain %s\n", node, dlm->name);
+
+       if (timeout)
                 wait_event_timeout(dlm->dlm_reco_thread_wq,
-                          dlm_is_node_recovered(dlm, node),
-                          msecs_to_jiffies(timeout));
-       } else {
-               mlog(0, "%s: waiting indefinitely for notification "
-                    "of recovery of node %u\n", dlm->name, node);
+                                  dlm_is_node_recovered(dlm, node),
+                                  msecs_to_jiffies(timeout));
+       else
                 wait_event(dlm->dlm_reco_thread_wq,
                            dlm_is_node_recovered(dlm, node));
-       }
-       /* for now, return 0 */
-       return 0;
  }
  
  /* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
  {
         spin_lock(&dlm->spinlock);
         BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
+       printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
+              dlm->name, dlm->reco.dead_node);
         dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
         spin_unlock(&dlm->spinlock);
  }
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
         BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
         dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
         spin_unlock(&dlm->spinlock);
+       printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
         wake_up(&dlm->reco.event);
  }
  
+static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
+{
+       printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
+              "dead node %u in domain %s\n", dlm->reco.new_master,
+              (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
+              dlm->reco.dead_node, dlm->name);
+}
+
  static int dlm_do_recovery(struct dlm_ctxt *dlm)
  {
         int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
                 }
                 mlog(0, "another node will master this recovery session.\n");
         }
-       mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
-            dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master,
-            dlm->node_num, dlm->reco.dead_node);
+
+       dlm_print_recovery_master(dlm);
  
         /* it is safe to start everything back up here
          * because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
         return 0;
  
  master_here:
-       mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node "
-            "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
-            dlm->node_num, dlm->reco.dead_node, dlm->name);
+       dlm_print_recovery_master(dlm);
  
         status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
         if (status < 0) {
                 /* we should never hit this anymore */
-               mlog(ML_ERROR, "error %d remastering locks for node %u, "
-                    "retrying.\n", status, dlm->reco.dead_node);
+               mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
+                    "retrying.\n", dlm->name, status, dlm->reco.dead_node);
                 /* yield a bit to allow any final network messages
                  * to get handled on remaining nodes */
                 msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
                 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
  
-               mlog(0, "requesting lock info from node %u\n",
+               mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
                      ndata->node_num);
  
                 if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
                 spin_unlock(&dlm_reco_state_lock);
         }
  
-       mlog(0, "done requesting all lock info\n");
+       mlog(0, "%s: Done requesting all lock info\n", dlm->name);
  
         /* nodes should be sending reco data now
          * just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
  
         /* negative status is handled by caller */
         if (ret < 0)
-               mlog(ML_ERROR, "Error %d when sending message %u (key "
-                    "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG,
-                    dlm->key, request_from);
-
+               mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
+                    "to recover dead node %u\n", dlm->name, ret,
+                    request_from, dead_node);
         // return from here, then
         // sleep until all received or error
         return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
         ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
                                  sizeof(done_msg), send_to, &tmpret);
         if (ret < 0) {
-               mlog(ML_ERROR, "Error %d when sending message %u (key "
-                    "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG,
-                    dlm->key, send_to);
+               mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
+                    "to recover dead node %u\n", dlm->name, ret, send_to,
+                    dead_node);
                 if (!dlm_is_host_down(ret)) {
                         BUG();
                 }
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
         if (ret < 0) {
                 /* XXX: negative status is not handled.
                  * this will end up killing this node. */
-               mlog(ML_ERROR, "Error %d when sending message %u (key "
-                    "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG,
-                    dlm->key, send_to);
+               mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
+                    "node %u (%s)\n", dlm->name, mres->lockname_len,
+                    mres->lockname, ret, send_to,
+                    (orig_flags & DLM_MRES_MIGRATION ?
+                     "migration" : "recovery"));
         } else {
                 /* might get an -ENOMEM back here */
                 ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
                              dlm->name, mres->lockname_len, mres->lockname,
                              from);
                         spin_lock(&res->spinlock);
-                       dlm_lockres_set_refmap_bit(from, res);
+                       dlm_lockres_set_refmap_bit(dlm, res, from);
                         spin_unlock(&res->spinlock);
                         added++;
                         break;
@@ -1965,7 +1972,7 @@ skip_lvb:
                         mlog(0, "%s:%.*s: added lock for node %u, "
                              "setting refmap bit\n", dlm->name,
                              res->lockname.len, res->lockname.name, ml->node);
-                       dlm_lockres_set_refmap_bit(ml->node, res);
+                       dlm_lockres_set_refmap_bit(dlm, res, ml->node);
                         added++;
                 }
                 spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
  
         list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
                 if (res->owner == dead_node) {
+                       mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+                            dlm->name, res->lockname.len, res->lockname.name,
+                            res->owner, new_master);
                         list_del_init(&res->recovering);
                         spin_lock(&res->spinlock);
                         /* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
         for (i = 0; i < DLM_HASH_BUCKETS; i++) {
                 bucket = dlm_lockres_hash(dlm, i);
                 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
-                       if (res->state & DLM_LOCK_RES_RECOVERING) {
-                               if (res->owner == dead_node) {
-                                       mlog(0, "(this=%u) res %.*s owner=%u "
-                                            "was not on recovering list, but "
-                                            "clearing state anyway\n",
-                                            dlm->node_num, res->lockname.len,
-                                            res->lockname.name, new_master);
-                               } else if (res->owner == dlm->node_num) {
-                                       mlog(0, "(this=%u) res %.*s owner=%u "
-                                            "was not on recovering list, "
-                                            "owner is THIS node, clearing\n",
-                                            dlm->node_num, res->lockname.len,
-                                            res->lockname.name, new_master);
-                               } else
-                                       continue;
+                       if (!(res->state & DLM_LOCK_RES_RECOVERING))
+                               continue;
  
-                               if (!list_empty(&res->recovering)) {
-                                       mlog(0, "%s:%.*s: lockres was "
-                                            "marked RECOVERING, owner=%u\n",
-                                            dlm->name, res->lockname.len,
-                                            res->lockname.name, res->owner);
-                                       list_del_init(&res->recovering);
-                                       dlm_lockres_put(res);
-                               }
-                               spin_lock(&res->spinlock);
-                               /* new_master has our reference from
-                                * the lock state sent during recovery */
-                               dlm_change_lockres_owner(dlm, res, new_master);
-                               res->state &= ~DLM_LOCK_RES_RECOVERING;
-                               if (__dlm_lockres_has_locks(res))
-                                       __dlm_dirty_lockres(dlm, res);
-                               spin_unlock(&res->spinlock);
-                               wake_up(&res->wq);
+                       if (res->owner != dead_node &&
+                           res->owner != dlm->node_num)
+                               continue;
+
+                       if (!list_empty(&res->recovering)) {
+                               list_del_init(&res->recovering);
+                               dlm_lockres_put(res);
                         }
+
+                       /* new_master has our reference from
+                        * the lock state sent during recovery */
+                       mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
+                            dlm->name, res->lockname.len, res->lockname.name,
+                            res->owner, new_master);
+                       spin_lock(&res->spinlock);
+                       dlm_change_lockres_owner(dlm, res, new_master);
+                       res->state &= ~DLM_LOCK_RES_RECOVERING;
+                       if (__dlm_lockres_has_locks(res))
+                               __dlm_dirty_lockres(dlm, res);
+                       spin_unlock(&res->spinlock);
+                       wake_up(&res->wq);
                 }
         }
  }
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
                              res->lockname.len, res->lockname.name, freed, dead_node);
                         __dlm_print_one_lock_resource(res);
                 }
-               dlm_lockres_clear_refmap_bit(dead_node, res);
+               dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
         } else if (test_bit(dead_node, res->refmap)) {
                 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
                      "no locks and had not purged before dying\n", dlm->name,
                      res->lockname.len, res->lockname.name, dead_node);
-               dlm_lockres_clear_refmap_bit(dead_node, res);
+               dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
         }
  
         /* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                         dlm_revalidate_lvb(dlm, res, dead_node);
                         if (res->owner == dead_node) {
                                 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
-                                       mlog(ML_NOTICE, "Ignore %.*s for "
+                                       mlog(ML_NOTICE, "%s: res %.*s, Skip "
                                              "recovery as it is being freed\n",
-                                            res->lockname.len,
+                                            dlm->name, res->lockname.len,
                                              res->lockname.name);
                                 } else
                                         dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c

index 1d6d1d22c4715e3c89bef69570916cc5bf44c259..e73c833fc2a1a97cac35903f0439115cef813c69 100644 (file)
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
  {
         int bit;
  
+       assert_spin_locked(&res->spinlock);
+
         if (__dlm_lockres_has_locks(res))
                 return 0;
  
+       /* Locks are in the process of being created */
+       if (res->inflight_locks)
+               return 0;
+
         if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
                 return 0;
  
         if (res->state & DLM_LOCK_RES_RECOVERING)
                 return 0;
  
+       /* Another node has this resource with this node as the master */
         bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
         if (bit < O2NM_MAX_NODES)
                 return 0;
  
-       /*
-        * since the bit for dlm->node_num is not set, inflight_locks better
-        * be zero
-        */
-       BUG_ON(res->inflight_locks != 0);
         return 1;
  }
  
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                 /* clear our bit from the master's refmap, ignore errors */
                 ret = dlm_drop_lockres_ref(dlm, res);
                 if (ret < 0) {
-                       mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
-                            res->lockname.len, res->lockname.name, ret);
                         if (!dlm_is_host_down(ret))
                                 BUG();
                 }
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
                 BUG();
         }
  
-       __dlm_unhash_lockres(res);
+       __dlm_unhash_lockres(dlm, res);
  
         /* lockres is not in the hash now.  drop the flag and wake up
          * any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c

index e1ed5e502ff25dc8afe39de464949ba13c2f9892..81a4cd22f80be84a06eac2b0fbf4348385d76262 100644 (file)
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1692,7 +1692,7 @@ int ocfs2_open_lock(struct inode *inode)
         mlog(0, "inode %llu take PRMODE open lock\n",
              (unsigned long long)OCFS2_I(inode)->ip_blkno);
  
-       if (ocfs2_mount_local(osb))
+       if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
                 goto out;
  
         lockres = &OCFS2_I(inode)->ip_open_lockres;
@@ -1718,6 +1718,12 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
              (unsigned long long)OCFS2_I(inode)->ip_blkno,
              write ? "EXMODE" : "PRMODE");
  
+       if (ocfs2_is_hard_readonly(osb)) {
+               if (write)
+                       status = -EROFS;
+               goto out;
+       }
+
         if (ocfs2_mount_local(osb))
                 goto out;
  
@@ -2298,7 +2304,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
         if (ocfs2_is_hard_readonly(osb)) {
                 if (ex)
                         status = -EROFS;
-               goto bail;
+               goto getbh;
         }
  
         if (ocfs2_mount_local(osb))
@@ -2356,7 +2362,7 @@ local:
                         mlog_errno(status);
                 goto bail;
         }
-
+getbh:
         if (ret_bh) {
                 status = ocfs2_assign_bh(inode, ret_bh, local_bh);
                 if (status < 0) {
@@ -2628,8 +2634,11 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
  
         BUG_ON(!dl);
  
-       if (ocfs2_is_hard_readonly(osb))
-               return -EROFS;
+       if (ocfs2_is_hard_readonly(osb)) {
+               if (ex)
+                       return -EROFS;
+               return 0;
+       }
  
         if (ocfs2_mount_local(osb))
                 return 0;
@@ -2647,7 +2656,7 @@ void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
         struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
         struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
  
-       if (!ocfs2_mount_local(osb))
+       if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
                 ocfs2_cluster_unlock(osb, &dl->dl_lockres, level);
  }
  
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c

index 23457b491e8ce53ac5b71d9cd5fc2a5e1400a07f..2f5b92ef0e533146007b49d21dd705a242125dc5 100644 (file)
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
         return ret;
  }
  
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       int ret;
+       unsigned int is_last = 0, is_data = 0;
+       u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+       u32 cpos, cend, clen, hole_size;
+       u64 extoff, extlen;
+       struct buffer_head *di_bh = NULL;
+       struct ocfs2_extent_rec rec;
+
+       BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
+
+       ret = ocfs2_inode_lock(inode, &di_bh, 0);
+       if (ret) {
+               mlog_errno(ret);
+               goto out;
+       }
+
+       down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       if (*offset >= inode->i_size) {
+               ret = -ENXIO;
+               goto out_unlock;
+       }
+
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+               if (origin == SEEK_HOLE)
+                       *offset = inode->i_size;
+               goto out_unlock;
+       }
+
+       clen = 0;
+       cpos = *offset >> cs_bits;
+       cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+
+       while (cpos < cend && !is_last) {
+               ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
+                                                &rec, &is_last);
+               if (ret) {
+                       mlog_errno(ret);
+                       goto out_unlock;
+               }
+
+               extoff = cpos;
+               extoff <<= cs_bits;
+
+               if (rec.e_blkno == 0ULL) {
+                       clen = hole_size;
+                       is_data = 0;
+               } else {
+                       clen = le16_to_cpu(rec.e_leaf_clusters) -
+                               (cpos - le32_to_cpu(rec.e_cpos));
+                       is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ?  0 : 1;
+               }
+
+               if ((!is_data && origin == SEEK_HOLE) ||
+                   (is_data && origin == SEEK_DATA)) {
+                       if (extoff > *offset)
+                               *offset = extoff;
+                       goto out_unlock;
+               }
+
+               if (!is_last)
+                       cpos += clen;
+       }
+
+       if (origin == SEEK_HOLE) {
+               extoff = cpos;
+               extoff <<= cs_bits;
+               extlen = clen;
+               extlen <<=  cs_bits;
+
+               if ((extoff + extlen) > inode->i_size)
+                       extlen = inode->i_size - extoff;
+               extoff += extlen;
+               if (extoff > *offset)
+                       *offset = extoff;
+               goto out_unlock;
+       }
+
+       ret = -ENXIO;
+
+out_unlock:
+
+       brelse(di_bh);
+
+       up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ocfs2_inode_unlock(inode, 0);
+out:
+       if (ret && ret != -ENXIO)
+               ret = -ENXIO;
+       return ret;
+}
+
  int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
                            struct buffer_head *bhs[], int flags,
                            int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h

index e79d41c2c90972fe801a99ec040b229a92debe98..67ea57d2fd594da7e456c1103bb1652fa68b5f69 100644 (file)
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
  int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                  u64 map_start, u64 map_len);
  
+int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
+
  int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
                              u32 *p_cluster, u32 *num_clusters,
                              struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c

index de4ea1af041b654f8f1b4f1a000f6492226f701f..6e396683c3d48af7321f77b8658754982586cb35 100644 (file)
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1950,6 +1950,9 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
         if (ret < 0)
                 mlog_errno(ret);
  
+       if (file->f_flags & O_SYNC)
+               handle->h_sync = 1;
+
         ocfs2_commit_trans(osb, handle);
  
  out_inode_unlock:
@@ -2052,6 +2055,23 @@ out:
         return ret;
  }
  
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+       wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+       int blockmask = inode->i_sb->s_blocksize - 1;
+       loff_t final_size = pos + count;
+
+       if ((pos & blockmask) || (final_size & blockmask))
+               return 1;
+       return 0;
+}
+
  static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                             struct file *file,
                                             loff_t pos, size_t count,
@@ -2230,6 +2250,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
         int full_coherency = !(osb->s_mount_opt &
                                OCFS2_MOUNT_COHERENCY_BUFFERED);
+       int unaligned_dio = 0;
  
         trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2297,6 +2318,10 @@ relock:
                 goto out;
         }
  
+       if (direct_io && !is_sync_kiocb(iocb))
+               unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                     *ppos);
+
         /*
          * We can't complete the direct I/O as requested, fall back to
          * buffered I/O.
@@ -2311,6 +2336,18 @@ relock:
                 goto relock;
         }
  
+       if (unaligned_dio) {
+               /*
+                * Wait on previous unaligned aio to complete before
+                * proceeding.
+                */
+               ocfs2_aiodio_wait(inode);
+
+               /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+               atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+               ocfs2_iocb_set_unaligned_aio(iocb);
+       }
+
         /*
          * To later detect whether a journal commit for sync writes is
          * necessary, we sample i_size, and cluster count here.
@@ -2382,8 +2419,12 @@ out_dio:
         if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                 rw_level = -1;
                 have_alloc_sem = 0;
+               unaligned_dio = 0;
         }
  
+       if (unaligned_dio)
+               atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
  out:
         if (rw_level != -1)
                 ocfs2_rw_unlock(inode, rw_level);
@@ -2591,6 +2632,57 @@ bail:
         return ret;
  }
  
+/* Refer generic_file_llseek_unlocked() */
+static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
+{
+       struct inode *inode = file->f_mapping->host;
+       int ret = 0;
+
+       mutex_lock(&inode->i_mutex);
+
+       switch (origin) {
+       case SEEK_SET:
+               break;
+       case SEEK_END:
+               offset += inode->i_size;
+               break;
+       case SEEK_CUR:
+               if (offset == 0) {
+                       offset = file->f_pos;
+                       goto out;
+               }
+               offset += file->f_pos;
+               break;
+       case SEEK_DATA:
+       case SEEK_HOLE:
+               ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
+               if (ret)
+                       goto out;
+               break;
+       default:
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
+               ret = -EINVAL;
+       if (!ret && offset > inode->i_sb->s_maxbytes)
+               ret = -EINVAL;
+       if (ret)
+               goto out;
+
+       if (offset != file->f_pos) {
+               file->f_pos = offset;
+               file->f_version = 0;
+       }
+
+out:
+       mutex_unlock(&inode->i_mutex);
+       if (ret)
+               return ret;
+       return offset;
+}
+
  const struct inode_operations ocfs2_file_iops = {
         .setattr        = ocfs2_setattr,
         .getattr        = ocfs2_getattr,
@@ -2615,7 +2707,7 @@ const struct inode_operations ocfs2_special_file_iops = {
   * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
   */
  const struct file_operations ocfs2_fops = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
         .read           = do_sync_read,
         .write          = do_sync_write,
         .mmap           = ocfs2_mmap,
@@ -2663,7 +2755,7 @@ const struct file_operations ocfs2_dops = {
   * the cluster.
   */
  const struct file_operations ocfs2_fops_no_plocks = {
-       .llseek         = generic_file_llseek,
+       .llseek         = ocfs2_file_llseek,
         .read           = do_sync_read,
         .write          = do_sync_write,
         .mmap           = ocfs2_mmap,
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c

index a22d2c098890a9ca67e2056976bfc9869d74a325..17454a904d7bf488093de9f3db61dc529e0f8e3c 100644 (file)
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
         trace_ocfs2_cleanup_delete_inode(
                 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
         if (sync_data)
-               write_inode_now(inode, 1);
+               filemap_write_and_wait(inode->i_mapping);
         truncate_inode_pages(&inode->i_data, 0);
  }
  
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h

index 1c508b149b3ac1bd4325fd33a9aae6bdb70e024a..88924a3133fae7c15ca3f5a5259b64eecd97022e 100644 (file)
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
         /* protects extended attribute changes on this inode */
         struct rw_semaphore             ip_xattr_sem;
  
+       /* Number of outstanding AIO's which are not page aligned */
+       atomic_t                        ip_unaligned_aio;
+
         /* These fields are protected by ip_lock */
         spinlock_t                      ip_lock;
         u32                             ip_open_count;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c

index bc91072b72196fd335c4b7cbc02ba08cb67254e6..726ff265b296bc3365cfe46e94588c1ee4f6ed6a 100644 (file)
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -122,7 +122,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
         if ((oldflags & OCFS2_IMMUTABLE_FL) || ((flags ^ oldflags) &
                 (OCFS2_APPEND_FL | OCFS2_IMMUTABLE_FL))) {
                 if (!capable(CAP_LINUX_IMMUTABLE))
-                       goto bail_unlock;
+                       goto bail_commit;
         }
  
         ocfs2_inode->ip_attr = flags;
@@ -132,6 +132,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
         if (status < 0)
                 mlog_errno(status);
  
+bail_commit:
         ocfs2_commit_trans(osb, handle);
  bail_unlock:
         ocfs2_inode_unlock(inode, 1);
@@ -381,7 +382,7 @@ int ocfs2_info_handle_freeinode(struct inode *inode,
         if (!oifi) {
                 status = -ENOMEM;
                 mlog_errno(status);
-               goto bail;
+               goto out_err;
         }
  
         if (o2info_from_user(*oifi, req))
@@ -431,7 +432,7 @@ bail:
                 o2info_set_request_error(&oifi->ifi_req, req);
  
         kfree(oifi);
-
+out_err:
         return status;
  }
  
@@ -666,7 +667,7 @@ int ocfs2_info_handle_freefrag(struct inode *inode,
         if (!oiff) {
                 status = -ENOMEM;
                 mlog_errno(status);
-               goto bail;
+               goto out_err;
         }
  
         if (o2info_from_user(*oiff, req))
@@ -716,7 +717,7 @@ bail:
                 o2info_set_request_error(&oiff->iff_req, req);
  
         kfree(oiff);
-
+out_err:
         return status;
  }
  
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c

index 295d56454e8b23b6e9d97a6bd258e6170311c8f5..0a42ae96dca7d4a0f662505e0e51206895ce4156 100644 (file)
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
         /* we need to run complete recovery for offline orphan slots */
         ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
  
-       mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n",
-            node_num, slot_num,
-            MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+       printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
+              "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+              MINOR(osb->sb->s_dev));
  
         OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
  
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
  
         jbd2_journal_destroy(journal);
  
+       printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
+              "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
+              MINOR(osb->sb->s_dev));
  done:
         /* drop the lock on this nodes journal */
         if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
   * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
   * is done to catch any orphans that are left over in orphan directories.
   *
+ * It scans all slots, even ones that are in use. It does so to handle the
+ * case described below:
+ *
+ *   Node 1 has an inode it was using. The dentry went away due to memory
+ *   pressure.  Node 1 closes the inode, but it's on the free list. The node
+ *   has the open lock.
+ *   Node 2 unlinks the inode. It grabs the dentry lock to notify others,
+ *   but node 1 has no dentry and doesn't get the message. It trylocks the
+ *   open lock, sees that another node has a PR, and does nothing.
+ *   Later node 2 runs its orphan dir. It igets the inode, trylocks the
+ *   open lock, sees the PR still, and does nothing.
+ *   Basically, we have to trigger an orphan iput on node 1. The only way
+ *   for this to happen is if node 1 runs node 2's orphan dir.
+ *
   * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
   * seconds.  It gets an EX lock on os_lockres and checks sequence number
   * stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h

index 68cf2f6d3c6a40b22dda3e2d4f7e2d44349167fa..a3385b63ff5e542bcfaabe59744fa3af38b537e4 100644 (file)
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -441,10 +441,11 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
  #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
  
  /* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
- * update on dir + index leaf + dx root update for free list */
+ * update on dir + index leaf + dx root update for free list +
+ * previous dirblock update in the free list */
  static inline int ocfs2_link_credits(struct super_block *sb)
  {
-       return 2*OCFS2_INODE_UPDATE_CREDITS + 3 +
+       return 2*OCFS2_INODE_UPDATE_CREDITS + 4 +
                ocfs2_quota_trans_credits(sb);
  }
  
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c

index 3e9393ca39ebd823772ae910cda1b4516d46fa62..9cd41083e99123eca1c48085fb39809e6b906b40 100644 (file)
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
  static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
                                 struct page *page)
  {
-       int ret;
+       int ret = VM_FAULT_NOPAGE;
         struct inode *inode = file->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
         loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
         void *fsdata;
         loff_t size = i_size_read(inode);
  
-       /*
-        * Another node might have truncated while we were waiting on
-        * cluster locks.
-        * We don't check size == 0 before the shift. This is borrowed
-        * from do_generic_file_read.
-        */
         last_index = (size - 1) >> PAGE_CACHE_SHIFT;
-       if (unlikely(!size || page->index > last_index)) {
-               ret = -EINVAL;
-               goto out;
-       }
  
         /*
-        * The i_size check above doesn't catch the case where nodes
-        * truncated and then re-extended the file. We'll re-check the
-        * page mapping after taking the page lock inside of
-        * ocfs2_write_begin_nolock().
+        * There are cases that lead to the page no longer bebongs to the
+        * mapping.
+        * 1) pagecache truncates locally due to memory pressure.
+        * 2) pagecache truncates when another is taking EX lock against 
+        * inode lock. see ocfs2_data_convert_worker.
+        * 
+        * The i_size check doesn't catch the case where nodes truncated and
+        * then re-extended the file. We'll re-check the page mapping after
+        * taking the page lock inside of ocfs2_write_begin_nolock().
+        *
+        * Let VM retry with these cases.
          */
-       if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
-               /*
-                * the page has been umapped in ocfs2_data_downconvert_worker.
-                * So return 0 here and let VFS retry.
-                */
-               ret = 0;
+       if ((page->mapping != inode->i_mapping) ||
+           (!PageUptodate(page)) ||
+           (page_offset(page) >= size))
                 goto out;
-       }
  
         /*
          * Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
         if (ret) {
                 if (ret != -ENOSPC)
                         mlog_errno(ret);
+               if (ret == -ENOMEM)
+                       ret = VM_FAULT_OOM;
+               else
+                       ret = VM_FAULT_SIGBUS;
                 goto out;
         }
  
-       ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
-                                    fsdata);
-       if (ret < 0) {
-               mlog_errno(ret);
+       if (!locked_page) {
+               ret = VM_FAULT_NOPAGE;
                 goto out;
         }
+       ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+                                    fsdata);
         BUG_ON(ret != len);
-       ret = 0;
+       ret = VM_FAULT_LOCKED;
  out:
         return ret;
  }
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  
  out:
         ocfs2_unblock_signals(&oldset);
-       if (ret)
-               ret = VM_FAULT_SIGBUS;
         return ret;
  }
  
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c

index d53cb706f14c27dfc5ec754dc6d0f846e424421c..184c76b8c293907368f325316da04f57801d87eb 100644 (file)
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -745,7 +745,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
          */
         ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
                                 new_phys_cpos);
-       if (!new_phys_cpos) {
+       if (!*new_phys_cpos) {
                 ret = -ENOSPC;
                 goto out_commit;
         }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h

index 409285854f647e2357223bc7a8d24c36b376a6bb..d355e6e36b366bfe7dc8cc91cbabdad05976ba2c 100644 (file)
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -836,18 +836,65 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
  
  static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
  {
-       __test_and_set_bit_le(bit, bitmap);
+       __set_bit_le(bit, bitmap);
  }
  #define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
  
  static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
  {
-       __test_and_clear_bit_le(bit, bitmap);
+       __clear_bit_le(bit, bitmap);
  }
  #define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
  
  #define ocfs2_test_bit test_bit_le
  #define ocfs2_find_next_zero_bit find_next_zero_bit_le
  #define ocfs2_find_next_bit find_next_bit_le
+
+static inline void *correct_addr_and_bit_unaligned(int *bit, void *addr)
+{
+#if BITS_PER_LONG == 64
+       *bit += ((unsigned long) addr & 7UL) << 3;
+       addr = (void *) ((unsigned long) addr & ~7UL);
+#elif BITS_PER_LONG == 32
+       *bit += ((unsigned long) addr & 3UL) << 3;
+       addr = (void *) ((unsigned long) addr & ~3UL);
+#else
+#error "how many bits you are?!"
+#endif
+       return addr;
+}
+
+static inline void ocfs2_set_bit_unaligned(int bit, void *bitmap)
+{
+       bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+       ocfs2_set_bit(bit, bitmap);
+}
+
+static inline void ocfs2_clear_bit_unaligned(int bit, void *bitmap)
+{
+       bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+       ocfs2_clear_bit(bit, bitmap);
+}
+
+static inline int ocfs2_test_bit_unaligned(int bit, void *bitmap)
+{
+       bitmap = correct_addr_and_bit_unaligned(&bit, bitmap);
+       return ocfs2_test_bit(bit, bitmap);
+}
+
+static inline int ocfs2_find_next_zero_bit_unaligned(void *bitmap, int max,
+                                                       int start)
+{
+       int fix = 0, ret, tmpmax;
+       bitmap = correct_addr_and_bit_unaligned(&fix, bitmap);
+       tmpmax = max + fix;
+       start += fix;
+
+       ret = ocfs2_find_next_zero_bit(bitmap, tmpmax, start) - fix;
+       if (ret > max)
+               return max;
+       return ret;
+}
+
  #endif  /* OCFS2_H */
  
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c

index dc8007fc924718c6d461b22cee52e4a8fbd6ae7d..f100bf70a9066ed1b917b8ec0451c27231840f92 100644 (file)
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
         int status = 0;
         struct ocfs2_quota_recovery *rec;
  
-       mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+       printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
+              "slot %u\n", osb->dev_str, slot_num);
+
         rec = ocfs2_alloc_quota_recovery();
         if (!rec)
                 return ERR_PTR(-ENOMEM);
@@ -549,8 +551,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
                                 goto out_commit;
                         }
                         lock_buffer(qbh);
-                       WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
-                       ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+                       WARN_ON(!ocfs2_test_bit_unaligned(bit, dchunk->dqc_bitmap));
+                       ocfs2_clear_bit_unaligned(bit, dchunk->dqc_bitmap);
                         le32_add_cpu(&dchunk->dqc_free, 1);
                         unlock_buffer(qbh);
                         ocfs2_journal_dirty(handle, qbh);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
         struct inode *lqinode;
         unsigned int flags;
  
-       mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+       printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
+              "slot %u\n", osb->dev_str, slot_num);
+
         mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
         for (type = 0; type < MAXQUOTAS; type++) {
                 if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                 /* Someone else is holding the lock? Then he must be
                  * doing the recovery. Just skip the file... */
                 if (status == -EAGAIN) {
-                       mlog(ML_NOTICE, "skipping quota recovery for slot %d "
-                            "because quota file is locked.\n", slot_num);
+                       printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
+                              "device (%s) for slot %d because quota file is "
+                              "locked.\n", osb->dev_str, slot_num);
                         status = 0;
                         goto out_put;
                 } else if (status < 0) {
@@ -944,7 +949,7 @@ static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
                       * ol_quota_entries_per_block(sb);
         }
  
-       found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+       found = ocfs2_find_next_zero_bit_unaligned(dchunk->dqc_bitmap, len, 0);
         /* We failed? */
         if (found == len) {
                 mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
@@ -1208,7 +1213,7 @@ static void olq_alloc_dquot(struct buffer_head *bh, void *private)
         struct ocfs2_local_disk_chunk *dchunk;
  
         dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
-       ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+       ocfs2_set_bit_unaligned(*offset, dchunk->dqc_bitmap);
         le32_add_cpu(&dchunk->dqc_free, -1);
  }
  
@@ -1289,7 +1294,7 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot)
                         (od->dq_chunk->qc_headerbh->b_data);
         /* Mark structure as freed */
         lock_buffer(od->dq_chunk->qc_headerbh);
-       ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+       ocfs2_clear_bit_unaligned(offset, dchunk->dqc_bitmap);
         le32_add_cpu(&dchunk->dqc_free, 1);
         unlock_buffer(od->dq_chunk->qc_headerbh);
         ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c

index 26fc0014d50936137d0afb898b7c9541e8026087..1424c151cccce0170819ce4e0f36dad7d97461b8 100644 (file)
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
                         goto bail;
                 }
         } else
-               mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
-                    slot);
+               printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+                      "allocated to this node!\n", slot, osb->dev_str);
  
         ocfs2_set_slot(si, slot, osb->node_num);
         osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c

index 19965b00c43caee7df4e09428775a55150ba9f8c..94368017edb378ce1e3961d1976408c954677c9a 100644 (file)
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
  #include "cluster/masklog.h"
  #include "cluster/nodemanager.h"
  #include "cluster/heartbeat.h"
+#include "cluster/tcp.h"
  
  #include "stackglue.h"
  
@@ -255,6 +256,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
         dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
  }
  
+/*
+ * Check if this node is heartbeating and is connected to all other
+ * heartbeating nodes.
+ */
+static int o2cb_cluster_check(void)
+{
+       u8 node_num;
+       int i;
+       unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+       unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
+
+       node_num = o2nm_this_node();
+       if (node_num == O2NM_MAX_NODES) {
+               printk(KERN_ERR "o2cb: This node has not been configured.\n");
+               return -EINVAL;
+       }
+
+       /*
+        * o2dlm expects o2net sockets to be created. If not, then
+        * dlm_join_domain() fails with a stack of errors which are both cryptic
+        * and incomplete. The idea here is to detect upfront whether we have
+        * managed to connect to all nodes or not. If not, then list the nodes
+        * to allow the user to check the configuration (incorrect IP, firewall,
+        * etc.) Yes, this is racy. But its not the end of the world.
+        */
+#define        O2CB_MAP_STABILIZE_COUNT        60
+       for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
+               o2hb_fill_node_map(hbmap, sizeof(hbmap));
+               if (!test_bit(node_num, hbmap)) {
+                       printk(KERN_ERR "o2cb: %s heartbeat has not been "
+                              "started.\n", (o2hb_global_heartbeat_active() ?
+                                             "Global" : "Local"));
+                       return -EINVAL;
+               }
+               o2net_fill_node_map(netmap, sizeof(netmap));
+               /* Force set the current node to allow easy compare */
+               set_bit(node_num, netmap);
+               if (!memcmp(hbmap, netmap, sizeof(hbmap)))
+                       return 0;
+               if (i < O2CB_MAP_STABILIZE_COUNT)
+                       msleep(1000);
+       }
+
+       printk(KERN_ERR "o2cb: This node could not connect to nodes:");
+       i = -1;
+       while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
+                                 i + 1)) < O2NM_MAX_NODES) {
+               if (!test_bit(i, netmap))
+                       printk(" %u", i);
+       }
+       printk(".\n");
+
+       return -ENOTCONN;
+}
+
  /*
   * Called from the dlm when it's about to evict a node. This is how the
   * classic stack signals node death.
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
  {
         struct ocfs2_cluster_connection *conn = data;
  
-       mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
-            node_num, conn->cc_namelen, conn->cc_name);
+       printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
+              node_num, conn->cc_namelen, conn->cc_name);
  
         conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
  }
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
         BUG_ON(conn == NULL);
         BUG_ON(conn->cc_proto == NULL);
  
-       /* for now we only have one cluster/node, make sure we see it
-        * in the heartbeat universe */
-       if (!o2hb_check_local_node_heartbeating()) {
-               if (o2hb_global_heartbeat_active())
-                       mlog(ML_ERROR, "Global heartbeat not started\n");
-               rc = -EINVAL;
+       /* Ensure cluster stack is up and all nodes are connected */
+       rc = o2cb_cluster_check();
+       if (rc) {
+               printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
+                      "before retrying.\n");
                 goto out;
         }
  
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c

index 56f61027236b696fce1ccde3e1edaf86acee59a0..4994f8b0e60410ff576fa63299e29e430192080a 100644 (file)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -54,6 +54,7 @@
  #include "ocfs1_fs_compat.h"
  
  #include "alloc.h"
+#include "aops.h"
  #include "blockcheck.h"
  #include "dlmglue.h"
  #include "export.h"
@@ -1107,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
  
                 ocfs2_set_ro_flag(osb, 1);
  
-               printk(KERN_NOTICE "Readonly device detected. No cluster "
-                      "services will be utilized for this mount. Recovery "
-                      "will be skipped.\n");
+               printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
+                      "Cluster services will not be used for this mount. "
+                      "Recovery will be skipped.\n", osb->dev_str);
         }
  
         if (!ocfs2_is_hard_readonly(osb)) {
@@ -1616,12 +1617,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
         return 0;
  }
  
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
  static int __init ocfs2_init(void)
  {
-       int status;
+       int status, i;
  
         ocfs2_print_version();
  
+       for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+               init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
         status = init_ocfs2_uptodate_cache();
         if (status < 0) {
                 mlog_errno(status);
@@ -1760,7 +1766,7 @@ static void ocfs2_inode_init_once(void *data)
         ocfs2_extent_map_init(&oi->vfs_inode);
         INIT_LIST_HEAD(&oi->ip_io_markers);
         oi->ip_dir_start_lookup = 0;
-
+       atomic_set(&oi->ip_unaligned_aio, 0);
         init_rwsem(&oi->ip_alloc_sem);
         init_rwsem(&oi->ip_xattr_sem);
         mutex_init(&oi->ip_io_mutex);
@@ -1974,7 +1980,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
          * If we failed before we got a uuid_str yet, we can't stop
          * heartbeat.  Otherwise, do it.
          */
-       if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
+       if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str &&
+           !ocfs2_is_hard_readonly(osb))
                 hangup_needed = 1;
  
         if (osb->cconn)
@@ -2353,7 +2360,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
                 mlog_errno(status);
                 goto bail;
         }
-       cleancache_init_shared_fs((char *)&uuid_net_key, sb);
+       cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
  
  bail:
         return status;
@@ -2462,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                         goto finally;
                 }
         } else {
-               mlog(ML_NOTICE, "File system was not unmounted cleanly, "
-                    "recovering volume.\n");
+               printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
+                      "unmounted cleanly, recovering it.\n", osb->dev_str);
         }
  
         local = ocfs2_mount_local(osb);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c

index 194fb22ef79d590580f3245b522d0b095ef3794c..aa9e8777b09a5e345b081b0e495db378485a30e7 100644 (file)
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2376,16 +2376,18 @@ static int ocfs2_remove_value_outside(struct inode*inode,
                 }
  
                 ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
-               if (ret < 0) {
-                       mlog_errno(ret);
-                       break;
-               }
  
                 ocfs2_commit_trans(osb, ctxt.handle);
                 if (ctxt.meta_ac) {
                         ocfs2_free_alloc_context(ctxt.meta_ac);
                         ctxt.meta_ac = NULL;
                 }
+
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       break;
+               }
+
         }
  
         if (ctxt.meta_ac)
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Dec 2011 22:55:34 +0000 (14:55 -0800)
fs/ocfs2/alloc.c		patch \| blob \| history
fs/ocfs2/aops.c		patch \| blob \| history
fs/ocfs2/aops.h		patch \| blob \| history
fs/ocfs2/cluster/heartbeat.c		patch \| blob \| history
fs/ocfs2/cluster/netdebug.c		patch \| blob \| history
fs/ocfs2/cluster/tcp.c		patch \| blob \| history
fs/ocfs2/cluster/tcp.h		patch \| blob \| history
fs/ocfs2/dir.c		patch \| blob \| history
fs/ocfs2/dlm/dlmcommon.h		patch \| blob \| history
fs/ocfs2/dlm/dlmdomain.c		patch \| blob \| history
fs/ocfs2/dlm/dlmlock.c		patch \| blob \| history
fs/ocfs2/dlm/dlmmaster.c		patch \| blob \| history
fs/ocfs2/dlm/dlmrecovery.c		patch \| blob \| history
fs/ocfs2/dlm/dlmthread.c		patch \| blob \| history
fs/ocfs2/dlmglue.c		patch \| blob \| history
fs/ocfs2/extent_map.c		patch \| blob \| history
fs/ocfs2/extent_map.h		patch \| blob \| history
fs/ocfs2/file.c		patch \| blob \| history
fs/ocfs2/inode.c		patch \| blob \| history
fs/ocfs2/inode.h		patch \| blob \| history
fs/ocfs2/ioctl.c		patch \| blob \| history
fs/ocfs2/journal.c		patch \| blob \| history
fs/ocfs2/journal.h		patch \| blob \| history
fs/ocfs2/mmap.c		patch \| blob \| history
fs/ocfs2/move_extents.c		patch \| blob \| history
fs/ocfs2/ocfs2.h		patch \| blob \| history
fs/ocfs2/quota_local.c		patch \| blob \| history
fs/ocfs2/slot_map.c		patch \| blob \| history
fs/ocfs2/stack_o2cb.c		patch \| blob \| history
fs/ocfs2/super.c		patch \| blob \| history
fs/ocfs2/xattr.c		patch \| blob \| history