Btrfs: eliminate races in worker stopping code
authorIlya Dryomov <idryomov@gmail.com>
Wed, 2 Oct 2013 16:39:50 +0000 (19:39 +0300)
committerJosef Bacik <jbacik@fusionio.com>
Fri, 4 Oct 2013 20:02:13 +0000 (16:02 -0400)
The current implementation of worker threads in Btrfs has races in
worker stopping code, which cause all kinds of panics and lockups when
running btrfs/011 xfstest in a loop.  The problem is that
btrfs_stop_workers is unsynchronized with respect to check_idle_worker,
check_busy_worker and __btrfs_start_workers.

E.g., check_idle_worker race flow:

       btrfs_stop_workers():            check_idle_worker(aworker):
- grabs the lock
- splices the idle list into the
  working list
- removes the first worker from the
  working list
- releases the lock to wait for
  its kthread's completion
                                  - grabs the lock
                                  - if aworker is on the working list,
                                    moves aworker from the working list
                                    to the idle list
                                  - releases the lock
- grabs the lock
- puts the worker
- removes the second worker from the
  working list
                              ......
        btrfs_stop_workers returns, aworker is on the idle list
                 FS is umounted, memory is freed
                              ......
              aworker is waken up, fireworks ensue

With this applied, I wasn't able to trigger the problem in 48 hours,
whereas previously I could reliably reproduce at least one of these
races within an hour.

Reported-by: David Sterba <dsterba@suse.cz>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
fs/btrfs/async-thread.c
fs/btrfs/async-thread.h

index 58b7d14b08ee1d47ec66a06243ee16f63854b676..08cc08f037a633199bffb3fadfa95750302eb25e 100644 (file)
@@ -107,7 +107,8 @@ static void check_idle_worker(struct btrfs_worker_thread *worker)
                worker->idle = 1;
 
                /* the list may be empty if the worker is just starting */
-               if (!list_empty(&worker->worker_list)) {
+               if (!list_empty(&worker->worker_list) &&
+                   !worker->workers->stopping) {
                        list_move(&worker->worker_list,
                                 &worker->workers->idle_list);
                }
@@ -127,7 +128,8 @@ static void check_busy_worker(struct btrfs_worker_thread *worker)
                spin_lock_irqsave(&worker->workers->lock, flags);
                worker->idle = 0;
 
-               if (!list_empty(&worker->worker_list)) {
+               if (!list_empty(&worker->worker_list) &&
+                   !worker->workers->stopping) {
                        list_move_tail(&worker->worker_list,
                                      &worker->workers->worker_list);
                }
@@ -412,6 +414,7 @@ void btrfs_stop_workers(struct btrfs_workers *workers)
        int can_stop;
 
        spin_lock_irq(&workers->lock);
+       workers->stopping = 1;
        list_splice_init(&workers->idle_list, &workers->worker_list);
        while (!list_empty(&workers->worker_list)) {
                cur = workers->worker_list.next;
@@ -455,6 +458,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
        workers->ordered = 0;
        workers->atomic_start_pending = 0;
        workers->atomic_worker_start = async_helper;
+       workers->stopping = 0;
 }
 
 /*
@@ -480,15 +484,19 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
        atomic_set(&worker->num_pending, 0);
        atomic_set(&worker->refs, 1);
        worker->workers = workers;
-       worker->task = kthread_run(worker_loop, worker,
-                                  "btrfs-%s-%d", workers->name,
-                                  workers->num_workers + 1);
+       worker->task = kthread_create(worker_loop, worker,
+                                     "btrfs-%s-%d", workers->name,
+                                     workers->num_workers + 1);
        if (IS_ERR(worker->task)) {
                ret = PTR_ERR(worker->task);
-               kfree(worker);
                goto fail;
        }
+
        spin_lock_irq(&workers->lock);
+       if (workers->stopping) {
+               spin_unlock_irq(&workers->lock);
+               goto fail_kthread;
+       }
        list_add_tail(&worker->worker_list, &workers->idle_list);
        worker->idle = 1;
        workers->num_workers++;
@@ -496,8 +504,13 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
        WARN_ON(workers->num_workers_starting < 0);
        spin_unlock_irq(&workers->lock);
 
+       wake_up_process(worker->task);
        return 0;
+
+fail_kthread:
+       kthread_stop(worker->task);
 fail:
+       kfree(worker);
        spin_lock_irq(&workers->lock);
        workers->num_workers_starting--;
        spin_unlock_irq(&workers->lock);
index 063698b90ce2dd0481663aeee46b30ef796d52e2..1f26792683edf195b48db80992c05acae6fd71f0 100644 (file)
@@ -107,6 +107,8 @@ struct btrfs_workers {
 
        /* extra name for this worker, used for current->name */
        char *name;
+
+       int stopping;
 };
 
 void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);