Merge tag 'dm-3.20-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device...
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 13 Feb 2015 00:36:31 +0000 (16:36 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 13 Feb 2015 00:36:31 +0000 (16:36 -0800)
Pull device mapper changes from Mike Snitzer:

 - The most significant change this cycle is request-based DM now
   supports stacking ontop of blk-mq devices.  This blk-mq support
   changes the model request-based DM uses for cloning a request to
   relying on calling blk_get_request() directly from the underlying
   blk-mq device.

   An early consumer of this code is Intel's emerging NVMe hardware;
   thanks to Keith Busch for working on, and pushing for, these changes.

 - A few other small fixes and cleanups across other DM targets.

* tag 'dm-3.20-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm: inherit QUEUE_FLAG_SG_GAPS flags from underlying queues
  dm snapshot: remove unnecessary NULL checks before vfree() calls
  dm mpath: simplify failure path of dm_multipath_init()
  dm thin metadata: remove unused dm_pool_get_data_block_size()
  dm ioctl: fix stale comment above dm_get_inactive_table()
  dm crypt: update url in CONFIG_DM_CRYPT help text
  dm bufio: fix time comparison to use time_after_eq()
  dm: use time_in_range() and time_after()
  dm raid: fix a couple integer overflows
  dm table: train hybrid target type detection to select blk-mq if appropriate
  dm: allocate requests in target when stacking on blk-mq devices
  dm: prepare for allocating blk-mq clone requests in target
  dm: submit stacked requests in irq enabled context
  dm: split request structure out from dm_rq_target_io structure
  dm: remove exports for request-based interfaces without external callers

1  2 
drivers/md/Kconfig
drivers/md/dm-cache-target.c
drivers/md/dm-raid.c
drivers/md/dm-thin.c
drivers/md/dm.c

diff --combined drivers/md/Kconfig
index c355a226a0247c824770457179731bc05d3a0667,09c89a4b014d4bf1724a2fb238bedfc8d6b846fc..c39644478aa4e660f0ec2ddefedea4efbfd776b2
@@@ -5,7 -5,6 +5,7 @@@
  menuconfig MD
        bool "Multiple devices driver support (RAID and LVM)"
        depends on BLOCK
 +      select SRCU
        help
          Support multiple physical spindles through a single logical device.
          Required for RAID and logical volume management.
@@@ -231,9 -230,8 +231,8 @@@ config DM_CRYP
          transparently encrypts the data on it. You'll need to activate
          the ciphers you're going to use in the cryptoapi configuration.
  
-         Information on how to use dm-crypt can be found on
-         <http://www.saout.de/misc/dm-crypt/>
+         For further information on dm-crypt and userspace tools see:
+         <http://code.google.com/p/cryptsetup/wiki/DMCrypt>
  
          To compile this code as a module, choose M here: the module will
          be called dm-crypt.
index e1650539cc2f826d9efe7f878352570bcc31e101,2eca128a9d6ac301f591bd58d41378ed0f4bae03..7755af35186762a4319e8cff52d4e95b26524d3e
@@@ -11,6 -11,7 +11,7 @@@
  
  #include <linux/dm-io.h>
  #include <linux/dm-kcopyd.h>
+ #include <linux/jiffies.h>
  #include <linux/init.h>
  #include <linux/mempool.h>
  #include <linux/module.h>
@@@ -221,13 -222,7 +222,13 @@@ struct cache 
        struct list_head need_commit_migrations;
        sector_t migration_threshold;
        wait_queue_head_t migration_wait;
 -      atomic_t nr_migrations;
 +      atomic_t nr_allocated_migrations;
 +
 +      /*
 +       * The number of in flight migrations that are performing
 +       * background io. eg, promotion, writeback.
 +       */
 +      atomic_t nr_io_migrations;
  
        wait_queue_head_t quiescing_wait;
        atomic_t quiescing;
        struct dm_deferred_set *all_io_ds;
  
        mempool_t *migration_pool;
 -      struct dm_cache_migration *next_migration;
  
        struct dm_cache_policy *policy;
        unsigned policy_nr_args;
@@@ -355,31 -351,10 +356,31 @@@ static void free_prison_cell(struct cac
        dm_bio_prison_free_cell(cache->prison, cell);
  }
  
 +static struct dm_cache_migration *alloc_migration(struct cache *cache)
 +{
 +      struct dm_cache_migration *mg;
 +
 +      mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 +      if (mg) {
 +              mg->cache = cache;
 +              atomic_inc(&mg->cache->nr_allocated_migrations);
 +      }
 +
 +      return mg;
 +}
 +
 +static void free_migration(struct dm_cache_migration *mg)
 +{
 +      if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
 +              wake_up(&mg->cache->migration_wait);
 +
 +      mempool_free(mg, mg->cache->migration_pool);
 +}
 +
  static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
  {
        if (!p->mg) {
 -              p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
 +              p->mg = alloc_migration(cache);
                if (!p->mg)
                        return -ENOMEM;
        }
@@@ -408,7 -383,7 +409,7 @@@ static void prealloc_free_structs(struc
                free_prison_cell(cache, p->cell1);
  
        if (p->mg)
 -              mempool_free(p->mg, cache->migration_pool);
 +              free_migration(p->mg);
  }
  
  static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
@@@ -880,14 -855,24 +881,14 @@@ static void remap_to_origin_then_cache(
   * Migration covers moving data from the origin device to the cache, or
   * vice versa.
   *--------------------------------------------------------------*/
 -static void free_migration(struct dm_cache_migration *mg)
 -{
 -      mempool_free(mg, mg->cache->migration_pool);
 -}
 -
 -static void inc_nr_migrations(struct cache *cache)
 +static void inc_io_migrations(struct cache *cache)
  {
 -      atomic_inc(&cache->nr_migrations);
 +      atomic_inc(&cache->nr_io_migrations);
  }
  
 -static void dec_nr_migrations(struct cache *cache)
 +static void dec_io_migrations(struct cache *cache)
  {
 -      atomic_dec(&cache->nr_migrations);
 -
 -      /*
 -       * Wake the worker in case we're suspending the target.
 -       */
 -      wake_up(&cache->migration_wait);
 +      atomic_dec(&cache->nr_io_migrations);
  }
  
  static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
@@@ -910,10 -895,11 +911,10 @@@ static void cell_defer(struct cache *ca
        wake_worker(cache);
  }
  
 -static void cleanup_migration(struct dm_cache_migration *mg)
 +static void free_io_migration(struct dm_cache_migration *mg)
  {
 -      struct cache *cache = mg->cache;
 +      dec_io_migrations(mg->cache);
        free_migration(mg);
 -      dec_nr_migrations(cache);
  }
  
  static void migration_failure(struct dm_cache_migration *mg)
                cell_defer(cache, mg->new_ocell, true);
        }
  
 -      cleanup_migration(mg);
 +      free_io_migration(mg);
  }
  
  static void migration_success_pre_commit(struct dm_cache_migration *mg)
        if (mg->writeback) {
                clear_dirty(cache, mg->old_oblock, mg->cblock);
                cell_defer(cache, mg->old_ocell, false);
 -              cleanup_migration(mg);
 +              free_io_migration(mg);
                return;
  
        } else if (mg->demote) {
                                             mg->old_oblock);
                        if (mg->promote)
                                cell_defer(cache, mg->new_ocell, true);
 -                      cleanup_migration(mg);
 +                      free_io_migration(mg);
                        return;
                }
        } else {
                if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
                        DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
                        policy_remove_mapping(cache->policy, mg->new_oblock);
 -                      cleanup_migration(mg);
 +                      free_io_migration(mg);
                        return;
                }
        }
@@@ -999,7 -985,7 +1000,7 @@@ static void migration_success_post_comm
                } else {
                        if (mg->invalidate)
                                policy_remove_mapping(cache->policy, mg->old_oblock);
 -                      cleanup_migration(mg);
 +                      free_io_migration(mg);
                }
  
        } else {
                        bio_endio(mg->new_ocell->holder, 0);
                        cell_defer(cache, mg->new_ocell, false);
                }
 -              cleanup_migration(mg);
 +              free_io_migration(mg);
        }
  }
  
@@@ -1266,7 -1252,7 +1267,7 @@@ static void promote(struct cache *cache
        mg->new_ocell = cell;
        mg->start_jiffies = jiffies;
  
 -      inc_nr_migrations(cache);
 +      inc_io_migrations(cache);
        quiesce_migration(mg);
  }
  
@@@ -1290,7 -1276,7 +1291,7 @@@ static void writeback(struct cache *cac
        mg->new_ocell = NULL;
        mg->start_jiffies = jiffies;
  
 -      inc_nr_migrations(cache);
 +      inc_io_migrations(cache);
        quiesce_migration(mg);
  }
  
@@@ -1317,7 -1303,7 +1318,7 @@@ static void demote_then_promote(struct 
        mg->new_ocell = new_ocell;
        mg->start_jiffies = jiffies;
  
 -      inc_nr_migrations(cache);
 +      inc_io_migrations(cache);
        quiesce_migration(mg);
  }
  
@@@ -1345,7 -1331,7 +1346,7 @@@ static void invalidate(struct cache *ca
        mg->new_ocell = NULL;
        mg->start_jiffies = jiffies;
  
 -      inc_nr_migrations(cache);
 +      inc_io_migrations(cache);
        quiesce_migration(mg);
  }
  
@@@ -1427,7 -1413,7 +1428,7 @@@ static void process_discard_bio(struct 
  
  static bool spare_migration_bandwidth(struct cache *cache)
  {
 -      sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
 +      sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
                cache->sectors_per_block;
        return current_volume < cache->migration_threshold;
  }
@@@ -1562,8 -1548,8 +1563,8 @@@ static void process_bio(struct cache *c
  
  static int need_commit_due_to_time(struct cache *cache)
  {
-       return jiffies < cache->last_commit_jiffies ||
-              jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
+       return !time_in_range(jiffies, cache->last_commit_jiffies,
+                             cache->last_commit_jiffies + COMMIT_PERIOD);
  }
  
  static int commit_if_needed(struct cache *cache)
@@@ -1779,7 -1765,7 +1780,7 @@@ static void stop_quiescing(struct cach
  
  static void wait_for_migrations(struct cache *cache)
  {
 -      wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
 +      wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations));
  }
  
  static void stop_worker(struct cache *cache)
@@@ -1891,6 -1877,9 +1892,6 @@@ static void destroy(struct cache *cache
  {
        unsigned i;
  
 -      if (cache->next_migration)
 -              mempool_free(cache->next_migration, cache->migration_pool);
 -
        if (cache->migration_pool)
                mempool_destroy(cache->migration_pool);
  
@@@ -2436,8 -2425,7 +2437,8 @@@ static int cache_create(struct cache_ar
        INIT_LIST_HEAD(&cache->quiesced_migrations);
        INIT_LIST_HEAD(&cache->completed_migrations);
        INIT_LIST_HEAD(&cache->need_commit_migrations);
 -      atomic_set(&cache->nr_migrations, 0);
 +      atomic_set(&cache->nr_allocated_migrations, 0);
 +      atomic_set(&cache->nr_io_migrations, 0);
        init_waitqueue_head(&cache->migration_wait);
  
        init_waitqueue_head(&cache->quiescing_wait);
                goto bad;
        }
  
 -      cache->next_migration = NULL;
 -
        cache->need_tick_bio = true;
        cache->sized = false;
        cache->invalidate = false;
diff --combined drivers/md/dm-raid.c
index 777d9ba2acad646d7a0a20cea72a056ab1684239,41acc9dd7342f1ab3bc0a05a04fc2892ba372ba4..88e4c7f249864e6875796674d7177c8ecd6d6ad4
@@@ -746,7 -746,13 +746,7 @@@ static int raid_is_congested(struct dm_
  {
        struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
  
 -      if (rs->raid_type->level == 1)
 -              return md_raid1_congested(&rs->md, bits);
 -
 -      if (rs->raid_type->level == 10)
 -              return md_raid10_congested(&rs->md, bits);
 -
 -      return md_raid5_congested(&rs->md, bits);
 +      return mddev_congested(&rs->md, bits);
  }
  
  /*
@@@ -1237,7 -1243,7 +1237,7 @@@ static int raid_ctr(struct dm_target *t
        argv++;
  
        /* Skip over RAID params for now and find out # of devices */
-       if (num_raid_params + 1 > argc) {
+       if (num_raid_params >= argc) {
                ti->error = "Arguments do not agree with counts given";
                return -EINVAL;
        }
                return -EINVAL;
        }
  
+       argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
+       if (argc != (num_raid_devs * 2)) {
+               ti->error = "Supplied RAID devices does not match the count given";
+               return -EINVAL;
+       }
        rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
        if (IS_ERR(rs))
                return PTR_ERR(rs);
        if (ret)
                goto bad;
  
-       ret = -EINVAL;
-       argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
        argv += num_raid_params + 1;
  
-       if (argc != (num_raid_devs * 2)) {
-               ti->error = "Supplied RAID devices does not match the count given";
-               goto bad;
-       }
        ret = dev_parms(rs, argv);
        if (ret)
                goto bad;
diff --combined drivers/md/dm-thin.c
index 07705ee181e3d2837c47954626276f9dea52cac0,0f781451ea3f2b344dd830f085a22ba73643607a..654773cb1eeea23b39db0fcf0e6fb00d91d9476d
@@@ -11,6 -11,7 +11,7 @@@
  #include <linux/device-mapper.h>
  #include <linux/dm-io.h>
  #include <linux/dm-kcopyd.h>
+ #include <linux/jiffies.h>
  #include <linux/log2.h>
  #include <linux/list.h>
  #include <linux/rculist.h>
@@@ -1700,8 -1701,8 +1701,8 @@@ static void process_cell_fail(struct th
   */
  static int need_commit_due_to_time(struct pool *pool)
  {
-       return jiffies < pool->last_commit_jiffies ||
-              jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
+       return !time_in_range(jiffies, pool->last_commit_jiffies,
+                             pool->last_commit_jiffies + COMMIT_PERIOD);
  }
  
  #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
@@@ -3385,12 -3386,6 +3386,12 @@@ static int pool_message(struct dm_targe
        struct pool_c *pt = ti->private;
        struct pool *pool = pt->pool;
  
 +      if (get_pool_mode(pool) >= PM_READ_ONLY) {
 +              DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
 +                    dm_device_name(pool->pool_md));
 +              return -EINVAL;
 +      }
 +
        if (!strcasecmp(argv[0], "create_thin"))
                r = process_create_thin_mesg(argc, argv, pool);
  
diff --combined drivers/md/dm.c
index 68c1b535c52ec5bfe53fe36d0a38b3e3695ca974,549b815999a1e082da40b2a59039c499e1766ee5..ec1444f49de14ac185ae39cfb214deee3ba66998
@@@ -20,6 -20,7 +20,7 @@@
  #include <linux/hdreg.h>
  #include <linux/delay.h>
  #include <linux/wait.h>
+ #include <linux/kthread.h>
  
  #include <trace/events/block.h>
  
@@@ -78,7 -79,8 +79,8 @@@ struct dm_io 
  struct dm_rq_target_io {
        struct mapped_device *md;
        struct dm_target *ti;
-       struct request *orig, clone;
+       struct request *orig, *clone;
+       struct kthread_work work;
        int error;
        union map_info info;
  };
@@@ -179,6 -181,7 +181,7 @@@ struct mapped_device 
         * io objects are allocated from here.
         */
        mempool_t *io_pool;
+       mempool_t *rq_pool;
  
        struct bio_set *bs;
  
        /* zero-length flush that will be cloned and submitted to targets */
        struct bio flush_bio;
  
 +      /* the number of internal suspends */
 +      unsigned internal_suspend_count;
 +
        struct dm_stats stats;
+       struct kthread_worker kworker;
+       struct task_struct *kworker_task;
  };
  
  /*
   */
  struct dm_md_mempools {
        mempool_t *io_pool;
+       mempool_t *rq_pool;
        struct bio_set *bs;
  };
  
@@@ -231,6 -235,7 +238,7 @@@ struct table_device 
  #define RESERVED_MAX_IOS              1024
  static struct kmem_cache *_io_cache;
  static struct kmem_cache *_rq_tio_cache;
+ static struct kmem_cache *_rq_cache;
  
  /*
   * Bio-based DM's mempools' reserved IOs set by the user.
@@@ -288,9 -293,14 +296,14 @@@ static int __init local_init(void
        if (!_rq_tio_cache)
                goto out_free_io_cache;
  
+       _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+                                     __alignof__(struct request), 0, NULL);
+       if (!_rq_cache)
+               goto out_free_rq_tio_cache;
        r = dm_uevent_init();
        if (r)
-               goto out_free_rq_tio_cache;
+               goto out_free_rq_cache;
  
        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
        if (!deferred_remove_workqueue) {
@@@ -312,6 -322,8 +325,8 @@@ out_free_workqueue
        destroy_workqueue(deferred_remove_workqueue);
  out_uevent_exit:
        dm_uevent_exit();
+ out_free_rq_cache:
+       kmem_cache_destroy(_rq_cache);
  out_free_rq_tio_cache:
        kmem_cache_destroy(_rq_tio_cache);
  out_free_io_cache:
@@@ -325,6 -337,7 +340,7 @@@ static void local_exit(void
        flush_scheduled_work();
        destroy_workqueue(deferred_remove_workqueue);
  
+       kmem_cache_destroy(_rq_cache);
        kmem_cache_destroy(_rq_tio_cache);
        kmem_cache_destroy(_io_cache);
        unregister_blkdev(_major, _name);
@@@ -577,6 -590,17 +593,17 @@@ static void free_rq_tio(struct dm_rq_ta
        mempool_free(tio, tio->md->io_pool);
  }
  
+ static struct request *alloc_clone_request(struct mapped_device *md,
+                                          gfp_t gfp_mask)
+ {
+       return mempool_alloc(md->rq_pool, gfp_mask);
+ }
+ static void free_clone_request(struct mapped_device *md, struct request *rq)
+ {
+       mempool_free(rq, md->rq_pool);
+ }
  static int md_in_flight(struct mapped_device *md)
  {
        return atomic_read(&md->pending[READ]) +
@@@ -992,7 -1016,7 +1019,7 @@@ static void end_clone_bio(struct bio *c
   * the md may be freed in dm_put() at the end of this function.
   * Or do dm_get() before calling this function and dm_put() later.
   */
- static void rq_completed(struct mapped_device *md, int rw, int run_queue)
+ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
  {
        atomic_dec(&md->pending[rw]);
  
@@@ -1020,12 -1044,17 +1047,17 @@@ static void free_rq_clone(struct reques
        struct dm_rq_target_io *tio = clone->end_io_data;
  
        blk_rq_unprep_clone(clone);
+       if (clone->q && clone->q->mq_ops)
+               tio->ti->type->release_clone_rq(clone);
+       else
+               free_clone_request(tio->md, clone);
        free_rq_tio(tio);
  }
  
  /*
   * Complete the clone and the original request.
-  * Must be called without queue lock.
+  * Must be called without clone's queue lock held,
+  * see end_clone_request() for more details.
   */
  static void dm_end_request(struct request *clone, int error)
  {
  
  static void dm_unprep_request(struct request *rq)
  {
-       struct request *clone = rq->special;
+       struct dm_rq_target_io *tio = rq->special;
+       struct request *clone = tio->clone;
  
        rq->special = NULL;
        rq->cmd_flags &= ~REQ_DONTPREP;
  
-       free_rq_clone(clone);
+       if (clone)
+               free_rq_clone(clone);
  }
  
  /*
   * Requeue the original request of a clone.
   */
- void dm_requeue_unmapped_request(struct request *clone)
+ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+                                                struct request *rq)
  {
-       int rw = rq_data_dir(clone);
-       struct dm_rq_target_io *tio = clone->end_io_data;
-       struct mapped_device *md = tio->md;
-       struct request *rq = tio->orig;
+       int rw = rq_data_dir(rq);
        struct request_queue *q = rq->q;
        unsigned long flags;
  
        blk_requeue_request(q, rq);
        spin_unlock_irqrestore(q->queue_lock, flags);
  
-       rq_completed(md, rw, 0);
+       rq_completed(md, rw, false);
+ }
+ static void dm_requeue_unmapped_request(struct request *clone)
+ {
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       dm_requeue_unmapped_original_request(tio->md, tio->orig);
  }
- EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
  
  static void __stop_queue(struct request_queue *q)
  {
@@@ -1151,8 -1186,15 +1189,15 @@@ static void dm_done(struct request *clo
  static void dm_softirq_done(struct request *rq)
  {
        bool mapped = true;
-       struct request *clone = rq->completion_data;
-       struct dm_rq_target_io *tio = clone->end_io_data;
+       struct dm_rq_target_io *tio = rq->special;
+       struct request *clone = tio->clone;
+       if (!clone) {
+               blk_end_request_all(rq, tio->error);
+               rq_completed(tio->md, rq_data_dir(rq), false);
+               free_rq_tio(tio);
+               return;
+       }
  
        if (rq->cmd_flags & REQ_FAILED)
                mapped = false;
   * Complete the clone and the original request with the error status
   * through softirq context.
   */
- static void dm_complete_request(struct request *clone, int error)
+ static void dm_complete_request(struct request *rq, int error)
  {
-       struct dm_rq_target_io *tio = clone->end_io_data;
-       struct request *rq = tio->orig;
+       struct dm_rq_target_io *tio = rq->special;
  
        tio->error = error;
-       rq->completion_data = clone;
        blk_complete_request(rq);
  }
  
   * Complete the not-mapped clone and the original request with the error status
   * through softirq context.
   * Target's rq_end_io() function isn't called.
-  * This may be used when the target's map_rq() function fails.
+  * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
   */
void dm_kill_unmapped_request(struct request *clone, int error)
static void dm_kill_unmapped_request(struct request *rq, int error)
  {
-       struct dm_rq_target_io *tio = clone->end_io_data;
-       struct request *rq = tio->orig;
        rq->cmd_flags |= REQ_FAILED;
-       dm_complete_request(clone, error);
+       dm_complete_request(rq, error);
  }
- EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
  
  /*
-  * Called with the queue lock held
+  * Called with the clone's queue lock held
   */
  static void end_clone_request(struct request *clone, int error)
  {
-       /*
-        * For just cleaning up the information of the queue in which
-        * the clone was dispatched.
-        * The clone is *NOT* freed actually here because it is alloced from
-        * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
-        */
-       __blk_put_request(clone->q, clone);
+       struct dm_rq_target_io *tio = clone->end_io_data;
+       if (!clone->q->mq_ops) {
+               /*
+                * For just cleaning up the information of the queue in which
+                * the clone was dispatched.
+                * The clone is *NOT* freed actually here because it is alloced
+                * from dm own mempool (REQ_ALLOCED isn't set).
+                */
+               __blk_put_request(clone->q, clone);
+       }
  
        /*
         * Actual request completion is done in a softirq context which doesn't
-        * hold the queue lock.  Otherwise, deadlock could occur because:
+        * hold the clone's queue lock.  Otherwise, deadlock could occur because:
         *     - another request may be submitted by the upper level driver
         *       of the stacking during the completion
         *     - the submission which requires queue lock may be done
-        *       against this queue
+        *       against this clone's queue
         */
-       dm_complete_request(clone, error);
+       dm_complete_request(tio->orig, error);
  }
  
  /*
@@@ -1689,19 -1729,19 +1732,19 @@@ static void dm_request(struct request_q
                _dm_request(q, bio);
  }
  
void dm_dispatch_request(struct request *rq)
static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
  {
        int r;
  
-       if (blk_queue_io_stat(rq->q))
-               rq->cmd_flags |= REQ_IO_STAT;
+       if (blk_queue_io_stat(clone->q))
+               clone->cmd_flags |= REQ_IO_STAT;
  
-       rq->start_time = jiffies;
-       r = blk_insert_cloned_request(rq->q, rq);
+       clone->start_time = jiffies;
+       r = blk_insert_cloned_request(clone->q, clone);
        if (r)
+               /* must complete clone in terms of original request */
                dm_complete_request(rq, r);
  }
- EXPORT_SYMBOL_GPL(dm_dispatch_request);
  
  static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
                                 void *data)
  }
  
  static int setup_clone(struct request *clone, struct request *rq,
-                      struct dm_rq_target_io *tio)
+                      struct dm_rq_target_io *tio, gfp_t gfp_mask)
  {
        int r;
  
-       blk_rq_init(NULL, clone);
-       r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+       r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
                              dm_rq_bio_constructor, tio);
        if (r)
                return r;
        clone->end_io = end_clone_request;
        clone->end_io_data = tio;
  
+       tio->clone = clone;
        return 0;
  }
  
  static struct request *clone_rq(struct request *rq, struct mapped_device *md,
-                               gfp_t gfp_mask)
+                               struct dm_rq_target_io *tio, gfp_t gfp_mask)
+ {
+       struct request *clone = alloc_clone_request(md, gfp_mask);
+       if (!clone)
+               return NULL;
+       blk_rq_init(NULL, clone);
+       if (setup_clone(clone, rq, tio, gfp_mask)) {
+               /* -ENOMEM */
+               free_clone_request(md, clone);
+               return NULL;
+       }
+       return clone;
+ }
+ static void map_tio_request(struct kthread_work *work);
+ static struct dm_rq_target_io *prep_tio(struct request *rq,
+                                       struct mapped_device *md, gfp_t gfp_mask)
  {
-       struct request *clone;
        struct dm_rq_target_io *tio;
+       int srcu_idx;
+       struct dm_table *table;
  
        tio = alloc_rq_tio(md, gfp_mask);
        if (!tio)
  
        tio->md = md;
        tio->ti = NULL;
+       tio->clone = NULL;
        tio->orig = rq;
        tio->error = 0;
        memset(&tio->info, 0, sizeof(tio->info));
-       clone = &tio->clone;
-       if (setup_clone(clone, rq, tio)) {
-               /* -ENOMEM */
-               free_rq_tio(tio);
-               return NULL;
+       init_kthread_work(&tio->work, map_tio_request);
+       table = dm_get_live_table(md, &srcu_idx);
+       if (!dm_table_mq_request_based(table)) {
+               if (!clone_rq(rq, md, tio, gfp_mask)) {
+                       dm_put_live_table(md, srcu_idx);
+                       free_rq_tio(tio);
+                       return NULL;
+               }
        }
+       dm_put_live_table(md, srcu_idx);
  
-       return clone;
+       return tio;
  }
  
  /*
  static int dm_prep_fn(struct request_queue *q, struct request *rq)
  {
        struct mapped_device *md = q->queuedata;
-       struct request *clone;
+       struct dm_rq_target_io *tio;
  
        if (unlikely(rq->special)) {
                DMWARN("Already has something in rq->special.");
                return BLKPREP_KILL;
        }
  
-       clone = clone_rq(rq, md, GFP_ATOMIC);
-       if (!clone)
+       tio = prep_tio(rq, md, GFP_ATOMIC);
+       if (!tio)
                return BLKPREP_DEFER;
  
-       rq->special = clone;
+       rq->special = tio;
        rq->cmd_flags |= REQ_DONTPREP;
  
        return BLKPREP_OK;
  
  /*
   * Returns:
-  * 0  : the request has been processed (not requeued)
-  * !0 : the request has been requeued
+  * 0                : the request has been processed
+  * DM_MAPIO_REQUEUE : the original request needs to be requeued
+  * < 0              : the request was completed due to failure
   */
- static int map_request(struct dm_target *ti, struct request *clone,
+ static int map_request(struct dm_target *ti, struct request *rq,
                       struct mapped_device *md)
  {
-       int r, requeued = 0;
-       struct dm_rq_target_io *tio = clone->end_io_data;
+       int r;
+       struct dm_rq_target_io *tio = rq->special;
+       struct request *clone = NULL;
+       if (tio->clone) {
+               clone = tio->clone;
+               r = ti->type->map_rq(ti, clone, &tio->info);
+       } else {
+               r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+               if (r < 0) {
+                       /* The target wants to complete the I/O */
+                       dm_kill_unmapped_request(rq, r);
+                       return r;
+               }
+               if (IS_ERR(clone))
+                       return DM_MAPIO_REQUEUE;
+               if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+                       /* -ENOMEM */
+                       ti->type->release_clone_rq(clone);
+                       return DM_MAPIO_REQUEUE;
+               }
+       }
  
-       tio->ti = ti;
-       r = ti->type->map_rq(ti, clone, &tio->info);
        switch (r) {
        case DM_MAPIO_SUBMITTED:
                /* The target has taken the I/O to submit by itself later */
        case DM_MAPIO_REMAPPED:
                /* The target has remapped the I/O so dispatch it */
                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
-                                    blk_rq_pos(tio->orig));
-               dm_dispatch_request(clone);
+                                    blk_rq_pos(rq));
+               dm_dispatch_clone_request(clone, rq);
                break;
        case DM_MAPIO_REQUEUE:
                /* The target wants to requeue the I/O */
                dm_requeue_unmapped_request(clone);
-               requeued = 1;
                break;
        default:
                if (r > 0) {
                }
  
                /* The target wants to complete the I/O */
-               dm_kill_unmapped_request(clone, r);
-               break;
+               dm_kill_unmapped_request(rq, r);
+               return r;
        }
  
-       return requeued;
+       return 0;
  }
  
- static struct request *dm_start_request(struct mapped_device *md, struct request *orig)
+ static void map_tio_request(struct kthread_work *work)
  {
-       struct request *clone;
+       struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
+       struct request *rq = tio->orig;
+       struct mapped_device *md = tio->md;
  
+       if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+               dm_requeue_unmapped_original_request(md, rq);
+ }
+ static void dm_start_request(struct mapped_device *md, struct request *orig)
+ {
        blk_start_request(orig);
-       clone = orig->special;
-       atomic_inc(&md->pending[rq_data_dir(clone)]);
+       atomic_inc(&md->pending[rq_data_dir(orig)]);
  
        /*
         * Hold the md reference here for the in-flight I/O.
         * See the comment in rq_completed() too.
         */
        dm_get(md);
-       return clone;
  }
  
  /*
@@@ -1858,7 -1948,8 +1951,8 @@@ static void dm_request_fn(struct reques
        int srcu_idx;
        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
        struct dm_target *ti;
-       struct request *rq, *clone;
+       struct request *rq;
+       struct dm_rq_target_io *tio;
        sector_t pos;
  
        /*
                ti = dm_table_find_target(map, pos);
                if (!dm_target_is_valid(ti)) {
                        /*
-                        * Must perform setup, that dm_done() requires,
+                        * Must perform setup, that rq_completed() requires,
                         * before calling dm_kill_unmapped_request
                         */
                        DMERR_LIMIT("request attempted access beyond the end of device");
-                       clone = dm_start_request(md, rq);
-                       dm_kill_unmapped_request(clone, -EIO);
+                       dm_start_request(md, rq);
+                       dm_kill_unmapped_request(rq, -EIO);
                        continue;
                }
  
                if (ti->type->busy && ti->type->busy(ti))
                        goto delay_and_out;
  
-               clone = dm_start_request(md, rq);
-               spin_unlock(q->queue_lock);
-               if (map_request(ti, clone, md))
-                       goto requeued;
+               dm_start_request(md, rq);
  
+               tio = rq->special;
+               /* Establish tio->ti before queuing work (map_tio_request) */
+               tio->ti = ti;
+               queue_kthread_work(&md->kworker, &tio->work);
                BUG_ON(!irqs_disabled());
-               spin_lock(q->queue_lock);
        }
  
        goto out;
  
- requeued:
-       BUG_ON(!irqs_disabled());
-       spin_lock(q->queue_lock);
  delay_and_out:
        blk_delay_queue(q, HZ / 10);
  out:
@@@ -2093,6 -2179,7 +2182,7 @@@ static struct mapped_device *alloc_dev(
        INIT_WORK(&md->work, dm_wq_work);
        init_waitqueue_head(&md->eventq);
        init_completion(&md->kobj_holder.completion);
+       md->kworker_task = NULL;
  
        md->disk->major = _major;
        md->disk->first_minor = minor;
@@@ -2153,8 -2240,13 +2243,13 @@@ static void free_dev(struct mapped_devi
        unlock_fs(md);
        bdput(md->bdev);
        destroy_workqueue(md->wq);
+       if (md->kworker_task)
+               kthread_stop(md->kworker_task);
        if (md->io_pool)
                mempool_destroy(md->io_pool);
+       if (md->rq_pool)
+               mempool_destroy(md->rq_pool);
        if (md->bs)
                bioset_free(md->bs);
        blk_integrity_unregister(md->disk);
@@@ -2188,23 -2280,24 +2283,24 @@@ static void __bind_mempools(struct mapp
                        bioset_free(md->bs);
                        md->bs = p->bs;
                        p->bs = NULL;
-               } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) {
-                       /*
-                        * There's no need to reload with request-based dm
-                        * because the size of front_pad doesn't change.
-                        * Note for future: If you are to reload bioset,
-                        * prep-ed requests in the queue may refer
-                        * to bio from the old bioset, so you must walk
-                        * through the queue to unprep.
-                        */
                }
+               /*
+                * There's no need to reload with request-based dm
+                * because the size of front_pad doesn't change.
+                * Note for future: If you are to reload bioset,
+                * prep-ed requests in the queue may refer
+                * to bio from the old bioset, so you must walk
+                * through the queue to unprep.
+                */
                goto out;
        }
  
-       BUG_ON(!p || md->io_pool || md->bs);
+       BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
  
        md->io_pool = p->io_pool;
        p->io_pool = NULL;
+       md->rq_pool = p->rq_pool;
+       p->rq_pool = NULL;
        md->bs = p->bs;
        p->bs = NULL;
  
@@@ -2407,6 -2500,14 +2503,14 @@@ unsigned dm_get_md_type(struct mapped_d
        return md->type;
  }
  
+ static bool dm_md_type_request_based(struct mapped_device *md)
+ {
+       unsigned table_type = dm_get_md_type(md);
+       return (table_type == DM_TYPE_REQUEST_BASED ||
+               table_type == DM_TYPE_MQ_REQUEST_BASED);
+ }
  struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
  {
        return md->immutable_target_type;
@@@ -2444,6 -2545,11 +2548,11 @@@ static int dm_init_request_based_queue(
        blk_queue_prep_rq(md->queue, dm_prep_fn);
        blk_queue_lld_busy(md->queue, dm_lld_busy);
  
+       /* Also initialize the request-based DM worker thread */
+       init_kthread_worker(&md->kworker);
+       md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+                                      "kdmwork-%s", dm_device_name(md));
        elv_register_queue(md->queue);
  
        return 1;
   */
  int dm_setup_md_queue(struct mapped_device *md)
  {
-       if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
-           !dm_init_request_based_queue(md)) {
+       if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
                DMWARN("Cannot initialize queue for request-based mapped device");
                return -EINVAL;
        }
@@@ -2534,6 -2639,9 +2642,9 @@@ static void __dm_destroy(struct mapped_
        set_bit(DMF_FREEING, &md->flags);
        spin_unlock(&_minor_lock);
  
+       if (dm_request_based(md))
+               flush_kthread_worker(&md->kworker);
        if (!dm_suspended_md(md)) {
                dm_table_presuspend_targets(map);
                dm_table_postsuspend_targets(map);
@@@ -2777,8 -2885,10 +2888,10 @@@ static int __dm_suspend(struct mapped_d
         * Stop md->queue before flushing md->wq in case request-based
         * dm defers requests to md->wq from md->queue.
         */
-       if (dm_request_based(md))
+       if (dm_request_based(md)) {
                stop_queue(md->queue);
+               flush_kthread_worker(&md->kworker);
+       }
  
        flush_workqueue(md->wq);
  
@@@ -2932,7 -3042,7 +3045,7 @@@ static void __dm_internal_suspend(struc
  {
        struct dm_table *map = NULL;
  
 -      if (dm_suspended_internally_md(md))
 +      if (md->internal_suspend_count++)
                return; /* nested internal suspend */
  
        if (dm_suspended_md(md)) {
  
  static void __dm_internal_resume(struct mapped_device *md)
  {
 -      if (!dm_suspended_internally_md(md))
 +      BUG_ON(!md->internal_suspend_count);
 +
 +      if (--md->internal_suspend_count)
                return; /* resume from nested internal suspend */
  
        if (dm_suspended_md(md))
@@@ -3124,24 -3232,35 +3237,35 @@@ struct dm_md_mempools *dm_alloc_md_memp
  {
        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
        struct kmem_cache *cachep;
-       unsigned int pool_size;
+       unsigned int pool_size = 0;
        unsigned int front_pad;
  
        if (!pools)
                return NULL;
  
-       if (type == DM_TYPE_BIO_BASED) {
+       switch (type) {
+       case DM_TYPE_BIO_BASED:
                cachep = _io_cache;
                pool_size = dm_get_reserved_bio_based_ios();
                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-       } else if (type == DM_TYPE_REQUEST_BASED) {
-               cachep = _rq_tio_cache;
+               break;
+       case DM_TYPE_REQUEST_BASED:
                pool_size = dm_get_reserved_rq_based_ios();
+               pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+               if (!pools->rq_pool)
+                       goto out;
+               /* fall through to setup remaining rq-based pools */
+       case DM_TYPE_MQ_REQUEST_BASED:
+               cachep = _rq_tio_cache;
+               if (!pool_size)
+                       pool_size = dm_get_reserved_rq_based_ios();
                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
                /* per_bio_data_size is not used. See __bind_mempools(). */
                WARN_ON(per_bio_data_size != 0);
-       } else
+               break;
+       default:
                goto out;
+       }
  
        pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
        if (!pools->io_pool)
@@@ -3170,6 -3289,9 +3294,9 @@@ void dm_free_md_mempools(struct dm_md_m
        if (pools->io_pool)
                mempool_destroy(pools->io_pool);
  
+       if (pools->rq_pool)
+               mempool_destroy(pools->rq_pool);
        if (pools->bs)
                bioset_free(pools->bs);