Merge branch 'restriper' of git://github.com/idryomov/btrfs-unstable into integration
[linux-drm-fsl-dcu.git] / fs / btrfs / volumes.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <linux/kthread.h>
27 #include <asm/div64.h>
28 #include "compat.h"
29 #include "ctree.h"
30 #include "extent_map.h"
31 #include "disk-io.h"
32 #include "transaction.h"
33 #include "print-tree.h"
34 #include "volumes.h"
35 #include "async-thread.h"
36
37 static int init_first_rw_device(struct btrfs_trans_handle *trans,
38                                 struct btrfs_root *root,
39                                 struct btrfs_device *device);
40 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
41
42 static DEFINE_MUTEX(uuid_mutex);
43 static LIST_HEAD(fs_uuids);
44
45 static void lock_chunks(struct btrfs_root *root)
46 {
47         mutex_lock(&root->fs_info->chunk_mutex);
48 }
49
50 static void unlock_chunks(struct btrfs_root *root)
51 {
52         mutex_unlock(&root->fs_info->chunk_mutex);
53 }
54
55 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
56 {
57         struct btrfs_device *device;
58         WARN_ON(fs_devices->opened);
59         while (!list_empty(&fs_devices->devices)) {
60                 device = list_entry(fs_devices->devices.next,
61                                     struct btrfs_device, dev_list);
62                 list_del(&device->dev_list);
63                 kfree(device->name);
64                 kfree(device);
65         }
66         kfree(fs_devices);
67 }
68
69 int btrfs_cleanup_fs_uuids(void)
70 {
71         struct btrfs_fs_devices *fs_devices;
72
73         while (!list_empty(&fs_uuids)) {
74                 fs_devices = list_entry(fs_uuids.next,
75                                         struct btrfs_fs_devices, list);
76                 list_del(&fs_devices->list);
77                 free_fs_devices(fs_devices);
78         }
79         return 0;
80 }
81
82 static noinline struct btrfs_device *__find_device(struct list_head *head,
83                                                    u64 devid, u8 *uuid)
84 {
85         struct btrfs_device *dev;
86
87         list_for_each_entry(dev, head, dev_list) {
88                 if (dev->devid == devid &&
89                     (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
90                         return dev;
91                 }
92         }
93         return NULL;
94 }
95
96 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
97 {
98         struct btrfs_fs_devices *fs_devices;
99
100         list_for_each_entry(fs_devices, &fs_uuids, list) {
101                 if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
102                         return fs_devices;
103         }
104         return NULL;
105 }
106
107 static void requeue_list(struct btrfs_pending_bios *pending_bios,
108                         struct bio *head, struct bio *tail)
109 {
110
111         struct bio *old_head;
112
113         old_head = pending_bios->head;
114         pending_bios->head = head;
115         if (pending_bios->tail)
116                 tail->bi_next = old_head;
117         else
118                 pending_bios->tail = tail;
119 }
120
121 /*
122  * we try to collect pending bios for a device so we don't get a large
123  * number of procs sending bios down to the same device.  This greatly
124  * improves the schedulers ability to collect and merge the bios.
125  *
126  * But, it also turns into a long list of bios to process and that is sure
127  * to eventually make the worker thread block.  The solution here is to
128  * make some progress and then put this work struct back at the end of
129  * the list if the block device is congested.  This way, multiple devices
130  * can make progress from a single worker thread.
131  */
132 static noinline int run_scheduled_bios(struct btrfs_device *device)
133 {
134         struct bio *pending;
135         struct backing_dev_info *bdi;
136         struct btrfs_fs_info *fs_info;
137         struct btrfs_pending_bios *pending_bios;
138         struct bio *tail;
139         struct bio *cur;
140         int again = 0;
141         unsigned long num_run;
142         unsigned long batch_run = 0;
143         unsigned long limit;
144         unsigned long last_waited = 0;
145         int force_reg = 0;
146         int sync_pending = 0;
147         struct blk_plug plug;
148
149         /*
150          * this function runs all the bios we've collected for
151          * a particular device.  We don't want to wander off to
152          * another device without first sending all of these down.
153          * So, setup a plug here and finish it off before we return
154          */
155         blk_start_plug(&plug);
156
157         bdi = blk_get_backing_dev_info(device->bdev);
158         fs_info = device->dev_root->fs_info;
159         limit = btrfs_async_submit_limit(fs_info);
160         limit = limit * 2 / 3;
161
162 loop:
163         spin_lock(&device->io_lock);
164
165 loop_lock:
166         num_run = 0;
167
168         /* take all the bios off the list at once and process them
169          * later on (without the lock held).  But, remember the
170          * tail and other pointers so the bios can be properly reinserted
171          * into the list if we hit congestion
172          */
173         if (!force_reg && device->pending_sync_bios.head) {
174                 pending_bios = &device->pending_sync_bios;
175                 force_reg = 1;
176         } else {
177                 pending_bios = &device->pending_bios;
178                 force_reg = 0;
179         }
180
181         pending = pending_bios->head;
182         tail = pending_bios->tail;
183         WARN_ON(pending && !tail);
184
185         /*
186          * if pending was null this time around, no bios need processing
187          * at all and we can stop.  Otherwise it'll loop back up again
188          * and do an additional check so no bios are missed.
189          *
190          * device->running_pending is used to synchronize with the
191          * schedule_bio code.
192          */
193         if (device->pending_sync_bios.head == NULL &&
194             device->pending_bios.head == NULL) {
195                 again = 0;
196                 device->running_pending = 0;
197         } else {
198                 again = 1;
199                 device->running_pending = 1;
200         }
201
202         pending_bios->head = NULL;
203         pending_bios->tail = NULL;
204
205         spin_unlock(&device->io_lock);
206
207         while (pending) {
208
209                 rmb();
210                 /* we want to work on both lists, but do more bios on the
211                  * sync list than the regular list
212                  */
213                 if ((num_run > 32 &&
214                     pending_bios != &device->pending_sync_bios &&
215                     device->pending_sync_bios.head) ||
216                    (num_run > 64 && pending_bios == &device->pending_sync_bios &&
217                     device->pending_bios.head)) {
218                         spin_lock(&device->io_lock);
219                         requeue_list(pending_bios, pending, tail);
220                         goto loop_lock;
221                 }
222
223                 cur = pending;
224                 pending = pending->bi_next;
225                 cur->bi_next = NULL;
226                 atomic_dec(&fs_info->nr_async_bios);
227
228                 if (atomic_read(&fs_info->nr_async_bios) < limit &&
229                     waitqueue_active(&fs_info->async_submit_wait))
230                         wake_up(&fs_info->async_submit_wait);
231
232                 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
233
234                 /*
235                  * if we're doing the sync list, record that our
236                  * plug has some sync requests on it
237                  *
238                  * If we're doing the regular list and there are
239                  * sync requests sitting around, unplug before
240                  * we add more
241                  */
242                 if (pending_bios == &device->pending_sync_bios) {
243                         sync_pending = 1;
244                 } else if (sync_pending) {
245                         blk_finish_plug(&plug);
246                         blk_start_plug(&plug);
247                         sync_pending = 0;
248                 }
249
250                 submit_bio(cur->bi_rw, cur);
251                 num_run++;
252                 batch_run++;
253                 if (need_resched())
254                         cond_resched();
255
256                 /*
257                  * we made progress, there is more work to do and the bdi
258                  * is now congested.  Back off and let other work structs
259                  * run instead
260                  */
261                 if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
262                     fs_info->fs_devices->open_devices > 1) {
263                         struct io_context *ioc;
264
265                         ioc = current->io_context;
266
267                         /*
268                          * the main goal here is that we don't want to
269                          * block if we're going to be able to submit
270                          * more requests without blocking.
271                          *
272                          * This code does two great things, it pokes into
273                          * the elevator code from a filesystem _and_
274                          * it makes assumptions about how batching works.
275                          */
276                         if (ioc && ioc->nr_batch_requests > 0 &&
277                             time_before(jiffies, ioc->last_waited + HZ/50UL) &&
278                             (last_waited == 0 ||
279                              ioc->last_waited == last_waited)) {
280                                 /*
281                                  * we want to go through our batch of
282                                  * requests and stop.  So, we copy out
283                                  * the ioc->last_waited time and test
284                                  * against it before looping
285                                  */
286                                 last_waited = ioc->last_waited;
287                                 if (need_resched())
288                                         cond_resched();
289                                 continue;
290                         }
291                         spin_lock(&device->io_lock);
292                         requeue_list(pending_bios, pending, tail);
293                         device->running_pending = 1;
294
295                         spin_unlock(&device->io_lock);
296                         btrfs_requeue_work(&device->work);
297                         goto done;
298                 }
299                 /* unplug every 64 requests just for good measure */
300                 if (batch_run % 64 == 0) {
301                         blk_finish_plug(&plug);
302                         blk_start_plug(&plug);
303                         sync_pending = 0;
304                 }
305         }
306
307         cond_resched();
308         if (again)
309                 goto loop;
310
311         spin_lock(&device->io_lock);
312         if (device->pending_bios.head || device->pending_sync_bios.head)
313                 goto loop_lock;
314         spin_unlock(&device->io_lock);
315
316 done:
317         blk_finish_plug(&plug);
318         return 0;
319 }
320
321 static void pending_bios_fn(struct btrfs_work *work)
322 {
323         struct btrfs_device *device;
324
325         device = container_of(work, struct btrfs_device, work);
326         run_scheduled_bios(device);
327 }
328
329 static noinline int device_list_add(const char *path,
330                            struct btrfs_super_block *disk_super,
331                            u64 devid, struct btrfs_fs_devices **fs_devices_ret)
332 {
333         struct btrfs_device *device;
334         struct btrfs_fs_devices *fs_devices;
335         u64 found_transid = btrfs_super_generation(disk_super);
336         char *name;
337
338         fs_devices = find_fsid(disk_super->fsid);
339         if (!fs_devices) {
340                 fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
341                 if (!fs_devices)
342                         return -ENOMEM;
343                 INIT_LIST_HEAD(&fs_devices->devices);
344                 INIT_LIST_HEAD(&fs_devices->alloc_list);
345                 list_add(&fs_devices->list, &fs_uuids);
346                 memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
347                 fs_devices->latest_devid = devid;
348                 fs_devices->latest_trans = found_transid;
349                 mutex_init(&fs_devices->device_list_mutex);
350                 device = NULL;
351         } else {
352                 device = __find_device(&fs_devices->devices, devid,
353                                        disk_super->dev_item.uuid);
354         }
355         if (!device) {
356                 if (fs_devices->opened)
357                         return -EBUSY;
358
359                 device = kzalloc(sizeof(*device), GFP_NOFS);
360                 if (!device) {
361                         /* we can safely leave the fs_devices entry around */
362                         return -ENOMEM;
363                 }
364                 device->devid = devid;
365                 device->work.func = pending_bios_fn;
366                 memcpy(device->uuid, disk_super->dev_item.uuid,
367                        BTRFS_UUID_SIZE);
368                 spin_lock_init(&device->io_lock);
369                 device->name = kstrdup(path, GFP_NOFS);
370                 if (!device->name) {
371                         kfree(device);
372                         return -ENOMEM;
373                 }
374                 INIT_LIST_HEAD(&device->dev_alloc_list);
375
376                 /* init readahead state */
377                 spin_lock_init(&device->reada_lock);
378                 device->reada_curr_zone = NULL;
379                 atomic_set(&device->reada_in_flight, 0);
380                 device->reada_next = 0;
381                 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
382                 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
383
384                 mutex_lock(&fs_devices->device_list_mutex);
385                 list_add_rcu(&device->dev_list, &fs_devices->devices);
386                 mutex_unlock(&fs_devices->device_list_mutex);
387
388                 device->fs_devices = fs_devices;
389                 fs_devices->num_devices++;
390         } else if (!device->name || strcmp(device->name, path)) {
391                 name = kstrdup(path, GFP_NOFS);
392                 if (!name)
393                         return -ENOMEM;
394                 kfree(device->name);
395                 device->name = name;
396                 if (device->missing) {
397                         fs_devices->missing_devices--;
398                         device->missing = 0;
399                 }
400         }
401
402         if (found_transid > fs_devices->latest_trans) {
403                 fs_devices->latest_devid = devid;
404                 fs_devices->latest_trans = found_transid;
405         }
406         *fs_devices_ret = fs_devices;
407         return 0;
408 }
409
410 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
411 {
412         struct btrfs_fs_devices *fs_devices;
413         struct btrfs_device *device;
414         struct btrfs_device *orig_dev;
415
416         fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
417         if (!fs_devices)
418                 return ERR_PTR(-ENOMEM);
419
420         INIT_LIST_HEAD(&fs_devices->devices);
421         INIT_LIST_HEAD(&fs_devices->alloc_list);
422         INIT_LIST_HEAD(&fs_devices->list);
423         mutex_init(&fs_devices->device_list_mutex);
424         fs_devices->latest_devid = orig->latest_devid;
425         fs_devices->latest_trans = orig->latest_trans;
426         memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
427
428         /* We have held the volume lock, it is safe to get the devices. */
429         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
430                 device = kzalloc(sizeof(*device), GFP_NOFS);
431                 if (!device)
432                         goto error;
433
434                 device->name = kstrdup(orig_dev->name, GFP_NOFS);
435                 if (!device->name) {
436                         kfree(device);
437                         goto error;
438                 }
439
440                 device->devid = orig_dev->devid;
441                 device->work.func = pending_bios_fn;
442                 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
443                 spin_lock_init(&device->io_lock);
444                 INIT_LIST_HEAD(&device->dev_list);
445                 INIT_LIST_HEAD(&device->dev_alloc_list);
446
447                 list_add(&device->dev_list, &fs_devices->devices);
448                 device->fs_devices = fs_devices;
449                 fs_devices->num_devices++;
450         }
451         return fs_devices;
452 error:
453         free_fs_devices(fs_devices);
454         return ERR_PTR(-ENOMEM);
455 }
456
457 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
458 {
459         struct btrfs_device *device, *next;
460
461         mutex_lock(&uuid_mutex);
462 again:
463         /* This is the initialized path, it is safe to release the devices. */
464         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
465                 if (device->in_fs_metadata)
466                         continue;
467
468                 if (device->bdev) {
469                         blkdev_put(device->bdev, device->mode);
470                         device->bdev = NULL;
471                         fs_devices->open_devices--;
472                 }
473                 if (device->writeable) {
474                         list_del_init(&device->dev_alloc_list);
475                         device->writeable = 0;
476                         fs_devices->rw_devices--;
477                 }
478                 list_del_init(&device->dev_list);
479                 fs_devices->num_devices--;
480                 kfree(device->name);
481                 kfree(device);
482         }
483
484         if (fs_devices->seed) {
485                 fs_devices = fs_devices->seed;
486                 goto again;
487         }
488
489         mutex_unlock(&uuid_mutex);
490         return 0;
491 }
492
493 static void __free_device(struct work_struct *work)
494 {
495         struct btrfs_device *device;
496
497         device = container_of(work, struct btrfs_device, rcu_work);
498
499         if (device->bdev)
500                 blkdev_put(device->bdev, device->mode);
501
502         kfree(device->name);
503         kfree(device);
504 }
505
506 static void free_device(struct rcu_head *head)
507 {
508         struct btrfs_device *device;
509
510         device = container_of(head, struct btrfs_device, rcu);
511
512         INIT_WORK(&device->rcu_work, __free_device);
513         schedule_work(&device->rcu_work);
514 }
515
516 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
517 {
518         struct btrfs_device *device;
519
520         if (--fs_devices->opened > 0)
521                 return 0;
522
523         mutex_lock(&fs_devices->device_list_mutex);
524         list_for_each_entry(device, &fs_devices->devices, dev_list) {
525                 struct btrfs_device *new_device;
526
527                 if (device->bdev)
528                         fs_devices->open_devices--;
529
530                 if (device->writeable) {
531                         list_del_init(&device->dev_alloc_list);
532                         fs_devices->rw_devices--;
533                 }
534
535                 if (device->can_discard)
536                         fs_devices->num_can_discard--;
537
538                 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
539                 BUG_ON(!new_device);
540                 memcpy(new_device, device, sizeof(*new_device));
541                 new_device->name = kstrdup(device->name, GFP_NOFS);
542                 BUG_ON(device->name && !new_device->name);
543                 new_device->bdev = NULL;
544                 new_device->writeable = 0;
545                 new_device->in_fs_metadata = 0;
546                 new_device->can_discard = 0;
547                 list_replace_rcu(&device->dev_list, &new_device->dev_list);
548
549                 call_rcu(&device->rcu, free_device);
550         }
551         mutex_unlock(&fs_devices->device_list_mutex);
552
553         WARN_ON(fs_devices->open_devices);
554         WARN_ON(fs_devices->rw_devices);
555         fs_devices->opened = 0;
556         fs_devices->seeding = 0;
557
558         return 0;
559 }
560
561 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
562 {
563         struct btrfs_fs_devices *seed_devices = NULL;
564         int ret;
565
566         mutex_lock(&uuid_mutex);
567         ret = __btrfs_close_devices(fs_devices);
568         if (!fs_devices->opened) {
569                 seed_devices = fs_devices->seed;
570                 fs_devices->seed = NULL;
571         }
572         mutex_unlock(&uuid_mutex);
573
574         while (seed_devices) {
575                 fs_devices = seed_devices;
576                 seed_devices = fs_devices->seed;
577                 __btrfs_close_devices(fs_devices);
578                 free_fs_devices(fs_devices);
579         }
580         return ret;
581 }
582
583 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
584                                 fmode_t flags, void *holder)
585 {
586         struct request_queue *q;
587         struct block_device *bdev;
588         struct list_head *head = &fs_devices->devices;
589         struct btrfs_device *device;
590         struct block_device *latest_bdev = NULL;
591         struct buffer_head *bh;
592         struct btrfs_super_block *disk_super;
593         u64 latest_devid = 0;
594         u64 latest_transid = 0;
595         u64 devid;
596         int seeding = 1;
597         int ret = 0;
598
599         flags |= FMODE_EXCL;
600
601         list_for_each_entry(device, head, dev_list) {
602                 if (device->bdev)
603                         continue;
604                 if (!device->name)
605                         continue;
606
607                 bdev = blkdev_get_by_path(device->name, flags, holder);
608                 if (IS_ERR(bdev)) {
609                         printk(KERN_INFO "open %s failed\n", device->name);
610                         goto error;
611                 }
612                 set_blocksize(bdev, 4096);
613
614                 bh = btrfs_read_dev_super(bdev);
615                 if (!bh)
616                         goto error_close;
617
618                 disk_super = (struct btrfs_super_block *)bh->b_data;
619                 devid = btrfs_stack_device_id(&disk_super->dev_item);
620                 if (devid != device->devid)
621                         goto error_brelse;
622
623                 if (memcmp(device->uuid, disk_super->dev_item.uuid,
624                            BTRFS_UUID_SIZE))
625                         goto error_brelse;
626
627                 device->generation = btrfs_super_generation(disk_super);
628                 if (!latest_transid || device->generation > latest_transid) {
629                         latest_devid = devid;
630                         latest_transid = device->generation;
631                         latest_bdev = bdev;
632                 }
633
634                 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
635                         device->writeable = 0;
636                 } else {
637                         device->writeable = !bdev_read_only(bdev);
638                         seeding = 0;
639                 }
640
641                 q = bdev_get_queue(bdev);
642                 if (blk_queue_discard(q)) {
643                         device->can_discard = 1;
644                         fs_devices->num_can_discard++;
645                 }
646
647                 device->bdev = bdev;
648                 device->in_fs_metadata = 0;
649                 device->mode = flags;
650
651                 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
652                         fs_devices->rotating = 1;
653
654                 fs_devices->open_devices++;
655                 if (device->writeable) {
656                         fs_devices->rw_devices++;
657                         list_add(&device->dev_alloc_list,
658                                  &fs_devices->alloc_list);
659                 }
660                 brelse(bh);
661                 continue;
662
663 error_brelse:
664                 brelse(bh);
665 error_close:
666                 blkdev_put(bdev, flags);
667 error:
668                 continue;
669         }
670         if (fs_devices->open_devices == 0) {
671                 ret = -EINVAL;
672                 goto out;
673         }
674         fs_devices->seeding = seeding;
675         fs_devices->opened = 1;
676         fs_devices->latest_bdev = latest_bdev;
677         fs_devices->latest_devid = latest_devid;
678         fs_devices->latest_trans = latest_transid;
679         fs_devices->total_rw_bytes = 0;
680 out:
681         return ret;
682 }
683
684 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
685                        fmode_t flags, void *holder)
686 {
687         int ret;
688
689         mutex_lock(&uuid_mutex);
690         if (fs_devices->opened) {
691                 fs_devices->opened++;
692                 ret = 0;
693         } else {
694                 ret = __btrfs_open_devices(fs_devices, flags, holder);
695         }
696         mutex_unlock(&uuid_mutex);
697         return ret;
698 }
699
700 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
701                           struct btrfs_fs_devices **fs_devices_ret)
702 {
703         struct btrfs_super_block *disk_super;
704         struct block_device *bdev;
705         struct buffer_head *bh;
706         int ret;
707         u64 devid;
708         u64 transid;
709
710         mutex_lock(&uuid_mutex);
711
712         flags |= FMODE_EXCL;
713         bdev = blkdev_get_by_path(path, flags, holder);
714
715         if (IS_ERR(bdev)) {
716                 ret = PTR_ERR(bdev);
717                 goto error;
718         }
719
720         ret = set_blocksize(bdev, 4096);
721         if (ret)
722                 goto error_close;
723         bh = btrfs_read_dev_super(bdev);
724         if (!bh) {
725                 ret = -EINVAL;
726                 goto error_close;
727         }
728         disk_super = (struct btrfs_super_block *)bh->b_data;
729         devid = btrfs_stack_device_id(&disk_super->dev_item);
730         transid = btrfs_super_generation(disk_super);
731         if (disk_super->label[0])
732                 printk(KERN_INFO "device label %s ", disk_super->label);
733         else
734                 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
735         printk(KERN_CONT "devid %llu transid %llu %s\n",
736                (unsigned long long)devid, (unsigned long long)transid, path);
737         ret = device_list_add(path, disk_super, devid, fs_devices_ret);
738
739         brelse(bh);
740 error_close:
741         blkdev_put(bdev, flags);
742 error:
743         mutex_unlock(&uuid_mutex);
744         return ret;
745 }
746
747 /* helper to account the used device space in the range */
748 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
749                                    u64 end, u64 *length)
750 {
751         struct btrfs_key key;
752         struct btrfs_root *root = device->dev_root;
753         struct btrfs_dev_extent *dev_extent;
754         struct btrfs_path *path;
755         u64 extent_end;
756         int ret;
757         int slot;
758         struct extent_buffer *l;
759
760         *length = 0;
761
762         if (start >= device->total_bytes)
763                 return 0;
764
765         path = btrfs_alloc_path();
766         if (!path)
767                 return -ENOMEM;
768         path->reada = 2;
769
770         key.objectid = device->devid;
771         key.offset = start;
772         key.type = BTRFS_DEV_EXTENT_KEY;
773
774         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
775         if (ret < 0)
776                 goto out;
777         if (ret > 0) {
778                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
779                 if (ret < 0)
780                         goto out;
781         }
782
783         while (1) {
784                 l = path->nodes[0];
785                 slot = path->slots[0];
786                 if (slot >= btrfs_header_nritems(l)) {
787                         ret = btrfs_next_leaf(root, path);
788                         if (ret == 0)
789                                 continue;
790                         if (ret < 0)
791                                 goto out;
792
793                         break;
794                 }
795                 btrfs_item_key_to_cpu(l, &key, slot);
796
797                 if (key.objectid < device->devid)
798                         goto next;
799
800                 if (key.objectid > device->devid)
801                         break;
802
803                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
804                         goto next;
805
806                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
807                 extent_end = key.offset + btrfs_dev_extent_length(l,
808                                                                   dev_extent);
809                 if (key.offset <= start && extent_end > end) {
810                         *length = end - start + 1;
811                         break;
812                 } else if (key.offset <= start && extent_end > start)
813                         *length += extent_end - start;
814                 else if (key.offset > start && extent_end <= end)
815                         *length += extent_end - key.offset;
816                 else if (key.offset > start && key.offset <= end) {
817                         *length += end - key.offset + 1;
818                         break;
819                 } else if (key.offset > end)
820                         break;
821
822 next:
823                 path->slots[0]++;
824         }
825         ret = 0;
826 out:
827         btrfs_free_path(path);
828         return ret;
829 }
830
831 /*
832  * find_free_dev_extent - find free space in the specified device
833  * @trans:      transaction handler
834  * @device:     the device which we search the free space in
835  * @num_bytes:  the size of the free space that we need
836  * @start:      store the start of the free space.
837  * @len:        the size of the free space. that we find, or the size of the max
838  *              free space if we don't find suitable free space
839  *
840  * this uses a pretty simple search, the expectation is that it is
841  * called very infrequently and that a given device has a small number
842  * of extents
843  *
844  * @start is used to store the start of the free space if we find. But if we
845  * don't find suitable free space, it will be used to store the start position
846  * of the max free space.
847  *
848  * @len is used to store the size of the free space that we find.
849  * But if we don't find suitable free space, it is used to store the size of
850  * the max free space.
851  */
852 int find_free_dev_extent(struct btrfs_trans_handle *trans,
853                          struct btrfs_device *device, u64 num_bytes,
854                          u64 *start, u64 *len)
855 {
856         struct btrfs_key key;
857         struct btrfs_root *root = device->dev_root;
858         struct btrfs_dev_extent *dev_extent;
859         struct btrfs_path *path;
860         u64 hole_size;
861         u64 max_hole_start;
862         u64 max_hole_size;
863         u64 extent_end;
864         u64 search_start;
865         u64 search_end = device->total_bytes;
866         int ret;
867         int slot;
868         struct extent_buffer *l;
869
870         /* FIXME use last free of some kind */
871
872         /* we don't want to overwrite the superblock on the drive,
873          * so we make sure to start at an offset of at least 1MB
874          */
875         search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
876
877         max_hole_start = search_start;
878         max_hole_size = 0;
879         hole_size = 0;
880
881         if (search_start >= search_end) {
882                 ret = -ENOSPC;
883                 goto error;
884         }
885
886         path = btrfs_alloc_path();
887         if (!path) {
888                 ret = -ENOMEM;
889                 goto error;
890         }
891         path->reada = 2;
892
893         key.objectid = device->devid;
894         key.offset = search_start;
895         key.type = BTRFS_DEV_EXTENT_KEY;
896
897         ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
898         if (ret < 0)
899                 goto out;
900         if (ret > 0) {
901                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
902                 if (ret < 0)
903                         goto out;
904         }
905
906         while (1) {
907                 l = path->nodes[0];
908                 slot = path->slots[0];
909                 if (slot >= btrfs_header_nritems(l)) {
910                         ret = btrfs_next_leaf(root, path);
911                         if (ret == 0)
912                                 continue;
913                         if (ret < 0)
914                                 goto out;
915
916                         break;
917                 }
918                 btrfs_item_key_to_cpu(l, &key, slot);
919
920                 if (key.objectid < device->devid)
921                         goto next;
922
923                 if (key.objectid > device->devid)
924                         break;
925
926                 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
927                         goto next;
928
929                 if (key.offset > search_start) {
930                         hole_size = key.offset - search_start;
931
932                         if (hole_size > max_hole_size) {
933                                 max_hole_start = search_start;
934                                 max_hole_size = hole_size;
935                         }
936
937                         /*
938                          * If this free space is greater than which we need,
939                          * it must be the max free space that we have found
940                          * until now, so max_hole_start must point to the start
941                          * of this free space and the length of this free space
942                          * is stored in max_hole_size. Thus, we return
943                          * max_hole_start and max_hole_size and go back to the
944                          * caller.
945                          */
946                         if (hole_size >= num_bytes) {
947                                 ret = 0;
948                                 goto out;
949                         }
950                 }
951
952                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
953                 extent_end = key.offset + btrfs_dev_extent_length(l,
954                                                                   dev_extent);
955                 if (extent_end > search_start)
956                         search_start = extent_end;
957 next:
958                 path->slots[0]++;
959                 cond_resched();
960         }
961
962         /*
963          * At this point, search_start should be the end of
964          * allocated dev extents, and when shrinking the device,
965          * search_end may be smaller than search_start.
966          */
967         if (search_end > search_start)
968                 hole_size = search_end - search_start;
969
970         if (hole_size > max_hole_size) {
971                 max_hole_start = search_start;
972                 max_hole_size = hole_size;
973         }
974
975         /* See above. */
976         if (hole_size < num_bytes)
977                 ret = -ENOSPC;
978         else
979                 ret = 0;
980
981 out:
982         btrfs_free_path(path);
983 error:
984         *start = max_hole_start;
985         if (len)
986                 *len = max_hole_size;
987         return ret;
988 }
989
990 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
991                           struct btrfs_device *device,
992                           u64 start)
993 {
994         int ret;
995         struct btrfs_path *path;
996         struct btrfs_root *root = device->dev_root;
997         struct btrfs_key key;
998         struct btrfs_key found_key;
999         struct extent_buffer *leaf = NULL;
1000         struct btrfs_dev_extent *extent = NULL;
1001
1002         path = btrfs_alloc_path();
1003         if (!path)
1004                 return -ENOMEM;
1005
1006         key.objectid = device->devid;
1007         key.offset = start;
1008         key.type = BTRFS_DEV_EXTENT_KEY;
1009 again:
1010         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1011         if (ret > 0) {
1012                 ret = btrfs_previous_item(root, path, key.objectid,
1013                                           BTRFS_DEV_EXTENT_KEY);
1014                 if (ret)
1015                         goto out;
1016                 leaf = path->nodes[0];
1017                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1018                 extent = btrfs_item_ptr(leaf, path->slots[0],
1019                                         struct btrfs_dev_extent);
1020                 BUG_ON(found_key.offset > start || found_key.offset +
1021                        btrfs_dev_extent_length(leaf, extent) < start);
1022                 key = found_key;
1023                 btrfs_release_path(path);
1024                 goto again;
1025         } else if (ret == 0) {
1026                 leaf = path->nodes[0];
1027                 extent = btrfs_item_ptr(leaf, path->slots[0],
1028                                         struct btrfs_dev_extent);
1029         }
1030         BUG_ON(ret);
1031
1032         if (device->bytes_used > 0) {
1033                 u64 len = btrfs_dev_extent_length(leaf, extent);
1034                 device->bytes_used -= len;
1035                 spin_lock(&root->fs_info->free_chunk_lock);
1036                 root->fs_info->free_chunk_space += len;
1037                 spin_unlock(&root->fs_info->free_chunk_lock);
1038         }
1039         ret = btrfs_del_item(trans, root, path);
1040
1041 out:
1042         btrfs_free_path(path);
1043         return ret;
1044 }
1045
1046 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1047                            struct btrfs_device *device,
1048                            u64 chunk_tree, u64 chunk_objectid,
1049                            u64 chunk_offset, u64 start, u64 num_bytes)
1050 {
1051         int ret;
1052         struct btrfs_path *path;
1053         struct btrfs_root *root = device->dev_root;
1054         struct btrfs_dev_extent *extent;
1055         struct extent_buffer *leaf;
1056         struct btrfs_key key;
1057
1058         WARN_ON(!device->in_fs_metadata);
1059         path = btrfs_alloc_path();
1060         if (!path)
1061                 return -ENOMEM;
1062
1063         key.objectid = device->devid;
1064         key.offset = start;
1065         key.type = BTRFS_DEV_EXTENT_KEY;
1066         ret = btrfs_insert_empty_item(trans, root, path, &key,
1067                                       sizeof(*extent));
1068         BUG_ON(ret);
1069
1070         leaf = path->nodes[0];
1071         extent = btrfs_item_ptr(leaf, path->slots[0],
1072                                 struct btrfs_dev_extent);
1073         btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1074         btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1075         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1076
1077         write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1078                     (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1079                     BTRFS_UUID_SIZE);
1080
1081         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1082         btrfs_mark_buffer_dirty(leaf);
1083         btrfs_free_path(path);
1084         return ret;
1085 }
1086
1087 static noinline int find_next_chunk(struct btrfs_root *root,
1088                                     u64 objectid, u64 *offset)
1089 {
1090         struct btrfs_path *path;
1091         int ret;
1092         struct btrfs_key key;
1093         struct btrfs_chunk *chunk;
1094         struct btrfs_key found_key;
1095
1096         path = btrfs_alloc_path();
1097         if (!path)
1098                 return -ENOMEM;
1099
1100         key.objectid = objectid;
1101         key.offset = (u64)-1;
1102         key.type = BTRFS_CHUNK_ITEM_KEY;
1103
1104         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1105         if (ret < 0)
1106                 goto error;
1107
1108         BUG_ON(ret == 0);
1109
1110         ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1111         if (ret) {
1112                 *offset = 0;
1113         } else {
1114                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1115                                       path->slots[0]);
1116                 if (found_key.objectid != objectid)
1117                         *offset = 0;
1118                 else {
1119                         chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1120                                                struct btrfs_chunk);
1121                         *offset = found_key.offset +
1122                                 btrfs_chunk_length(path->nodes[0], chunk);
1123                 }
1124         }
1125         ret = 0;
1126 error:
1127         btrfs_free_path(path);
1128         return ret;
1129 }
1130
1131 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1132 {
1133         int ret;
1134         struct btrfs_key key;
1135         struct btrfs_key found_key;
1136         struct btrfs_path *path;
1137
1138         root = root->fs_info->chunk_root;
1139
1140         path = btrfs_alloc_path();
1141         if (!path)
1142                 return -ENOMEM;
1143
1144         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1145         key.type = BTRFS_DEV_ITEM_KEY;
1146         key.offset = (u64)-1;
1147
1148         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1149         if (ret < 0)
1150                 goto error;
1151
1152         BUG_ON(ret == 0);
1153
1154         ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1155                                   BTRFS_DEV_ITEM_KEY);
1156         if (ret) {
1157                 *objectid = 1;
1158         } else {
1159                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1160                                       path->slots[0]);
1161                 *objectid = found_key.offset + 1;
1162         }
1163         ret = 0;
1164 error:
1165         btrfs_free_path(path);
1166         return ret;
1167 }
1168
1169 /*
1170  * the device information is stored in the chunk root
1171  * the btrfs_device struct should be fully filled in
1172  */
1173 int btrfs_add_device(struct btrfs_trans_handle *trans,
1174                      struct btrfs_root *root,
1175                      struct btrfs_device *device)
1176 {
1177         int ret;
1178         struct btrfs_path *path;
1179         struct btrfs_dev_item *dev_item;
1180         struct extent_buffer *leaf;
1181         struct btrfs_key key;
1182         unsigned long ptr;
1183
1184         root = root->fs_info->chunk_root;
1185
1186         path = btrfs_alloc_path();
1187         if (!path)
1188                 return -ENOMEM;
1189
1190         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1191         key.type = BTRFS_DEV_ITEM_KEY;
1192         key.offset = device->devid;
1193
1194         ret = btrfs_insert_empty_item(trans, root, path, &key,
1195                                       sizeof(*dev_item));
1196         if (ret)
1197                 goto out;
1198
1199         leaf = path->nodes[0];
1200         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1201
1202         btrfs_set_device_id(leaf, dev_item, device->devid);
1203         btrfs_set_device_generation(leaf, dev_item, 0);
1204         btrfs_set_device_type(leaf, dev_item, device->type);
1205         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1206         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1207         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1208         btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1209         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1210         btrfs_set_device_group(leaf, dev_item, 0);
1211         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1212         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1213         btrfs_set_device_start_offset(leaf, dev_item, 0);
1214
1215         ptr = (unsigned long)btrfs_device_uuid(dev_item);
1216         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1217         ptr = (unsigned long)btrfs_device_fsid(dev_item);
1218         write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1219         btrfs_mark_buffer_dirty(leaf);
1220
1221         ret = 0;
1222 out:
1223         btrfs_free_path(path);
1224         return ret;
1225 }
1226
1227 static int btrfs_rm_dev_item(struct btrfs_root *root,
1228                              struct btrfs_device *device)
1229 {
1230         int ret;
1231         struct btrfs_path *path;
1232         struct btrfs_key key;
1233         struct btrfs_trans_handle *trans;
1234
1235         root = root->fs_info->chunk_root;
1236
1237         path = btrfs_alloc_path();
1238         if (!path)
1239                 return -ENOMEM;
1240
1241         trans = btrfs_start_transaction(root, 0);
1242         if (IS_ERR(trans)) {
1243                 btrfs_free_path(path);
1244                 return PTR_ERR(trans);
1245         }
1246         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1247         key.type = BTRFS_DEV_ITEM_KEY;
1248         key.offset = device->devid;
1249         lock_chunks(root);
1250
1251         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1252         if (ret < 0)
1253                 goto out;
1254
1255         if (ret > 0) {
1256                 ret = -ENOENT;
1257                 goto out;
1258         }
1259
1260         ret = btrfs_del_item(trans, root, path);
1261         if (ret)
1262                 goto out;
1263 out:
1264         btrfs_free_path(path);
1265         unlock_chunks(root);
1266         btrfs_commit_transaction(trans, root);
1267         return ret;
1268 }
1269
1270 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1271 {
1272         struct btrfs_device *device;
1273         struct btrfs_device *next_device;
1274         struct block_device *bdev;
1275         struct buffer_head *bh = NULL;
1276         struct btrfs_super_block *disk_super;
1277         struct btrfs_fs_devices *cur_devices;
1278         u64 all_avail;
1279         u64 devid;
1280         u64 num_devices;
1281         u8 *dev_uuid;
1282         int ret = 0;
1283         bool clear_super = false;
1284
1285         mutex_lock(&uuid_mutex);
1286
1287         all_avail = root->fs_info->avail_data_alloc_bits |
1288                 root->fs_info->avail_system_alloc_bits |
1289                 root->fs_info->avail_metadata_alloc_bits;
1290
1291         if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1292             root->fs_info->fs_devices->num_devices <= 4) {
1293                 printk(KERN_ERR "btrfs: unable to go below four devices "
1294                        "on raid10\n");
1295                 ret = -EINVAL;
1296                 goto out;
1297         }
1298
1299         if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1300             root->fs_info->fs_devices->num_devices <= 2) {
1301                 printk(KERN_ERR "btrfs: unable to go below two "
1302                        "devices on raid1\n");
1303                 ret = -EINVAL;
1304                 goto out;
1305         }
1306
1307         if (strcmp(device_path, "missing") == 0) {
1308                 struct list_head *devices;
1309                 struct btrfs_device *tmp;
1310
1311                 device = NULL;
1312                 devices = &root->fs_info->fs_devices->devices;
1313                 /*
1314                  * It is safe to read the devices since the volume_mutex
1315                  * is held.
1316                  */
1317                 list_for_each_entry(tmp, devices, dev_list) {
1318                         if (tmp->in_fs_metadata && !tmp->bdev) {
1319                                 device = tmp;
1320                                 break;
1321                         }
1322                 }
1323                 bdev = NULL;
1324                 bh = NULL;
1325                 disk_super = NULL;
1326                 if (!device) {
1327                         printk(KERN_ERR "btrfs: no missing devices found to "
1328                                "remove\n");
1329                         goto out;
1330                 }
1331         } else {
1332                 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1333                                           root->fs_info->bdev_holder);
1334                 if (IS_ERR(bdev)) {
1335                         ret = PTR_ERR(bdev);
1336                         goto out;
1337                 }
1338
1339                 set_blocksize(bdev, 4096);
1340                 bh = btrfs_read_dev_super(bdev);
1341                 if (!bh) {
1342                         ret = -EINVAL;
1343                         goto error_close;
1344                 }
1345                 disk_super = (struct btrfs_super_block *)bh->b_data;
1346                 devid = btrfs_stack_device_id(&disk_super->dev_item);
1347                 dev_uuid = disk_super->dev_item.uuid;
1348                 device = btrfs_find_device(root, devid, dev_uuid,
1349                                            disk_super->fsid);
1350                 if (!device) {
1351                         ret = -ENOENT;
1352                         goto error_brelse;
1353                 }
1354         }
1355
1356         if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1357                 printk(KERN_ERR "btrfs: unable to remove the only writeable "
1358                        "device\n");
1359                 ret = -EINVAL;
1360                 goto error_brelse;
1361         }
1362
1363         if (device->writeable) {
1364                 lock_chunks(root);
1365                 list_del_init(&device->dev_alloc_list);
1366                 unlock_chunks(root);
1367                 root->fs_info->fs_devices->rw_devices--;
1368                 clear_super = true;
1369         }
1370
1371         ret = btrfs_shrink_device(device, 0);
1372         if (ret)
1373                 goto error_undo;
1374
1375         ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1376         if (ret)
1377                 goto error_undo;
1378
1379         spin_lock(&root->fs_info->free_chunk_lock);
1380         root->fs_info->free_chunk_space = device->total_bytes -
1381                 device->bytes_used;
1382         spin_unlock(&root->fs_info->free_chunk_lock);
1383
1384         device->in_fs_metadata = 0;
1385         btrfs_scrub_cancel_dev(root, device);
1386
1387         /*
1388          * the device list mutex makes sure that we don't change
1389          * the device list while someone else is writing out all
1390          * the device supers.
1391          */
1392
1393         cur_devices = device->fs_devices;
1394         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1395         list_del_rcu(&device->dev_list);
1396
1397         device->fs_devices->num_devices--;
1398
1399         if (device->missing)
1400                 root->fs_info->fs_devices->missing_devices--;
1401
1402         next_device = list_entry(root->fs_info->fs_devices->devices.next,
1403                                  struct btrfs_device, dev_list);
1404         if (device->bdev == root->fs_info->sb->s_bdev)
1405                 root->fs_info->sb->s_bdev = next_device->bdev;
1406         if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1407                 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1408
1409         if (device->bdev)
1410                 device->fs_devices->open_devices--;
1411
1412         call_rcu(&device->rcu, free_device);
1413         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1414
1415         num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1416         btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1417
1418         if (cur_devices->open_devices == 0) {
1419                 struct btrfs_fs_devices *fs_devices;
1420                 fs_devices = root->fs_info->fs_devices;
1421                 while (fs_devices) {
1422                         if (fs_devices->seed == cur_devices)
1423                                 break;
1424                         fs_devices = fs_devices->seed;
1425                 }
1426                 fs_devices->seed = cur_devices->seed;
1427                 cur_devices->seed = NULL;
1428                 lock_chunks(root);
1429                 __btrfs_close_devices(cur_devices);
1430                 unlock_chunks(root);
1431                 free_fs_devices(cur_devices);
1432         }
1433
1434         /*
1435          * at this point, the device is zero sized.  We want to
1436          * remove it from the devices list and zero out the old super
1437          */
1438         if (clear_super) {
1439                 /* make sure this device isn't detected as part of
1440                  * the FS anymore
1441                  */
1442                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1443                 set_buffer_dirty(bh);
1444                 sync_dirty_buffer(bh);
1445         }
1446
1447         ret = 0;
1448
1449 error_brelse:
1450         brelse(bh);
1451 error_close:
1452         if (bdev)
1453                 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1454 out:
1455         mutex_unlock(&uuid_mutex);
1456         return ret;
1457 error_undo:
1458         if (device->writeable) {
1459                 lock_chunks(root);
1460                 list_add(&device->dev_alloc_list,
1461                          &root->fs_info->fs_devices->alloc_list);
1462                 unlock_chunks(root);
1463                 root->fs_info->fs_devices->rw_devices++;
1464         }
1465         goto error_brelse;
1466 }
1467
1468 /*
1469  * does all the dirty work required for changing file system's UUID.
1470  */
1471 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1472                                 struct btrfs_root *root)
1473 {
1474         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1475         struct btrfs_fs_devices *old_devices;
1476         struct btrfs_fs_devices *seed_devices;
1477         struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1478         struct btrfs_device *device;
1479         u64 super_flags;
1480
1481         BUG_ON(!mutex_is_locked(&uuid_mutex));
1482         if (!fs_devices->seeding)
1483                 return -EINVAL;
1484
1485         seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1486         if (!seed_devices)
1487                 return -ENOMEM;
1488
1489         old_devices = clone_fs_devices(fs_devices);
1490         if (IS_ERR(old_devices)) {
1491                 kfree(seed_devices);
1492                 return PTR_ERR(old_devices);
1493         }
1494
1495         list_add(&old_devices->list, &fs_uuids);
1496
1497         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1498         seed_devices->opened = 1;
1499         INIT_LIST_HEAD(&seed_devices->devices);
1500         INIT_LIST_HEAD(&seed_devices->alloc_list);
1501         mutex_init(&seed_devices->device_list_mutex);
1502
1503         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1504         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1505                               synchronize_rcu);
1506         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1507
1508         list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1509         list_for_each_entry(device, &seed_devices->devices, dev_list) {
1510                 device->fs_devices = seed_devices;
1511         }
1512
1513         fs_devices->seeding = 0;
1514         fs_devices->num_devices = 0;
1515         fs_devices->open_devices = 0;
1516         fs_devices->seed = seed_devices;
1517
1518         generate_random_uuid(fs_devices->fsid);
1519         memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1520         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1521         super_flags = btrfs_super_flags(disk_super) &
1522                       ~BTRFS_SUPER_FLAG_SEEDING;
1523         btrfs_set_super_flags(disk_super, super_flags);
1524
1525         return 0;
1526 }
1527
1528 /*
1529  * strore the expected generation for seed devices in device items.
1530  */
1531 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1532                                struct btrfs_root *root)
1533 {
1534         struct btrfs_path *path;
1535         struct extent_buffer *leaf;
1536         struct btrfs_dev_item *dev_item;
1537         struct btrfs_device *device;
1538         struct btrfs_key key;
1539         u8 fs_uuid[BTRFS_UUID_SIZE];
1540         u8 dev_uuid[BTRFS_UUID_SIZE];
1541         u64 devid;
1542         int ret;
1543
1544         path = btrfs_alloc_path();
1545         if (!path)
1546                 return -ENOMEM;
1547
1548         root = root->fs_info->chunk_root;
1549         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1550         key.offset = 0;
1551         key.type = BTRFS_DEV_ITEM_KEY;
1552
1553         while (1) {
1554                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1555                 if (ret < 0)
1556                         goto error;
1557
1558                 leaf = path->nodes[0];
1559 next_slot:
1560                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1561                         ret = btrfs_next_leaf(root, path);
1562                         if (ret > 0)
1563                                 break;
1564                         if (ret < 0)
1565                                 goto error;
1566                         leaf = path->nodes[0];
1567                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1568                         btrfs_release_path(path);
1569                         continue;
1570                 }
1571
1572                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1573                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1574                     key.type != BTRFS_DEV_ITEM_KEY)
1575                         break;
1576
1577                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
1578                                           struct btrfs_dev_item);
1579                 devid = btrfs_device_id(leaf, dev_item);
1580                 read_extent_buffer(leaf, dev_uuid,
1581                                    (unsigned long)btrfs_device_uuid(dev_item),
1582                                    BTRFS_UUID_SIZE);
1583                 read_extent_buffer(leaf, fs_uuid,
1584                                    (unsigned long)btrfs_device_fsid(dev_item),
1585                                    BTRFS_UUID_SIZE);
1586                 device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1587                 BUG_ON(!device);
1588
1589                 if (device->fs_devices->seeding) {
1590                         btrfs_set_device_generation(leaf, dev_item,
1591                                                     device->generation);
1592                         btrfs_mark_buffer_dirty(leaf);
1593                 }
1594
1595                 path->slots[0]++;
1596                 goto next_slot;
1597         }
1598         ret = 0;
1599 error:
1600         btrfs_free_path(path);
1601         return ret;
1602 }
1603
1604 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1605 {
1606         struct request_queue *q;
1607         struct btrfs_trans_handle *trans;
1608         struct btrfs_device *device;
1609         struct block_device *bdev;
1610         struct list_head *devices;
1611         struct super_block *sb = root->fs_info->sb;
1612         u64 total_bytes;
1613         int seeding_dev = 0;
1614         int ret = 0;
1615
1616         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1617                 return -EINVAL;
1618
1619         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
1620                                   root->fs_info->bdev_holder);
1621         if (IS_ERR(bdev))
1622                 return PTR_ERR(bdev);
1623
1624         if (root->fs_info->fs_devices->seeding) {
1625                 seeding_dev = 1;
1626                 down_write(&sb->s_umount);
1627                 mutex_lock(&uuid_mutex);
1628         }
1629
1630         filemap_write_and_wait(bdev->bd_inode->i_mapping);
1631
1632         devices = &root->fs_info->fs_devices->devices;
1633         /*
1634          * we have the volume lock, so we don't need the extra
1635          * device list mutex while reading the list here.
1636          */
1637         list_for_each_entry(device, devices, dev_list) {
1638                 if (device->bdev == bdev) {
1639                         ret = -EEXIST;
1640                         goto error;
1641                 }
1642         }
1643
1644         device = kzalloc(sizeof(*device), GFP_NOFS);
1645         if (!device) {
1646                 /* we can safely leave the fs_devices entry around */
1647                 ret = -ENOMEM;
1648                 goto error;
1649         }
1650
1651         device->name = kstrdup(device_path, GFP_NOFS);
1652         if (!device->name) {
1653                 kfree(device);
1654                 ret = -ENOMEM;
1655                 goto error;
1656         }
1657
1658         ret = find_next_devid(root, &device->devid);
1659         if (ret) {
1660                 kfree(device->name);
1661                 kfree(device);
1662                 goto error;
1663         }
1664
1665         trans = btrfs_start_transaction(root, 0);
1666         if (IS_ERR(trans)) {
1667                 kfree(device->name);
1668                 kfree(device);
1669                 ret = PTR_ERR(trans);
1670                 goto error;
1671         }
1672
1673         lock_chunks(root);
1674
1675         q = bdev_get_queue(bdev);
1676         if (blk_queue_discard(q))
1677                 device->can_discard = 1;
1678         device->writeable = 1;
1679         device->work.func = pending_bios_fn;
1680         generate_random_uuid(device->uuid);
1681         spin_lock_init(&device->io_lock);
1682         device->generation = trans->transid;
1683         device->io_width = root->sectorsize;
1684         device->io_align = root->sectorsize;
1685         device->sector_size = root->sectorsize;
1686         device->total_bytes = i_size_read(bdev->bd_inode);
1687         device->disk_total_bytes = device->total_bytes;
1688         device->dev_root = root->fs_info->dev_root;
1689         device->bdev = bdev;
1690         device->in_fs_metadata = 1;
1691         device->mode = FMODE_EXCL;
1692         set_blocksize(device->bdev, 4096);
1693
1694         if (seeding_dev) {
1695                 sb->s_flags &= ~MS_RDONLY;
1696                 ret = btrfs_prepare_sprout(trans, root);
1697                 BUG_ON(ret);
1698         }
1699
1700         device->fs_devices = root->fs_info->fs_devices;
1701
1702         /*
1703          * we don't want write_supers to jump in here with our device
1704          * half setup
1705          */
1706         mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1707         list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1708         list_add(&device->dev_alloc_list,
1709                  &root->fs_info->fs_devices->alloc_list);
1710         root->fs_info->fs_devices->num_devices++;
1711         root->fs_info->fs_devices->open_devices++;
1712         root->fs_info->fs_devices->rw_devices++;
1713         if (device->can_discard)
1714                 root->fs_info->fs_devices->num_can_discard++;
1715         root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1716
1717         spin_lock(&root->fs_info->free_chunk_lock);
1718         root->fs_info->free_chunk_space += device->total_bytes;
1719         spin_unlock(&root->fs_info->free_chunk_lock);
1720
1721         if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1722                 root->fs_info->fs_devices->rotating = 1;
1723
1724         total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1725         btrfs_set_super_total_bytes(root->fs_info->super_copy,
1726                                     total_bytes + device->total_bytes);
1727
1728         total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1729         btrfs_set_super_num_devices(root->fs_info->super_copy,
1730                                     total_bytes + 1);
1731         mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1732
1733         if (seeding_dev) {
1734                 ret = init_first_rw_device(trans, root, device);
1735                 BUG_ON(ret);
1736                 ret = btrfs_finish_sprout(trans, root);
1737                 BUG_ON(ret);
1738         } else {
1739                 ret = btrfs_add_device(trans, root, device);
1740         }
1741
1742         /*
1743          * we've got more storage, clear any full flags on the space
1744          * infos
1745          */
1746         btrfs_clear_space_info_full(root->fs_info);
1747
1748         unlock_chunks(root);
1749         btrfs_commit_transaction(trans, root);
1750
1751         if (seeding_dev) {
1752                 mutex_unlock(&uuid_mutex);
1753                 up_write(&sb->s_umount);
1754
1755                 ret = btrfs_relocate_sys_chunks(root);
1756                 BUG_ON(ret);
1757         }
1758
1759         return ret;
1760 error:
1761         blkdev_put(bdev, FMODE_EXCL);
1762         if (seeding_dev) {
1763                 mutex_unlock(&uuid_mutex);
1764                 up_write(&sb->s_umount);
1765         }
1766         return ret;
1767 }
1768
1769 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1770                                         struct btrfs_device *device)
1771 {
1772         int ret;
1773         struct btrfs_path *path;
1774         struct btrfs_root *root;
1775         struct btrfs_dev_item *dev_item;
1776         struct extent_buffer *leaf;
1777         struct btrfs_key key;
1778
1779         root = device->dev_root->fs_info->chunk_root;
1780
1781         path = btrfs_alloc_path();
1782         if (!path)
1783                 return -ENOMEM;
1784
1785         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1786         key.type = BTRFS_DEV_ITEM_KEY;
1787         key.offset = device->devid;
1788
1789         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1790         if (ret < 0)
1791                 goto out;
1792
1793         if (ret > 0) {
1794                 ret = -ENOENT;
1795                 goto out;
1796         }
1797
1798         leaf = path->nodes[0];
1799         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1800
1801         btrfs_set_device_id(leaf, dev_item, device->devid);
1802         btrfs_set_device_type(leaf, dev_item, device->type);
1803         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1804         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1805         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1806         btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1807         btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1808         btrfs_mark_buffer_dirty(leaf);
1809
1810 out:
1811         btrfs_free_path(path);
1812         return ret;
1813 }
1814
1815 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1816                       struct btrfs_device *device, u64 new_size)
1817 {
1818         struct btrfs_super_block *super_copy =
1819                 device->dev_root->fs_info->super_copy;
1820         u64 old_total = btrfs_super_total_bytes(super_copy);
1821         u64 diff = new_size - device->total_bytes;
1822
1823         if (!device->writeable)
1824                 return -EACCES;
1825         if (new_size <= device->total_bytes)
1826                 return -EINVAL;
1827
1828         btrfs_set_super_total_bytes(super_copy, old_total + diff);
1829         device->fs_devices->total_rw_bytes += diff;
1830
1831         device->total_bytes = new_size;
1832         device->disk_total_bytes = new_size;
1833         btrfs_clear_space_info_full(device->dev_root->fs_info);
1834
1835         return btrfs_update_device(trans, device);
1836 }
1837
1838 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1839                       struct btrfs_device *device, u64 new_size)
1840 {
1841         int ret;
1842         lock_chunks(device->dev_root);
1843         ret = __btrfs_grow_device(trans, device, new_size);
1844         unlock_chunks(device->dev_root);
1845         return ret;
1846 }
1847
1848 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849                             struct btrfs_root *root,
1850                             u64 chunk_tree, u64 chunk_objectid,
1851                             u64 chunk_offset)
1852 {
1853         int ret;
1854         struct btrfs_path *path;
1855         struct btrfs_key key;
1856
1857         root = root->fs_info->chunk_root;
1858         path = btrfs_alloc_path();
1859         if (!path)
1860                 return -ENOMEM;
1861
1862         key.objectid = chunk_objectid;
1863         key.offset = chunk_offset;
1864         key.type = BTRFS_CHUNK_ITEM_KEY;
1865
1866         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1867         BUG_ON(ret);
1868
1869         ret = btrfs_del_item(trans, root, path);
1870
1871         btrfs_free_path(path);
1872         return ret;
1873 }
1874
1875 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1876                         chunk_offset)
1877 {
1878         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1879         struct btrfs_disk_key *disk_key;
1880         struct btrfs_chunk *chunk;
1881         u8 *ptr;
1882         int ret = 0;
1883         u32 num_stripes;
1884         u32 array_size;
1885         u32 len = 0;
1886         u32 cur;
1887         struct btrfs_key key;
1888
1889         array_size = btrfs_super_sys_array_size(super_copy);
1890
1891         ptr = super_copy->sys_chunk_array;
1892         cur = 0;
1893
1894         while (cur < array_size) {
1895                 disk_key = (struct btrfs_disk_key *)ptr;
1896                 btrfs_disk_key_to_cpu(&key, disk_key);
1897
1898                 len = sizeof(*disk_key);
1899
1900                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1901                         chunk = (struct btrfs_chunk *)(ptr + len);
1902                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1903                         len += btrfs_chunk_item_size(num_stripes);
1904                 } else {
1905                         ret = -EIO;
1906                         break;
1907                 }
1908                 if (key.objectid == chunk_objectid &&
1909                     key.offset == chunk_offset) {
1910                         memmove(ptr, ptr + len, array_size - (cur + len));
1911                         array_size -= len;
1912                         btrfs_set_super_sys_array_size(super_copy, array_size);
1913                 } else {
1914                         ptr += len;
1915                         cur += len;
1916                 }
1917         }
1918         return ret;
1919 }
1920
1921 static int btrfs_relocate_chunk(struct btrfs_root *root,
1922                          u64 chunk_tree, u64 chunk_objectid,
1923                          u64 chunk_offset)
1924 {
1925         struct extent_map_tree *em_tree;
1926         struct btrfs_root *extent_root;
1927         struct btrfs_trans_handle *trans;
1928         struct extent_map *em;
1929         struct map_lookup *map;
1930         int ret;
1931         int i;
1932
1933         root = root->fs_info->chunk_root;
1934         extent_root = root->fs_info->extent_root;
1935         em_tree = &root->fs_info->mapping_tree.map_tree;
1936
1937         ret = btrfs_can_relocate(extent_root, chunk_offset);
1938         if (ret)
1939                 return -ENOSPC;
1940
1941         /* step one, relocate all the extents inside this chunk */
1942         ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1943         if (ret)
1944                 return ret;
1945
1946         trans = btrfs_start_transaction(root, 0);
1947         BUG_ON(IS_ERR(trans));
1948
1949         lock_chunks(root);
1950
1951         /*
1952          * step two, delete the device extents and the
1953          * chunk tree entries
1954          */
1955         read_lock(&em_tree->lock);
1956         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1957         read_unlock(&em_tree->lock);
1958
1959         BUG_ON(em->start > chunk_offset ||
1960                em->start + em->len < chunk_offset);
1961         map = (struct map_lookup *)em->bdev;
1962
1963         for (i = 0; i < map->num_stripes; i++) {
1964                 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1965                                             map->stripes[i].physical);
1966                 BUG_ON(ret);
1967
1968                 if (map->stripes[i].dev) {
1969                         ret = btrfs_update_device(trans, map->stripes[i].dev);
1970                         BUG_ON(ret);
1971                 }
1972         }
1973         ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1974                                chunk_offset);
1975
1976         BUG_ON(ret);
1977
1978         trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1979
1980         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1981                 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1982                 BUG_ON(ret);
1983         }
1984
1985         ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1986         BUG_ON(ret);
1987
1988         write_lock(&em_tree->lock);
1989         remove_extent_mapping(em_tree, em);
1990         write_unlock(&em_tree->lock);
1991
1992         kfree(map);
1993         em->bdev = NULL;
1994
1995         /* once for the tree */
1996         free_extent_map(em);
1997         /* once for us */
1998         free_extent_map(em);
1999
2000         unlock_chunks(root);
2001         btrfs_end_transaction(trans, root);
2002         return 0;
2003 }
2004
2005 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2006 {
2007         struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2008         struct btrfs_path *path;
2009         struct extent_buffer *leaf;
2010         struct btrfs_chunk *chunk;
2011         struct btrfs_key key;
2012         struct btrfs_key found_key;
2013         u64 chunk_tree = chunk_root->root_key.objectid;
2014         u64 chunk_type;
2015         bool retried = false;
2016         int failed = 0;
2017         int ret;
2018
2019         path = btrfs_alloc_path();
2020         if (!path)
2021                 return -ENOMEM;
2022
2023 again:
2024         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2025         key.offset = (u64)-1;
2026         key.type = BTRFS_CHUNK_ITEM_KEY;
2027
2028         while (1) {
2029                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2030                 if (ret < 0)
2031                         goto error;
2032                 BUG_ON(ret == 0);
2033
2034                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
2035                                           key.type);
2036                 if (ret < 0)
2037                         goto error;
2038                 if (ret > 0)
2039                         break;
2040
2041                 leaf = path->nodes[0];
2042                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2043
2044                 chunk = btrfs_item_ptr(leaf, path->slots[0],
2045                                        struct btrfs_chunk);
2046                 chunk_type = btrfs_chunk_type(leaf, chunk);
2047                 btrfs_release_path(path);
2048
2049                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2050                         ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2051                                                    found_key.objectid,
2052                                                    found_key.offset);
2053                         if (ret == -ENOSPC)
2054                                 failed++;
2055                         else if (ret)
2056                                 BUG();
2057                 }
2058
2059                 if (found_key.offset == 0)
2060                         break;
2061                 key.offset = found_key.offset - 1;
2062         }
2063         ret = 0;
2064         if (failed && !retried) {
2065                 failed = 0;
2066                 retried = true;
2067                 goto again;
2068         } else if (failed && retried) {
2069                 WARN_ON(1);
2070                 ret = -ENOSPC;
2071         }
2072 error:
2073         btrfs_free_path(path);
2074         return ret;
2075 }
2076
2077 static int insert_balance_item(struct btrfs_root *root,
2078                                struct btrfs_balance_control *bctl)
2079 {
2080         struct btrfs_trans_handle *trans;
2081         struct btrfs_balance_item *item;
2082         struct btrfs_disk_balance_args disk_bargs;
2083         struct btrfs_path *path;
2084         struct extent_buffer *leaf;
2085         struct btrfs_key key;
2086         int ret, err;
2087
2088         path = btrfs_alloc_path();
2089         if (!path)
2090                 return -ENOMEM;
2091
2092         trans = btrfs_start_transaction(root, 0);
2093         if (IS_ERR(trans)) {
2094                 btrfs_free_path(path);
2095                 return PTR_ERR(trans);
2096         }
2097
2098         key.objectid = BTRFS_BALANCE_OBJECTID;
2099         key.type = BTRFS_BALANCE_ITEM_KEY;
2100         key.offset = 0;
2101
2102         ret = btrfs_insert_empty_item(trans, root, path, &key,
2103                                       sizeof(*item));
2104         if (ret)
2105                 goto out;
2106
2107         leaf = path->nodes[0];
2108         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2109
2110         memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2111
2112         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2113         btrfs_set_balance_data(leaf, item, &disk_bargs);
2114         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2115         btrfs_set_balance_meta(leaf, item, &disk_bargs);
2116         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2117         btrfs_set_balance_sys(leaf, item, &disk_bargs);
2118
2119         btrfs_set_balance_flags(leaf, item, bctl->flags);
2120
2121         btrfs_mark_buffer_dirty(leaf);
2122 out:
2123         btrfs_free_path(path);
2124         err = btrfs_commit_transaction(trans, root);
2125         if (err && !ret)
2126                 ret = err;
2127         return ret;
2128 }
2129
2130 static int del_balance_item(struct btrfs_root *root)
2131 {
2132         struct btrfs_trans_handle *trans;
2133         struct btrfs_path *path;
2134         struct btrfs_key key;
2135         int ret, err;
2136
2137         path = btrfs_alloc_path();
2138         if (!path)
2139                 return -ENOMEM;
2140
2141         trans = btrfs_start_transaction(root, 0);
2142         if (IS_ERR(trans)) {
2143                 btrfs_free_path(path);
2144                 return PTR_ERR(trans);
2145         }
2146
2147         key.objectid = BTRFS_BALANCE_OBJECTID;
2148         key.type = BTRFS_BALANCE_ITEM_KEY;
2149         key.offset = 0;
2150
2151         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2152         if (ret < 0)
2153                 goto out;
2154         if (ret > 0) {
2155                 ret = -ENOENT;
2156                 goto out;
2157         }
2158
2159         ret = btrfs_del_item(trans, root, path);
2160 out:
2161         btrfs_free_path(path);
2162         err = btrfs_commit_transaction(trans, root);
2163         if (err && !ret)
2164                 ret = err;
2165         return ret;
2166 }
2167
2168 /*
2169  * This is a heuristic used to reduce the number of chunks balanced on
2170  * resume after balance was interrupted.
2171  */
2172 static void update_balance_args(struct btrfs_balance_control *bctl)
2173 {
2174         /*
2175          * Turn on soft mode for chunk types that were being converted.
2176          */
2177         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2178                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2179         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2180                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2181         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2182                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2183
2184         /*
2185          * Turn on usage filter if is not already used.  The idea is
2186          * that chunks that we have already balanced should be
2187          * reasonably full.  Don't do it for chunks that are being
2188          * converted - that will keep us from relocating unconverted
2189          * (albeit full) chunks.
2190          */
2191         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2192             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2193                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2194                 bctl->data.usage = 90;
2195         }
2196         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2197             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2198                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2199                 bctl->sys.usage = 90;
2200         }
2201         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2202             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2203                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2204                 bctl->meta.usage = 90;
2205         }
2206 }
2207
2208 /*
2209  * Should be called with both balance and volume mutexes held to
2210  * serialize other volume operations (add_dev/rm_dev/resize) with
2211  * restriper.  Same goes for unset_balance_control.
2212  */
2213 static void set_balance_control(struct btrfs_balance_control *bctl)
2214 {
2215         struct btrfs_fs_info *fs_info = bctl->fs_info;
2216
2217         BUG_ON(fs_info->balance_ctl);
2218
2219         spin_lock(&fs_info->balance_lock);
2220         fs_info->balance_ctl = bctl;
2221         spin_unlock(&fs_info->balance_lock);
2222 }
2223
2224 static void unset_balance_control(struct btrfs_fs_info *fs_info)
2225 {
2226         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2227
2228         BUG_ON(!fs_info->balance_ctl);
2229
2230         spin_lock(&fs_info->balance_lock);
2231         fs_info->balance_ctl = NULL;
2232         spin_unlock(&fs_info->balance_lock);
2233
2234         kfree(bctl);
2235 }
2236
2237 /*
2238  * Balance filters.  Return 1 if chunk should be filtered out
2239  * (should not be balanced).
2240  */
2241 static int chunk_profiles_filter(u64 chunk_profile,
2242                                  struct btrfs_balance_args *bargs)
2243 {
2244         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2245
2246         if (chunk_profile == 0)
2247                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2248
2249         if (bargs->profiles & chunk_profile)
2250                 return 0;
2251
2252         return 1;
2253 }
2254
2255 static u64 div_factor_fine(u64 num, int factor)
2256 {
2257         if (factor <= 0)
2258                 return 0;
2259         if (factor >= 100)
2260                 return num;
2261
2262         num *= factor;
2263         do_div(num, 100);
2264         return num;
2265 }
2266
2267 static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2268                               struct btrfs_balance_args *bargs)
2269 {
2270         struct btrfs_block_group_cache *cache;
2271         u64 chunk_used, user_thresh;
2272         int ret = 1;
2273
2274         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2275         chunk_used = btrfs_block_group_used(&cache->item);
2276
2277         user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
2278         if (chunk_used < user_thresh)
2279                 ret = 0;
2280
2281         btrfs_put_block_group(cache);
2282         return ret;
2283 }
2284
2285 static int chunk_devid_filter(struct extent_buffer *leaf,
2286                               struct btrfs_chunk *chunk,
2287                               struct btrfs_balance_args *bargs)
2288 {
2289         struct btrfs_stripe *stripe;
2290         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2291         int i;
2292
2293         for (i = 0; i < num_stripes; i++) {
2294                 stripe = btrfs_stripe_nr(chunk, i);
2295                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2296                         return 0;
2297         }
2298
2299         return 1;
2300 }
2301
2302 /* [pstart, pend) */
2303 static int chunk_drange_filter(struct extent_buffer *leaf,
2304                                struct btrfs_chunk *chunk,
2305                                u64 chunk_offset,
2306                                struct btrfs_balance_args *bargs)
2307 {
2308         struct btrfs_stripe *stripe;
2309         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2310         u64 stripe_offset;
2311         u64 stripe_length;
2312         int factor;
2313         int i;
2314
2315         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
2316                 return 0;
2317
2318         if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
2319              BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
2320                 factor = 2;
2321         else
2322                 factor = 1;
2323         factor = num_stripes / factor;
2324
2325         for (i = 0; i < num_stripes; i++) {
2326                 stripe = btrfs_stripe_nr(chunk, i);
2327                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
2328                         continue;
2329
2330                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
2331                 stripe_length = btrfs_chunk_length(leaf, chunk);
2332                 do_div(stripe_length, factor);
2333
2334                 if (stripe_offset < bargs->pend &&
2335                     stripe_offset + stripe_length > bargs->pstart)
2336                         return 0;
2337         }
2338
2339         return 1;
2340 }
2341
2342 /* [vstart, vend) */
2343 static int chunk_vrange_filter(struct extent_buffer *leaf,
2344                                struct btrfs_chunk *chunk,
2345                                u64 chunk_offset,
2346                                struct btrfs_balance_args *bargs)
2347 {
2348         if (chunk_offset < bargs->vend &&
2349             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
2350                 /* at least part of the chunk is inside this vrange */
2351                 return 0;
2352
2353         return 1;
2354 }
2355
2356 static int chunk_soft_convert_filter(u64 chunk_profile,
2357                                      struct btrfs_balance_args *bargs)
2358 {
2359         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
2360                 return 0;
2361
2362         chunk_profile &= BTRFS_BLOCK_GROUP_PROFILE_MASK;
2363
2364         if (chunk_profile == 0)
2365                 chunk_profile = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2366
2367         if (bargs->target & chunk_profile)
2368                 return 1;
2369
2370         return 0;
2371 }
2372
2373 static int should_balance_chunk(struct btrfs_root *root,
2374                                 struct extent_buffer *leaf,
2375                                 struct btrfs_chunk *chunk, u64 chunk_offset)
2376 {
2377         struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
2378         struct btrfs_balance_args *bargs = NULL;
2379         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
2380
2381         /* type filter */
2382         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
2383               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
2384                 return 0;
2385         }
2386
2387         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
2388                 bargs = &bctl->data;
2389         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
2390                 bargs = &bctl->sys;
2391         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
2392                 bargs = &bctl->meta;
2393
2394         /* profiles filter */
2395         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
2396             chunk_profiles_filter(chunk_type, bargs)) {
2397                 return 0;
2398         }
2399
2400         /* usage filter */
2401         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
2402             chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
2403                 return 0;
2404         }
2405
2406         /* devid filter */
2407         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
2408             chunk_devid_filter(leaf, chunk, bargs)) {
2409                 return 0;
2410         }
2411
2412         /* drange filter, makes sense only with devid filter */
2413         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
2414             chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
2415                 return 0;
2416         }
2417
2418         /* vrange filter */
2419         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
2420             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
2421                 return 0;
2422         }
2423
2424         /* soft profile changing mode */
2425         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
2426             chunk_soft_convert_filter(chunk_type, bargs)) {
2427                 return 0;
2428         }
2429
2430         return 1;
2431 }
2432
2433 static u64 div_factor(u64 num, int factor)
2434 {
2435         if (factor == 10)
2436                 return num;
2437         num *= factor;
2438         do_div(num, 10);
2439         return num;
2440 }
2441
2442 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
2443 {
2444         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2445         struct btrfs_root *chunk_root = fs_info->chunk_root;
2446         struct btrfs_root *dev_root = fs_info->dev_root;
2447         struct list_head *devices;
2448         struct btrfs_device *device;
2449         u64 old_size;
2450         u64 size_to_free;
2451         struct btrfs_chunk *chunk;
2452         struct btrfs_path *path;
2453         struct btrfs_key key;
2454         struct btrfs_key found_key;
2455         struct btrfs_trans_handle *trans;
2456         struct extent_buffer *leaf;
2457         int slot;
2458         int ret;
2459         int enospc_errors = 0;
2460         bool counting = true;
2461
2462         /* step one make some room on all the devices */
2463         devices = &fs_info->fs_devices->devices;
2464         list_for_each_entry(device, devices, dev_list) {
2465                 old_size = device->total_bytes;
2466                 size_to_free = div_factor(old_size, 1);
2467                 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2468                 if (!device->writeable ||
2469                     device->total_bytes - device->bytes_used > size_to_free)
2470                         continue;
2471
2472                 ret = btrfs_shrink_device(device, old_size - size_to_free);
2473                 if (ret == -ENOSPC)
2474                         break;
2475                 BUG_ON(ret);
2476
2477                 trans = btrfs_start_transaction(dev_root, 0);
2478                 BUG_ON(IS_ERR(trans));
2479
2480                 ret = btrfs_grow_device(trans, device, old_size);
2481                 BUG_ON(ret);
2482
2483                 btrfs_end_transaction(trans, dev_root);
2484         }
2485
2486         /* step two, relocate all the chunks */
2487         path = btrfs_alloc_path();
2488         if (!path) {
2489                 ret = -ENOMEM;
2490                 goto error;
2491         }
2492
2493         /* zero out stat counters */
2494         spin_lock(&fs_info->balance_lock);
2495         memset(&bctl->stat, 0, sizeof(bctl->stat));
2496         spin_unlock(&fs_info->balance_lock);
2497 again:
2498         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2499         key.offset = (u64)-1;
2500         key.type = BTRFS_CHUNK_ITEM_KEY;
2501
2502         while (1) {
2503                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
2504                     atomic_read(&fs_info->balance_cancel_req)) {
2505                         ret = -ECANCELED;
2506                         goto error;
2507                 }
2508
2509                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2510                 if (ret < 0)
2511                         goto error;
2512
2513                 /*
2514                  * this shouldn't happen, it means the last relocate
2515                  * failed
2516                  */
2517                 if (ret == 0)
2518                         BUG(); /* FIXME break ? */
2519
2520                 ret = btrfs_previous_item(chunk_root, path, 0,
2521                                           BTRFS_CHUNK_ITEM_KEY);
2522                 if (ret) {
2523                         ret = 0;
2524                         break;
2525                 }
2526
2527                 leaf = path->nodes[0];
2528                 slot = path->slots[0];
2529                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
2530
2531                 if (found_key.objectid != key.objectid)
2532                         break;
2533
2534                 /* chunk zero is special */
2535                 if (found_key.offset == 0)
2536                         break;
2537
2538                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
2539
2540                 if (!counting) {
2541                         spin_lock(&fs_info->balance_lock);
2542                         bctl->stat.considered++;
2543                         spin_unlock(&fs_info->balance_lock);
2544                 }
2545
2546                 ret = should_balance_chunk(chunk_root, leaf, chunk,
2547                                            found_key.offset);
2548                 btrfs_release_path(path);
2549                 if (!ret)
2550                         goto loop;
2551
2552                 if (counting) {
2553                         spin_lock(&fs_info->balance_lock);
2554                         bctl->stat.expected++;
2555                         spin_unlock(&fs_info->balance_lock);
2556                         goto loop;
2557                 }
2558
2559                 ret = btrfs_relocate_chunk(chunk_root,
2560                                            chunk_root->root_key.objectid,
2561                                            found_key.objectid,
2562                                            found_key.offset);
2563                 if (ret && ret != -ENOSPC)
2564                         goto error;
2565                 if (ret == -ENOSPC) {
2566                         enospc_errors++;
2567                 } else {
2568                         spin_lock(&fs_info->balance_lock);
2569                         bctl->stat.completed++;
2570                         spin_unlock(&fs_info->balance_lock);
2571                 }
2572 loop:
2573                 key.offset = found_key.offset - 1;
2574         }
2575
2576         if (counting) {
2577                 btrfs_release_path(path);
2578                 counting = false;
2579                 goto again;
2580         }
2581 error:
2582         btrfs_free_path(path);
2583         if (enospc_errors) {
2584                 printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
2585                        enospc_errors);
2586                 if (!ret)
2587                         ret = -ENOSPC;
2588         }
2589
2590         return ret;
2591 }
2592
2593 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
2594 {
2595         /* cancel requested || normal exit path */
2596         return atomic_read(&fs_info->balance_cancel_req) ||
2597                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
2598                  atomic_read(&fs_info->balance_cancel_req) == 0);
2599 }
2600
2601 static void __cancel_balance(struct btrfs_fs_info *fs_info)
2602 {
2603         int ret;
2604
2605         unset_balance_control(fs_info);
2606         ret = del_balance_item(fs_info->tree_root);
2607         BUG_ON(ret);
2608 }
2609
2610 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
2611                                struct btrfs_ioctl_balance_args *bargs);
2612
2613 /*
2614  * Should be called with both balance and volume mutexes held
2615  */
2616 int btrfs_balance(struct btrfs_balance_control *bctl,
2617                   struct btrfs_ioctl_balance_args *bargs)
2618 {
2619         struct btrfs_fs_info *fs_info = bctl->fs_info;
2620         u64 allowed;
2621         int ret;
2622
2623         if (btrfs_fs_closing(fs_info) ||
2624             atomic_read(&fs_info->balance_pause_req) ||
2625             atomic_read(&fs_info->balance_cancel_req)) {
2626                 ret = -EINVAL;
2627                 goto out;
2628         }
2629
2630         /*
2631          * In case of mixed groups both data and meta should be picked,
2632          * and identical options should be given for both of them.
2633          */
2634         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
2635         if ((allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2636             (bctl->flags & (BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA))) {
2637                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
2638                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
2639                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
2640                         printk(KERN_ERR "btrfs: with mixed groups data and "
2641                                "metadata balance options must be the same\n");
2642                         ret = -EINVAL;
2643                         goto out;
2644                 }
2645         }
2646
2647         /*
2648          * Profile changing sanity checks.  Skip them if a simple
2649          * balance is requested.
2650          */
2651         if (!((bctl->data.flags | bctl->sys.flags | bctl->meta.flags) &
2652               BTRFS_BALANCE_ARGS_CONVERT))
2653                 goto do_balance;
2654
2655         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
2656         if (fs_info->fs_devices->num_devices == 1)
2657                 allowed |= BTRFS_BLOCK_GROUP_DUP;
2658         else if (fs_info->fs_devices->num_devices < 4)
2659                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
2660         else
2661                 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
2662                                 BTRFS_BLOCK_GROUP_RAID10);
2663
2664         if (!profile_is_valid(bctl->data.target, 1) ||
2665             bctl->data.target & ~allowed) {
2666                 printk(KERN_ERR "btrfs: unable to start balance with target "
2667                        "data profile %llu\n",
2668                        (unsigned long long)bctl->data.target);
2669                 ret = -EINVAL;
2670                 goto out;
2671         }
2672         if (!profile_is_valid(bctl->meta.target, 1) ||
2673             bctl->meta.target & ~allowed) {
2674                 printk(KERN_ERR "btrfs: unable to start balance with target "
2675                        "metadata profile %llu\n",
2676                        (unsigned long long)bctl->meta.target);
2677                 ret = -EINVAL;
2678                 goto out;
2679         }
2680         if (!profile_is_valid(bctl->sys.target, 1) ||
2681             bctl->sys.target & ~allowed) {
2682                 printk(KERN_ERR "btrfs: unable to start balance with target "
2683                        "system profile %llu\n",
2684                        (unsigned long long)bctl->sys.target);
2685                 ret = -EINVAL;
2686                 goto out;
2687         }
2688
2689         if (bctl->data.target & BTRFS_BLOCK_GROUP_DUP) {
2690                 printk(KERN_ERR "btrfs: dup for data is not allowed\n");
2691                 ret = -EINVAL;
2692                 goto out;
2693         }
2694
2695         /* allow to reduce meta or sys integrity only if force set */
2696         allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2697                         BTRFS_BLOCK_GROUP_RAID10;
2698         if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2699              (fs_info->avail_system_alloc_bits & allowed) &&
2700              !(bctl->sys.target & allowed)) ||
2701             ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
2702              (fs_info->avail_metadata_alloc_bits & allowed) &&
2703              !(bctl->meta.target & allowed))) {
2704                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
2705                         printk(KERN_INFO "btrfs: force reducing metadata "
2706                                "integrity\n");
2707                 } else {
2708                         printk(KERN_ERR "btrfs: balance will reduce metadata "
2709                                "integrity, use force if you want this\n");
2710                         ret = -EINVAL;
2711                         goto out;
2712                 }
2713         }
2714
2715 do_balance:
2716         ret = insert_balance_item(fs_info->tree_root, bctl);
2717         if (ret && ret != -EEXIST)
2718                 goto out;
2719
2720         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
2721                 BUG_ON(ret == -EEXIST);
2722                 set_balance_control(bctl);
2723         } else {
2724                 BUG_ON(ret != -EEXIST);
2725                 spin_lock(&fs_info->balance_lock);
2726                 update_balance_args(bctl);
2727                 spin_unlock(&fs_info->balance_lock);
2728         }
2729
2730         atomic_inc(&fs_info->balance_running);
2731         mutex_unlock(&fs_info->balance_mutex);
2732
2733         ret = __btrfs_balance(fs_info);
2734
2735         mutex_lock(&fs_info->balance_mutex);
2736         atomic_dec(&fs_info->balance_running);
2737
2738         if (bargs) {
2739                 memset(bargs, 0, sizeof(*bargs));
2740                 update_ioctl_balance_args(fs_info, 0, bargs);
2741         }
2742
2743         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
2744             balance_need_close(fs_info)) {
2745                 __cancel_balance(fs_info);
2746         }
2747
2748         wake_up(&fs_info->balance_wait_q);
2749
2750         return ret;
2751 out:
2752         if (bctl->flags & BTRFS_BALANCE_RESUME)
2753                 __cancel_balance(fs_info);
2754         else
2755                 kfree(bctl);
2756         return ret;
2757 }
2758
2759 static int balance_kthread(void *data)
2760 {
2761         struct btrfs_balance_control *bctl =
2762                         (struct btrfs_balance_control *)data;
2763         struct btrfs_fs_info *fs_info = bctl->fs_info;
2764         int ret = 0;
2765
2766         mutex_lock(&fs_info->volume_mutex);
2767         mutex_lock(&fs_info->balance_mutex);
2768
2769         set_balance_control(bctl);
2770
2771         if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
2772                 printk(KERN_INFO "btrfs: force skipping balance\n");
2773         } else {
2774                 printk(KERN_INFO "btrfs: continuing balance\n");
2775                 ret = btrfs_balance(bctl, NULL);
2776         }
2777
2778         mutex_unlock(&fs_info->balance_mutex);
2779         mutex_unlock(&fs_info->volume_mutex);
2780         return ret;
2781 }
2782
2783 int btrfs_recover_balance(struct btrfs_root *tree_root)
2784 {
2785         struct task_struct *tsk;
2786         struct btrfs_balance_control *bctl;
2787         struct btrfs_balance_item *item;
2788         struct btrfs_disk_balance_args disk_bargs;
2789         struct btrfs_path *path;
2790         struct extent_buffer *leaf;
2791         struct btrfs_key key;
2792         int ret;
2793
2794         path = btrfs_alloc_path();
2795         if (!path)
2796                 return -ENOMEM;
2797
2798         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
2799         if (!bctl) {
2800                 ret = -ENOMEM;
2801                 goto out;
2802         }
2803
2804         key.objectid = BTRFS_BALANCE_OBJECTID;
2805         key.type = BTRFS_BALANCE_ITEM_KEY;
2806         key.offset = 0;
2807
2808         ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
2809         if (ret < 0)
2810                 goto out_bctl;
2811         if (ret > 0) { /* ret = -ENOENT; */
2812                 ret = 0;
2813                 goto out_bctl;
2814         }
2815
2816         leaf = path->nodes[0];
2817         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2818
2819         bctl->fs_info = tree_root->fs_info;
2820         bctl->flags = btrfs_balance_flags(leaf, item) | BTRFS_BALANCE_RESUME;
2821
2822         btrfs_balance_data(leaf, item, &disk_bargs);
2823         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
2824         btrfs_balance_meta(leaf, item, &disk_bargs);
2825         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
2826         btrfs_balance_sys(leaf, item, &disk_bargs);
2827         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
2828
2829         tsk = kthread_run(balance_kthread, bctl, "btrfs-balance");
2830         if (IS_ERR(tsk))
2831                 ret = PTR_ERR(tsk);
2832         else
2833                 goto out;
2834
2835 out_bctl:
2836         kfree(bctl);
2837 out:
2838         btrfs_free_path(path);
2839         return ret;
2840 }
2841
2842 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
2843 {
2844         int ret = 0;
2845
2846         mutex_lock(&fs_info->balance_mutex);
2847         if (!fs_info->balance_ctl) {
2848                 mutex_unlock(&fs_info->balance_mutex);
2849                 return -ENOTCONN;
2850         }
2851
2852         if (atomic_read(&fs_info->balance_running)) {
2853                 atomic_inc(&fs_info->balance_pause_req);
2854                 mutex_unlock(&fs_info->balance_mutex);
2855
2856                 wait_event(fs_info->balance_wait_q,
2857                            atomic_read(&fs_info->balance_running) == 0);
2858
2859                 mutex_lock(&fs_info->balance_mutex);
2860                 /* we are good with balance_ctl ripped off from under us */
2861                 BUG_ON(atomic_read(&fs_info->balance_running));
2862                 atomic_dec(&fs_info->balance_pause_req);
2863         } else {
2864                 ret = -ENOTCONN;
2865         }
2866
2867         mutex_unlock(&fs_info->balance_mutex);
2868         return ret;
2869 }
2870
2871 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
2872 {
2873         mutex_lock(&fs_info->balance_mutex);
2874         if (!fs_info->balance_ctl) {
2875                 mutex_unlock(&fs_info->balance_mutex);
2876                 return -ENOTCONN;
2877         }
2878
2879         atomic_inc(&fs_info->balance_cancel_req);
2880         /*
2881          * if we are running just wait and return, balance item is
2882          * deleted in btrfs_balance in this case
2883          */
2884         if (atomic_read(&fs_info->balance_running)) {
2885                 mutex_unlock(&fs_info->balance_mutex);
2886                 wait_event(fs_info->balance_wait_q,
2887                            atomic_read(&fs_info->balance_running) == 0);
2888                 mutex_lock(&fs_info->balance_mutex);
2889         } else {
2890                 /* __cancel_balance needs volume_mutex */
2891                 mutex_unlock(&fs_info->balance_mutex);
2892                 mutex_lock(&fs_info->volume_mutex);
2893                 mutex_lock(&fs_info->balance_mutex);
2894
2895                 if (fs_info->balance_ctl)
2896                         __cancel_balance(fs_info);
2897
2898                 mutex_unlock(&fs_info->volume_mutex);
2899         }
2900
2901         BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
2902         atomic_dec(&fs_info->balance_cancel_req);
2903         mutex_unlock(&fs_info->balance_mutex);
2904         return 0;
2905 }
2906
2907 /*
2908  * shrinking a device means finding all of the device extents past
2909  * the new size, and then following the back refs to the chunks.
2910  * The chunk relocation code actually frees the device extent
2911  */
2912 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2913 {
2914         struct btrfs_trans_handle *trans;
2915         struct btrfs_root *root = device->dev_root;
2916         struct btrfs_dev_extent *dev_extent = NULL;
2917         struct btrfs_path *path;
2918         u64 length;
2919         u64 chunk_tree;
2920         u64 chunk_objectid;
2921         u64 chunk_offset;
2922         int ret;
2923         int slot;
2924         int failed = 0;
2925         bool retried = false;
2926         struct extent_buffer *l;
2927         struct btrfs_key key;
2928         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2929         u64 old_total = btrfs_super_total_bytes(super_copy);
2930         u64 old_size = device->total_bytes;
2931         u64 diff = device->total_bytes - new_size;
2932
2933         if (new_size >= device->total_bytes)
2934                 return -EINVAL;
2935
2936         path = btrfs_alloc_path();
2937         if (!path)
2938                 return -ENOMEM;
2939
2940         path->reada = 2;
2941
2942         lock_chunks(root);
2943
2944         device->total_bytes = new_size;
2945         if (device->writeable) {
2946                 device->fs_devices->total_rw_bytes -= diff;
2947                 spin_lock(&root->fs_info->free_chunk_lock);
2948                 root->fs_info->free_chunk_space -= diff;
2949                 spin_unlock(&root->fs_info->free_chunk_lock);
2950         }
2951         unlock_chunks(root);
2952
2953 again:
2954         key.objectid = device->devid;
2955         key.offset = (u64)-1;
2956         key.type = BTRFS_DEV_EXTENT_KEY;
2957
2958         while (1) {
2959                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2960                 if (ret < 0)
2961                         goto done;
2962
2963                 ret = btrfs_previous_item(root, path, 0, key.type);
2964                 if (ret < 0)
2965                         goto done;
2966                 if (ret) {
2967                         ret = 0;
2968                         btrfs_release_path(path);
2969                         break;
2970                 }
2971
2972                 l = path->nodes[0];
2973                 slot = path->slots[0];
2974                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2975
2976                 if (key.objectid != device->devid) {
2977                         btrfs_release_path(path);
2978                         break;
2979                 }
2980
2981                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2982                 length = btrfs_dev_extent_length(l, dev_extent);
2983
2984                 if (key.offset + length <= new_size) {
2985                         btrfs_release_path(path);
2986                         break;
2987                 }
2988
2989                 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2990                 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2991                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2992                 btrfs_release_path(path);
2993
2994                 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2995                                            chunk_offset);
2996                 if (ret && ret != -ENOSPC)
2997                         goto done;
2998                 if (ret == -ENOSPC)
2999                         failed++;
3000                 key.offset -= 1;
3001         }
3002
3003         if (failed && !retried) {
3004                 failed = 0;
3005                 retried = true;
3006                 goto again;
3007         } else if (failed && retried) {
3008                 ret = -ENOSPC;
3009                 lock_chunks(root);
3010
3011                 device->total_bytes = old_size;
3012                 if (device->writeable)
3013                         device->fs_devices->total_rw_bytes += diff;
3014                 spin_lock(&root->fs_info->free_chunk_lock);
3015                 root->fs_info->free_chunk_space += diff;
3016                 spin_unlock(&root->fs_info->free_chunk_lock);
3017                 unlock_chunks(root);
3018                 goto done;
3019         }
3020
3021         /* Shrinking succeeded, else we would be at "done". */
3022         trans = btrfs_start_transaction(root, 0);
3023         if (IS_ERR(trans)) {
3024                 ret = PTR_ERR(trans);
3025                 goto done;
3026         }
3027
3028         lock_chunks(root);
3029
3030         device->disk_total_bytes = new_size;
3031         /* Now btrfs_update_device() will change the on-disk size. */
3032         ret = btrfs_update_device(trans, device);
3033         if (ret) {
3034                 unlock_chunks(root);
3035                 btrfs_end_transaction(trans, root);
3036                 goto done;
3037         }
3038         WARN_ON(diff > old_total);
3039         btrfs_set_super_total_bytes(super_copy, old_total - diff);
3040         unlock_chunks(root);
3041         btrfs_end_transaction(trans, root);
3042 done:
3043         btrfs_free_path(path);
3044         return ret;
3045 }
3046
3047 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
3048                            struct btrfs_root *root,
3049                            struct btrfs_key *key,
3050                            struct btrfs_chunk *chunk, int item_size)
3051 {
3052         struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3053         struct btrfs_disk_key disk_key;
3054         u32 array_size;
3055         u8 *ptr;
3056
3057         array_size = btrfs_super_sys_array_size(super_copy);
3058         if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
3059                 return -EFBIG;
3060
3061         ptr = super_copy->sys_chunk_array + array_size;
3062         btrfs_cpu_key_to_disk(&disk_key, key);
3063         memcpy(ptr, &disk_key, sizeof(disk_key));
3064         ptr += sizeof(disk_key);
3065         memcpy(ptr, chunk, item_size);
3066         item_size += sizeof(disk_key);
3067         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
3068         return 0;
3069 }
3070
3071 /*
3072  * sort the devices in descending order by max_avail, total_avail
3073  */
3074 static int btrfs_cmp_device_info(const void *a, const void *b)
3075 {
3076         const struct btrfs_device_info *di_a = a;
3077         const struct btrfs_device_info *di_b = b;
3078
3079         if (di_a->max_avail > di_b->max_avail)
3080                 return -1;
3081         if (di_a->max_avail < di_b->max_avail)
3082                 return 1;
3083         if (di_a->total_avail > di_b->total_avail)
3084                 return -1;
3085         if (di_a->total_avail < di_b->total_avail)
3086                 return 1;
3087         return 0;
3088 }
3089
3090 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
3091                                struct btrfs_root *extent_root,
3092                                struct map_lookup **map_ret,
3093                                u64 *num_bytes_out, u64 *stripe_size_out,
3094                                u64 start, u64 type)
3095 {
3096         struct btrfs_fs_info *info = extent_root->fs_info;
3097         struct btrfs_fs_devices *fs_devices = info->fs_devices;
3098         struct list_head *cur;
3099         struct map_lookup *map = NULL;
3100         struct extent_map_tree *em_tree;
3101         struct extent_map *em;
3102         struct btrfs_device_info *devices_info = NULL;
3103         u64 total_avail;
3104         int num_stripes;        /* total number of stripes to allocate */
3105         int sub_stripes;        /* sub_stripes info for map */
3106         int dev_stripes;        /* stripes per dev */
3107         int devs_max;           /* max devs to use */
3108         int devs_min;           /* min devs needed */
3109         int devs_increment;     /* ndevs has to be a multiple of this */
3110         int ncopies;            /* how many copies to data has */
3111         int ret;
3112         u64 max_stripe_size;
3113         u64 max_chunk_size;
3114         u64 stripe_size;
3115         u64 num_bytes;
3116         int ndevs;
3117         int i;
3118         int j;
3119
3120         if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
3121             (type & BTRFS_BLOCK_GROUP_DUP)) {
3122                 WARN_ON(1);
3123                 type &= ~BTRFS_BLOCK_GROUP_DUP;
3124         }
3125
3126         if (list_empty(&fs_devices->alloc_list))
3127                 return -ENOSPC;
3128
3129         sub_stripes = 1;
3130         dev_stripes = 1;
3131         devs_increment = 1;
3132         ncopies = 1;
3133         devs_max = 0;   /* 0 == as many as possible */
3134         devs_min = 1;
3135
3136         /*
3137          * define the properties of each RAID type.
3138          * FIXME: move this to a global table and use it in all RAID
3139          * calculation code
3140          */
3141         if (type & (BTRFS_BLOCK_GROUP_DUP)) {
3142                 dev_stripes = 2;
3143                 ncopies = 2;
3144                 devs_max = 1;
3145         } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
3146                 devs_min = 2;
3147         } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
3148                 devs_increment = 2;
3149                 ncopies = 2;
3150                 devs_max = 2;
3151                 devs_min = 2;
3152         } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
3153                 sub_stripes = 2;
3154                 devs_increment = 2;
3155                 ncopies = 2;
3156                 devs_min = 4;
3157         } else {
3158                 devs_max = 1;
3159         }
3160
3161         if (type & BTRFS_BLOCK_GROUP_DATA) {
3162                 max_stripe_size = 1024 * 1024 * 1024;
3163                 max_chunk_size = 10 * max_stripe_size;
3164         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
3165                 /* for larger filesystems, use larger metadata chunks */
3166                 if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
3167                         max_stripe_size = 1024 * 1024 * 1024;
3168                 else
3169                         max_stripe_size = 256 * 1024 * 1024;
3170                 max_chunk_size = max_stripe_size;
3171         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
3172                 max_stripe_size = 8 * 1024 * 1024;
3173                 max_chunk_size = 2 * max_stripe_size;
3174         } else {
3175                 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
3176                        type);
3177                 BUG_ON(1);
3178         }
3179
3180         /* we don't want a chunk larger than 10% of writeable space */
3181         max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
3182                              max_chunk_size);
3183
3184         devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
3185                                GFP_NOFS);
3186         if (!devices_info)
3187                 return -ENOMEM;
3188
3189         cur = fs_devices->alloc_list.next;
3190
3191         /*
3192          * in the first pass through the devices list, we gather information
3193          * about the available holes on each device.
3194          */
3195         ndevs = 0;
3196         while (cur != &fs_devices->alloc_list) {
3197                 struct btrfs_device *device;
3198                 u64 max_avail;
3199                 u64 dev_offset;
3200
3201                 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
3202
3203                 cur = cur->next;
3204
3205                 if (!device->writeable) {
3206                         printk(KERN_ERR
3207                                "btrfs: read-only device in alloc_list\n");
3208                         WARN_ON(1);
3209                         continue;
3210                 }
3211
3212                 if (!device->in_fs_metadata)
3213                         continue;
3214
3215                 if (device->total_bytes > device->bytes_used)
3216                         total_avail = device->total_bytes - device->bytes_used;
3217                 else
3218                         total_avail = 0;
3219
3220                 /* If there is no space on this device, skip it. */
3221                 if (total_avail == 0)
3222                         continue;
3223
3224                 ret = find_free_dev_extent(trans, device,
3225                                            max_stripe_size * dev_stripes,
3226                                            &dev_offset, &max_avail);
3227                 if (ret && ret != -ENOSPC)
3228                         goto error;
3229
3230                 if (ret == 0)
3231                         max_avail = max_stripe_size * dev_stripes;
3232
3233                 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
3234                         continue;
3235
3236                 devices_info[ndevs].dev_offset = dev_offset;
3237                 devices_info[ndevs].max_avail = max_avail;
3238                 devices_info[ndevs].total_avail = total_avail;
3239                 devices_info[ndevs].dev = device;
3240                 ++ndevs;
3241         }
3242
3243         /*
3244          * now sort the devices by hole size / available space
3245          */
3246         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
3247              btrfs_cmp_device_info, NULL);
3248
3249         /* round down to number of usable stripes */
3250         ndevs -= ndevs % devs_increment;
3251
3252         if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
3253                 ret = -ENOSPC;
3254                 goto error;
3255         }
3256
3257         if (devs_max && ndevs > devs_max)
3258                 ndevs = devs_max;
3259         /*
3260          * the primary goal is to maximize the number of stripes, so use as many
3261          * devices as possible, even if the stripes are not maximum sized.
3262          */
3263         stripe_size = devices_info[ndevs-1].max_avail;
3264         num_stripes = ndevs * dev_stripes;
3265
3266         if (stripe_size * num_stripes > max_chunk_size * ncopies) {
3267                 stripe_size = max_chunk_size * ncopies;
3268                 do_div(stripe_size, num_stripes);
3269         }
3270
3271         do_div(stripe_size, dev_stripes);
3272         do_div(stripe_size, BTRFS_STRIPE_LEN);
3273         stripe_size *= BTRFS_STRIPE_LEN;
3274
3275         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3276         if (!map) {
3277                 ret = -ENOMEM;
3278                 goto error;
3279         }
3280         map->num_stripes = num_stripes;
3281
3282         for (i = 0; i < ndevs; ++i) {
3283                 for (j = 0; j < dev_stripes; ++j) {
3284                         int s = i * dev_stripes + j;
3285                         map->stripes[s].dev = devices_info[i].dev;
3286                         map->stripes[s].physical = devices_info[i].dev_offset +
3287                                                    j * stripe_size;
3288                 }
3289         }
3290         map->sector_size = extent_root->sectorsize;
3291         map->stripe_len = BTRFS_STRIPE_LEN;
3292         map->io_align = BTRFS_STRIPE_LEN;