blackfin: bf537: fix typo "CONFIG_SND_SOC_ADV80X_MODULE"
[linux.git] / fs / ocfs2 / move_extents.c
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * move_extents.c
5  *
6  * Copyright (C) 2011 Oracle.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public
10  * License version 2 as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * General Public License for more details.
16  */
17 #include <linux/fs.h>
18 #include <linux/types.h>
19 #include <linux/mount.h>
20 #include <linux/swap.h>
21
22 #include <cluster/masklog.h>
23
24 #include "ocfs2.h"
25 #include "ocfs2_ioctl.h"
26
27 #include "alloc.h"
28 #include "aops.h"
29 #include "dlmglue.h"
30 #include "extent_map.h"
31 #include "inode.h"
32 #include "journal.h"
33 #include "suballoc.h"
34 #include "uptodate.h"
35 #include "super.h"
36 #include "dir.h"
37 #include "buffer_head_io.h"
38 #include "sysfile.h"
39 #include "refcounttree.h"
40 #include "move_extents.h"
41
42 struct ocfs2_move_extents_context {
43         struct inode *inode;
44         struct file *file;
45         int auto_defrag;
46         int partial;
47         int credits;
48         u32 new_phys_cpos;
49         u32 clusters_moved;
50         u64 refcount_loc;
51         struct ocfs2_move_extents *range;
52         struct ocfs2_extent_tree et;
53         struct ocfs2_alloc_context *meta_ac;
54         struct ocfs2_alloc_context *data_ac;
55         struct ocfs2_cached_dealloc_ctxt dealloc;
56 };
57
58 static int __ocfs2_move_extent(handle_t *handle,
59                                struct ocfs2_move_extents_context *context,
60                                u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
61                                int ext_flags)
62 {
63         int ret = 0, index;
64         struct inode *inode = context->inode;
65         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
66         struct ocfs2_extent_rec *rec, replace_rec;
67         struct ocfs2_path *path = NULL;
68         struct ocfs2_extent_list *el;
69         u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
70         u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
71
72         ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
73                                                p_cpos, new_p_cpos, len);
74         if (ret) {
75                 mlog_errno(ret);
76                 goto out;
77         }
78
79         memset(&replace_rec, 0, sizeof(replace_rec));
80         replace_rec.e_cpos = cpu_to_le32(cpos);
81         replace_rec.e_leaf_clusters = cpu_to_le16(len);
82         replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
83                                                                    new_p_cpos));
84
85         path = ocfs2_new_path_from_et(&context->et);
86         if (!path) {
87                 ret = -ENOMEM;
88                 mlog_errno(ret);
89                 goto out;
90         }
91
92         ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
93         if (ret) {
94                 mlog_errno(ret);
95                 goto out;
96         }
97
98         el = path_leaf_el(path);
99
100         index = ocfs2_search_extent_list(el, cpos);
101         if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
102                 ocfs2_error(inode->i_sb,
103                             "Inode %llu has an extent at cpos %u which can no "
104                             "longer be found.\n",
105                             (unsigned long long)ino, cpos);
106                 ret = -EROFS;
107                 goto out;
108         }
109
110         rec = &el->l_recs[index];
111
112         BUG_ON(ext_flags != rec->e_flags);
113         /*
114          * after moving/defraging to new location, the extent is not going
115          * to be refcounted anymore.
116          */
117         replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
118
119         ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
120                                       context->et.et_root_bh,
121                                       OCFS2_JOURNAL_ACCESS_WRITE);
122         if (ret) {
123                 mlog_errno(ret);
124                 goto out;
125         }
126
127         ret = ocfs2_split_extent(handle, &context->et, path, index,
128                                  &replace_rec, context->meta_ac,
129                                  &context->dealloc);
130         if (ret) {
131                 mlog_errno(ret);
132                 goto out;
133         }
134
135         ocfs2_journal_dirty(handle, context->et.et_root_bh);
136
137         context->new_phys_cpos = new_p_cpos;
138
139         /*
140          * need I to append truncate log for old clusters?
141          */
142         if (old_blkno) {
143                 if (ext_flags & OCFS2_EXT_REFCOUNTED)
144                         ret = ocfs2_decrease_refcount(inode, handle,
145                                         ocfs2_blocks_to_clusters(osb->sb,
146                                                                  old_blkno),
147                                         len, context->meta_ac,
148                                         &context->dealloc, 1);
149                 else
150                         ret = ocfs2_truncate_log_append(osb, handle,
151                                                         old_blkno, len);
152         }
153
154 out:
155         ocfs2_free_path(path);
156         return ret;
157 }
158
159 /*
160  * lock allocators, and reserving appropriate number of bits for
161  * meta blocks and data clusters.
162  *
163  * in some cases, we don't need to reserve clusters, just let data_ac
164  * be NULL.
165  */
166 static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167                                         struct ocfs2_extent_tree *et,
168                                         u32 clusters_to_move,
169                                         u32 extents_to_split,
170                                         struct ocfs2_alloc_context **meta_ac,
171                                         struct ocfs2_alloc_context **data_ac,
172                                         int extra_blocks,
173                                         int *credits)
174 {
175         int ret, num_free_extents;
176         unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178
179         num_free_extents = ocfs2_num_free_extents(osb, et);
180         if (num_free_extents < 0) {
181                 ret = num_free_extents;
182                 mlog_errno(ret);
183                 goto out;
184         }
185
186         if (!num_free_extents ||
187             (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188                 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189
190         ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191         if (ret) {
192                 mlog_errno(ret);
193                 goto out;
194         }
195
196         if (data_ac) {
197                 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198                 if (ret) {
199                         mlog_errno(ret);
200                         goto out;
201                 }
202         }
203
204         *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
205
206         mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
207              extra_blocks, clusters_to_move, *credits);
208 out:
209         if (ret) {
210                 if (*meta_ac) {
211                         ocfs2_free_alloc_context(*meta_ac);
212                         *meta_ac = NULL;
213                 }
214         }
215
216         return ret;
217 }
218
219 /*
220  * Using one journal handle to guarantee the data consistency in case
221  * crash happens anywhere.
222  *
223  *  XXX: defrag can end up with finishing partial extent as requested,
224  * due to not enough contiguous clusters can be found in allocator.
225  */
226 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
227                                u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
228 {
229         int ret, credits = 0, extra_blocks = 0, partial = context->partial;
230         handle_t *handle;
231         struct inode *inode = context->inode;
232         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233         struct inode *tl_inode = osb->osb_tl_inode;
234         struct ocfs2_refcount_tree *ref_tree = NULL;
235         u32 new_phys_cpos, new_len;
236         u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
237
238         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
239
240                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
241                          OCFS2_HAS_REFCOUNT_FL));
242
243                 BUG_ON(!context->refcount_loc);
244
245                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
246                                                &ref_tree, NULL);
247                 if (ret) {
248                         mlog_errno(ret);
249                         return ret;
250                 }
251
252                 ret = ocfs2_prepare_refcount_change_for_del(inode,
253                                                         context->refcount_loc,
254                                                         phys_blkno,
255                                                         *len,
256                                                         &credits,
257                                                         &extra_blocks);
258                 if (ret) {
259                         mlog_errno(ret);
260                         goto out;
261                 }
262         }
263
264         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
265                                                  &context->meta_ac,
266                                                  &context->data_ac,
267                                                  extra_blocks, &credits);
268         if (ret) {
269                 mlog_errno(ret);
270                 goto out;
271         }
272
273         /*
274          * should be using allocation reservation strategy there?
275          *
276          * if (context->data_ac)
277          *      context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
278          */
279
280         mutex_lock(&tl_inode->i_mutex);
281
282         if (ocfs2_truncate_log_needs_flush(osb)) {
283                 ret = __ocfs2_flush_truncate_log(osb);
284                 if (ret < 0) {
285                         mlog_errno(ret);
286                         goto out_unlock_mutex;
287                 }
288         }
289
290         handle = ocfs2_start_trans(osb, credits);
291         if (IS_ERR(handle)) {
292                 ret = PTR_ERR(handle);
293                 mlog_errno(ret);
294                 goto out_unlock_mutex;
295         }
296
297         ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
298                                      &new_phys_cpos, &new_len);
299         if (ret) {
300                 mlog_errno(ret);
301                 goto out_commit;
302         }
303
304         /*
305          * allowing partial extent moving is kind of 'pros and cons', it makes
306          * whole defragmentation less likely to fail, on the contrary, the bad
307          * thing is it may make the fs even more fragmented after moving, let
308          * userspace make a good decision here.
309          */
310         if (new_len != *len) {
311                 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
312                 if (!partial) {
313                         context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
314                         ret = -ENOSPC;
315                         goto out_commit;
316                 }
317         }
318
319         mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
320              phys_cpos, new_phys_cpos);
321
322         ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
323                                   new_phys_cpos, ext_flags);
324         if (ret)
325                 mlog_errno(ret);
326
327         if (partial && (new_len != *len))
328                 *len = new_len;
329
330         /*
331          * Here we should write the new page out first if we are
332          * in write-back mode.
333          */
334         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
335         if (ret)
336                 mlog_errno(ret);
337
338 out_commit:
339         ocfs2_commit_trans(osb, handle);
340
341 out_unlock_mutex:
342         mutex_unlock(&tl_inode->i_mutex);
343
344         if (context->data_ac) {
345                 ocfs2_free_alloc_context(context->data_ac);
346                 context->data_ac = NULL;
347         }
348
349         if (context->meta_ac) {
350                 ocfs2_free_alloc_context(context->meta_ac);
351                 context->meta_ac = NULL;
352         }
353
354 out:
355         if (ref_tree)
356                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
357
358         return ret;
359 }
360
361 /*
362  * find the victim alloc group, where #blkno fits.
363  */
364 static int ocfs2_find_victim_alloc_group(struct inode *inode,
365                                          u64 vict_blkno,
366                                          int type, int slot,
367                                          int *vict_bit,
368                                          struct buffer_head **ret_bh)
369 {
370         int ret, i, bits_per_unit = 0;
371         u64 blkno;
372         char namebuf[40];
373
374         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
375         struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
376         struct ocfs2_chain_list *cl;
377         struct ocfs2_chain_rec *rec;
378         struct ocfs2_dinode *ac_dinode;
379         struct ocfs2_group_desc *bg;
380
381         ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
382         ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
383                                          strlen(namebuf), &blkno);
384         if (ret) {
385                 ret = -ENOENT;
386                 goto out;
387         }
388
389         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
390         if (ret) {
391                 mlog_errno(ret);
392                 goto out;
393         }
394
395         ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
396         cl = &(ac_dinode->id2.i_chain);
397         rec = &(cl->cl_recs[0]);
398
399         if (type == GLOBAL_BITMAP_SYSTEM_INODE)
400                 bits_per_unit = osb->s_clustersize_bits -
401                                         inode->i_sb->s_blocksize_bits;
402         /*
403          * 'vict_blkno' was out of the valid range.
404          */
405         if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
406             (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
407                                 bits_per_unit))) {
408                 ret = -EINVAL;
409                 goto out;
410         }
411
412         for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
413
414                 rec = &(cl->cl_recs[i]);
415                 if (!rec)
416                         continue;
417
418                 bg = NULL;
419
420                 do {
421                         if (!bg)
422                                 blkno = le64_to_cpu(rec->c_blkno);
423                         else
424                                 blkno = le64_to_cpu(bg->bg_next_group);
425
426                         if (gd_bh) {
427                                 brelse(gd_bh);
428                                 gd_bh = NULL;
429                         }
430
431                         ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
432                         if (ret) {
433                                 mlog_errno(ret);
434                                 goto out;
435                         }
436
437                         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
438
439                         if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
440                                                 le16_to_cpu(bg->bg_bits))) {
441
442                                 *ret_bh = gd_bh;
443                                 *vict_bit = (vict_blkno - blkno) >>
444                                                         bits_per_unit;
445                                 mlog(0, "find the victim group: #%llu, "
446                                      "total_bits: %u, vict_bit: %u\n",
447                                      blkno, le16_to_cpu(bg->bg_bits),
448                                      *vict_bit);
449                                 goto out;
450                         }
451
452                 } while (le64_to_cpu(bg->bg_next_group));
453         }
454
455         ret = -EINVAL;
456 out:
457         brelse(ac_bh);
458
459         /*
460          * caller has to release the gd_bh properly.
461          */
462         return ret;
463 }
464
465 /*
466  * XXX: helper to validate and adjust moving goal.
467  */
468 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
469                                                struct ocfs2_move_extents *range)
470 {
471         int ret, goal_bit = 0;
472
473         struct buffer_head *gd_bh = NULL;
474         struct ocfs2_group_desc *bg;
475         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
476         int c_to_b = 1 << (osb->s_clustersize_bits -
477                                         inode->i_sb->s_blocksize_bits);
478
479         /*
480          * make goal become cluster aligned.
481          */
482         range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
483                                                       range->me_goal);
484         /*
485          * validate goal sits within global_bitmap, and return the victim
486          * group desc
487          */
488         ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
489                                             GLOBAL_BITMAP_SYSTEM_INODE,
490                                             OCFS2_INVALID_SLOT,
491                                             &goal_bit, &gd_bh);
492         if (ret)
493                 goto out;
494
495         bg = (struct ocfs2_group_desc *)gd_bh->b_data;
496
497         /*
498          * moving goal is not allowd to start with a group desc blok(#0 blk)
499          * let's compromise to the latter cluster.
500          */
501         if (range->me_goal == le64_to_cpu(bg->bg_blkno))
502                 range->me_goal += c_to_b;
503
504         /*
505          * movement is not gonna cross two groups.
506          */
507         if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
508                                                                 range->me_len) {
509                 ret = -EINVAL;
510                 goto out;
511         }
512         /*
513          * more exact validations/adjustments will be performed later during
514          * moving operation for each extent range.
515          */
516         mlog(0, "extents get ready to be moved to #%llu block\n",
517              range->me_goal);
518
519 out:
520         brelse(gd_bh);
521
522         return ret;
523 }
524
525 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
526                                     int *goal_bit, u32 move_len, u32 max_hop,
527                                     u32 *phys_cpos)
528 {
529         int i, used, last_free_bits = 0, base_bit = *goal_bit;
530         struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
531         u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
532                                                  le64_to_cpu(gd->bg_blkno));
533
534         for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
535
536                 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
537                 if (used) {
538                         /*
539                          * we even tried searching the free chunk by jumping
540                          * a 'max_hop' distance, but still failed.
541                          */
542                         if ((i - base_bit) > max_hop) {
543                                 *phys_cpos = 0;
544                                 break;
545                         }
546
547                         if (last_free_bits)
548                                 last_free_bits = 0;
549
550                         continue;
551                 } else
552                         last_free_bits++;
553
554                 if (last_free_bits == move_len) {
555                         *goal_bit = i;
556                         *phys_cpos = base_cpos + i;
557                         break;
558                 }
559         }
560
561         mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
562 }
563
564 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
565                              u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
566                              u32 len, int ext_flags)
567 {
568         int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
569         handle_t *handle;
570         struct inode *inode = context->inode;
571         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
572         struct inode *tl_inode = osb->osb_tl_inode;
573         struct inode *gb_inode = NULL;
574         struct buffer_head *gb_bh = NULL;
575         struct buffer_head *gd_bh = NULL;
576         struct ocfs2_group_desc *gd;
577         struct ocfs2_refcount_tree *ref_tree = NULL;
578         u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
579                                                     context->range->me_threshold);
580         u64 phys_blkno, new_phys_blkno;
581
582         phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
583
584         if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
585
586                 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
587                          OCFS2_HAS_REFCOUNT_FL));
588
589                 BUG_ON(!context->refcount_loc);
590
591                 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
592                                                &ref_tree, NULL);
593                 if (ret) {
594                         mlog_errno(ret);
595                         return ret;
596                 }
597
598                 ret = ocfs2_prepare_refcount_change_for_del(inode,
599                                                         context->refcount_loc,
600                                                         phys_blkno,
601                                                         len,
602                                                         &credits,
603                                                         &extra_blocks);
604                 if (ret) {
605                         mlog_errno(ret);
606                         goto out;
607                 }
608         }
609
610         ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
611                                                  &context->meta_ac,
612                                                  NULL, extra_blocks, &credits);
613         if (ret) {
614                 mlog_errno(ret);
615                 goto out;
616         }
617
618         /*
619          * need to count 2 extra credits for global_bitmap inode and
620          * group descriptor.
621          */
622         credits += OCFS2_INODE_UPDATE_CREDITS + 1;
623
624         /*
625          * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
626          * logic, while we still need to lock the global_bitmap.
627          */
628         gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
629                                                OCFS2_INVALID_SLOT);
630         if (!gb_inode) {
631                 mlog(ML_ERROR, "unable to get global_bitmap inode\n");
632                 ret = -EIO;
633                 goto out;
634         }
635
636         mutex_lock(&gb_inode->i_mutex);
637
638         ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
639         if (ret) {
640                 mlog_errno(ret);
641                 goto out_unlock_gb_mutex;
642         }
643
644         mutex_lock(&tl_inode->i_mutex);
645
646         handle = ocfs2_start_trans(osb, credits);
647         if (IS_ERR(handle)) {
648                 ret = PTR_ERR(handle);
649                 mlog_errno(ret);
650                 goto out_unlock_tl_inode;
651         }
652
653         new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
654         ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
655                                             GLOBAL_BITMAP_SYSTEM_INODE,
656                                             OCFS2_INVALID_SLOT,
657                                             &goal_bit, &gd_bh);
658         if (ret) {
659                 mlog_errno(ret);
660                 goto out_commit;
661         }
662
663         /*
664          * probe the victim cluster group to find a proper
665          * region to fit wanted movement, it even will perfrom
666          * a best-effort attempt by compromising to a threshold
667          * around the goal.
668          */
669         ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
670                                 new_phys_cpos);
671         if (!*new_phys_cpos) {
672                 ret = -ENOSPC;
673                 goto out_commit;
674         }
675
676         ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
677                                   *new_phys_cpos, ext_flags);
678         if (ret) {
679                 mlog_errno(ret);
680                 goto out_commit;
681         }
682
683         gd = (struct ocfs2_group_desc *)gd_bh->b_data;
684         ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
685                                                le16_to_cpu(gd->bg_chain));
686         if (ret) {
687                 mlog_errno(ret);
688                 goto out_commit;
689         }
690
691         ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
692                                          goal_bit, len);
693         if (ret)
694                 mlog_errno(ret);
695
696         /*
697          * Here we should write the new page out first if we are
698          * in write-back mode.
699          */
700         ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
701         if (ret)
702                 mlog_errno(ret);
703
704 out_commit:
705         ocfs2_commit_trans(osb, handle);
706         brelse(gd_bh);
707
708 out_unlock_tl_inode:
709         mutex_unlock(&tl_inode->i_mutex);
710
711         ocfs2_inode_unlock(gb_inode, 1);
712 out_unlock_gb_mutex:
713         mutex_unlock(&gb_inode->i_mutex);
714         brelse(gb_bh);
715         iput(gb_inode);
716
717 out:
718         if (context->meta_ac) {
719                 ocfs2_free_alloc_context(context->meta_ac);
720                 context->meta_ac = NULL;
721         }
722
723         if (ref_tree)
724                 ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
725
726         return ret;
727 }
728
729 /*
730  * Helper to calculate the defraging length in one run according to threshold.
731  */
732 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
733                                          u32 threshold, int *skip)
734 {
735         if ((*alloc_size + *len_defraged) < threshold) {
736                 /*
737                  * proceed defragmentation until we meet the thresh
738                  */
739                 *len_defraged += *alloc_size;
740         } else if (*len_defraged == 0) {
741                 /*
742                  * XXX: skip a large extent.
743                  */
744                 *skip = 1;
745         } else {
746                 /*
747                  * split this extent to coalesce with former pieces as
748                  * to reach the threshold.
749                  *
750                  * we're done here with one cycle of defragmentation
751                  * in a size of 'thresh', resetting 'len_defraged'
752                  * forces a new defragmentation.
753                  */
754                 *alloc_size = threshold - *len_defraged;
755                 *len_defraged = 0;
756         }
757 }
758
759 static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
760                                 struct ocfs2_move_extents_context *context)
761 {
762         int ret = 0, flags, do_defrag, skip = 0;
763         u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
764         u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
765
766         struct inode *inode = context->inode;
767         struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
768         struct ocfs2_move_extents *range = context->range;
769         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
770
771         if ((i_size_read(inode) == 0) || (range->me_len == 0))
772                 return 0;
773
774         if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
775                 return 0;
776
777         context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
778
779         ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
780         ocfs2_init_dealloc_ctxt(&context->dealloc);
781
782         /*
783          * TO-DO XXX:
784          *
785          * - xattr extents.
786          */
787
788         do_defrag = context->auto_defrag;
789
790         /*
791          * extents moving happens in unit of clusters, for the sake
792          * of simplicity, we may ignore two clusters where 'byte_start'
793          * and 'byte_start + len' were within.
794          */
795         move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
796         len_to_move = (range->me_start + range->me_len) >>
797                                                 osb->s_clustersize_bits;
798         if (len_to_move >= move_start)
799                 len_to_move -= move_start;
800         else
801                 len_to_move = 0;
802
803         if (do_defrag) {
804                 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
805                 if (defrag_thresh <= 1)
806                         goto done;
807         } else
808                 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
809                                                          range->me_goal);
810
811         mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
812              "thresh: %u\n",
813              (unsigned long long)OCFS2_I(inode)->ip_blkno,
814              (unsigned long long)range->me_start,
815              (unsigned long long)range->me_len,
816              move_start, len_to_move, defrag_thresh);
817
818         cpos = move_start;
819         while (len_to_move) {
820                 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
821                                          &flags);
822                 if (ret) {
823                         mlog_errno(ret);
824                         goto out;
825                 }
826
827                 if (alloc_size > len_to_move)
828                         alloc_size = len_to_move;
829
830                 /*
831                  * XXX: how to deal with a hole:
832                  *
833                  * - skip the hole of course
834                  * - force a new defragmentation
835                  */
836                 if (!phys_cpos) {
837                         if (do_defrag)
838                                 len_defraged = 0;
839
840                         goto next;
841                 }
842
843                 if (do_defrag) {
844                         ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
845                                                      defrag_thresh, &skip);
846                         /*
847                          * skip large extents
848                          */
849                         if (skip) {
850                                 skip = 0;
851                                 goto next;
852                         }
853
854                         mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
855                              "alloc_size: %u, len_defraged: %u\n",
856                              cpos, phys_cpos, alloc_size, len_defraged);
857
858                         ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
859                                                   &alloc_size, flags);
860                 } else {
861                         ret = ocfs2_move_extent(context, cpos, phys_cpos,
862                                                 &new_phys_cpos, alloc_size,
863                                                 flags);
864
865                         new_phys_cpos += alloc_size;
866                 }
867
868                 if (ret < 0) {
869                         mlog_errno(ret);
870                         goto out;
871                 }
872
873                 context->clusters_moved += alloc_size;
874 next:
875                 cpos += alloc_size;
876                 len_to_move -= alloc_size;
877         }
878
879 done:
880         range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
881
882 out:
883         range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
884                                                       context->clusters_moved);
885         range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
886                                                        context->new_phys_cpos);
887
888         ocfs2_schedule_truncate_log_flush(osb, 1);
889         ocfs2_run_deallocs(osb, &context->dealloc);
890
891         return ret;
892 }
893
894 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
895 {
896         int status;
897         handle_t *handle;
898         struct inode *inode = context->inode;
899         struct ocfs2_dinode *di;
900         struct buffer_head *di_bh = NULL;
901         struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
902
903         if (!inode)
904                 return -ENOENT;
905
906         if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
907                 return -EROFS;
908
909         mutex_lock(&inode->i_mutex);
910
911         /*
912          * This prevents concurrent writes from other nodes
913          */
914         status = ocfs2_rw_lock(inode, 1);
915         if (status) {
916                 mlog_errno(status);
917                 goto out;
918         }
919
920         status = ocfs2_inode_lock(inode, &di_bh, 1);
921         if (status) {
922                 mlog_errno(status);
923                 goto out_rw_unlock;
924         }
925
926         /*
927          * rememer ip_xattr_sem also needs to be held if necessary
928          */
929         down_write(&OCFS2_I(inode)->ip_alloc_sem);
930
931         status = __ocfs2_move_extents_range(di_bh, context);
932
933         up_write(&OCFS2_I(inode)->ip_alloc_sem);
934         if (status) {
935                 mlog_errno(status);
936                 goto out_inode_unlock;
937         }
938
939         /*
940          * We update ctime for these changes
941          */
942         handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
943         if (IS_ERR(handle)) {
944                 status = PTR_ERR(handle);
945                 mlog_errno(status);
946                 goto out_inode_unlock;
947         }
948
949         status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
950                                          OCFS2_JOURNAL_ACCESS_WRITE);
951         if (status) {
952                 mlog_errno(status);
953                 goto out_commit;
954         }
955
956         di = (struct ocfs2_dinode *)di_bh->b_data;
957         inode->i_ctime = CURRENT_TIME;
958         di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
959         di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
960
961         ocfs2_journal_dirty(handle, di_bh);
962
963 out_commit:
964         ocfs2_commit_trans(osb, handle);
965
966 out_inode_unlock:
967         brelse(di_bh);
968         ocfs2_inode_unlock(inode, 1);
969 out_rw_unlock:
970         ocfs2_rw_unlock(inode, 1);
971 out:
972         mutex_unlock(&inode->i_mutex);
973
974         return status;
975 }
976
977 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
978 {
979         int status;
980
981         struct inode *inode = file_inode(filp);
982         struct ocfs2_move_extents range;
983         struct ocfs2_move_extents_context *context;
984
985         if (!argp)
986                 return -EINVAL;
987
988         status = mnt_want_write_file(filp);
989         if (status)
990                 return status;
991
992         if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
993                 status = -EPERM;
994                 goto out_drop;
995         }
996
997         if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
998                 status = -EPERM;
999                 goto out_drop;
1000         }
1001
1002         context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
1003         if (!context) {
1004                 status = -ENOMEM;
1005                 mlog_errno(status);
1006                 goto out_drop;
1007         }
1008
1009         context->inode = inode;
1010         context->file = filp;
1011
1012         if (copy_from_user(&range, argp, sizeof(range))) {
1013                 status = -EFAULT;
1014                 goto out_free;
1015         }
1016
1017         if (range.me_start > i_size_read(inode)) {
1018                 status = -EINVAL;
1019                 goto out_free;
1020         }
1021
1022         if (range.me_start + range.me_len > i_size_read(inode))
1023                         range.me_len = i_size_read(inode) - range.me_start;
1024
1025         context->range = &range;
1026
1027         if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1028                 context->auto_defrag = 1;
1029                 /*
1030                  * ok, the default theshold for the defragmentation
1031                  * is 1M, since our maximum clustersize was 1M also.
1032                  * any thought?
1033                  */
1034                 if (!range.me_threshold)
1035                         range.me_threshold = 1024 * 1024;
1036
1037                 if (range.me_threshold > i_size_read(inode))
1038                         range.me_threshold = i_size_read(inode);
1039
1040                 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
1041                         context->partial = 1;
1042         } else {
1043                 /*
1044                  * first best-effort attempt to validate and adjust the goal
1045                  * (physical address in block), while it can't guarantee later
1046                  * operation can succeed all the time since global_bitmap may
1047                  * change a bit over time.
1048                  */
1049
1050                 status = ocfs2_validate_and_adjust_move_goal(inode, &range);
1051                 if (status)
1052                         goto out_copy;
1053         }
1054
1055         status = ocfs2_move_extents(context);
1056         if (status)
1057                 mlog_errno(status);
1058 out_copy:
1059         /*
1060          * movement/defragmentation may end up being partially completed,
1061          * that's the reason why we need to return userspace the finished
1062          * length and new_offset even if failure happens somewhere.
1063          */
1064         if (copy_to_user(argp, &range, sizeof(range)))
1065                 status = -EFAULT;
1066
1067 out_free:
1068         kfree(context);
1069 out_drop:
1070         mnt_drop_write_file(filp);
1071
1072         return status;
1073 }