Skip to content

Commit 9e15f50

Browse files
adam900710kdave
authored andcommitted
btrfs: make btrfs_repair_io_failure() handle bs > ps cases without large folios
Currently btrfs_repair_io_failure() only accept a single @paddr parameter, and for bs > ps cases it's required that @paddr is backed by a large folio. That assumption has quite some limitations, preventing us from utilizing true zero-copy direct-io and encoded read/writes. To address the problem, enhance btrfs_repair_io_failure() by: - Accept an array of paddrs, up to 64K / PAGE_SIZE entries This kind of acts like a bio_vec, but with very limited entries, as the function is only utilized to repair one fs data block, or a tree block. Both have an upper size limit (BTRFS_MAX_BLOCK_SIZE, i.e. 64K), so we don't need the full bio_vec thing to handle it. - Allocate a bio with multiple slots Previously even for bs > ps cases, we only passed in a contiguous physical address range, thus a single slot will be enough. But not anymore, so we have to allocate a bio structure, other than using the on-stack one. - Use on-stack memory to allocate @paddrs array It's at most 16 pages (4K page size, 64K block size), will take up at most 128 bytes. I think the on-stack cost is still acceptable. - Add one extra check to make sure the repair bio is exactly one block - Utilize btrfs_repair_io_failure() to submit a single bio for metadata This should improve the read-repair performance for metadata, as now we submit a node sized bio then wait, other than submit each block of the metadata and wait for each submitted block. - Add one extra parameter indicating the step This is due to the fact that metadata step can be as large as nodesize, instead of sectorsize. So we need a way to distinguish metadata and data repair. - Reduce the width of @Length parameter of btrfs_repair_io_failure() Since we only call btrfs_repair_io_failure() on a single data or metadata block, u64 is overkilled. Use u32 instead and add one extra ASSERT()s to make sure the length never exceed BTRFS_MAX_BLOCK_SIZE. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 42a3bc2 commit 9e15f50

File tree

3 files changed

+76
-28
lines changed

3 files changed

+76
-28
lines changed

fs/btrfs/bio.c

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,21 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
172172
struct btrfs_inode *inode = repair_bbio->inode;
173173
struct btrfs_fs_info *fs_info = inode->root->fs_info;
174174
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
175+
/*
176+
* We can not move forward the saved_iter, as it will be later
177+
* utilized by repair_bbio again.
178+
*/
179+
struct bvec_iter saved_iter = repair_bbio->saved_iter;
180+
const u32 step = min(fs_info->sectorsize, PAGE_SIZE);
181+
const u64 logical = repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT;
182+
const u32 nr_steps = repair_bbio->saved_iter.bi_size / step;
175183
int mirror = repair_bbio->mirror_num;
184+
phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
185+
phys_addr_t paddr;
186+
unsigned int slot = 0;
187+
188+
/* Repair bbio should be eaxctly one block sized. */
189+
ASSERT(repair_bbio->saved_iter.bi_size == fs_info->sectorsize);
176190

177191
if (repair_bbio->bio.bi_status ||
178192
!btrfs_data_csum_ok(repair_bbio, dev, 0, bvec_phys(bv))) {
@@ -190,12 +204,17 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
190204
return;
191205
}
192206

207+
btrfs_bio_for_each_block(paddr, &repair_bbio->bio, &saved_iter, step) {
208+
ASSERT(slot < nr_steps);
209+
paddrs[slot] = paddr;
210+
slot++;
211+
}
212+
193213
do {
194214
mirror = prev_repair_mirror(fbio, mirror);
195215
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
196216
repair_bbio->file_offset, fs_info->sectorsize,
197-
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
198-
bvec_phys(bv), mirror);
217+
logical, paddrs, step, mirror);
199218
} while (mirror != fbio->bbio->mirror_num);
200219

201220
done:
@@ -866,18 +885,36 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num)
866885
*
867886
* The I/O is issued synchronously to block the repair read completion from
868887
* freeing the bio.
888+
*
889+
* @ino: Offending inode number
890+
* @fileoff: File offset inside the inode
891+
* @length: Length of the repair write
892+
* @logical: Logical address of the range
893+
* @paddrs: Physical address array of the content
894+
* @step: Length of for each paddrs
895+
* @mirror_num: Mirror number to write to. Must not be zero
869896
*/
870-
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
871-
u64 length, u64 logical, phys_addr_t paddr, int mirror_num)
897+
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
898+
u32 length, u64 logical, const phys_addr_t paddrs[],
899+
unsigned int step, int mirror_num)
872900
{
901+
const u32 nr_steps = DIV_ROUND_UP_POW2(length, step);
873902
struct btrfs_io_stripe smap = { 0 };
874-
struct bio_vec bvec;
875-
struct bio bio;
903+
struct bio *bio = NULL;
876904
int ret = 0;
877905

878906
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
879907
BUG_ON(!mirror_num);
880908

909+
/* Basic alignment checks. */
910+
ASSERT(IS_ALIGNED(logical, fs_info->sectorsize));
911+
ASSERT(IS_ALIGNED(length, fs_info->sectorsize));
912+
ASSERT(IS_ALIGNED(fileoff, fs_info->sectorsize));
913+
/* Either it's a single data or metadata block. */
914+
ASSERT(length <= BTRFS_MAX_BLOCKSIZE);
915+
ASSERT(step <= length);
916+
ASSERT(is_power_of_2(step));
917+
881918
if (btrfs_repair_one_zone(fs_info, logical))
882919
return 0;
883920

@@ -897,24 +934,27 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
897934
goto out_counter_dec;
898935
}
899936

900-
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
901-
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
902-
__bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr));
903-
ret = submit_bio_wait(&bio);
937+
bio = bio_alloc(smap.dev->bdev, nr_steps, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS);
938+
bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
939+
for (int i = 0; i < nr_steps; i++) {
940+
ret = bio_add_page(bio, phys_to_page(paddrs[i]), step, offset_in_page(paddrs[i]));
941+
/* We should have allocated enough slots to contain all the different pages. */
942+
ASSERT(ret == step);
943+
}
944+
ret = submit_bio_wait(bio);
945+
bio_put(bio);
904946
if (ret) {
905947
/* try to remap that extent elsewhere? */
906948
btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
907-
goto out_bio_uninit;
949+
goto out_counter_dec;
908950
}
909951

910952
btrfs_info_rl(fs_info,
911953
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
912-
ino, start, btrfs_dev_name(smap.dev),
954+
ino, fileoff, btrfs_dev_name(smap.dev),
913955
smap.physical >> SECTOR_SHIFT);
914956
ret = 0;
915957

916-
out_bio_uninit:
917-
bio_uninit(&bio);
918958
out_counter_dec:
919959
btrfs_bio_counter_dec(fs_info);
920960
return ret;

fs/btrfs/bio.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
117117

118118
void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num);
119119
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
120-
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
121-
u64 length, u64 logical, phys_addr_t paddr, int mirror_num);
120+
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 fileoff,
121+
u32 length, u64 logical, const phys_addr_t paddrs[],
122+
unsigned int step, int mirror_num);
122123

123124
#endif

fs/btrfs/disk-io.c

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -183,26 +183,33 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
183183
int mirror_num)
184184
{
185185
struct btrfs_fs_info *fs_info = eb->fs_info;
186+
const u32 step = min(fs_info->nodesize, PAGE_SIZE);
187+
const u32 nr_steps = eb->len / step;
188+
phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE];
186189
int ret = 0;
187190

188191
if (sb_rdonly(fs_info->sb))
189192
return -EROFS;
190193

191-
for (int i = 0; i < num_extent_folios(eb); i++) {
194+
for (int i = 0; i < num_extent_pages(eb); i++) {
192195
struct folio *folio = eb->folios[i];
193-
u64 start = max_t(u64, eb->start, folio_pos(folio));
194-
u64 end = min_t(u64, eb->start + eb->len,
195-
folio_pos(folio) + eb->folio_size);
196-
u32 len = end - start;
197-
phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) +
198-
offset_in_folio(folio, start);
199-
200-
ret = btrfs_repair_io_failure(fs_info, 0, start, len, start,
201-
paddr, mirror_num);
202-
if (ret)
203-
break;
196+
197+
/* No large folio support yet. */
198+
ASSERT(folio_order(folio) == 0);
199+
ASSERT(i < nr_steps);
200+
201+
/*
202+
* For nodesize < page size, there is just one paddr, with some
203+
* offset inside the page.
204+
*
205+
* For nodesize >= page size, it's one or more paddrs, and eb->start
206+
* must be aligned to page boundary.
207+
*/
208+
paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start);
204209
}
205210

211+
ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start,
212+
paddrs, step, mirror_num);
206213
return ret;
207214
}
208215

0 commit comments

Comments
 (0)