Skip to content

Commit 98dad9c

Browse files
adam900710kdave
authored andcommitted
btrfs: reduce extent map lookup during writes
With large data folios supports, even on x86_64 we can hit a folio that contains several fs blocks. In that case, we still need to call btrfs_get_extent() for each block, as our submission path is still iterating each fs block and submit them one by one. This reduces the benefit of large folios. Change the behavior to submit the whole range when possible, this is done by: - Use for_each_set_bitrange() instead of for_each_set_bit() Now we can get a contiguous range to submit instead of a single fs block. - Handle blocks beyond EOF in one go This is pretty much the same as the old behavior, but for a range crossing i_size, we finish the range beyond i_size first, then submit the remaining. - Submit the contiguous range in one go Although we still need to consider the extent map boundary. - Remove submit_one_sector() As it's no longer utilized. Reviewed-by: Boris Burkov <boris@bur.io> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent ac35d1e commit 98dad9c

File tree

1 file changed

+96
-84
lines changed

1 file changed

+96
-84
lines changed

fs/btrfs/extent_io.c

Lines changed: 96 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1602,91 +1602,94 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
16021602
return 0;
16031603
}
16041604

1605+
16051606
/*
1606-
* Return 0 if we have submitted or queued the sector for submission.
1607-
* Return <0 for critical errors, and the involved sector will be cleaned up.
1607+
* Return 0 if we have submitted or queued the range for submission.
1608+
* Return <0 for critical errors, and the involved blocks will be cleaned up.
16081609
*
1609-
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
1610+
* Caller should make sure the range doesn't go beyond the last block of the inode.
16101611
*/
1611-
static int submit_one_sector(struct btrfs_inode *inode,
1612-
struct folio *folio,
1613-
u64 filepos, struct btrfs_bio_ctrl *bio_ctrl,
1614-
loff_t i_size)
1612+
static int submit_range(struct btrfs_inode *inode, struct folio *folio,
1613+
u64 start, u32 len, struct btrfs_bio_ctrl *bio_ctrl)
16151614
{
16161615
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1617-
struct extent_map *em;
1618-
u64 block_start;
1619-
u64 disk_bytenr;
1620-
u64 extent_offset;
1621-
u64 em_end;
16221616
const u32 sectorsize = fs_info->sectorsize;
1617+
u64 cur = start;
16231618

1624-
ASSERT(IS_ALIGNED(filepos, sectorsize));
1625-
1626-
/* @filepos >= i_size case should be handled by the caller. */
1627-
ASSERT(filepos < i_size);
1619+
ASSERT(IS_ALIGNED(start, sectorsize));
1620+
ASSERT(IS_ALIGNED(len, sectorsize));
1621+
ASSERT(start + len <= folio_end(folio));
16281622

1629-
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
1630-
if (IS_ERR(em)) {
1631-
/*
1632-
* bio_ctrl may contain a bio crossing several folios.
1633-
* Submit it immediately so that the bio has a chance
1634-
* to finish normally, other than marked as error.
1635-
*/
1636-
submit_one_bio(bio_ctrl);
1623+
while (cur < start + len) {
1624+
struct extent_map *em;
1625+
u64 block_start;
1626+
u64 disk_bytenr;
1627+
u64 extent_offset;
1628+
u64 em_end;
1629+
u32 cur_len = start + len - cur;
16371630

1638-
/*
1639-
* When submission failed, we should still clear the folio dirty.
1640-
* Or the folio will be written back again but without any
1641-
* ordered extent.
1642-
*/
1643-
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
1644-
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
1645-
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
1631+
em = btrfs_get_extent(inode, NULL, cur, cur_len);
1632+
if (IS_ERR(em)) {
1633+
/*
1634+
* bio_ctrl may contain a bio crossing several folios.
1635+
* Submit it immediately so that the bio has a chance
1636+
* to finish normally, other than marked as error.
1637+
*/
1638+
submit_one_bio(bio_ctrl);
16461639

1647-
/*
1648-
* Since there is no bio submitted to finish the ordered
1649-
* extent, we have to manually finish this sector.
1650-
*/
1651-
btrfs_mark_ordered_io_finished(inode, folio, filepos,
1652-
fs_info->sectorsize, false);
1653-
return PTR_ERR(em);
1654-
}
1640+
/*
1641+
* When submission failed, we should still clear the folio dirty.
1642+
* Or the folio will be written back again but without any
1643+
* ordered extent.
1644+
*/
1645+
btrfs_folio_clear_dirty(fs_info, folio, cur, cur_len);
1646+
btrfs_folio_set_writeback(fs_info, folio, cur, cur_len);
1647+
btrfs_folio_clear_writeback(fs_info, folio, cur, cur_len);
16551648

1656-
extent_offset = filepos - em->start;
1657-
em_end = btrfs_extent_map_end(em);
1658-
ASSERT(filepos <= em_end);
1659-
ASSERT(IS_ALIGNED(em->start, sectorsize));
1660-
ASSERT(IS_ALIGNED(em->len, sectorsize));
1649+
/*
1650+
* Since there is no bio submitted to finish the ordered
1651+
* extent, we have to manually finish this range.
1652+
*/
1653+
btrfs_mark_ordered_io_finished(inode, folio, cur, cur_len, false);
1654+
return PTR_ERR(em);
1655+
}
1656+
extent_offset = cur - em->start;
1657+
em_end = btrfs_extent_map_end(em);
1658+
ASSERT(cur <= em_end);
1659+
ASSERT(IS_ALIGNED(em->start, sectorsize));
1660+
ASSERT(IS_ALIGNED(em->len, sectorsize));
16611661

1662-
block_start = btrfs_extent_map_block_start(em);
1663-
disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
1662+
block_start = btrfs_extent_map_block_start(em);
1663+
disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset;
16641664

1665-
ASSERT(!btrfs_extent_map_is_compressed(em));
1666-
ASSERT(block_start != EXTENT_MAP_HOLE);
1667-
ASSERT(block_start != EXTENT_MAP_INLINE);
1665+
ASSERT(!btrfs_extent_map_is_compressed(em));
1666+
ASSERT(block_start != EXTENT_MAP_HOLE);
1667+
ASSERT(block_start != EXTENT_MAP_INLINE);
16681668

1669-
btrfs_free_extent_map(em);
1670-
em = NULL;
1669+
cur_len = min(cur_len, em_end - cur);
1670+
btrfs_free_extent_map(em);
1671+
em = NULL;
16711672

1672-
/*
1673-
* Although the PageDirty bit is cleared before entering this
1674-
* function, subpage dirty bit is not cleared.
1675-
* So clear subpage dirty bit here so next time we won't submit
1676-
* a folio for a range already written to disk.
1677-
*/
1678-
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
1679-
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
1680-
/*
1681-
* Above call should set the whole folio with writeback flag, even
1682-
* just for a single subpage sector.
1683-
* As long as the folio is properly locked and the range is correct,
1684-
* we should always get the folio with writeback flag.
1685-
*/
1686-
ASSERT(folio_test_writeback(folio));
1673+
/*
1674+
* Although the PageDirty bit is cleared before entering this
1675+
* function, subpage dirty bit is not cleared.
1676+
* So clear subpage dirty bit here so next time we won't submit
1677+
* a folio for a range already written to disk.
1678+
*/
1679+
btrfs_folio_clear_dirty(fs_info, folio, cur, cur_len);
1680+
btrfs_folio_set_writeback(fs_info, folio, cur, cur_len);
1681+
/*
1682+
* Above call should set the whole folio with writeback flag, even
1683+
* just for a single subpage block.
1684+
* As long as the folio is properly locked and the range is correct,
1685+
* we should always get the folio with writeback flag.
1686+
*/
1687+
ASSERT(folio_test_writeback(folio));
16871688

1688-
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
1689-
sectorsize, filepos - folio_pos(folio), 0);
1689+
submit_extent_folio(bio_ctrl, disk_bytenr, folio,
1690+
cur_len, cur - folio_pos(folio), 0);
1691+
cur += cur_len;
1692+
}
16901693
return 0;
16911694
}
16921695

@@ -1712,8 +1715,9 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
17121715
const u64 folio_start = folio_pos(folio);
17131716
const u64 folio_end = folio_start + folio_size(folio);
17141717
const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio);
1715-
u64 cur;
1716-
int bit;
1718+
unsigned int start_bit;
1719+
unsigned int end_bit;
1720+
const u64 rounded_isize = round_up(i_size, fs_info->sectorsize);
17171721
int ret = 0;
17181722

17191723
ASSERT(start >= folio_start, "start=%llu folio_start=%llu", start, folio_start);
@@ -1741,23 +1745,31 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
17411745

17421746
bio_ctrl->end_io_func = end_bbio_data_write;
17431747

1744-
for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
1745-
cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits);
1748+
for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, blocks_per_folio) {
1749+
const u64 cur_start = folio_pos(folio) + (start_bit << fs_info->sectorsize_bits);
1750+
u32 cur_len = (end_bit - start_bit) << fs_info->sectorsize_bits;
17461751

1747-
if (cur >= i_size) {
1748-
btrfs_mark_ordered_io_truncated(inode, folio, cur, end - cur);
1752+
if (cur_start > rounded_isize) {
17491753
/*
1750-
* This range is beyond i_size, thus we don't need to
1751-
* bother writing back.
1752-
* But we still need to clear the dirty subpage bit, or
1753-
* the next time the folio gets dirtied, we will try to
1754-
* writeback the sectors with subpage dirty bits,
1755-
* causing writeback without ordered extent.
1754+
* The whole range is beyond EOF.
1755+
*
1756+
* Just finish the IO and skip to the next range.
17561757
*/
1757-
btrfs_folio_clear_dirty(fs_info, folio, cur, end - cur);
1758-
break;
1758+
btrfs_mark_ordered_io_truncated(inode, folio, cur_start, cur_len);
1759+
btrfs_folio_clear_dirty(fs_info, folio, cur_start, cur_len);
1760+
continue;
17591761
}
1760-
ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size);
1762+
if (cur_start + cur_len > rounded_isize) {
1763+
u32 truncate_len = cur_start + cur_len - rounded_isize;
1764+
1765+
/* The tailing part of the range is beyond EOF. */
1766+
btrfs_mark_ordered_io_truncated(inode, folio, rounded_isize, truncate_len);
1767+
btrfs_folio_clear_dirty(fs_info, folio, rounded_isize, truncate_len);
1768+
/* Shrink the range inside the EOF. */
1769+
cur_len = rounded_isize - cur_start;
1770+
}
1771+
1772+
ret = submit_range(inode, folio, cur_start, cur_len, bio_ctrl);
17611773
if (unlikely(ret < 0)) {
17621774
if (!found_error)
17631775
found_error = ret;

0 commit comments

Comments
 (0)