Skip to content

Commit ee13d1e

Browse files
committed
yugabyte#4929: Cleanup split tablets
Summary: Implemented cleanup of the tablet for which all replicas have been split: - Added DeleteTablet master functionality and RPC support. - Renamed `TEST_GetAllAppliedOpId` -> `GetAllAppliedOpId` to be used in production code. - `TSTabletManager`: added tablets cleaner background task which is responsible for deleting of split tablets that are no longer needed. - Added `TabletSplitITest.ParentTabletCleanup`. Test Plan: ``` ybd --remote --dltp --cxx-test tablet-split-itest -n 100 -- -p 1 ``` Reviewers: hector, nicolas, bogdan Reviewed By: bogdan Subscribers: zyu, ybase, bogdan Differential Revision: https://phabricator.dev.yugabyte.com/D9456
1 parent e591a96 commit ee13d1e

30 files changed

+646
-209
lines changed

ent/src/yb/master/catalog_manager-test_ent.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ void SetupClusterConfigEnt(const vector<string>& az_list,
5050
const vector<string>& affinitized_leader_list,
5151
ReplicationInfoPB* replication_info) {
5252
PlacementInfoPB* placement_info = replication_info->mutable_live_replicas();
53-
placement_info->set_num_replicas(kNumReplicas);
53+
placement_info->set_num_replicas(kDefaultNumReplicas);
5454

5555
for (const string& az : az_list) {
5656
auto pb = placement_info->add_placement_blocks();

src/yb/client/client-internal.cc

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,6 +1562,45 @@ void GetCDCStreamRpc::ProcessResponse(const Status& status) {
15621562
user_cb_(status);
15631563
}
15641564

1565+
class DeleteTabletRpc
1566+
: public ClientMasterRpc<master::DeleteTabletRequestPB, master::DeleteTabletResponsePB> {
1567+
public:
1568+
DeleteTabletRpc(
1569+
YBClient* client,
1570+
const TabletId& tablet_id,
1571+
StdStatusCallback user_cb,
1572+
CoarseTimePoint deadline,
1573+
rpc::Messenger* messenger,
1574+
rpc::ProxyCache* proxy_cache)
1575+
: ClientMasterRpc(client, deadline, messenger, proxy_cache),
1576+
user_cb_(std::move(user_cb)) {
1577+
req_.set_tablet_id(tablet_id);
1578+
}
1579+
1580+
std::string ToString() const override {
1581+
return Format(
1582+
"DeleteTabletRpc(tablet_id: $0, num_attempts: $1)", req_.tablet_id(), num_attempts());
1583+
}
1584+
1585+
virtual ~DeleteTabletRpc() = default;
1586+
1587+
private:
1588+
void CallRemoteMethod() override {
1589+
master_proxy()->DeleteTabletAsync(
1590+
req_, &resp_, mutable_retrier()->mutable_controller(),
1591+
std::bind(&DeleteTabletRpc::Finished, this, Status::OK()));
1592+
}
1593+
1594+
void ProcessResponse(const Status& status) override {
1595+
if (!status.ok()) {
1596+
LOG(WARNING) << ToString() << " failed: " << status.ToString();
1597+
}
1598+
user_cb_(status);
1599+
}
1600+
1601+
StdStatusCallback user_cb_;
1602+
};
1603+
15651604
} // namespace internal
15661605

15671606
Status YBClient::Data::GetTableSchema(YBClient* client,
@@ -1715,6 +1754,18 @@ void YBClient::Data::GetCDCStream(
17151754
proxy_cache_.get());
17161755
}
17171756

1757+
void YBClient::Data::DeleteTablet(
1758+
YBClient* client, const TabletId& tablet_id, CoarseTimePoint deadline,
1759+
StdStatusCallback callback) {
1760+
auto rpc = rpc::StartRpc<internal::DeleteTabletRpc>(
1761+
client,
1762+
tablet_id,
1763+
callback,
1764+
deadline,
1765+
messenger_,
1766+
proxy_cache_.get());
1767+
}
1768+
17181769
void YBClient::Data::LeaderMasterDetermined(const Status& status,
17191770
const HostPort& host_port) {
17201771
Status new_status = status;

src/yb/client/client-internal.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,10 @@ class YBClient::Data {
246246
CoarseTimePoint deadline,
247247
StdStatusCallback callback);
248248

249+
void DeleteTablet(
250+
YBClient* client, const TabletId& tablet_id, CoarseTimePoint deadline,
251+
StdStatusCallback callback);
252+
249253
CHECKED_STATUS InitLocalHostNames();
250254

251255
bool IsLocalHostPort(const HostPort& hp) const;

src/yb/client/client.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ using yb::master::DeleteUDTypeRequestPB;
131131
using yb::master::DeleteUDTypeResponsePB;
132132
using yb::master::DeleteRoleRequestPB;
133133
using yb::master::DeleteRoleResponsePB;
134+
using yb::master::DeleteTabletRequestPB;
135+
using yb::master::DeleteTabletResponsePB;
134136
using yb::master::GetPermissionsRequestPB;
135137
using yb::master::GetPermissionsResponsePB;
136138
using yb::master::GrantRevokeRoleRequestPB;
@@ -1354,6 +1356,11 @@ void YBClient::DeleteCDCStream(const CDCStreamId& stream_id, StatusCallback call
13541356
data_->DeleteCDCStream(this, stream_id, deadline, callback);
13551357
}
13561358

1359+
void YBClient::DeleteTablet(const TabletId& tablet_id, StdStatusCallback callback) {
1360+
auto deadline = CoarseMonoClock::Now() + default_admin_operation_timeout();
1361+
data_->DeleteTablet(this, tablet_id, deadline, callback);
1362+
}
1363+
13571364
Status YBClient::TabletServerCount(int *tserver_count, bool primary_only, bool use_cache) {
13581365
int tserver_count_cached = data_->tserver_count_cached_.load(std::memory_order_acquire);
13591366
if (use_cache && tserver_count_cached > 0) {

src/yb/client/client.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,8 @@ class YBClient {
521521
std::shared_ptr<std::unordered_map<std::string, std::string>> options,
522522
StdStatusCallback callback);
523523

524+
void DeleteTablet(const TabletId& tablet_id, StdStatusCallback callback);
525+
524526
// Find the number of tservers. This function should not be called frequently for reading or
525527
// writing actual data. Currently, it is called only for SQL DDL statements.
526528
// If primary_only is set to true, we expect the primary/sync cluster tserver count only.

src/yb/client/ql-tablet-test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1239,7 +1239,7 @@ std::vector<OpId> GetLastAppliedOpIds(const std::vector<tablet::TabletPeerPtr>&
12391239
Result<OpId> GetAllAppliedOpId(const std::vector<tablet::TabletPeerPtr>& peers) {
12401240
for (auto& peer : peers) {
12411241
if (peer->LeaderStatus() == consensus::LeaderStatus::LEADER_AND_READY) {
1242-
return peer->raft_consensus()->TEST_GetAllAppliedOpId();
1242+
return peer->raft_consensus()->GetAllAppliedOpId();
12431243
}
12441244
}
12451245
return STATUS(NotFound, "No leader found");

src/yb/consensus/consensus_queue-test.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ TEST_F(ConsensusQueueTest, TestPeersDontAckBeyondWatermarks) {
363363
// replicated watermark to the last op appended to the local log.
364364
ASSERT_EQ(queue_->TEST_GetAllReplicatedIndex(), MakeOpIdForIndex(kNumMessages));
365365
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), OpId::Min());
366-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
366+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
367367

368368
// Start to track the peer after the queue has some messages in it
369369
// at a point that is halfway through the current messages in the queue.
@@ -379,7 +379,7 @@ TEST_F(ConsensusQueueTest, TestPeersDontAckBeyondWatermarks) {
379379
ASSERT_EQ(queue_->TEST_GetAllReplicatedIndex(), OpId::Min());
380380
ASSERT_EQ(queue_->TEST_GetMajorityReplicatedOpId(), OpId::Min());
381381
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), OpId::Min());
382-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
382+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
383383

384384
ReplicateMsgsHolder refs;
385385
bool needs_remote_bootstrap;
@@ -443,7 +443,7 @@ TEST_F(ConsensusQueueTest, TestQueueAdvancesCommittedIndex) {
443443
queue_->raft_pool_observers_token_->Wait();
444444
ASSERT_EQ(queue_->TEST_GetCommittedIndex(), OpId::Min());
445445
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), OpId::Min());
446-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
446+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
447447

448448
// NOTE: We don't need to get operations from the queue. The queue
449449
// only cares about what the peer reported as received, not what was sent.
@@ -462,7 +462,7 @@ TEST_F(ConsensusQueueTest, TestQueueAdvancesCommittedIndex) {
462462
queue_->raft_pool_observers_token_->Wait();
463463
ASSERT_EQ(queue_->TEST_GetCommittedIndex(), OpId::Min());
464464
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), OpId::Min());
465-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
465+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
466466

467467
// Ack the first five operations for peer-2
468468
response.set_responder_uuid("peer-2");
@@ -640,7 +640,7 @@ TEST_F(ConsensusQueueTest, TestQueueHandlesOperationOverwriting) {
640640
// 0.0 since we haven't had a successful exchange with the 'remote' peer.
641641
ASSERT_EQ(queue_->TEST_GetAllReplicatedIndex(), OpId::Min());
642642
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), committed_op_id);
643-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
643+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
644644

645645
// Test even when a correct peer responds (meaning we actually get to execute
646646
// watermark advancement) we sill have the same all-replicated watermark.
@@ -650,7 +650,7 @@ TEST_F(ConsensusQueueTest, TestQueueHandlesOperationOverwriting) {
650650

651651
ASSERT_EQ(queue_->TEST_GetAllReplicatedIndex(), OpId::Min());
652652
ASSERT_EQ(queue_->TEST_GetLastAppliedOpId(), committed_op_id);
653-
ASSERT_EQ(queue_->TEST_GetAllAppliedOpId(), OpId::Min());
653+
ASSERT_EQ(queue_->GetAllAppliedOpId(), OpId::Min());
654654

655655
// Generate another request for the remote peer, which should include
656656
// all of the ops since the peer's last-known committed index.

src/yb/consensus/consensus_queue.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1228,7 +1228,7 @@ OpId PeerMessageQueue::TEST_GetAllReplicatedIndex() const {
12281228
return queue_state_.all_replicated_op_id;
12291229
}
12301230

1231-
OpId PeerMessageQueue::TEST_GetAllAppliedOpId() const {
1231+
OpId PeerMessageQueue::GetAllAppliedOpId() const {
12321232
LockGuard lock(queue_lock_);
12331233
return queue_state_.all_applied_op_id;
12341234
}

src/yb/consensus/consensus_queue.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ class PeerMessageQueue {
303303

304304
OpId TEST_GetCommittedIndex() const;
305305

306-
OpId TEST_GetAllAppliedOpId() const;
306+
OpId GetAllAppliedOpId() const;
307307

308308
// Returns the current majority replicated OpId, for tests.
309309
OpId TEST_GetMajorityReplicatedOpId() const;
@@ -500,7 +500,7 @@ class PeerMessageQueue {
500500
// Updates op id replicated on each node.
501501
void UpdateAllReplicatedOpId(OpId* result) REQUIRES(queue_lock_);
502502

503-
// Updates op id applied on each node.
503+
// Updates op ID applied on each node.
504504
void UpdateAllAppliedOpId(OpId* result) REQUIRES(queue_lock_);
505505

506506
// Updates op id replicated on each non-lagging node.

src/yb/consensus/raft_consensus.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3020,8 +3020,8 @@ yb::OpId RaftConsensus::GetLastAppliedOpId() {
30203020
return state_->GetLastAppliedOpIdUnlocked();
30213021
}
30223022

3023-
yb::OpId RaftConsensus::TEST_GetAllAppliedOpId() {
3024-
return queue_->TEST_GetAllAppliedOpId();
3023+
yb::OpId RaftConsensus::GetAllAppliedOpId() {
3024+
return queue_->GetAllAppliedOpId();
30253025
}
30263026

30273027
yb::OpId RaftConsensus::GetSplitOpId() {

0 commit comments

Comments
 (0)