Skip to content

Commit 5fc9ce1

Browse files
committed
[yugabyte#7126] Fix YbAdminSnapshotScheduleTest.UndeleteIndex
Summary: Fixes issues uncovered by YbAdminSnapshotScheduleTest.UndeleteIndex test. 1) DeleteTableInMemory could be called multiple times in the case of the index table. There is a check that just does noop when the table was already deleted. Adjusted this check to do the same when the table is being hidden. 2) Don't remove the table from names map during delete, when it was previously hidden. Otherwise, it would crash with fatal during cleanup. 3) DeleteTabletListAndSendRequests executes delete on tablet before commiting tablet info changes. As a result tablet could be deleted before and callback called, before info changes in memory. So table would hang in delete state. Because callback would think that tablet is not being deleted. 4) Decreased log flooding when compactions are being enabled in RocksDB. When compactions are being enabled we call SetOptions twice for each RocksDB, and each of them dumps all current options values. So while we have regular and intents DB we have 4 dumps of all rocksdb options. Also added debug logging to `RWCLock::WriteLock()`, when it takes a too long time to acquire this lock, it would log the stack trace of the successful write lock. Test Plan: ybd --gtest_filter YbAdminSnapshotScheduleTest.UndeleteIndex -n 20 Reviewers: bogdan Reviewed By: bogdan Subscribers: amitanand, ybase Differential Revision: https://phabricator.dev.yugabyte.com/D11614
1 parent f6b271e commit 5fc9ce1

33 files changed

+432
-248
lines changed

ent/src/yb/master/async_snapshot_tasks.cc

Lines changed: 32 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ using tserver::TabletServerErrorPB;
3737
// AsyncTabletSnapshotOp
3838
////////////////////////////////////////////////////////////
3939

40+
namespace {
41+
42+
std::string SnapshotIdToString(const std::string& snapshot_id) {
43+
auto uuid = TryFullyDecodeTxnSnapshotId(snapshot_id);
44+
return uuid.IsNil() ? snapshot_id : uuid.ToString();
45+
}
46+
47+
}
48+
4049
AsyncTabletSnapshotOp::AsyncTabletSnapshotOp(Master *master,
4150
ThreadPool* callback_pool,
4251
const scoped_refptr<TabletInfo>& tablet,
@@ -52,8 +61,9 @@ AsyncTabletSnapshotOp::AsyncTabletSnapshotOp(Master *master,
5261
}
5362

5463
string AsyncTabletSnapshotOp::description() const {
55-
return Format("$0 Tablet Snapshot Operation $1 RPC",
56-
*tablet_, tserver::TabletSnapshotOpRequestPB::Operation_Name(operation_));
64+
return Format("$0 Tablet Snapshot Operation $1 RPC $2",
65+
*tablet_, tserver::TabletSnapshotOpRequestPB::Operation_Name(operation_),
66+
SnapshotIdToString(snapshot_id_));
5767
}
5868

5969
TabletId AsyncTabletSnapshotOp::tablet_id() const {
@@ -64,43 +74,36 @@ TabletServerId AsyncTabletSnapshotOp::permanent_uuid() const {
6474
return target_ts_desc_ != nullptr ? target_ts_desc_->permanent_uuid() : "";
6575
}
6676

77+
bool AsyncTabletSnapshotOp::RetryAllowed(TabletServerErrorPB::Code code, const Status& status) {
78+
switch (code) {
79+
case TabletServerErrorPB::TABLET_NOT_FOUND:
80+
return false;
81+
case TabletServerErrorPB::INVALID_SNAPSHOT:
82+
return operation_ != tserver::TabletSnapshotOpRequestPB::RESTORE_ON_TABLET;
83+
default:
84+
return TransactionError(status) != TransactionErrorCode::kSnapshotTooOld;
85+
}
86+
}
87+
6788
void AsyncTabletSnapshotOp::HandleResponse(int attempt) {
6889
server::UpdateClock(resp_, master_->clock());
6990

7091
if (resp_.has_error()) {
7192
Status status = StatusFromPB(resp_.error().status());
7293

73-
// Do not retry on a fatal error.
74-
switch (resp_.error().code()) {
75-
case TabletServerErrorPB::TABLET_NOT_FOUND:
76-
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
77-
<< tablet_->ToString() << " no further retry: " << status;
78-
TransitionToCompleteState();
79-
break;
80-
case TabletServerErrorPB::INVALID_SNAPSHOT:
81-
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
82-
<< tablet_->ToString() << ": " << status;
83-
if (operation_ == tserver::TabletSnapshotOpRequestPB::RESTORE_ON_TABLET) {
84-
LOG(WARNING) << "No further retry for RESTORE snapshot operation: " << status;
85-
TransitionToCompleteState();
86-
}
87-
break;
88-
default:
89-
LOG(WARNING) << "TS " << permanent_uuid() << ": snapshot failed for tablet "
90-
<< tablet_->ToString() << ": " << status;
91-
if (TransactionError(status) == TransactionErrorCode::kSnapshotTooOld) {
92-
TransitionToCompleteState();
93-
}
94-
break;
94+
if (!RetryAllowed(resp_.error().code(), status)) {
95+
LOG_WITH_PREFIX(WARNING) << "Failed, NO retry: " << status;
96+
TransitionToCompleteState();
97+
} else {
98+
LOG_WITH_PREFIX(WARNING) << "Failed, will be retried: " << status;
9599
}
96100
} else {
97101
TransitionToCompleteState();
98-
VLOG(1) << "TS " << permanent_uuid() << ": snapshot complete on tablet "
99-
<< tablet_->ToString();
102+
VLOG_WITH_PREFIX(1) << "Complete";
100103
}
101104

102105
if (state() != MonitoredTaskState::kComplete) {
103-
VLOG(1) << "TabletSnapshotOp task is not completed";
106+
VLOG_WITH_PREFIX(1) << "TabletSnapshotOp task is not completed";
104107
return;
105108
}
106109

@@ -162,9 +165,8 @@ bool AsyncTabletSnapshotOp::SendRequest(int attempt) {
162165
req.set_propagated_hybrid_time(master_->clock()->Now().ToUint64());
163166

164167
ts_backup_proxy_->TabletSnapshotOpAsync(req, &resp_, &rpc_, BindRpcCallback());
165-
VLOG(1) << "Send tablet snapshot request " << operation_ << " to " << permanent_uuid()
166-
<< " (attempt " << attempt << "):\n"
167-
<< req.DebugString();
168+
VLOG_WITH_PREFIX(1) << "Sent to " << permanent_uuid() << " (attempt " << attempt << "): "
169+
<< (VLOG_IS_ON(4) ? req.ShortDebugString() : "");
168170
return true;
169171
}
170172

ent/src/yb/master/async_snapshot_tasks.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ class AsyncTabletSnapshotOp : public enterprise::RetryingTSRpcTask {
6161
void HandleResponse(int attempt) override;
6262
bool SendRequest(int attempt) override;
6363
void Finished(const Status& status) override;
64+
bool RetryAllowed(tserver::TabletServerErrorPB::Code code, const Status& status);
6465

65-
scoped_refptr<TabletInfo> tablet_;
66+
TabletInfoPtr tablet_;
6667
const std::string snapshot_id_;
6768
tserver::TabletSnapshotOpRequestPB::Operation operation_;
6869
SnapshotScheduleId snapshot_schedule_id_ = SnapshotScheduleId::Nil();

ent/src/yb/master/catalog_manager_ent.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,11 @@ Status CatalogManager::RestoreSysCatalog(SnapshotScheduleRestoration* restoratio
14321432
// Load objects to restore and determine obsolete objects.
14331433
RestoreSysCatalogState state(restoration);
14341434
RETURN_NOT_OK(state.LoadObjects(schema(), doc_db));
1435+
{
1436+
SharedLock lock(mutex_);
1437+
RETURN_NOT_OK(state.PatchVersions(*table_ids_map_));
1438+
}
1439+
RETURN_NOT_OK(state.DetermineEntries());
14351440
{
14361441
auto existing = VERIFY_RESULT(CollectEntriesForSnapshot(restoration->filter.tables().tables()));
14371442
RETURN_NOT_OK(state.DetermineObsoleteObjects(existing));

ent/src/yb/master/restore_sys_catalog_state.cc

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,20 @@ Status RestoreSysCatalogState::LoadObjects(const Schema& schema, const docdb::Do
186186
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &namespaces_));
187187
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &tables_));
188188
RETURN_NOT_OK(IterateSysCatalog(schema, doc_db, &tablets_));
189-
return DetermineEntries();
189+
return Status::OK();
190+
}
191+
192+
Status RestoreSysCatalogState::PatchVersions(const TableInfoMap& tables) {
193+
for (auto& id_and_pb : tables_) {
194+
auto it = tables.find(id_and_pb.first);
195+
if (it == tables.end()) {
196+
return STATUS_FORMAT(NotFound, "Not found restoring table: $0", id_and_pb.first);
197+
}
198+
199+
// Force schema update after restoration.
200+
id_and_pb.second.set_version(it->second->LockForRead()->pb.version() + 1);
201+
}
202+
return Status::OK();
190203
}
191204

192205
Status RestoreSysCatalogState::DetermineObsoleteObjects(const SysRowEntries& existing) {

ent/src/yb/master/restore_sys_catalog_state.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,28 @@ class RestoreSysCatalogState {
3535
public:
3636
explicit RestoreSysCatalogState(SnapshotScheduleRestoration* restoration);
3737

38+
// Load objects from DB snapshot.
3839
CHECKED_STATUS LoadObjects(const Schema& schema, const docdb::DocDB& doc_db);
3940

41+
// Patch table versions, so restored tables will have greater schema version to force schema
42+
// update.
43+
CHECKED_STATUS PatchVersions(const TableInfoMap& tables);
44+
45+
// Determine entries that should be restored. I.e. apply filter and serialize.
46+
CHECKED_STATUS DetermineEntries();
47+
48+
// Determine objects that should be removed, i.e. was created after restoration time.
4049
CHECKED_STATUS DetermineObsoleteObjects(const SysRowEntries& existing);
4150

51+
// Prepare write batch with object changes.
4252
CHECKED_STATUS PrepareWriteBatch(const Schema& schema, docdb::DocWriteBatch* write_batch);
4353

54+
// Prepare write batch to delete obsolete tablet.
4455
CHECKED_STATUS PrepareTabletCleanup(
4556
const TabletId& id, SysTabletsEntryPB pb, const Schema& schema,
4657
docdb::DocWriteBatch* write_batch);
4758

59+
// Prepare write batch to delete obsolete table.
4860
CHECKED_STATUS PrepareTableCleanup(
4961
const TableId& id, SysTablesEntryPB pb, const Schema& schema,
5062
docdb::DocWriteBatch* write_batch);
@@ -62,8 +74,6 @@ class RestoreSysCatalogState {
6274
CHECKED_STATUS IterateSysCatalog(
6375
const Schema& schema, const docdb::DocDB& doc_db, std::unordered_map<std::string, PB>* map);
6476

65-
CHECKED_STATUS DetermineEntries();
66-
6777
Result<bool> MatchTable(const TableId& id, const SysTablesEntryPB& table);
6878
Result<bool> TableMatchesIdentifier(
6979
const TableId& id, const SysTablesEntryPB& table, const TableIdentifierPB& table_identifier);

ent/src/yb/tools/yb-admin_client_ent.cc

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,8 @@ Result<TxnSnapshotId> ClusterAdminClient::SuitableSnapshotId(
419419
req.set_snapshot_schedule_id(schedule_id.data(), schedule_id.size());
420420
}
421421

422-
RETURN_NOT_OK(master_backup_proxy_->ListSnapshotSchedules(req, &resp, &rpc));
422+
RETURN_NOT_OK_PREPEND(master_backup_proxy_->ListSnapshotSchedules(req, &resp, &rpc),
423+
"Failed to list snapshot schedules");
423424

424425
if (resp.has_error()) {
425426
return StatusFromPB(resp.error().status());
@@ -446,7 +447,8 @@ Result<TxnSnapshotId> ClusterAdminClient::SuitableSnapshotId(
446447
master::CreateSnapshotRequestPB req;
447448
master::CreateSnapshotResponsePB resp;
448449
req.set_schedule_id(schedule_id.data(), schedule_id.size());
449-
RETURN_NOT_OK(master_backup_proxy_->CreateSnapshot(req, &resp, &rpc));
450+
RETURN_NOT_OK_PREPEND(master_backup_proxy_->CreateSnapshot(req, &resp, &rpc),
451+
"Failed to create snapshot");
450452
if (resp.has_error()) {
451453
auto status = StatusFromPB(resp.error().status());
452454
if (master::MasterError(status) == master::MasterErrorPB::PARALLEL_SNAPSHOT_OPERATION) {
@@ -471,7 +473,8 @@ Result<rapidjson::Document> ClusterAdminClient::RestoreSnapshotSchedule(
471473
master::ListSnapshotsRequestPB req;
472474
req.set_snapshot_id(snapshot_id.data(), snapshot_id.size());
473475
master::ListSnapshotsResponsePB resp;
474-
RETURN_NOT_OK(master_backup_proxy_->ListSnapshots(req, &resp, &rpc));
476+
RETURN_NOT_OK_PREPEND(master_backup_proxy_->ListSnapshots(req, &resp, &rpc),
477+
"Failed to list snapshots");
475478
if (resp.has_error()) {
476479
return StatusFromPB(resp.error().status());
477480
}
@@ -500,7 +503,8 @@ Result<rapidjson::Document> ClusterAdminClient::RestoreSnapshotSchedule(
500503
RestoreSnapshotResponsePB resp;
501504
req.set_snapshot_id(snapshot_id.data(), snapshot_id.size());
502505
req.set_restore_ht(restore_at.ToUint64());
503-
RETURN_NOT_OK(master_backup_proxy_->RestoreSnapshot(req, &resp, &rpc));
506+
RETURN_NOT_OK_PREPEND(master_backup_proxy_->RestoreSnapshot(req, &resp, &rpc),
507+
"Failed to restore snapshot");
504508

505509
if (resp.has_error()) {
506510
return StatusFromPB(resp.error().status());

ent/src/yb/tserver/backup_service.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ void TabletServiceBackupImpl::TabletSnapshotOp(const TabletSnapshotOpRequestPB*
5959
TRACE_EVENT1("tserver", "TabletSnapshotOp", "tablet_id: ", tablet_id);
6060

6161
LOG(INFO) << "Processing TabletSnapshotOp for tablet " << tablet_id << " from "
62-
<< context.requestor_string() << ": " << req->operation();
62+
<< context.requestor_string() << ": "
63+
<< TabletSnapshotOpRequestPB::Operation_Name(req->operation());
6364
VLOG(1) << "Full request: " << req->DebugString();
6465

6566
auto tablet = LookupLeaderTabletOrRespond(tablet_manager_, tablet_id, resp, &context);

src/yb/common/entity_ids.h

Lines changed: 2 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -18,41 +18,12 @@
1818
#include <set>
1919
#include <utility>
2020

21+
#include "yb/common/entity_ids_types.h"
22+
2123
#include "yb/util/result.h"
22-
#include "yb/util/strongly_typed_string.h"
2324

2425
namespace yb {
2526

26-
// TODO: switch many of these to opaque types for additional type safety and efficiency.
27-
28-
using NamespaceName = std::string;
29-
using TableName = std::string;
30-
using UDTypeName = std::string;
31-
using RoleName = std::string;
32-
33-
using NamespaceId = std::string;
34-
using TableId = std::string;
35-
using UDTypeId = std::string;
36-
using CDCStreamId = std::string;
37-
38-
using PeerId = std::string;
39-
using SnapshotId = std::string;
40-
using TabletServerId = PeerId;
41-
using TabletId = std::string;
42-
using TablegroupId = std::string;
43-
using TablespaceId = std::string;
44-
45-
YB_STRONGLY_TYPED_STRING(KvStoreId);
46-
47-
// TODO(#79): switch to YB_STRONGLY_TYPED_STRING
48-
using RaftGroupId = std::string;
49-
50-
using NamespaceIdTableNamePair = std::pair<NamespaceId, TableName>;
51-
52-
using FlushRequestId = std::string;
53-
54-
using RedisConfigKey = std::string;
55-
5627
static const uint32_t kPgSequencesDataTableOid = 0xFFFF;
5728
static const uint32_t kPgSequencesDataDatabaseOid = 0xFFFF;
5829

src/yb/common/entity_ids_types.h

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright (c) YugaByte, Inc.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
4+
// in compliance with the License. You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software distributed under the License
9+
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
10+
// or implied. See the License for the specific language governing permissions and limitations
11+
// under the License.
12+
//
13+
14+
#ifndef YB_COMMON_ENTITY_IDS_TYPES_H
15+
#define YB_COMMON_ENTITY_IDS_TYPES_H
16+
17+
#include <string>
18+
19+
#include "yb/util/strongly_typed_string.h"
20+
21+
namespace yb {
22+
23+
// TODO: switch many of these to opaque types for additional type safety and efficiency.
24+
using NamespaceName = std::string;
25+
using TableName = std::string;
26+
using UDTypeName = std::string;
27+
using RoleName = std::string;
28+
29+
using NamespaceId = std::string;
30+
using TableId = std::string;
31+
using UDTypeId = std::string;
32+
using CDCStreamId = std::string;
33+
34+
using PeerId = std::string;
35+
using SnapshotId = std::string;
36+
using TabletServerId = PeerId;
37+
using TabletId = std::string;
38+
using TablegroupId = std::string;
39+
using TablespaceId = std::string;
40+
41+
YB_STRONGLY_TYPED_STRING(KvStoreId);
42+
43+
// TODO(#79): switch to YB_STRONGLY_TYPED_STRING
44+
using RaftGroupId = std::string;
45+
46+
using NamespaceIdTableNamePair = std::pair<NamespaceId, TableName>;
47+
48+
using FlushRequestId = std::string;
49+
50+
using RedisConfigKey = std::string;
51+
52+
} // namespace yb
53+
54+
#endif // YB_COMMON_ENTITY_IDS_TYPES_H

0 commit comments

Comments
 (0)