Skip to content

Commit 5d402b5

Browse files
[yugabyte#27697] Docdb: Handle Coredump in master due to stack-overflow during index backfill.
Summary: In the last few weeks we have seen few instances of the stress test (with various nemesis) run into a master crash caused by a stack trace that looks like: ``` * thread #1, name = 'yb-master', stop reason = signal SIGSEGV: invalid address * frame #0: 0x0000aaaad52f5fc4 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] std::__1::shared_ptr<yb::master::BackfillTablet>::shared_ptr[abi:ue170006]<yb::master::BackfillTablet, void>(this=<unavailable>, __r=std::__1:: weak_ptr<yb::master::BackfillTablet>::element_type @ 0x000013e4bf787778) at shared_ptr.h:701:20 frame #1: 0x0000aaaad52f5fbc yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] std::__1::enable_shared_from_this<yb::master::BackfillTablet>::shared_from_this[abi:ue170006](this=0x000013e4bf787778) at shared_ptr.h:1954:17 frame #2: 0x0000aaaad52f5fbc yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone(this=0x000013e4bf787778) at backfill_index.cc:1300:50 frame #3: 0x0000aaaad52fb0d4 yb-master`yb::master::BackfillTablet::Done(this=0x000013e4bf787778, status=<unavailable>, backfilled_until=<unavailable>, number_rows_processed=<unavailable>, failed_indexes=<unavailable>) at backfill_index.cc:1323: 10 frame #4: 0x0000aaaad52f9dd8 yb-master`yb::master::BackfillChunk::UnregisterAsyncTaskCallback(this=0x000013e4bbd4d458) at backfill_index.cc:1620:5 frame #5: 0x0000aaaad52be9e0 yb-master`yb::master::RetryingRpcTask::UnregisterAsyncTask(this=0x000013e4bbd4d458) at async_rpc_tasks.cc:470:3 frame #6: 0x0000aaaad52bd4d8 yb-master`yb::master::RetryingRpcTask::Run(this=0x000013e4bbd4d458) at async_rpc_tasks.cc:273:5 frame #7: 0x0000aaaad52f63f0 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] yb::master::BackfillChunk::Launch(this=0x000013e4bbd4d458) at backfill_index.cc:1463:19 frame #8: 0x0000aaaad52f6324 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone(this=<unavailable>) at backfill_index.cc:1303:19 frame #9: 0x0000aaaad52fb0d4 yb-master`yb::master::BackfillTablet::Done(this=0x000013e4bf787778, status=<unavailable>, backfilled_until=<unavailable>, number_rows_processed=<unavailable>, failed_indexes=<unavailable>) at backfill_index.cc:1323: 10 frame #10: 0x0000aaaad52f9dd8 yb-master`yb::master::BackfillChunk::UnregisterAsyncTaskCallback(this=0x000013e4bbd4cd98) at backfill_index.cc:1620:5 frame #11: 0x0000aaaad52be9e0 yb-master`yb::master::RetryingRpcTask::UnregisterAsyncTask(this=0x000013e4bbd4cd98) at async_rpc_tasks.cc:470:3 frame yugabyte#12: 0x0000aaaad52bd4d8 yb-master`yb::master::RetryingRpcTask::Run(this=0x000013e4bbd4cd98) at async_rpc_tasks.cc:273:5 frame yugabyte#13: 0x0000aaaad52f63f0 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] yb::master::BackfillChunk::Launch(this=0x000013e4bbd4cd98) at backfill_index.cc:1463:19 frame yugabyte#14: 0x0000aaaad52f6324 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone(this=<unavailable>) at backfill_index.cc:1303:19 frame yugabyte#15: 0x0000aaaad52fb0d4 yb-master`yb::master::BackfillTablet::Done(this=0x000013e4bf787778, status=<unavailable>, backfilled_until=<unavailable>, number_rows_processed=<unavailable>, failed_indexes=<unavailable>) at backfill_index.cc: 1323:10 frame yugabyte#16: 0x0000aaaad52f9dd8 yb-master`yb::master::BackfillChunk::UnregisterAsyncTaskCallback(this=0x000013e4bbd4cfd8) at backfill_index.cc:1620:5 frame yugabyte#17: 0x0000aaaad52be9e0 yb-master`yb::master::RetryingRpcTask::UnregisterAsyncTask(this=0x000013e4bbd4cfd8) at async_rpc_tasks.cc:470:3 frame yugabyte#18: 0x0000aaaad52bd4d8 yb-master`yb::master::RetryingRpcTask::Run(this=0x000013e4bbd4cfd8) at async_rpc_tasks.cc:273:5 frame yugabyte#19: 0x0000aaaad52f63f0 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] yb::master::BackfillChunk::Launch(this=0x000013e4bbd4cfd8) at backfill_index.cc:1463:19 frame yugabyte#20: 0x0000aaaad52f6324 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone(this=<unavailable>) at backfill_index.cc:1303:19 frame yugabyte#21: 0x0000aaaad52fb0d4 yb-master`yb::master::BackfillTablet::Done(this=0x000013e4bf787778, status=<unavailable>, backfilled_until=<unavailable>, number_rows_processed=<unavailable>, failed_indexes=<unavailable>) at backfill_index.cc: 1323:10 ... frame yugabyte#2452: 0x0000aaaad52f9dd8 yb-master`yb::master::BackfillChunk::UnregisterAsyncTaskCallback(this=0x000013e4bdc7ed98) at backfill_index.cc:1620:5 frame yugabyte#2453: 0x0000aaaad52be9e0 yb-master`yb::master::RetryingRpcTask::UnregisterAsyncTask(this=0x000013e4bdc7ed98) at async_rpc_tasks.cc:470:3 frame yugabyte#2454: 0x0000aaaad52bd4d8 yb-master`yb::master::RetryingRpcTask::Run(this=0x000013e4bdc7ed98) at async_rpc_tasks.cc:273:5 frame yugabyte#2455: 0x0000aaaad52f63f0 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone() [inlined] yb::master::BackfillChunk::Launch(this=0x000013e4bdc7ed98) at backfill_index.cc:1463:19 frame yugabyte#2456: 0x0000aaaad52f6324 yb-master`yb::master::BackfillTablet::LaunchNextChunkOrDone(this=<unavailable>) at backfill_index.cc:1303:19 frame yugabyte#2457: 0x0000aaaad52fb0d4 yb-master`yb::master::BackfillTablet::Done(this=0x000013e4bf787778, status=<unavailable>, backfilled_until=<unavailable>, number_rows_processed=<unavailable>, failed_indexes=<unavailable>) at backfill_index.cc: 1323:10 frame yugabyte#2458: 0x0000aaaad52f9dd8 yb-master`yb::master::BackfillChunk::UnregisterAsyncTaskCallback(this=0x000013e4ba1ff458) at backfill_index.cc:1620:5 frame yugabyte#2459: 0x0000aaaad52be9e0 yb-master`yb::master::RetryingRpcTask::UnregisterAsyncTask(this=0x000013e4ba1ff458) at async_rpc_tasks.cc:470:3 frame yugabyte#2460: 0x0000aaaad52bd4d8 yb-master`yb::master::RetryingRpcTask::Run(this=0x000013e4ba1ff458) at async_rpc_tasks.cc:273:5 frame yugabyte#2461: 0x0000aaaad52c0260 yb-master`yb::master::RetryingRpcTask::RunDelayedTask(this=0x000013e4ba1ff458, status=0x0000ffffab2668c0) at async_rpc_tasks.cc:432:14 frame yugabyte#2462: 0x0000aaaad5c3f838 yb-master`void ev::base<ev_timer, ev::timer>::method_thunk<yb::rpc::DelayedTask, &yb::rpc::DelayedTask::TimerHandler(ev::timer&, int)>(ev_loop*, ev_timer*, int) [inlined] boost::function1<void, yb::Status const&>::operator()(this=0x000013e4bff63b18, a0=0x0000ffffab2668c0) const at function_template.hpp:763:14 frame yugabyte#2463: 0x0000aaaad5c3f81c yb-master`void ev::base<ev_timer, ev::timer>::method_thunk<yb::rpc::DelayedTask, &yb::rpc::DelayedTask::TimerHandler(ev::timer&, int)>(ev_loop*, ev_timer*, int) [inlined] yb::rpc::DelayedTask:: TimerHandler(this=0x000013e4bff63ae8, watcher=<unavailable>, revents=<unavailable>) at delayed_task.cc:155:5 frame yugabyte#2464: 0x0000aaaad5c3f284 yb-master`void ev::base<ev_timer, ev::timer>::method_thunk<yb::rpc::DelayedTask, &yb::rpc::DelayedTask::TimerHandler(ev::timer&, int)>(loop=<unavailable>, w=<unavailable>, revents=<unavailable>) at ev++.h:479:7 frame yugabyte#2465: 0x0000aaaad4cdf170 yb-master`ev_invoke_pending + 112 frame yugabyte#2466: 0x0000aaaad4ce21fc yb-master`ev_run + 2940 frame yugabyte#2467: 0x0000aaaad5c725fc yb-master`yb::rpc::Reactor::RunThread() [inlined] ev::loop_ref::run(this=0x000013e4bfcfadf8, flags=0) at ev++.h:211:7 frame yugabyte#2468: 0x0000aaaad5c725f4 yb-master`yb::rpc::Reactor::RunThread(this=0x000013e4bfcfadc0) at reactor.cc:735:9 frame yugabyte#2469: 0x0000aaaad65c61d8 yb-master`yb::Thread::SuperviseThread(void*) [inlined] std::__1::__function::__value_func<void ()>::operator()[abi:ue170006](this=0x000013e4bfeffa80) const at function.h:517:16 frame yugabyte#2470: 0x0000aaaad65c61c4 yb-master`yb::Thread::SuperviseThread(void*) [inlined] std::__1::function<void ()>::operator()(this=0x000013e4bfeffa80) const at function.h:1168:12 frame yugabyte#2471: 0x0000aaaad65c61c4 yb-master`yb::Thread::SuperviseThread(arg=0x000013e4bfeffa20) at thread.cc:895:3 ``` Essentially, a BackfillChunk is considered done (without sending out an RPC) and launches the next BackfillChunk; which does the same. This may happen if `BackfillTable::indexes_to_build()` is empty, or if the `backfill_jobs()` is empty. However, based on the code reading we should only get there, ** after ** marking `BackfillTable::done_` as `true`. If for some reason, we have `indexes_to_build()` as `empty` and `BackfillTable::done_ == false`, we could get into this infinite recursion. Since I am unable to explain and recreate how this happens, I'm adding a test flag `TEST_simulate_empty_indexes` to repro this. Fix: We update `BackfillChunk::SendRequest` to handle the empty `indexes_to_build()` as a failure rather than treating this as a success. This prevents the infinite recursion. Also, adding a few log lines that may help better understand the scenario if we run into this again. Jira: DB-17296 Test Plan: yb_build.sh fastdebug --cxx-test pg_index_backfill-test --gtest_filter *.SimulateEmptyIndexesForStackOverflow* Reviewers: zdrudi, rthallam, jason Reviewed By: zdrudi Subscribers: ybase, yql Differential Revision: https://phorge.dev.yugabyte.com/D45031
1 parent e2f360a commit 5d402b5

File tree

2 files changed

+41
-3
lines changed

2 files changed

+41
-3
lines changed

src/yb/master/backfill_index.cc

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ DEFINE_test_flag(bool, skip_index_backfill, false,
114114
DEFINE_test_flag(bool, block_do_backfill, false,
115115
"Block DoBackfill from proceeding.");
116116

117+
DEFINE_test_flag(bool, simulate_empty_indexes_during_backfill, false,
118+
"Simulates BackfillTable::indexes_to_build() to return an empty set.");
119+
117120
DEFINE_test_flag(bool, simulate_cannot_enable_compactions, false,
118121
"Skips updating an index table to GC delete markers and sending of the corresponding RPC "
119122
"to the TServer.");
@@ -701,11 +704,17 @@ BackfillTable::BackfillTable(
701704

702705
const std::unordered_set<TableId> BackfillTable::indexes_to_build() const {
703706
std::unordered_set<TableId> indexes_to_build;
707+
if (PREDICT_FALSE(FLAGS_TEST_simulate_empty_indexes_during_backfill)) {
708+
LOG_WITH_PREFIX(WARNING) << "Simulating empty indexes.";
709+
return indexes_to_build;
710+
}
704711
{
705712
auto l = indexed_table_->LockForRead();
706713
const auto& indexed_table_pb = l.data().pb;
707714
if (indexed_table_pb.backfill_jobs_size() == 0) {
708715
// Some other task already marked the backfill job as done.
716+
LOG_WITH_PREFIX(INFO) << "No backfill jobs found for table " << indexed_table_->ToString()
717+
<< ". Cannot determine indexes to build.";
709718
return {};
710719
}
711720
DCHECK(indexed_table_pb.backfill_jobs_size() == 1) << "For now we only expect to have up to 1 "
@@ -715,6 +724,17 @@ const std::unordered_set<TableId> BackfillTable::indexes_to_build() const {
715724
indexes_to_build.insert(kv_pair.first);
716725
}
717726
}
727+
728+
if (indexes_to_build.empty()) {
729+
std::vector<std::string> details;
730+
const auto& backfill_state = indexed_table_pb.backfill_jobs(0).backfill_state();
731+
std::transform(
732+
backfill_state.begin(), backfill_state.end(), std::back_inserter(details),
733+
[](const auto& kv_pair) {
734+
return Substitute("$0: $1", kv_pair.first, BackfillJobPB::State_Name(kv_pair.second));
735+
});
736+
LOG_WITH_PREFIX(WARNING) << "No indexes to build. backfill_state: " << yb::ToString(details);
737+
}
718738
}
719739
return indexes_to_build;
720740
}
@@ -1081,8 +1101,10 @@ Status BackfillTable::UpdateIndexPermissionsForIndexes() {
10811101
for (const auto& kv_pair : indexed_table_pb.backfill_jobs(0).backfill_state()) {
10821102
VLOG(2) << "Reading backfill_state for " << kv_pair.first << " as "
10831103
<< BackfillJobPB_State_Name(kv_pair.second);
1084-
DCHECK_NE(kv_pair.second, BackfillJobPB::IN_PROGRESS)
1085-
<< __func__ << " is expected to be only called after all indexes are done.";
1104+
if (PREDICT_TRUE(!FLAGS_TEST_simulate_empty_indexes_during_backfill)) {
1105+
DCHECK_NE(kv_pair.second, BackfillJobPB::IN_PROGRESS)
1106+
<< __func__ << " is expected to be only called after all indexes are done.";
1107+
}
10861108
const bool success = (kv_pair.second == BackfillJobPB::SUCCESS);
10871109
all_success &= success;
10881110
permissions_to_set.emplace(
@@ -1499,7 +1521,8 @@ std::string BackfillChunk::description() const {
14991521
bool BackfillChunk::SendRequest(int attempt) {
15001522
VLOG(1) << __PRETTY_FUNCTION__;
15011523
if (indexes_being_backfilled_.empty()) {
1502-
TransitionToCompleteState();
1524+
TransitionToFailedState(
1525+
MonitoredTaskState::kRunning, STATUS(IllegalState, "No indexes remaining to backfill."));
15031526
return false;
15041527
}
15051528

src/yb/yql/pgwrapper/pg_index_backfill-test.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,6 +1766,21 @@ TEST_P(PgIndexBackfillBlockDoBackfill, IndexScanVisibility) {
17661766
<< " for other sessions to notice that the index became public";
17671767
}
17681768

1769+
TEST_P(PgIndexBackfillTest, SimulateEmptyIndexesForStackOverflow) {
1770+
ASSERT_OK(conn_->ExecuteFormat("CREATE TABLE $0 (i int)", kTableName));
1771+
ASSERT_OK(cluster_->SetFlagOnMasters("TEST_simulate_empty_indexes_during_backfill", "true"));
1772+
1773+
LOG(INFO) << "Create connection to run CREATE INDEX";
1774+
PGConn create_index_conn = ASSERT_RESULT(ConnectToDB(kDatabaseName));
1775+
LOG(INFO) << "Create index...";
1776+
// The failed BackfillChunk/BackfillTable will not update the
1777+
// backfill_error_message in IndexInfoPB for kIndexName. Thus the following
1778+
// CREATE INDEX won't bubble up an error from the backfill job failing.
1779+
ASSERT_OK(create_index_conn.ExecuteFormat("CREATE INDEX $0 ON $1 (i)", kIndexName, kTableName));
1780+
1781+
cluster_->AssertNoCrashes();
1782+
}
1783+
17691784
// Override to have smaller backfill deadline.
17701785
class PgIndexBackfillClientDeadline : public PgIndexBackfillBlockDoBackfill {
17711786
public:

0 commit comments

Comments
 (0)