Commit ec79d7b
committed
[BACKPORT 2024.1][yugabyte#23747] MetaCache: Callback should not be called while holding the lock
Summary:
Original commit: c770d79 / D37706
Call callback in ScopeExit block only. Not while holding the lock.
Without this fix, it is possible that a thread can get into a deadlock, trying to request a shared_lock on a mutex, while already holding an exclusive lock on the same mutex:
This deadlock can be triggered if there are active read/write requests to a Table (from more than 1 thread) right after the table had a tablet-split.
If there is only 1 thread, it is unlikely to run into the deadlock, as the thread notices -- as part of the callback -- that the table's partition info is stale. Having a different thread refresh the partition version before the main thread checks if the table version is stale, is likely necessary to trigger the stack trace seen below.
e.g:
```
#0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
#1 0x00005640c3eb441b in std::__1::shared_timed_mutex::lock_shared() ()
#2 0x00005640c3ffcbff in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr<yb::client::YBTable> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::chrono::time_point<yb::CoarseMonoClock, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000000000l> > >, std::__1::function<void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>, yb::StronglyTypedBool<yb::client::internal::FailOnPartitionListRefreshed_Tag>) ()
#3 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) ()
#4 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr<yb::client::internal::Batcher> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool<yb::client::internal::IsWithinTransactionRetry_Tag>) ()
#5 0x00005640c401aa76 in yb::client::(anonymous namespace)::BatcherFlushDone(std::__1::shared_ptr<yb::client::internal::Batcher> const&, yb::Status const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig) ()
#6 0x00005640c401b371 in boost::detail::function::void_function_obj_invoker1<std::__1::__bind<void (*)(std::__1::shared_ptr<yb::client::internal::Batcher> const&, yb::Status const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig), std::__1::shared_ptr<yb::client::internal::Batcher> const&, std::__1::placeholders::__ph<1> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig&>, void, yb::Status const&>::invoke(boost::detail::function::function_buffer&, yb::Status const&) ()
#7 0x00005640c3f70398 in yb::client::internal::Batcher::Run() ()
#8 0x00005640c3f72656 in yb::client::internal::Batcher::FlushFinished() ()
#9 0x00005640c3f74a4d in yb::client::internal::Batcher::TabletLookupFinished(yb::client::internal::InFlightOp*, yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> >) ()
#10 0x00005640c3f759bc in std::__1::__function::__func<yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*)::$_0, std::__1::allocator<yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*)::$_0>, void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>::operator()(yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&) ()
#11 0x00005640c3fff05d in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr<yb::client::YBTable> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, std::__1::chrono::time_point<yb::CoarseMonoClock, std::__1::chrono::duration<long long, std::__1::ratio<1l, 1000000000l> > >, std::__1::function<void (yb::Result<scoped_refptr<yb::client::internal::RemoteTablet> > const&)>, yb::StronglyTypedBool<yb::client::internal::FailOnPartitionListRefreshed_Tag>) ()
** Is holding an exclusive lock in MetaCache::LookupTabletByKey/DoLookupTabletByKey **
yugabyte#12 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) ()
yugabyte#13 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr<yb::client::internal::Batcher> const&, boost::function<void (yb::client::FlushStatus*)>, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool<yb::client::internal::IsWithinTransactionRetry_Tag>) ()
yugabyte#14 0x00005640c4017130 in yb::client::YBSession::FlushAsync(boost::function<void (yb::client::FlushStatus*)>) ()
yugabyte#15 0x00005640c5225a0c in yb::tserver::PgClientServiceImpl::Perform(yb::tserver::PgPerformRequestPB const*, yb::tserver::PgPerformResponsePB*, yb::rpc::RpcContext) ()
yugabyte#16 0x00005640c51c4487 in std::__1::__function::__func<yb::tserver::PgClientServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_20, std::__1::allocator<yb::tserver::PgClientServiceIf::InitMethods(scoped_refptr<yb::MetricEntity> const&)::$_20>, void (std::__1::shared_ptr<yb::rpc::InboundCall>)>::operator()(std::__1::shared_ptr<yb::rpc::InboundCall>&&) ()
yugabyte#17 0x00005640c51d374f in yb::tserver::PgClientServiceIf::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) ()
yugabyte#18 0x00005640c4f5f420 in yb::rpc::ServicePoolImpl::Handle(std::__1::shared_ptr<yb::rpc::InboundCall>) ()
yugabyte#19 0x00005640c4e845af in yb::rpc::InboundCall::InboundCallTask::Run() ()
yugabyte#20 0x00005640c4f6e243 in yb::rpc::(anonymous namespace)::Worker::Execute() ()
yugabyte#21 0x00005640c570ecb4 in yb::Thread::SuperviseThread(void*) ()
yugabyte#22 0x00007f808b7c6694 in start_thread (arg=0x7f76d8caf700) at pthread_create.c:333
yugabyte#23 0x00007f808bac341d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109
```
Jira: DB-12651
Test Plan:
Jenkins
yb_build.sh --cxx-test ql-stress-test QLStressTest.ReproMetaCacheDeadlock
Reviewers: rthallam, hsunder, qhu, timur
Reviewed By: rthallam
Subscribers: ybase, svc_phabricator
Differential Revision: https://phorge.dev.yugabyte.com/D377881 parent 2bf6354 commit ec79d7b
2 files changed
+27
-5
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
111 | 111 | | |
112 | 112 | | |
113 | 113 | | |
| 114 | + | |
| 115 | + | |
114 | 116 | | |
115 | 117 | | |
116 | 118 | | |
| |||
1921 | 1923 | | |
1922 | 1924 | | |
1923 | 1925 | | |
1924 | | - | |
| 1926 | + | |
| 1927 | + | |
1925 | 1928 | | |
1926 | 1929 | | |
| 1930 | + | |
| 1931 | + | |
1927 | 1932 | | |
1928 | 1933 | | |
1929 | 1934 | | |
| |||
1950 | 1955 | | |
1951 | 1956 | | |
1952 | 1957 | | |
1953 | | - | |
| 1958 | + | |
1954 | 1959 | | |
1955 | 1960 | | |
1956 | 1961 | | |
1957 | 1962 | | |
1958 | 1963 | | |
1959 | | - | |
| 1964 | + | |
1960 | 1965 | | |
1961 | 1966 | | |
1962 | 1967 | | |
| |||
2063 | 2068 | | |
2064 | 2069 | | |
2065 | 2070 | | |
| 2071 | + | |
| 2072 | + | |
| 2073 | + | |
| 2074 | + | |
| 2075 | + | |
| 2076 | + | |
2066 | 2077 | | |
2067 | 2078 | | |
2068 | 2079 | | |
| |||
2156 | 2167 | | |
2157 | 2168 | | |
2158 | 2169 | | |
2159 | | - | |
| 2170 | + | |
| 2171 | + | |
2160 | 2172 | | |
2161 | 2173 | | |
| 2174 | + | |
| 2175 | + | |
2162 | 2176 | | |
2163 | 2177 | | |
2164 | 2178 | | |
| |||
2174 | 2188 | | |
2175 | 2189 | | |
2176 | 2190 | | |
2177 | | - | |
| 2191 | + | |
2178 | 2192 | | |
2179 | 2193 | | |
2180 | 2194 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
74 | 74 | | |
75 | 75 | | |
76 | 76 | | |
| 77 | + | |
77 | 78 | | |
78 | 79 | | |
| 80 | + | |
79 | 81 | | |
80 | 82 | | |
81 | 83 | | |
| |||
437 | 439 | | |
438 | 440 | | |
439 | 441 | | |
| 442 | + | |
| 443 | + | |
| 444 | + | |
| 445 | + | |
| 446 | + | |
| 447 | + | |
440 | 448 | | |
441 | 449 | | |
442 | 450 | | |
| |||
0 commit comments