Skip to content

Commit 799932b

Browse files
authored
[bugfix] clean up shm remnants to fix "BufferOut" error in PCStore (#512)
PCStore uses shared memory to share data in DRAM. If the service terminates abnormally and residual files of the shared data are not cleaned up, it will cause newly started services to report BufferOut errors.
1 parent 45e11a5 commit 799932b

19 files changed

+67
-30
lines changed

ucm/store/pcstore/cc/api/pcstore.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,15 @@ class PcStoreImpl : public PcStore {
3737
auto status = this->spaceMgr_.Setup(config.storageBackends, config.kvcacheBlockSize);
3838
if (status.Failure()) { return status.Underlying(); }
3939
if (config.transferEnable) {
40+
if (config.uniqueId.empty()) {
41+
UC_ERROR("UniqueId is required.");
42+
return Status::InvalidParam().Underlying();
43+
}
4044
status = this->transMgr_.Setup(
4145
config.transferLocalRankSize, config.transferDeviceId, config.transferStreamNumber,
4246
config.kvcacheBlockSize, config.transferIoSize, config.transferIoDirect,
4347
config.transferBufferNumber, this->spaceMgr_.GetSpaceLayout(),
44-
config.transferTimeoutMs, config.transferScatterGatherEnable);
48+
config.transferTimeoutMs, config.transferScatterGatherEnable, config.uniqueId);
4549
if (status.Failure()) { return status.Underlying(); }
4650
}
4751
this->ShowConfig(config);
@@ -86,6 +90,7 @@ class PcStoreImpl : public PcStore {
8690
UC_INFO("Set UC::BlockSize to {}.", config.kvcacheBlockSize);
8791
UC_INFO("Set UC::TransferEnable to {}.", config.transferEnable);
8892
if (!config.transferEnable) { return; }
93+
UC_INFO("Set UC::UniqueId to {}.", config.uniqueId);
8994
UC_INFO("Set UC::IoSize to {}.", config.transferIoSize);
9095
UC_INFO("Set UC::IoDirect to {}.", config.transferIoDirect);
9196
UC_INFO("Set UC::LocalRankSize to {}.", config.transferLocalRankSize);
@@ -112,4 +117,4 @@ int32_t PcStore::Setup(const Config& config)
112117
return impl->Setup(config);
113118
}
114119

115-
} // namespace UC
120+
} // namespace UC

ucm/store/pcstore/cc/api/pcstore.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ class PcStore : CCStore<TransTask> {
3535
std::vector<std::string> storageBackends;
3636
size_t kvcacheBlockSize;
3737
bool transferEnable;
38+
std::string uniqueId{};
3839
size_t transferIoSize{262144};
3940
bool transferIoDirect{false};
4041
size_t transferLocalRankSize{1};
@@ -46,7 +47,8 @@ class PcStore : CCStore<TransTask> {
4647

4748
Config(const std::vector<std::string>& storageBackends, const size_t kvcacheBlockSize,
4849
const bool transferEnable)
49-
: storageBackends{storageBackends}, kvcacheBlockSize{kvcacheBlockSize},
50+
: storageBackends{storageBackends},
51+
kvcacheBlockSize{kvcacheBlockSize},
5052
transferEnable{transferEnable}
5153
{
5254
}
@@ -87,6 +89,6 @@ class PcStore : CCStore<TransTask> {
8789
PcStore* impl_{nullptr};
8890
};
8991

90-
} // namespace UC
92+
} // namespace UC
9193

9294
#endif

ucm/store/pcstore/cc/domain/file/file.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,4 @@ void File::ShmUnlink(const std::string& path) { FileImpl{path}.ShmUnlink(); }
9494

9595
void File::Remove(const std::string& path) { FileImpl{path}.Remove(); }
9696

97-
} // namespace UC
97+
} // namespace UC

ucm/store/pcstore/cc/domain/file/file.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,6 @@ class File {
4747
static void Remove(const std::string& path);
4848
};
4949

50-
} // namespace UC
50+
} // namespace UC
5151

5252
#endif

ucm/store/pcstore/cc/domain/file/ifile.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,6 @@ class IFile {
7676
std::string path_;
7777
};
7878

79-
} // namespace UC
79+
} // namespace UC
8080

8181
#endif

ucm/store/pcstore/cc/domain/file/posix_file.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@
2525
#include <sys/mman.h>
2626
#include <sys/stat.h>
2727
#include <sys/xattr.h>
28-
#include <utime.h>
2928
#include <unistd.h>
29+
#include <utime.h>
3030
#include "logger/logger.h"
3131

3232
namespace UC {
@@ -242,4 +242,4 @@ Status PosixFile::UpdateTime()
242242
return Status::OK();
243243
}
244244

245-
} // namespace UC
245+
} // namespace UC

ucm/store/pcstore/cc/domain/file/posix_file.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,6 @@ class PosixFile : public IFile {
5353
int32_t handle_;
5454
};
5555

56-
} // namespace UC
56+
} // namespace UC
5757

58-
#endif // UNIFIEDCACHE_POSIX_FILE_H
58+
#endif // UNIFIEDCACHE_POSIX_FILE_H

ucm/store/pcstore/cc/domain/trans/share_buffer.cc

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "share_buffer.h"
2525
#include <atomic>
2626
#include <chrono>
27+
#include <filesystem>
2728
#include <thread>
2829
#include <unistd.h>
2930
#include "file/file.h"
@@ -97,21 +98,36 @@ struct ShareBufferHeader {
9798
ShareBlockHeader headers[0];
9899
};
99100

100-
inline std::string GenShareBufferName(const size_t blockSize, const size_t blockNumber,
101-
const bool ioDirect, const size_t nSharer)
101+
const inline std::string& ShmPrefix() noexcept
102102
{
103-
return fmt::format("uc.buf-{}-{}-{}-{:04x}", blockSize, blockNumber, ioDirect, nSharer);
103+
static std::string prefix{"uc_shm_pcstore_"};
104+
return prefix;
105+
}
106+
void CleanUpShmFileExceptMe(const std::string& me)
107+
{
108+
namespace fs = std::filesystem;
109+
std::string_view prefix = ShmPrefix();
110+
fs::path shmDir = "/dev/shm";
111+
if (!fs::exists(shmDir)) { return; }
112+
for (const auto& entry : fs::directory_iterator(shmDir)) {
113+
const auto& name = entry.path().filename().string();
114+
if (entry.is_regular_file() && (name.compare(0, prefix.length(), prefix) == 0) &&
115+
name != me) {
116+
fs::remove(entry.path());
117+
}
118+
}
104119
}
105120

106121
Status ShareBuffer::Setup(const size_t blockSize, const size_t blockNumber, const bool ioDirect,
107-
const size_t nSharer)
122+
const size_t nSharer, const std::string& uniqueId)
108123
{
109124
this->blockSize_ = blockSize;
110125
this->blockNumber_ = blockNumber;
111126
this->ioDirect_ = ioDirect;
112127
this->nSharer_ = nSharer;
113128
this->addr_ = nullptr;
114-
this->shmName_ = GenShareBufferName(blockSize, blockNumber, ioDirect, nSharer);
129+
this->shmName_ = ShmPrefix() + uniqueId;
130+
CleanUpShmFileExceptMe(this->shmName_);
115131
auto file = File::Make(this->shmName_);
116132
if (!file) { return Status::OutOfMemory(); }
117133
auto flags = IFile::OpenFlag::CREATE | IFile::OpenFlag::EXCL | IFile::OpenFlag::READ_WRITE;
@@ -305,4 +321,4 @@ uintptr_t ShareBuffer::Reader::GetData()
305321
return (uintptr_t)header->Data();
306322
}
307323

308-
} // namespace UC
324+
} // namespace UC

ucm/store/pcstore/cc/domain/trans/share_buffer.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,11 @@ class ShareBuffer {
4949
private:
5050
Reader(const std::string& block, const std::string& path, const size_t length,
5151
const bool ioDirect, const size_t nSharer, void* addr)
52-
: block_{block}, path_{path}, length_{length}, ioDirect_{ioDirect}, nSharer_{nSharer},
52+
: block_{block},
53+
path_{path},
54+
length_{length},
55+
ioDirect_{ioDirect},
56+
nSharer_{nSharer},
5357
addr_{addr}
5458
{
5559
}
@@ -58,7 +62,7 @@ class ShareBuffer {
5862

5963
public:
6064
Status Setup(const size_t blockSize, const size_t blockNumber, const bool ioDirect,
61-
const size_t nSharer);
65+
const size_t nSharer, const std::string& uniqueId);
6266
~ShareBuffer();
6367
std::shared_ptr<Reader> MakeReader(const std::string& block, const std::string& path);
6468

@@ -80,6 +84,6 @@ class ShareBuffer {
8084
void* addr_;
8185
};
8286

83-
} // namespace UC
87+
} // namespace UC
8488

8589
#endif

ucm/store/pcstore/cc/domain/trans/trans_manager.cc

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,13 @@ namespace UC {
2929
Status TransManager::Setup(const size_t rankSize, const int32_t deviceId, const size_t streamNumber,
3030
const size_t blockSize, const size_t ioSize, const bool ioDirect,
3131
const size_t bufferNumber, const SpaceLayout* layout,
32-
const size_t timeoutMs, const bool scatterGatherEnable)
32+
const size_t timeoutMs, const bool scatterGatherEnable,
33+
const std::string& uniqueId)
3334
{
3435
auto s = Status::OK();
3536
if (rankSize > 1) {
3637
s = this->shareQueue_.Setup(rankSize, deviceId, streamNumber, blockSize, ioSize, ioDirect,
37-
bufferNumber, layout, &this->failureSet_);
38+
bufferNumber, layout, &this->failureSet_, uniqueId);
3839
if (s.Failure()) { return s; }
3940
}
4041
s = this->queue_.Setup(deviceId, streamNumber, blockSize, ioSize, ioDirect, bufferNumber,
@@ -115,4 +116,4 @@ Status TransManager::Check(const size_t taskId, bool& finish) noexcept
115116
return Status::OK();
116117
}
117118

118-
} // namespace UC
119+
} // namespace UC

0 commit comments

Comments
 (0)