@@ -246,7 +246,6 @@ class AsEngineImpl final {
246246
247247ModelControlState::ModelControlState (const std::string& name)
248248 : model_name(name), msg_queue(1000 ) {
249- cond_var = std::make_unique<std::condition_variable>();
250249 request_handle_map.reserve (1000 );
251250 result_queue_map.reserve (1000 );
252251 msg_queue_size.store (0 );
@@ -328,14 +327,25 @@ AsEngineImpl::~AsEngineImpl() {
328327 // we should wait for the running thread stop otherwise it will cause
329328 // exception.
330329 LOG (INFO) << " ~AsEngine called" ;
331- for (auto & model_state : model_state_map_) {
332- if (!model_state.second && model_state.second ->model_stopped ) {
333- LOG (INFO) << " Stopping model " << model_state.first ;
334- StopModel (model_state.first .c_str ());
335- ReleaseModel (model_state.first .c_str ());
336- // model_state.second->StopLoop();
330+
331+ std::vector<std::string> pending_stop_model;
332+
333+ {
334+ std::lock_guard<std::mutex> guard (engine_lock_);
335+ LOG (INFO) << " model_state_map_ size:" << model_state_map_.size ();
336+ for (auto & model_state : model_state_map_) {
337+ if (!model_state.second ->model_stopped ) {
338+ LOG (INFO) << " Stopping model " << model_state.first ;
339+ pending_stop_model.push_back (model_state.first );
340+ }
337341 }
338342 }
343+
344+ for (auto & name : pending_stop_model) {
345+ StopModel (name.c_str ());
346+ ReleaseModel (name.c_str ());
347+ }
348+
339349 // LOG(INFO) << "~Engine clear BFC Allocator ";
340350 // free weight manager before destroy bfc.
341351 bool do_destroy_bfc = weight_manager_->GetNumModels () > 0 ;
@@ -1557,9 +1567,10 @@ AsStatus AsEngineImpl::StopModel(const char* model_name) {
15571567 LOG (ERROR) << " push message queue failed." ;
15581568 }
15591569 }
1560- model_state->cond_var ->notify_all ();
15611570
1571+ LOG (INFO) << " AsEngineImpl:: wait stop model return" ;
15621572 auto ret = reply_promise->get_future ().get ();
1573+ LOG (INFO) << " AsEngineImpl:: stop model got return." ;
15631574
15641575 if (ret != AsStatus::ALLSPARK_SUCCESS) {
15651576 LOG (ERROR) << " [" << model_name << " ] "
@@ -1781,8 +1792,7 @@ AsStatus AsEngineImpl::StartRequest(
17811792 handle->mm_embedding_internal = extra_embedding;
17821793
17831794#ifdef ENABLE_JSON_MODE
1784- if (request_info->config .response_format .find (" type" ) !=
1785- request_info->config .response_format .end () &&
1795+ if (request_info->config .response_format .count (" type" ) &&
17861796 request_info->config .response_format [" type" ] == " json_object" ) {
17871797 if (util::FormatEnforcer::vocab_.empty () &&
17881798 request_info->config .vocab .empty ()) {
@@ -1824,7 +1834,6 @@ AsStatus AsEngineImpl::StartRequest(
18241834 // create result queue & handle
18251835 }
18261836
1827- model_state->cond_var ->notify_one ();
18281837#ifndef ENABLE_CUDA
18291838 workers_[0 ]->GetDeviceContext ()->SemWaitSendInterProcess ();
18301839#endif
@@ -1869,7 +1878,6 @@ AsStatus AsEngineImpl::StopRequest(const char* model_name,
18691878 reply_promise, uuid);
18701879 model_state->msg_queue .enqueue (std::move (msg));
18711880 }
1872- model_state->cond_var ->notify_one ();
18731881#ifndef ENABLE_CUDA
18741882 workers_[0 ]->GetDeviceContext ()->SemWaitSendInterProcess ();
18751883#endif
@@ -1916,7 +1924,6 @@ AsStatus AsEngineImpl::ReleaseRequest(const char* model_name,
19161924 model_state->msg_queue .enqueue (std::move (msg));
19171925 }
19181926
1919- model_state->cond_var ->notify_one ();
19201927 auto ret = reply_promise->get_future ().get ();
19211928 if (ret == AsStatus::ALLSPARK_SUCCESS) {
19221929 LOG (INFO) << " [" << model_name << " ] "
@@ -1965,7 +1972,6 @@ AsStatus AsEngineImpl::SyncRequest(const char* model_name,
19651972 reply_promise, uuid);
19661973 model_state->msg_queue .enqueue (std::move (msg));
19671974 }
1968- model_state->cond_var ->notify_one ();
19691975#ifndef ENABLE_CUDA
19701976 workers_[0 ]->GetDeviceContext ()->SemWaitSendInterProcess ();
19711977#endif
@@ -2430,6 +2436,7 @@ AsStatus AsEngineImpl::InputParamsVerify(
24302436 << " gen_cfg.top_p must in [0,1]" << std::endl;
24312437 return AsStatus::ALLSPARK_PARAM_ERROR;
24322438 }
2439+
24332440 if (gen_cfg.temperature < SAMPLING_EPS) {
24342441 DLOG (INFO) << " [" << model_name << " ] "
24352442 << " gen_cfg.temperature = " << gen_cfg.temperature
@@ -2438,6 +2445,14 @@ AsStatus AsEngineImpl::InputParamsVerify(
24382445 gen_cfg.top_p = 0 ;
24392446 gen_cfg.temperature = 1.0 ;
24402447 }
2448+
2449+ if (std::abs (gen_cfg.top_p - 1.0 ) < 1e-6 ) {
2450+ LOG (WARNING) << " [" << model_name << " ] "
2451+ << " gen_cfg.top_p == 1.0, This might lead to performance "
2452+ " issues, so it is manually set to 0.99. "
2453+ << std::endl;
2454+ gen_cfg.top_p = 0.99 ;
2455+ }
24412456 // user customized max batch size
24422457 if (engine_max_batch_ != 0 && input_batch > engine_max_batch_) {
24432458 LOG (ERROR) << " [" << model_name << " ] "
@@ -2481,8 +2496,8 @@ AsStatus AsEngineImpl::StartRequestImpl(
24812496 << std::endl;
24822497
24832498 TensorMap out_tensors;
2484- // TODO: alloc generated_ids on CPU
2485- std::string out_name = " generated_ids " ;
2499+ // TODO: alloc generated_ids_global on CPU
2500+ std::string out_name = " generated_ids_global " ;
24862501 out_tensors.insert (
24872502 {out_name, std::make_shared<AsTensor>(out_name, DeviceType::CPU,
24882503 DataType::INT64, DataMode::DENSE,
@@ -2531,11 +2546,11 @@ FetchGenerationResultAndIncreaseCounter(
25312546 ele->prefix_len_gpu = request->prefix_len_gpu ;
25322547 ele->prefix_len_cpu = request->prefix_len - request->prefix_len_gpu ;
25332548
2534- TensorMap& tmap = request->outputs ;
2549+ const TensorMap& tmap = request->outputs ;
25352550
25362551 std::vector<std::vector<std::pair<int , float >>> log_probs_list =
25372552 request->log_probs_list ;
2538- auto device_tensor_ptr = tmap.at (" generated_ids " );
2553+ auto device_tensor_ptr = tmap.at (" generated_ids_global " );
25392554 if (device_tensor_ptr->GetShape ().Count () == 0 ) {
25402555 return nullptr ;
25412556 }
0 commit comments