modelscope
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cmake/hie-dnn.cmake‎
Lines changed: 1 addition & 1 deletion b/‎cmake/hie-dnn.cmake‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/common/as_engine.cpp‎
Lines changed: 33 additions & 18 deletions b/‎csrc/common/as_engine.cpp‎
Lines changed: 33 additions & 18 deletions
diff --git a/‎csrc/common/common.h‎
Lines changed: 18 additions & 0 deletions b/‎csrc/common/common.h‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎csrc/common/engine_runtime.h‎
Lines changed: 0 additions & 4 deletions b/‎csrc/common/engine_runtime.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎csrc/common/extra_embedding.hpp‎
Lines changed: 10 additions & 7 deletions b/‎csrc/common/extra_embedding.hpp‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎csrc/common/generate_context.h‎
Lines changed: 11 additions & 11 deletions b/‎csrc/common/generate_context.h‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎csrc/common/memory_reuser.cpp‎
Lines changed: 1 addition & 1 deletion b/‎csrc/common/memory_reuser.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/common/request.h‎
Lines changed: 21 additions & 18 deletions b/‎csrc/common/request.h‎
Lines changed: 21 additions & 18 deletions
@@ -25,5 +25,4 @@ third_party/from_source/openssl/*
 *.nsys-rep
 log*
 *.csv
-#*.sh
 *.as*
@@ -25,7 +25,7 @@ message(STATUS "Build HIE-DNN with: USE_CUDA=${HIEDNN_USE_CUDA}")
 message(STATUS "Build HIE-DNN with: CUDA_DEVICE_ARCH=${HIEDNN_CUDA_DEVICE_ARCH}")
 
 set(HIEDNN_INSTALL ${INSTALL_LOCATION}/HIE-DNN/install)
-set(HIEDNN_LIBRARY_PATH ${HIEDNN_INSTALL}/lib64/libhiednn_static.a)
+set(HIEDNN_LIBRARY_PATH ${HIEDNN_INSTALL}/${CMAKE_INSTALL_LIBDIR}/libhiednn_static.a)
 message(STATUS "HIEDNN_INSTALL: ${HIEDNN_INSTALL}")
 message(STATUS "HIEDNN_LIBRARY_PATH: ${HIEDNN_LIBRARY_PATH}")
 
 
@@ -246,7 +246,6 @@ class AsEngineImpl final {
 
 ModelControlState::ModelControlState(const std::string& name)
     : model_name(name), msg_queue(1000) {
-  cond_var = std::make_unique<std::condition_variable>();
   request_handle_map.reserve(1000);
   result_queue_map.reserve(1000);
   msg_queue_size.store(0);
@@ -328,14 +327,25 @@ AsEngineImpl::~AsEngineImpl() {
   // we should wait for the running thread stop otherwise it will cause
   // exception.
   LOG(INFO) << "~AsEngine called";
-  for (auto& model_state : model_state_map_) {
-    if (!model_state.second && model_state.second->model_stopped) {
-      LOG(INFO) << "Stopping model " << model_state.first;
-      StopModel(model_state.first.c_str());
-      ReleaseModel(model_state.first.c_str());
-      // model_state.second->StopLoop();
+
+  std::vector<std::string> pending_stop_model;
+
+  {
+    std::lock_guard<std::mutex> guard(engine_lock_);
+    LOG(INFO) << "model_state_map_ size:" << model_state_map_.size();
+    for (auto& model_state : model_state_map_) {
+      if (!model_state.second->model_stopped) {
+        LOG(INFO) << "Stopping model " << model_state.first;
+        pending_stop_model.push_back(model_state.first);
+      }
     }
   }
+
+  for (auto& name : pending_stop_model) {
+    StopModel(name.c_str());
+    ReleaseModel(name.c_str());
+  }
+
   // LOG(INFO) << "~Engine clear BFC Allocator ";
   //  free weight manager before destroy bfc.
   bool do_destroy_bfc = weight_manager_->GetNumModels() > 0;
@@ -1557,9 +1567,10 @@ AsStatus AsEngineImpl::StopModel(const char* model_name) {
       LOG(ERROR) << "push message queue failed.";
     }
   }
-  model_state->cond_var->notify_all();
 
+  LOG(INFO) << "AsEngineImpl:: wait stop model return";
   auto ret = reply_promise->get_future().get();
+  LOG(INFO) << "AsEngineImpl:: stop model got return.";
 
   if (ret != AsStatus::ALLSPARK_SUCCESS) {
     LOG(ERROR) << "[" << model_name << "] "
@@ -1781,8 +1792,7 @@ AsStatus AsEngineImpl::StartRequest(
   handle->mm_embedding_internal = extra_embedding;
 
 #ifdef ENABLE_JSON_MODE
-  if (request_info->config.response_format.find("type") !=
-          request_info->config.response_format.end() &&
+  if (request_info->config.response_format.count("type") &&
       request_info->config.response_format["type"] == "json_object") {
     if (util::FormatEnforcer::vocab_.empty() &&
         request_info->config.vocab.empty()) {
@@ -1824,7 +1834,6 @@ AsStatus AsEngineImpl::StartRequest(
     // create result queue & handle
   }
 
-  model_state->cond_var->notify_one();
 #ifndef ENABLE_CUDA
   workers_[0]->GetDeviceContext()->SemWaitSendInterProcess();
 #endif
@@ -1869,7 +1878,6 @@ AsStatus AsEngineImpl::StopRequest(const char* model_name,
                                     reply_promise, uuid);
     model_state->msg_queue.enqueue(std::move(msg));
   }
-  model_state->cond_var->notify_one();
 #ifndef ENABLE_CUDA
   workers_[0]->GetDeviceContext()->SemWaitSendInterProcess();
 #endif
@@ -1916,7 +1924,6 @@ AsStatus AsEngineImpl::ReleaseRequest(const char* model_name,
     model_state->msg_queue.enqueue(std::move(msg));
   }
 
-  model_state->cond_var->notify_one();
   auto ret = reply_promise->get_future().get();
   if (ret == AsStatus::ALLSPARK_SUCCESS) {
     LOG(INFO) << "[" << model_name << "] "
@@ -1965,7 +1972,6 @@ AsStatus AsEngineImpl::SyncRequest(const char* model_name,
                                     reply_promise, uuid);
     model_state->msg_queue.enqueue(std::move(msg));
   }
-  model_state->cond_var->notify_one();
 #ifndef ENABLE_CUDA
   workers_[0]->GetDeviceContext()->SemWaitSendInterProcess();
 #endif
@@ -2430,6 +2436,7 @@ AsStatus AsEngineImpl::InputParamsVerify(
                << "gen_cfg.top_p must in [0,1]" << std::endl;
     return AsStatus::ALLSPARK_PARAM_ERROR;
   }
+
   if (gen_cfg.temperature < SAMPLING_EPS) {
     DLOG(INFO) << "[" << model_name << "] "
                << "gen_cfg.temperature = " << gen_cfg.temperature
@@ -2438,6 +2445,14 @@ AsStatus AsEngineImpl::InputParamsVerify(
     gen_cfg.top_p = 0;
     gen_cfg.temperature = 1.0;
   }
+
+  if (std::abs(gen_cfg.top_p - 1.0) < 1e-6) {
+    LOG(WARNING) << "[" << model_name << "] "
+                 << "gen_cfg.top_p == 1.0, This might lead to performance "
+                    "issues, so it is manually set to 0.99. "
+                 << std::endl;
+    gen_cfg.top_p = 0.99;
+  }
   // user customized max batch size
   if (engine_max_batch_ != 0 && input_batch > engine_max_batch_) {
     LOG(ERROR) << "[" << model_name << "] "
@@ -2481,8 +2496,8 @@ AsStatus AsEngineImpl::StartRequestImpl(
              << std::endl;
 
   TensorMap out_tensors;
-  // TODO: alloc generated_ids on CPU
-  std::string out_name = "generated_ids";
+  // TODO: alloc generated_ids_global on CPU
+  std::string out_name = "generated_ids_global";
   out_tensors.insert(
       {out_name, std::make_shared<AsTensor>(out_name, DeviceType::CPU,
                                             DataType::INT64, DataMode::DENSE,
@@ -2531,11 +2546,11 @@ FetchGenerationResultAndIncreaseCounter(
   ele->prefix_len_gpu = request->prefix_len_gpu;
   ele->prefix_len_cpu = request->prefix_len - request->prefix_len_gpu;
 
-  TensorMap& tmap = request->outputs;
+  const TensorMap& tmap = request->outputs;
 
   std::vector<std::vector<std::pair<int, float>>> log_probs_list =
       request->log_probs_list;
-  auto device_tensor_ptr = tmap.at("generated_ids");
+  auto device_tensor_ptr = tmap.at("generated_ids_global");
   if (device_tensor_ptr->GetShape().Count() == 0) {
     return nullptr;
   }
 
@@ -189,6 +189,24 @@ inline std::ostream& operator<<(std::ostream& os, DeviceType device_type) {
   }
 }
 
+inline int get_layer_num(std::string str) {
+  std::stringstream ss(str);
+  std::string temp;
+  while (std::getline(ss, temp, '.')) {
+    bool flag = true;
+    for (char c : temp) {
+      if (!std::isdigit(c)) /* 如果不是数字，返回 false */ {
+        flag = false;
+        break;
+      }
+    }
+    if (flag) {
+      return std::stoi(temp);
+    }
+  }
+  return -1;
+}
+
 // deprecated api declear
 #if __cplusplus >= 201402L  // c++14
 
 
@@ -48,14 +48,10 @@ class ModelControlState final {
   moodycamel::BlockingConcurrentQueue<EngineControlMessage> msg_queue;
   std::atomic<int> msg_queue_size;
 
-  std::unique_ptr<std::condition_variable> cond_var;
-
   std::unordered_map<std::string, std::shared_ptr<RequestHandle>>
       request_handle_map;
   std::unordered_map<std::string, std::shared_ptr<AsEngine::ResultQueue>>
       result_queue_map;
-  std::queue<std::shared_ptr<RequestHandle>> release_request_handle;
-  std::queue<std::shared_ptr<AsEngine::ResultQueue>> release_request_queue;
   std::atomic<bool> model_stopping =
       false;                                // after GracefulStopModel called...
   std::atomic<bool> model_stopped = false;  // after GracefulStopModel is done.
 
@@ -145,16 +145,17 @@ class ExtraEmbeddingUtils {
   }
 
   static AsStatus CreateTensorForHash(std::shared_ptr<Request> request,
-                                      TensorMap& tensor_map,
+                                      TensorMap& dst_tensor_map,
+                                      const TensorMap& src_tensor_map,
                                       std::string src_tensor_name) {
     std::string dst_tensor_name = src_tensor_name + "_for_hash";
 
     if (!request->extra_embedding.empty()) {
       if (request->extra_embedding.count("hash_input") > 0) {
         // step 1: parse extra_embedding info
         int64_t* tensor_ptr =
-            (int64_t*)tensor_map[src_tensor_name]->GetDataPtr();
-        int seq_len = tensor_map[src_tensor_name]->GetShape()[1];
+            (int64_t*)src_tensor_map.at(src_tensor_name)->GetDataPtr();
+        int seq_len = src_tensor_map.at(src_tensor_name)->GetShape()[1];
         auto reinfo_vec = std::make_shared<ExtraEmbeddingUtils::REInfoList>();
         AS_CHECK_STATUS(ExtraEmbeddingUtils::ParseExtraEmbedding(
             request->extra_embedding, tensor_ptr, seq_len, reinfo_vec));
@@ -165,20 +166,22 @@ class ExtraEmbeddingUtils {
 
         // step 3: create a new input tensor
         auto dst_tensor = std::make_shared<AsTensor>(
-            dst_tensor_name, *tensor_map[src_tensor_name]);
+            dst_tensor_name, *src_tensor_map.at(src_tensor_name));
 
         // step 4: replace place holder with hashes
         ExtraEmbeddingUtils::ReplacePlaceHolder(dst_tensor, reinfo_vec);
 
-        tensor_map.insert({dst_tensor_name, dst_tensor});
+        dst_tensor_map.insert({dst_tensor_name, dst_tensor});
       } else {
         LOG(ERROR) << "multi-media content `hash_input` "
                    << "of request " << request->request_id << " is missing.";
         return AsStatus::ALLSPARK_PARAM_ERROR;
       }
     } else {
-      // no extra embedding, use original input_ids for hash
-      tensor_map.insert({dst_tensor_name, tensor_map[src_tensor_name]});
+      // no extra embedding, copy original tensor for hash
+      auto dst_tensor = std::make_shared<AsTensor>(
+          dst_tensor_name, *src_tensor_map.at(src_tensor_name));
+      dst_tensor_map.insert({dst_tensor_name, dst_tensor});
     }
 
     return AsStatus::ALLSPARK_SUCCESS;
 
@@ -6,6 +6,7 @@
 #pragma once
 
 #if ENABLE_SPAN_ATTENTION
+#include <cache/prefix_cache_manager.h>
 #include <cache/virtual_cache.h>
 #endif
 #include <common/common.h>
@@ -59,7 +60,7 @@ struct GenerateContext {
 #if ENABLE_SPAN_ATTENTION
   std::unique_ptr<VirtualCache> virtual_k_cache;
   std::unique_ptr<VirtualCache> virtual_v_cache;
-  std::vector<std::string> prefix_cache_hash_list;
+  std::vector<PrefixCacheManager::PrefixNodePtr> prefix_cache_node_list;
 #endif
 
 #ifdef ENABLE_JSON_MODE
@@ -70,7 +71,7 @@ struct GenerateContext {
   std::unique_ptr<AsTensor> sample_state = nullptr;
 };
 
-using GenContextList = std::vector<std::unique_ptr<GenerateContext>>;
+using GenContextList = std::vector<std::shared_ptr<GenerateContext>>;
 class LayerCacheManager {
  public:
   AsTensor* GetCache(std::string cache_name) {
@@ -102,15 +103,15 @@ class RuntimeContext {
   std::vector<float> logprobs_value_host;
   std::vector<float> token_logprobs_host;
 
-  GenerateContext* GetContextGenCtx() const {
-    return gen_ctx_list[current_batch].get();
+  std::shared_ptr<GenerateContext> GetContextGenCtx() const {
+    return gen_ctx_list[current_batch];
   }
-  GenerateContext* GetGenCtx(int index) const {
-    return gen_ctx_list[index].get();
+  std::shared_ptr<GenerateContext> GetGenCtx(int index) const {
+    return gen_ctx_list[index];
   }
   int GetGenCtxListSize() const { return gen_ctx_list.size(); }
-  void PushBackGenCtx(std::unique_ptr<GenerateContext> gen_ctx) {
-    gen_ctx_list.push_back(std::move(gen_ctx));
+  void PushBackGenCtx(std::shared_ptr<GenerateContext> gen_ctx) {
+    gen_ctx_list.push_back(gen_ctx);
     gen_ctx_list[gen_ctx_list.size() - 1]->current_batch =
         gen_ctx_list.size() - 1;
   }
@@ -124,16 +125,15 @@ class RuntimeContext {
     gen_ctx_list[index]->current_batch = index;
     gen_ctx_list.pop_back();
   }
-  std::shared_ptr<LayerCacheManager> CreateLayerCacheManager() {
+  void CreateLayerCacheManager() {
     layer_cache_manager = std::make_shared<LayerCacheManager>();
-    return layer_cache_manager;
   }
   std::shared_ptr<LayerCacheManager> GetLayerCacheManager() {
     return layer_cache_manager;
   }
 
  private:
-  GenContextList gen_ctx_list = std::vector<std::unique_ptr<GenerateContext>>();
+  GenContextList gen_ctx_list = std::vector<std::shared_ptr<GenerateContext>>();
   std::shared_ptr<LayerCacheManager> layer_cache_manager;
 };
 
 
@@ -30,7 +30,7 @@ string noreused[] = {"cross_attention.key_value.out",
                      "batch_offset",
                      "transmask.out",
                      "max_dec_ids",
-                     "generated_ids",
+                     "generated_ids_global",
                      "dec_ids",
                      "next_beam_id",
                      "hyps_ids",
 
@@ -21,7 +21,8 @@ class FormatEnforcer;
 struct Request {
   std::string request_id;
   TensorMap inputs;
-  TensorMap outputs;
+  const TensorMap outputs;  // created in AsEngineImpl, shared by all workers
+  TensorMap interim;        // intermediate tensors
   GenerateConfig gen_cfg;
   std::vector<std::vector<std::pair<int, float>>> log_probs_list;
   std::vector<float> token_logprobs_list;
@@ -42,31 +43,33 @@ struct Request {
   const std::chrono::time_point<std::chrono::steady_clock> start_ts;
   std::chrono::time_point<std::chrono::steady_clock> context_ts;
   std::chrono::time_point<std::chrono::steady_clock> generate_ts;
+
   Request(const std::string& request_id_, const TensorMap& inputs_,
-          const TensorMap& outputs_, const GenerateConfig& gen_cfg)
+          const TensorMap& outputs_, const GenerateConfig& gen_cfg,
+          const TensorMap& interim_ = {})
       : request_id(request_id_),
         inputs(inputs_),
         outputs(outputs_),
+        interim(interim_),
         gen_cfg(gen_cfg),
         finish(false),
         status(AsEngine::GenerateRequestStatus::Init),
         start_ts(std::chrono::steady_clock::now()) {}
-  Request(std::shared_ptr<Request> source_request) {
-    if (source_request) {
-      this->request_id = source_request->request_id;
-      this->inputs = source_request->inputs;
-      this->outputs = source_request->outputs;
-      this->gen_cfg = source_request->gen_cfg;
-      this->log_probs_list = source_request->log_probs_list;
-      this->token_logprobs_list = source_request->token_logprobs_list;
-      this->finish = source_request->finish;
-      this->input_len = source_request->input_len;
-      this->prefill_chunk_len = source_request->prefill_chunk_len;
-      this->prefix_len = source_request->prefix_len;
-      this->status = source_request->status;
-      this->extra_embedding = source_request->extra_embedding;
-    }
-  }
+
+  Request(std::shared_ptr<Request> source_request)
+      : request_id(source_request->request_id),
+        inputs(source_request->inputs),
+        outputs(source_request->outputs),
+        interim(source_request->interim),
+        gen_cfg(source_request->gen_cfg),
+        log_probs_list(source_request->log_probs_list),
+        token_logprobs_list(source_request->token_logprobs_list),
+        finish(source_request->finish),
+        input_len(source_request->input_len),
+        prefill_chunk_len(source_request->prefill_chunk_len),
+        prefix_len(source_request->prefix_len),
+        status(source_request->status),
+        extra_embedding(source_request->extra_embedding) {}
 };
 
 }  // namespace allspark
-Original file line number
+Diff line change
 *.nsys-rep
 log*
 *.csv
 -#*.sh
 *.as*