modelscope
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README_CN.md‎
Lines changed: 2 additions & 0 deletions b/‎README_CN.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎build.sh‎
Lines changed: 1 addition & 18 deletions b/‎build.sh‎
Lines changed: 1 addition & 18 deletions
diff --git a/‎csrc/core/kernel/cpu/ALiBiPE.cpp‎
Lines changed: 18 additions & 11 deletions b/‎csrc/core/kernel/cpu/ALiBiPE.cpp‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎csrc/core/kernel/cpu/cpu_kernel.h‎
Lines changed: 3 additions & 1 deletion b/‎csrc/core/kernel/cpu/cpu_kernel.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎csrc/core/model/baichuan/baichuan.cpp‎
Lines changed: 26 additions & 0 deletions b/‎csrc/core/model/baichuan/baichuan.cpp‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎csrc/core/model/baichuan/baichuan.h‎
Lines changed: 30 additions & 0 deletions b/‎csrc/core/model/baichuan/baichuan.h‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎csrc/core/operator/general/ALiBiPE/ALiBiPE_op.cpp‎
Lines changed: 41 additions & 16 deletions b/‎csrc/core/operator/general/ALiBiPE/ALiBiPE_op.cpp‎
Lines changed: 41 additions & 16 deletions
diff --git a/‎csrc/core/operator/general/ALiBiPE/ALiBiPE_op.h‎
Lines changed: 7 additions & 5 deletions b/‎csrc/core/operator/general/ALiBiPE/ALiBiPE_op.h‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎csrc/core/operator/generate_opt/batch_mha/batch_mha_op.cpp‎
Lines changed: 9 additions & 2 deletions b/‎csrc/core/operator/generate_opt/batch_mha/batch_mha_op.cpp‎
Lines changed: 9 additions & 2 deletions
@@ -90,6 +90,8 @@ During inference, the quantized weight is recovered as bfloat16 for matrix multi
 | ChatGLMModel | ChatGLM | ChatGLM_v4 | [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat) | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary) | [dash-infer/glm-4-9b-chat-DI](https://modelscope.cn/models/dash-infer/glm-4-9b-chat-DI/summary) |
 | LlamaForCausalLM | LLaMA-2 | LLaMA_v2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf),<br>[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | [modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary),<br>[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary) | / |
 | LlamaForCausalLM | LLaMA-3 | LLaMA_v3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [modelscope/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/modelscope/Meta-Llama-3-8B-Instruct/summary) | / |
+| BaiChuanForCausalLM | Baichuan | Baichuan | [baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B) | [baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B) | / |
+| BaichuanForCausalLM | Baichuan2 | Baichuan_v2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat), <br>[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat) | [baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat), <br>[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat) | / |
 
 # Software Architecture
 
 
@@ -91,6 +91,8 @@ $$ x_{u8} = x_{fp32} / scale + zeropoint $$
 | ChatGLMModel | ChatGLM | ChatGLM_v4 | [THUDM/glm-4-9b-chat](https://huggingface.co/THUDM/glm-4-9b-chat) | [ZhipuAI/glm-4-9b-chat](https://modelscope.cn/models/ZhipuAI/glm-4-9b-chat/summary) | [dash-infer/glm-4-9b-chat-DI](https://modelscope.cn/models/dash-infer/glm-4-9b-chat-DI/summary) |
 | LlamaForCausalLM | LLaMA-2 | LLaMA_v2 | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf),<br>[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | [modelscope/Llama-2-7b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-7b-chat-ms/summary),<br>[modelscope/Llama-2-13b-chat-ms](https://modelscope.cn/models/modelscope/Llama-2-13b-chat-ms/summary) | / |
 | LlamaForCausalLM | LLaMA-3 | LLaMA_v3 | [meta-llama/Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | [modelscope/Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/modelscope/Meta-Llama-3-8B-Instruct/summary) | / |
+| BaiChuanForCausalLM | Baichuan | Baichuan | [baichuan-inc/Baichuan-7B](https://huggingface.co/baichuan-inc/Baichuan-7B) | [baichuan-inc/baichuan-7B](https://modelscope.cn/models/baichuan-inc/baichuan-7B) | / |
+| BaichuanForCausalLM | Baichuan2 | Baichuan_v2 | [baichuan-inc/Baichuan2-7B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat), <br>[baichuan-inc/Baichuan2-13B-Chat](https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat) | [baichuan-inc/Baichuan2-7B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-7B-Chat), <br>[baichuan-inc/Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat) | / |
 
 # 软件框架
 
 
@@ -2,29 +2,12 @@ set -x
 
 clean="OFF"
 
-# 捕获arch命令的输出
-architecture=$(arch)
-
-# 使用if-else结构进行条件判断
-if [ "${architecture}" == "aarch64" ]; then
-    export AS_PLATFORM=armclang
-else
-    export AS_PLATFORM=x86
-fi
-
-if [ -z "$AS_PLATFORM" ];
-then
-	echo " please set AS_PLATFORM env, AS_PLATFORM can be x86 or armclang"
-	exit 1
-fi
-
 # with_platform, to support x86/arm build
-with_platform="${AS_PLATFORM}"
+with_platform="${AS_PLATFORM:-x86}"
 build_type="${AS_BUILD_TYPE:-Release}"
 build_package="${AS_BUILD_PACKAGE:-OFF}"
 enable_glibcxx11_abi="${AS_CXX11_ABI:-ON}" # default enable cxx11 ABI
 
-
 function clone_pull {
   GIT_URL=$1
   DIRECTORY=$2
 
@@ -39,7 +39,7 @@ void ALiBiPE_kernel(T* out, int* batch_offset, int batch_size, int seq_length,
   parallel_for(N, [&](int idx) {
     int batch = idx / num_heads;
     int head = idx % num_heads;
-    int offset = batch_offset[batch];
+    int offset = batch_offset ? batch_offset[batch] : 0;
     float slope = get_ALiBiPE_slope(head, num_heads, ori_num_heads, rank);
     for (int i = 0; i < seq_length; i++) {
       for (int j = 0; j < seq_length; j++) {
@@ -53,42 +53,49 @@ void ALiBiPE_kernel(T* out, int* batch_offset, int batch_size, int seq_length,
 template <typename T>
 void ALiBiPE_decoder_kernel(T* out, int* batch_offset, int batch_size,
                             int seq_length, int num_heads, int ori_num_heads,
-                            int rank, int N) {
+                            int rank, int N, std::vector<int>& step_list) {
   // return [batch,1,num_heads,seq_length],i=seq_length-1
   parallel_for(N, [&](int idx) {
     int batch = idx / num_heads;
     int head = idx % num_heads;
-    int offset = batch_offset[batch];
+    int step = step_list[batch];
+    int offset = batch_offset ? batch_offset[batch] : 0;
     float slope = get_ALiBiPE_slope(head, num_heads, ori_num_heads, rank);
-    for (int j = 0; j < seq_length; j++) {
-      out[batch * num_heads * 1 * seq_length + head * seq_length + j] =
+    for (int j = 0; j < step; j++) {
+      out[batch * num_heads * 1 * seq_length + head * step + j] =
           slope * (j - offset);
+      // we take this output tensor as a one-dimensional array in batch_MHA
+      // afterwards so it's 'head * step', not 'head * seq_length' otherwise the
+      // values updated will not be consecutively stored
     }
   });
 }
 template <typename T>
 void ALiBiPEKernelLauncher(T* out, int* batch_offset, int batch_size,
                            int seq_length, int num_heads, int ori_num_heads,
-                           int step, int rank) {
+                           int rank, bool is_context,
+                           std::vector<int>& step_list) {
   int N = batch_size * num_heads;
-  if (step - 1 == 0) {
+  if (is_context == true) {
     ALiBiPE_kernel(out, batch_offset, batch_size, seq_length, num_heads,
                    ori_num_heads, rank, N);
   } else {
-    ALiBiPE_decoder_kernel(out, batch_offset, batch_size, step, num_heads,
-                           ori_num_heads, rank, N);
+    ALiBiPE_decoder_kernel(out, batch_offset, batch_size, seq_length, num_heads,
+                           ori_num_heads, rank, N, step_list);
   }
 }
 
 template void ALiBiPEKernelLauncher<float>(float* out, int* batch_offset,
                                            int batch_size, int seq_length,
                                            int num_heads, int ori_num_heads,
-                                           int step, int rank);
+                                           int rank, bool is_context,
+                                           std::vector<int>& step_list);
 #ifdef ENABLE_FP16
 template void ALiBiPEKernelLauncher<half>(half* out, int* batch_offset,
                                           int batch_size, int seq_length,
                                           int num_heads, int ori_num_heads,
-                                          int step, int rank);
+                                          int rank, bool is_context,
+                                          std::vector<int>& step_list);
 #endif
 }  // namespace cpu
 }  // namespace allspark
@@ -7,6 +7,7 @@
 #include <common.h>
 #include <stdint.h>
 
+#include <map>
 #include <vector>
 namespace allspark {
 namespace cpu {
@@ -90,7 +91,8 @@ void RelativePEKernel(T* out, const T* attention_bias, int batch_size,
 template <typename T>
 void ALiBiPEKernelLauncher(T* out, int* batch_offset, int batch_size,
                            int seq_length, int num_heads, int ori_num_heads,
-                           int step, int rank);
+                           int rank, bool is_context,
+                           std::vector<int>& step_list);
 template <typename T>
 void MHAKernel(T* out, const T* q, const T* k, const T* v, const float* mask,
                T* score, int beam_size, int batch_size, int num_heads,
 
@@ -0,0 +1,26 @@
+/*!
+ * Copyright (c) Alibaba, Inc. and its affiliates.
+ * @file    baichuan.cpp
+ */
+
+#include "baichuan.h"  // NOLINT
+
+namespace allspark {
+AsStatus BaichuanModel::Init(const TransformerProto& model_proto,
+                             const DeviceContext& ctx) {
+  DLOG(INFO) << "BaichuanModel::Init()" << std::endl;
+  AS_CHECK_STATUS(AsModel::Init(model_proto, ctx));
+  topo_ops_.clear();
+  // parse graph
+  for (auto& op : graph_ops_["decoder"]) {
+    topo_ops_.emplace_back(op.get());
+  }
+  if (model_proto.model_conf().is_generate())
+    for (auto& op : graph_ops_["gen_graph"]) {
+      topo_ops_.emplace_back(op.get());
+    }
+  return AsStatus::ALLSPARK_SUCCESS;
+}
+REGISTER_MODEL("Baichuan_v2", BaichuanModel_v2)
+REGISTER_MODEL("Baichuan", BaichuanModel)
+}  // namespace allspark
@@ -0,0 +1,30 @@
+/*!
+ * Copyright (c) Alibaba, Inc. and its affiliates.
+ * @file    baichuan.h
+ */
+
+#pragma once
+
+#include <core/model/model.h>
+
+#include <string>
+
+namespace allspark {
+
+class BaichuanModel : public AsModel {
+ public:
+  explicit BaichuanModel(const std::string& model_type = "")
+      : AsModel(model_type) {}
+  AsStatus Init(const TransformerProto& model_proto,
+                const DeviceContext& ctx) override;
+};
+
+class BaichuanModel_v2 : public BaichuanModel {
+ public:
+  explicit BaichuanModel_v2(const std::string& model_type = "")
+      : BaichuanModel(model_type){};
+};
+
+
+
+}  // namespace allspark
@@ -11,15 +11,16 @@
 
 namespace allspark {
 AsStatus cpu_ALiBiPE(DataType dtype, void* out, int* batch_offset, int batch,
-                     int seq_len, int num_heads, int ori_num_heads, int step,
-                     const DeviceContext* ctx) {
+                     int seq_len, int num_heads, int ori_num_heads,
+                     const DeviceContext* ctx, bool is_context,
+                     std::vector<int>& step_list) {
   DLOG(INFO) << "cpu_ALiBiPE" << std::endl;
   const CPUContext* cpu_ctx = static_cast<const CPUContext*>(ctx);
   auto functor = [&]<typename T>() {
     T* typed_out = static_cast<T*>(out);
     cpu::ALiBiPEKernelLauncher(typed_out, batch_offset, batch, seq_len,
-                               num_heads, ori_num_heads, step,
-                               cpu_ctx->GetRank());
+                               num_heads, ori_num_heads, cpu_ctx->GetRank(),
+                               is_context, step_list);
   };
   DispatchCPU(dtype, functor);
   return AsStatus::ALLSPARK_SUCCESS;
@@ -54,32 +55,56 @@ AsStatus ALiBiPEOp::Init(const OperatorProto& op_proto,
   return AsStatus::ALLSPARK_SUCCESS;
 }
 
-AsStatus ALiBiPEOp::Reshape() {
+AsStatus ALiBiPEOp::Reshape(RuntimeContext* runtime_ctx) {
   Shape in_shape = tensor_map_->at(in_names_[0])->GetShape();
-  batch_size_ = in_shape[0];
-  if (gen_ctx_->step == 0) {
+  if (runtime_ctx->is_context == true) {
+    batch_size_ = in_shape[0];
     seq_length_ = in_shape[1];
     Shape out_shape = Shape{batch_size_, seq_length_, num_heads_, seq_length_};
     AS_CHECK_STATUS(
         tensor_map_->at(out_names_[0])->SetShape(std::move(out_shape)));
-  } else {
-    seq_length_ = 1;
   }
+  return AsStatus::ALLSPARK_SUCCESS;
+}
 
+AsStatus ALiBiPEOp::runContext(RuntimeContext* runtime_ctx) {
+  int* batch_offset = nullptr;
+  AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get();
+  std::vector<int> step_list;
+  kernel_launcher(out_tensor->GetDataType(), out_tensor->GetDataPtr(),
+                  batch_offset, batch_size_, seq_length_, num_heads_,
+                  ori_num_heads_, ctx_, runtime_ctx->is_context, step_list);
   return AsStatus::ALLSPARK_SUCCESS;
 }
 
-AsStatus ALiBiPEOp::Forward() {
+AsStatus ALiBiPEOp::runDecode(RuntimeContext* runtime_ctx) {
   int* batch_offset = nullptr;
   AsTensor* out_tensor = tensor_map_->at(out_names_[0]).get();
-  if (gen_ctx_->step != 0) {
-    Shape out_shape = Shape{batch_size_, 1, num_heads_, gen_ctx_->step + 1};
-    AS_CHECK_STATUS(
-        tensor_map_->at(out_names_[0])->SetShape(std::move(out_shape)));
+  int batch_size = runtime_ctx->GetGenCtxListSize();
+  std::vector<int> step_list(batch_size);
+  int max_step = 1;
+  for (int i = 0; i < batch_size; i++) {
+    GenerateContext* gen_ctx = runtime_ctx->GetGenCtx(i);
+    if (gen_ctx->step + 1 > max_step) {
+      max_step = gen_ctx->step + 1;
+    }
+    step_list[i] = gen_ctx->step + 1;
   }
+  Shape out_shape = Shape{batch_size, 1, num_heads_, max_step};
+  AS_CHECK_STATUS(
+      tensor_map_->at(out_names_[0])->SetShape(std::move(out_shape)));
   kernel_launcher(out_tensor->GetDataType(), out_tensor->GetDataPtr(),
-                  batch_offset, batch_size_, seq_length_, num_heads_,
-                  ori_num_heads_, (gen_ctx_->step + 1), ctx_);
+                  batch_offset, batch_size, max_step, num_heads_,
+                  ori_num_heads_, ctx_, runtime_ctx->is_context, step_list);
+  return AsStatus::ALLSPARK_SUCCESS;
+}
+
+AsStatus ALiBiPEOp::Forward(RuntimeContext* runtime_ctx) {
+  if (runtime_ctx->is_context == true) {
+    runContext(runtime_ctx);
+  } else {
+    runDecode(runtime_ctx);
+  }
   return AsStatus::ALLSPARK_SUCCESS;
 }
 
 
@@ -15,19 +15,21 @@ class ALiBiPEOp : public AsOperator {
       : AsOperator(op_type), batch_size_(1), seq_length_(1), num_heads_(1) {}
   AsStatus Init(const OperatorProto& op_proto, const DeviceContext& ctx,
                 const TensorMap& weights_map, TensorMap* tensor_map);
-  AsStatus Reshape() override;
-  AsStatus Forward() override;
+  AsStatus Reshape(RuntimeContext* runtime_ctx) override;
+  AsStatus Forward(RuntimeContext* runtime_ctx) override;
 
  private:
+  AsStatus runContext(RuntimeContext* runtime_ctx);
+  AsStatus runDecode(RuntimeContext* runtime_ctx);
   AsStatus (*kernel_launcher)(DataType dtype, void* out, int* batch_offset,
                               int batch, int seq_len, int num_heads,
-                              int ori_num_heads, int step,
-                              const DeviceContext* ctx) = nullptr;
+                              int ori_num_heads, const DeviceContext* ctx,
+                              bool is_context,
+                              std::vector<int>& step_list) = nullptr;
   int batch_size_;
   int seq_length_;
   int num_heads_;
   int ori_num_heads_;
-  int max_seq_;
 };
 
 }  // namespace allspark
@@ -310,8 +310,15 @@ AsStatus BatchMHAOp::runOneBatch(GenerateContext* gen_ctx, int current_batch) {
   if (tensor_map_->at(in_names_[1])->GetShape().Count() == 0) {
     mask_buf = nullptr;
   }
-  void* position_embedding =
-      pos_embedding_ ? tensor_map_->at(in_names_[2])->GetDataPtr() : nullptr;
+  void* position_embedding = nullptr;
+  if (pos_embedding_ == true) {
+    const Shape& embedding_shape = tensor_map_->at(in_names_[2])->GetShape();
+    // shape: [batch_size, 1, num_heads, step + 1]
+    // in context phase, 'current_batch' passed by caller will always be 0
+    position_embedding = (char*)tensor_map_->at(in_names_[2])->GetDataPtr() +
+                         current_batch * embedding_shape[2] *
+                             embedding_shape[3] * SizeofType(dtype_);
+  }
   char* score_buf = (char*)(tensor_map_->at("workspace")->GetDataPtr());
   void** q_array = (void**)(score_buf + score_size_);
   void** k_array = q_array + round32(gemm_batch_);