Skip to content

Commit 64cc47c

Browse files
Merge pull request #318 from janhq/update-dev-from-master-2025-11-09-00-38
Sync master with upstream release b6992
2 parents 164c0c5 + aa3b7a9 commit 64cc47c

37 files changed

+2250
-682
lines changed

.github/workflows/build.yml

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -161,15 +161,16 @@ jobs:
161161
- name: Dawn Dependency
162162
id: dawn-depends
163163
run: |
164-
DAWN_VERSION="v1.0.0"
164+
DAWN_VERSION="v2.0.0"
165165
DAWN_OWNER="reeselevine"
166166
DAWN_REPO="dawn"
167-
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
167+
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.zip"
168168
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
169-
curl -L -o artifact.tar.gz \
169+
curl -L -o artifact.zip \
170170
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
171171
mkdir dawn
172-
tar -xvf artifact.tar.gz -C dawn --strip-components=1
172+
unzip artifact.zip
173+
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-macos-latest-Release.tar.gz -C dawn --strip-components=1
173174
174175
- name: Build
175176
id: cmake_build
@@ -521,15 +522,16 @@ jobs:
521522
id: dawn-depends
522523
run: |
523524
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
524-
DAWN_VERSION="v1.0.0"
525+
DAWN_VERSION="v2.0.0"
525526
DAWN_OWNER="reeselevine"
526527
DAWN_REPO="dawn"
527-
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
528+
DAWN_ASSET_NAME="Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.zip"
528529
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
529-
curl -L -o artifact.tar.gz \
530+
curl -L -o artifact.zip \
530531
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
531532
mkdir dawn
532-
tar -xvf artifact.tar.gz -C dawn --strip-components=1
533+
unzip artifact.zip
534+
tar -xvf Dawn-5e9a4865b1635796ccc77dd30057f2b4002a1355-ubuntu-latest-Release.tar.gz -C dawn --strip-components=1
533535
534536
- name: Build
535537
id: cmake_build

common/arg.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
740740
exit(0);
741741
}
742742
));
743+
add_opt(common_arg(
744+
{"-cl", "--cache-list"},
745+
"show list of models in cache",
746+
[](common_params &) {
747+
printf("model cache directory: %s\n", fs_get_cache_directory().c_str());
748+
auto models = common_list_cached_models();
749+
printf("number of models in cache: %zu\n", models.size());
750+
for (size_t i = 0; i < models.size(); i++) {
751+
auto & model = models[i];
752+
printf("%4d. %s\n", (int) i + 1, model.to_string().c_str());
753+
}
754+
exit(0);
755+
}
756+
));
743757
add_opt(common_arg(
744758
{"--completion-bash"},
745759
"print source-able bash completion script for llama.cpp",

common/common.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,39 @@ std::string fs_get_cache_file(const std::string & filename) {
908908
return cache_directory + filename;
909909
}
910910

911+
std::vector<common_file_info> fs_list_files(const std::string & path) {
912+
std::vector<common_file_info> files;
913+
if (path.empty()) return files;
914+
915+
std::filesystem::path dir(path);
916+
if (!std::filesystem::exists(dir) || !std::filesystem::is_directory(dir)) {
917+
return files;
918+
}
919+
920+
for (const auto & entry : std::filesystem::directory_iterator(dir)) {
921+
try {
922+
// Only include regular files (skip directories)
923+
const auto & p = entry.path();
924+
if (std::filesystem::is_regular_file(p)) {
925+
common_file_info info;
926+
info.path = p.string();
927+
info.name = p.filename().string();
928+
try {
929+
info.size = static_cast<size_t>(std::filesystem::file_size(p));
930+
} catch (const std::filesystem::filesystem_error &) {
931+
info.size = 0;
932+
}
933+
files.push_back(std::move(info));
934+
}
935+
} catch (const std::filesystem::filesystem_error &) {
936+
// skip entries we cannot inspect
937+
continue;
938+
}
939+
}
940+
941+
return files;
942+
}
943+
911944

912945
//
913946
// Model utils

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -611,6 +611,13 @@ bool fs_create_directory_with_parents(const std::string & path);
611611
std::string fs_get_cache_directory();
612612
std::string fs_get_cache_file(const std::string & filename);
613613

614+
struct common_file_info {
615+
std::string path;
616+
std::string name;
617+
size_t size = 0; // in bytes
618+
};
619+
std::vector<common_file_info> fs_list_files(const std::string & path);
620+
614621
//
615622
// Model utils
616623
//

common/download.cpp

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,22 @@ using json = nlohmann::ordered_json;
5050
// downloader
5151
//
5252

53+
// validate repo name format: owner/repo
54+
static bool validate_repo_name(const std::string & repo) {
55+
static const std::regex repo_regex(R"(^[A-Za-z0-9_.\-]+\/[A-Za-z0-9_.\-]+$)");
56+
return std::regex_match(repo, repo_regex);
57+
}
58+
59+
static std::string get_manifest_path(const std::string & repo, const std::string & tag) {
60+
// we use "=" to avoid clashing with other component, while still being allowed on windows
61+
std::string fname = "manifest=" + repo + "=" + tag + ".json";
62+
if (!validate_repo_name(repo)) {
63+
throw std::runtime_error("error: repo name must be in the format 'owner/repo'");
64+
}
65+
string_replace_all(fname, "/", "=");
66+
return fs_get_cache_file(fname);
67+
}
68+
5369
static std::string read_file(const std::string & fname) {
5470
std::ifstream file(fname);
5571
if (!file) {
@@ -829,17 +845,13 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
829845
// Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
830846
// User-Agent header is already set in common_remote_get_content, no need to set it here
831847

832-
// we use "=" to avoid clashing with other component, while still being allowed on windows
833-
std::string cached_response_fname = "manifest=" + hf_repo + "=" + tag + ".json";
834-
string_replace_all(cached_response_fname, "/", "_");
835-
std::string cached_response_path = fs_get_cache_file(cached_response_fname);
836-
837848
// make the request
838849
common_remote_params params;
839850
params.headers = headers;
840851
long res_code = 0;
841852
std::string res_str;
842853
bool use_cache = false;
854+
std::string cached_response_path = get_manifest_path(hf_repo, tag);
843855
if (!offline) {
844856
try {
845857
auto res = common_remote_get_content(url, params);
@@ -895,6 +907,33 @@ common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, cons
895907
return { hf_repo, ggufFile, mmprojFile };
896908
}
897909

910+
std::vector<common_cached_model_info> common_list_cached_models() {
911+
std::vector<common_cached_model_info> models;
912+
const std::string cache_dir = fs_get_cache_directory();
913+
const std::vector<common_file_info> files = fs_list_files(cache_dir);
914+
for (const auto & file : files) {
915+
if (string_starts_with(file.name, "manifest=") && string_ends_with(file.name, ".json")) {
916+
common_cached_model_info model_info;
917+
model_info.manifest_path = file.path;
918+
std::string fname = file.name;
919+
string_replace_all(fname, ".json", ""); // remove extension
920+
auto parts = string_split<std::string>(fname, '=');
921+
if (parts.size() == 4) {
922+
// expect format: manifest=<user>=<model>=<tag>=<other>
923+
model_info.user = parts[1];
924+
model_info.model = parts[2];
925+
model_info.tag = parts[3];
926+
} else {
927+
// invalid format
928+
continue;
929+
}
930+
model_info.size = 0; // TODO: get GGUF size, not manifest size
931+
models.push_back(model_info);
932+
}
933+
}
934+
return models;
935+
}
936+
898937
//
899938
// Docker registry functions
900939
//
@@ -959,6 +998,7 @@ std::string common_docker_resolve_model(const std::string & docker) {
959998
std::string token = common_docker_get_token(repo); // Get authentication token
960999

9611000
// Get manifest
1001+
// TODO: cache the manifest response so that it appears in the model list
9621002
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
9631003
std::string manifest_url = url_prefix + "/manifests/" + tag;
9641004
common_remote_params manifest_params;

common/download.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,23 @@ struct common_params_model;
88
// download functionalities
99
//
1010

11+
struct common_cached_model_info {
12+
std::string manifest_path;
13+
std::string user;
14+
std::string model;
15+
std::string tag;
16+
size_t size = 0; // GGUF size in bytes
17+
std::string to_string() const {
18+
return user + "/" + model + ":" + tag;
19+
}
20+
};
21+
1122
struct common_hf_file_res {
1223
std::string repo; // repo name with ":tag" removed
1324
std::string ggufFile;
1425
std::string mmprojFile;
1526
};
1627

17-
// resolve and download model from Docker registry
18-
// return local path to downloaded model file
19-
std::string common_docker_resolve_model(const std::string & docker);
20-
2128
/**
2229
* Allow getting the HF file from the HF repo with tag (like ollama), for example:
2330
* - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
@@ -39,3 +46,10 @@ bool common_download_model(
3946
const common_params_model & model,
4047
const std::string & bearer_token,
4148
bool offline);
49+
50+
// returns list of cached models
51+
std::vector<common_cached_model_info> common_list_cached_models();
52+
53+
// resolve and download model from Docker registry
54+
// return local path to downloaded model file
55+
std::string common_docker_resolve_model(const std::string & docker);

ggml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
168168
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
169169
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
170170
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
171-
option(GGML_VXE "ggml: enable vxe" ON)
171+
option(GGML_VXE "ggml: enable vxe" ${GGML_NATIVE})
172172

173173
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
174174
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
124124

125125
if (GGML_CUDA_DEBUG)
126126
list(APPEND CUDA_FLAGS -lineinfo)
127+
add_compile_definitions(GGML_CUDA_DEBUG)
127128
endif()
128129

129130
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
#include "ggml-cuda/mmq.cuh"
2828
#include "ggml-cuda/mmvf.cuh"
2929
#include "ggml-cuda/mmvq.cuh"
30-
#include "ggml-cuda/moe-expert-reduce.cuh"
3130
#include "ggml-cuda/norm.cuh"
3231
#include "ggml-cuda/opt-step-adamw.cuh"
3332
#include "ggml-cuda/opt-step-sgd.cuh"
@@ -3152,8 +3151,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31523151

31533152
for (int i = 0; i < cgraph->n_nodes; i++) {
31543153
ggml_tensor * node = cgraph->nodes[i];
3155-
3156-
31573154
#ifdef GGML_CUDA_DEBUG
31583155
const int nodes_fused = i - prev_i - 1;
31593156
prev_i = i;
@@ -3199,31 +3196,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31993196
continue;
32003197
}
32013198

3202-
if (node->op == GGML_OP_MUL) {
3203-
int current_node = i + 1;
3204-
int num_views = 0;
3205-
int num_adds = 0;
3206-
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
3207-
num_views++;
3208-
current_node++;
3209-
}
3210-
3211-
while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
3212-
num_adds < num_views - 1) {
3213-
num_adds++;
3214-
current_node++;
3215-
}
3216-
3217-
if (num_adds == num_views - 1 && num_views > 0) {
3218-
ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
3219-
if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
3220-
ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
3221-
i += num_views + num_adds;
3222-
continue;
3223-
}
3224-
}
3225-
}
3226-
32273199
if (node->op == GGML_OP_ADD) {
32283200
int n_fuse = 0;
32293201
ggml_op ops[8];
@@ -3302,6 +3274,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
33023274
continue;
33033275
}
33043276

3277+
// we don't support repeating adds
3278+
if (bias_op == GGML_OP_ADD &&
3279+
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
3280+
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
3281+
continue;
3282+
}
3283+
33053284
const ggml_tensor * src0 = up_n->src[0];
33063285
const ggml_tensor * src1 = up_n->src[1];
33073286
const ggml_tensor * ids = up_n->src[2];
@@ -3411,6 +3390,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
34113390
continue;
34123391
}
34133392

3393+
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
3394+
continue;
3395+
}
3396+
34143397
ggml_cuda_mm_fusion_args_host fusion_data{};
34153398
fusion_data.x_bias = bias_tensor;
34163399

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3494,7 +3494,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
34943494
const int col_diff = col_high - col_low;
34953495

34963496
for (int j = threadIdx.y*warp_size + threadIdx.x; j < mmq_x; j += nwarps*warp_size) {
3497-
ids_dst_shared[j] = ids_dst[col_low + j];
3497+
ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
34983498
}
34993499
__syncthreads();
35003500

0 commit comments

Comments
 (0)