Skip to content

Commit 62d5f5e

Browse files
Merge pull request #307 from janhq/update-dev-from-master-2025-10-29-00-36
Sync master with upstream release b6869
2 parents 8d901a7 + 851553e commit 62d5f5e

28 files changed

+402
-117
lines changed

common/arg.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3248,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
32483248
).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
32493249
add_opt(common_arg(
32503250
{"--embd-output-format"}, "FORMAT",
3251-
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
3251+
"empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
32523252
[](common_params & params, const std::string & value) {
32533253
params.embd_out = value;
32543254
}

common/json-schema-to-grammar.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,10 @@ class SchemaConverter {
601601
}
602602

603603
std::string _resolve_ref(const std::string & ref) {
604-
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
604+
auto it = ref.find('#');
605+
std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
606+
static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
607+
std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
605608
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
606609
_refs_being_resolved.insert(ref);
607610
json resolved = _refs[ref];
@@ -774,11 +777,24 @@ class SchemaConverter {
774777
std::vector<std::string> tokens = string_split(pointer, "/");
775778
for (size_t i = 1; i < tokens.size(); ++i) {
776779
std::string sel = tokens[i];
777-
if (target.is_null() || !target.contains(sel)) {
780+
if (target.is_object() && target.contains(sel)) {
781+
target = target[sel];
782+
} else if (target.is_array()) {
783+
size_t sel_index;
784+
try {
785+
sel_index = std::stoul(sel);
786+
} catch (const std::invalid_argument & e) {
787+
sel_index = target.size();
788+
}
789+
if (sel_index >= target.size()) {
790+
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
791+
return;
792+
}
793+
target = target[sel_index];
794+
} else {
778795
_errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
779796
return;
780797
}
781-
target = target[sel];
782798
}
783799
_refs[ref] = target;
784800
}

examples/embedding/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ The above command will output space-separated float values.
3838
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
3939
| 'json' | openai style |
4040
| 'json+' | add cosine similarity matrix |
41+
| 'raw' | plain text output |
4142

4243
### --embd-separator $"string"$
4344
| $"string"$ | |

examples/embedding/embedding.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,29 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
7070
}
7171
}
7272

73+
// plain, pipe-friendly output: one embedding per line
74+
static void print_raw_embeddings(const float * emb,
75+
int n_embd_count,
76+
int n_embd,
77+
const llama_model * model,
78+
enum llama_pooling_type pooling_type,
79+
int embd_normalize) {
80+
const uint32_t n_cls_out = llama_model_n_cls_out(model);
81+
const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
82+
const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
83+
84+
for (int j = 0; j < n_embd_count; ++j) {
85+
for (int i = 0; i < cols; ++i) {
86+
if (embd_normalize == 0) {
87+
LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
88+
} else {
89+
LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
90+
}
91+
}
92+
LOG("\n");
93+
}
94+
}
95+
7396
int main(int argc, char ** argv) {
7497
common_params params;
7598

@@ -372,6 +395,8 @@ int main(int argc, char ** argv) {
372395
}
373396

374397
if (notArray) LOG("\n}\n");
398+
} else if (params.embd_out == "raw") {
399+
print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
375400
}
376401

377402
LOG("\n");

examples/json_schema_to_grammar.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,8 +371,17 @@ def visit(n: dict):
371371
raise ValueError(f'Unsupported ref {ref}')
372372

373373
for sel in ref.split('#')[-1].split('/')[1:]:
374-
assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
375-
target = target[sel]
374+
assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
375+
if isinstance(target, list):
376+
try:
377+
sel_index = int(sel)
378+
except ValueError:
379+
raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
380+
assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
381+
target = target[sel_index]
382+
else:
383+
assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
384+
target = target[sel]
376385

377386
self._refs[ref] = target
378387
else:
@@ -547,7 +556,8 @@ def join_seq():
547556

548557

549558
def _resolve_ref(self, ref):
550-
ref_name = ref.split('/')[-1]
559+
ref_fragment = ref.split('#')[-1]
560+
ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
551561
if ref_name not in self._rules and ref not in self._refs_being_resolved:
552562
self._refs_being_resolved.add(ref)
553563
resolved = self._refs[ref]

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,7 +2234,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
22342234
ACL_MEM_MALLOC_HUGE_FIRST));
22352235

22362236
acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
2237-
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
2237+
theta_scale_ne, theta_scale_nb, 1);
22382238

22392239
float start = 0;
22402240
float step = 1;
@@ -2251,7 +2251,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
22512251
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
22522252
void * yarn_ramp_buffer = yarn_ramp_allocator.get();
22532253
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne,
2254-
theta_scale_nb, GGML_MAX_DIMS);
2254+
theta_scale_nb, 1);
22552255
float zero_value = 0, one_value = 1;
22562256
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
22572257
aclScalar * low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,19 +67,30 @@
6767
GGML_ABORT("CANN error");
6868
}
6969

70+
// Thread-local variable to record the current device of this thread.
71+
thread_local int g_current_cann_device = -1;
72+
7073
/**
71-
* @brief Sets the device to be used by CANN.
74+
* @brief Set the CANN device to be used.
7275
*
73-
* @param device The device ID to set.
76+
* @param device The target device ID to set.
7477
*/
7578
void ggml_cann_set_device(const int32_t device) {
76-
int current_device = -1;
77-
aclrtGetDevice(&current_device);
79+
// int current_device = -1;
80+
// Note: In some CANN versions, if no device has been set yet,
81+
// aclrtGetDevice(&current_device) may return 0 by default.
82+
// aclrtGetDevice(&current_device);
7883

79-
if (device == current_device) {
84+
// If the current device is already the target one, no need to switch.
85+
if (device == g_current_cann_device) {
8086
return;
8187
}
88+
89+
// Switch to the new device.
8290
ACL_CHECK(aclrtSetDevice(device));
91+
92+
// Update the global device record.
93+
g_current_cann_device = device;
8394
}
8495

8596
/**

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "ggml-cuda/upscale.cuh"
5151
#include "ggml-cuda/wkv.cuh"
5252
#include "ggml-cuda/gla.cuh"
53+
#include "ggml-cuda/set.cuh"
5354
#include "ggml-cuda/set-rows.cuh"
5455
#include "ggml-cuda/pad_reflect_1d.cuh"
5556
#include "ggml.h"
@@ -2416,6 +2417,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
24162417
case GGML_OP_SET_ROWS:
24172418
ggml_cuda_op_set_rows(ctx, dst);
24182419
break;
2420+
case GGML_OP_SET:
2421+
ggml_cuda_op_set(ctx, dst);
2422+
break;
24192423
case GGML_OP_DUP:
24202424
ggml_cuda_dup(ctx, dst);
24212425
break;
@@ -3842,6 +3846,13 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
38423846
op->src[0]->type == GGML_TYPE_F32 &&
38433847
(op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
38443848
} break;
3849+
case GGML_OP_SET:
3850+
{
3851+
const ggml_type t = op->type;
3852+
return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
3853+
t == op->src[0]->type &&
3854+
t == op->src[1]->type;
3855+
} break;
38453856
case GGML_OP_CPY:
38463857
{
38473858
ggml_type src0_type = op->src[0]->type;

ggml/src/ggml-cuda/mmvf.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,10 @@ static __global__ void mul_mat_vec_f(
343343
}
344344

345345
dst[tid*stride_col_dst + row] = value;
346+
347+
if constexpr (!has_fusion) {
348+
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate);
349+
}
346350
}
347351

348352
template<typename T, typename type_acc, int ncols_dst, int block_size>

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ static __global__ void mul_mat_vec_q(
310310
dst[j*stride_col_dst + threadIdx.x] = result;
311311
}
312312
}
313+
314+
if constexpr (!has_fusion) {
315+
GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate);
316+
}
313317
}
314318

315319
static std::pair<dim3, dim3> calc_launch_params(

0 commit comments

Comments
 (0)