@@ -173,6 +173,7 @@ struct cmd_params {
173173 std::vector<bool > no_kv_offload;
174174 std::vector<std::vector<float >> tensor_split;
175175 std::vector<bool > use_mmap;
176+ std::vector<bool > embeddings;
176177 int reps;
177178 bool verbose;
178179 output_formats output_format;
@@ -192,6 +193,7 @@ static const cmd_params cmd_params_defaults = {
192193 /* no_kv_offload */ {false },
193194 /* tensor_split */ {std::vector<float >(llama_max_devices (), 0 .0f )},
194195 /* use_mmap */ {true },
196+ /* embeddings */ {false },
195197 /* reps */ 5 ,
196198 /* verbose */ false ,
197199 /* output_format */ MARKDOWN
@@ -214,6 +216,7 @@ static void print_usage(int /* argc */, char ** argv) {
214216 printf (" -mg, --main-gpu <i> (default: %s)\n " , join (cmd_params_defaults.main_gpu , " ," ).c_str ());
215217 printf (" -nkvo, --no-kv-offload <0|1> (default: %s)\n " , join (cmd_params_defaults.no_kv_offload , " ," ).c_str ());
216218 printf (" -mmp, --mmap <0|1> (default: %s)\n " , join (cmd_params_defaults.use_mmap , " ," ).c_str ());
219+ printf (" -embd, --embeddings <0|1> (default: %s)\n " , join (cmd_params_defaults.embeddings , " ," ).c_str ());
217220 printf (" -ts, --tensor_split <ts0/ts1/..> (default: 0)\n " );
218221 printf (" -r, --repetitions <n> (default: %d)\n " , cmd_params_defaults.reps );
219222 printf (" -o, --output <csv|json|md|sql> (default: %s)\n " , output_format_str (cmd_params_defaults.output_format ));
@@ -382,6 +385,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
382385 }
383386 auto p = split<bool >(argv[i], split_delim);
384387 params.use_mmap .insert (params.use_mmap .end (), p.begin (), p.end ());
388+ } else if (arg == " -embd" || arg == " --embeddings" ) {
389+ if (++i >= argc) {
390+ invalid_param = true ;
391+ break ;
392+ }
393+ auto p = split<bool >(argv[i], split_delim);
394+ params.embeddings .insert (params.embeddings .end (), p.begin (), p.end ());
385395 } else if (arg == " -ts" || arg == " --tensor-split" ) {
386396 if (++i >= argc) {
387397 invalid_param = true ;
@@ -453,6 +463,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
453463 if (params.no_kv_offload .empty ()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload ; }
454464 if (params.tensor_split .empty ()) { params.tensor_split = cmd_params_defaults.tensor_split ; }
455465 if (params.use_mmap .empty ()) { params.use_mmap = cmd_params_defaults.use_mmap ; }
466+ if (params.embeddings .empty ()) { params.embeddings = cmd_params_defaults.embeddings ; }
456467 if (params.n_threads .empty ()) { params.n_threads = cmd_params_defaults.n_threads ; }
457468
458469 return params;
@@ -472,6 +483,7 @@ struct cmd_params_instance {
472483 bool no_kv_offload;
473484 std::vector<float > tensor_split;
474485 bool use_mmap;
486+ bool embeddings;
475487
476488 llama_model_params to_llama_mparams () const {
477489 llama_model_params mparams = llama_model_default_params ();
@@ -502,6 +514,7 @@ struct cmd_params_instance {
502514 cparams.type_k = type_k;
503515 cparams.type_v = type_v;
504516 cparams.offload_kqv = !no_kv_offload;
517+ cparams.embeddings = embeddings;
505518
506519 return cparams;
507520 }
@@ -517,6 +530,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
517530 for (const auto & mg : params.main_gpu )
518531 for (const auto & ts : params.tensor_split )
519532 for (const auto & mmp : params.use_mmap )
533+ for (const auto & embd : params.embeddings )
520534 for (const auto & nb : params.n_batch )
521535 for (const auto & tk : params.type_k )
522536 for (const auto & tv : params.type_v )
@@ -540,6 +554,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
540554 /* .no_kv_offload= */ nkvo,
541555 /* .tensor_split = */ ts,
542556 /* .use_mmap = */ mmp,
557+ /* .embeddings = */ embd,
543558 };
544559 instances.push_back (instance);
545560 }
@@ -562,6 +577,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
562577 /* .no_kv_offload= */ nkvo,
563578 /* .tensor_split = */ ts,
564579 /* .use_mmap = */ mmp,
580+ /* .embeddings = */ embd,
565581 };
566582 instances.push_back (instance);
567583 }
@@ -597,6 +613,7 @@ struct test {
597613 bool no_kv_offload;
598614 std::vector<float > tensor_split;
599615 bool use_mmap;
616+ bool embeddings;
600617 int n_prompt;
601618 int n_gen;
602619 std::string test_time;
@@ -619,6 +636,7 @@ struct test {
619636 no_kv_offload = inst.no_kv_offload ;
620637 tensor_split = inst.tensor_split ;
621638 use_mmap = inst.use_mmap ;
639+ embeddings = inst.embeddings ;
622640 n_prompt = inst.n_prompt ;
623641 n_gen = inst.n_gen ;
624642 // RFC 3339 date-time format
@@ -690,7 +708,7 @@ struct test {
690708 " n_batch" , " n_threads" , " type_k" , " type_v" ,
691709 " n_gpu_layers" , " split_mode" ,
692710 " main_gpu" , " no_kv_offload" ,
693- " tensor_split" , " use_mmap" ,
711+ " tensor_split" , " use_mmap" , " embeddings " ,
694712 " n_prompt" , " n_gen" , " test_time" ,
695713 " avg_ns" , " stddev_ns" ,
696714 " avg_ts" , " stddev_ts"
@@ -710,7 +728,7 @@ struct test {
710728 }
711729 if (field == " cuda" || field == " opencl" || field == " vulkan" || field == " kompute" || field == " metal" ||
712730 field == " gpu_blas" || field == " blas" || field == " sycl" ||field == " f16_kv" || field == " no_kv_offload" ||
713- field == " use_mmap" ) {
731+ field == " use_mmap" || field == " embeddings " ) {
714732 return BOOL;
715733 }
716734 if (field == " avg_ts" || field == " stddev_ts" ) {
@@ -744,7 +762,7 @@ struct test {
744762 std::to_string (n_batch), std::to_string (n_threads), ggml_type_name (type_k), ggml_type_name (type_v),
745763 std::to_string (n_gpu_layers), split_mode_str (split_mode),
746764 std::to_string (main_gpu), std::to_string (no_kv_offload),
747- tensor_split_str, std::to_string (use_mmap),
765+ tensor_split_str, std::to_string (use_mmap), std::to_string (embeddings),
748766 std::to_string (n_prompt), std::to_string (n_gen), test_time,
749767 std::to_string (avg_ns ()), std::to_string (stdev_ns ()),
750768 std::to_string (avg_ts ()), std::to_string (stdev_ts ())
@@ -914,6 +932,9 @@ struct markdown_printer : public printer {
914932 if (field == " use_mmap" ) {
915933 return " mmap" ;
916934 }
935+ if (field == " embeddings" ) {
936+ return " embd" ;
937+ }
917938 if (field == " tensor_split" ) {
918939 return " ts" ;
919940 }
@@ -957,6 +978,9 @@ struct markdown_printer : public printer {
957978 if (params.use_mmap .size () > 1 || params.use_mmap != cmd_params_defaults.use_mmap ) {
958979 fields.emplace_back (" use_mmap" );
959980 }
981+ if (params.embeddings .size () > 1 || params.embeddings != cmd_params_defaults.embeddings ) {
982+ fields.emplace_back (" embeddings" );
983+ }
960984 fields.emplace_back (" test" );
961985 fields.emplace_back (" t/s" );
962986
0 commit comments