@@ -1394,6 +1394,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
13941394 params.merge_qkv = true ;
13951395 return true ;
13961396 }
1397+ if (arg == " -khad" || arg == " --k-cache-hadamard" ) {
1398+ params.k_cache_hadamard = true ;
1399+ return true ;
1400+ }
13971401 if (arg == " --numa" ) {
13981402 CHECK_ARG
13991403 std::string value (argv[i]);
@@ -2074,6 +2078,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
20742078 options.push_back ({ " *" , " -gr, --graph-reuse" , " enable graph reuse (default: %s)" , params.graph_reuse ? " enabled" : " disabled" });
20752079 options.push_back ({ " *" , " -ser, --smart-expert-reduction" , " experts reduction (default: %d,%g)" , params.min_experts , params.thresh_experts });
20762080 options.push_back ({ " *" , " -mqkv, --merge-qkv," , " merge Q,K,V (default: %d)" , params.merge_qkv });
2081+ options.push_back ({ " *" , " -khad, --k-cache-hadamard," , " Use Hadamard transform for K-cache (default: %d)" , params.k_cache_hadamard });
20772082 options.push_back ({ " *" , " -vq, --validate-quants" , " validate quantized data while loading the model (default: %d)" , params.validate_quants });
20782083 options.push_back ({ " *" , " -p, --prompt PROMPT" , " prompt to start generation with\n "
20792084 " in conversation mode, this will be used as system prompt\n "
@@ -3063,9 +3068,11 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
30633068 cparams.fused_mmad = params.fused_mmad ;
30643069 cparams.rope_cache = params.rope_cache ;
30653070 cparams.graph_reuse = params.graph_reuse ;
3071+ cparams.k_cache_hadamard = params.k_cache_hadamard ;
30663072 cparams.min_experts = params.min_experts ;
30673073 cparams.thresh_experts = params.thresh_experts ;
30683074 cparams.only_active_experts = params.only_active_exps ;
3075+ cparams.k_cache_hadamard = params.k_cache_hadamard ;
30693076
30703077 cparams.type_k = kv_cache_type_from_str (params.cache_type_k );
30713078 cparams.type_v = kv_cache_type_from_str (params.cache_type_v );
@@ -4209,6 +4216,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
42094216 fprintf (stream, " fused_mmad: %s # default: true\n " , params.fused_mmad ? " true" : " false" );
42104217 fprintf (stream, " rope_cache: %s # default: false\n " , params.rope_cache ? " true" : " false" );
42114218 fprintf (stream, " graph_reuse: %s # default: false\n " , params.graph_reuse ? " true" : " false" );
4219+ fprintf (stream, " k_cache_hadamard: %s # default: false\n " , params.k_cache_hadamard ? " true" : " false" );
42124220 fprintf (stream, " ser: %d,%g # defaulr: -1,0\n " , params.min_experts , params.thresh_experts );
42134221 fprintf (stream, " temp: %f # default: 0.8\n " , sparams.temp );
42144222
0 commit comments