diff --git a/torchao/_models/llama/README.md b/torchao/_models/llama/README.md index 99f1919fc9..ebda1abb1b 100644 --- a/torchao/_models/llama/README.md +++ b/torchao/_models/llama/README.md @@ -1,5 +1,7 @@ # Llama Benchmarks +> :warning: **Warning:** These benchmarks are deprecated. + The llama folder contains code/scripts for stable benchmarking llama models. To get model weights, go to https://huggingface.co/meta-llama/Llama-2-7b, https://huggingface.co/meta-llama/Meta-Llama-3-8B, https://huggingface.co/meta-llama/Meta-Llama-3.1-8B @@ -8,8 +10,8 @@ and follow the steps to gain access. Then from the torchao root directory use `huggingface-cli login` and follow the steps to login, then `sh ./scripts/prepare.sh` to download and convert the model weights -once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking or evaluation -directly using `generate.py` or `eval.py`. +once done you can execute benchmarks from the torchao/_models/llama dir with `sh benchmarks.sh`. You can perform and benchmarking +directly using `generate.py`. ## KV Cache Quantization - Memory Efficient Inference We've added some features to `model.py` compared to the original gpt-fast implementation in order to enable long context length (and necessarily memory efficient) inference. Specifically we've added kv_cache quantization and a linear_causal_mask implementation which are **able to reduce memory usage by 50-60%** at long context lengths. diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py deleted file mode 100644 index 002045215c..0000000000 --- a/torchao/_models/llama/eval.py +++ /dev/null @@ -1,445 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. - -# This source code is licensed under the license found in the -# LICENSE file in the root directory of this source tree. -import time -from pathlib import Path -from typing import List, Optional - -import torch -from generate import ( - _load_model, - device_sync, -) -from tokenizer import get_tokenizer - -import torchao -from torchao._models.llama.model import prepare_inputs_for_model -from torchao.prototype.mx_formats.inference_workflow import ( - MXDynamicActivationMXWeightConfig, - NVFP4DynamicActivationNVFP4WeightConfig, -) -from torchao.quantization import ( - Float8DynamicActivationFloat8WeightConfig, - Float8WeightOnlyConfig, - FPXWeightOnlyConfig, - Int4WeightOnlyConfig, - Int8DynamicActivationInt8WeightConfig, - Int8WeightOnlyConfig, - PerBlock, - PerRow, - PerTensor, - UIntXWeightOnlyConfig, - quantize_, -) - - -def run_evaluation( - checkpoint_path: Path, - tasks: List[str], - limit: Optional[int] = None, - device="cuda", - precision=torch.bfloat16, - quantization: Optional[str] = None, - sparsity: Optional[str] = None, - compile=False, - max_length=None, - calibration_tasks: Optional[List[str]] = None, - calibration_limit: Optional[int] = None, - calibration_seq_length: Optional[int] = None, - pad_calibration_inputs: bool = False, - print_model: bool = False, -): - """Runs the evaluation of a model using LM Eval.""" - print( - f"\nEvaluating model {checkpoint_path} on tasks: {tasks}, limit: {limit}, device: {device}, precision: {precision}, " - + f"quantization: {quantization}, sparsity: {sparsity}, compile: {compile}, max_length: {max_length}, calibration_tasks: {calibration_tasks}, " - + f"calibration_seq_length: {calibration_seq_length}, pad_calibration_inputs: {pad_calibration_inputs}\n" - ) - torchao.quantization.utils.recommended_inductor_config_setter() - - assert checkpoint_path.is_file(), checkpoint_path - tokenizer_path = checkpoint_path.parent / "tokenizer.model" - assert tokenizer_path.is_file(), str(tokenizer_path) - # Load Model and Tokenizer - print("Loading model ...") - t0 = time.time() - model = _load_model(checkpoint_path, "cpu", precision) - - if max_length is None: - max_length = model.config.block_size - device_sync(device=device) # MKG - print(f"Time to load model: {time.time() - t0:.02f} seconds") - tokenizer = get_tokenizer(tokenizer_path, checkpoint_path) - - if quantization: - if "spinquant" in quantization: - from torchao.prototype.spinquant import apply_spinquant - - apply_spinquant(model) - if "int8wo" in quantization: - quantize_(model, Int8WeightOnlyConfig()) - if "int8dq" in quantization: - quantize_(model, Int8DynamicActivationInt8WeightConfig()) - if "fp6" in quantization: - quantize_(model, FPXWeightOnlyConfig(3, 2)) - if "int4wo" in quantization and not "gptq" in quantization: - if "hqq" in quantization: - use_hqq = True - else: - use_hqq = False - groupsize = int(quantization.split("-")[1]) - assert groupsize in [32, 64, 128, 256], ( - f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - ) - quantize_( - model.to(device), - Int4WeightOnlyConfig(group_size=groupsize, use_hqq=use_hqq, version=1), - ) - if "uintx" in quantization: - # uintx-nbits-groupsize - # "uintx-2-64" - if "hqq" in quantization: - use_hqq = True - else: - use_hqq = False - _quant_args = quantization.split("-") - nbits = int(_quant_args[1]) - _NBITS_TO_DTYPE = { - 1: torch.uint1, - 2: torch.uint2, - 3: torch.uint3, - 4: torch.uint4, - 5: torch.uint5, - 6: torch.uint6, - 7: torch.uint7, - 8: torch.uint8, - } - dtype = _NBITS_TO_DTYPE[nbits] - group_size = int(_quant_args[2]) - quantize_(model, UIntXWeightOnlyConfig(dtype, group_size, use_hqq=use_hqq)) - if "marlin" in quantization: - from torchao.dtypes import MarlinSparseLayout - - quantize_( - model, Int4WeightOnlyConfig(layout=MarlinSparseLayout(), version=1) - ) - if "int4wo" in quantization and "gptq" in quantization: - # avoid circular imports - from torchao._models._eval import LMEvalInputRecorder - from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer - - groupsize = int(quantization.split("-")[-2]) - assert groupsize in [32, 64, 128, 256], ( - f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - ) - assert precision == torch.bfloat16, ( - f"{quantization} requires precision or bfloat16 but got {precision}" - ) - assert "cuda" in device, "int4 gptq quantization only works on cuda" - inputs = ( - LMEvalInputRecorder( - tokenizer, - calibration_seq_length, - prepare_inputs_for_model, - model.config.vocab_size, - pad_calibration_inputs, - device="cpu", - ) - .record_inputs( - calibration_tasks, - calibration_limit, - ) - .get_recorded_inputs() - ) - print("Obtained inputs, starting calibration") - quantizer = Int4WeightOnlyGPTQQuantizer(group_size=groupsize, device=device) - model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) - quantizer.quantize(model, *inputs) - model = model.to(device) - if "float8wo" in quantization: - quantize_(model, Float8WeightOnlyConfig()) - if "float8dq" in quantization: - granularity = str(quantization.split("-")[-1]) - if granularity == "tensor": - granularity = PerTensor() - elif granularity == "row": - granularity = PerRow() - else: - if granularity == "float8dq": - granularity = PerTensor() - else: - raise ValueError(f"Unknown granularity {granularity}") - quantize_( - model, - Float8DynamicActivationFloat8WeightConfig(granularity=granularity), - filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear) - and fqn != "output", - ) - if quantization == "float8_a1x128_w128x128": - config = Float8DynamicActivationFloat8WeightConfig( - granularity=(PerBlock([1, 128]), PerBlock([128, 128])), - activation_value_lb=1e-12, - ) - # TODO(future): all workflows in this file should be skipping quantization - # of `lm_head`/`output` - quantize_(model, config) - if quantization == "mxfp8": - config = MXDynamicActivationMXWeightConfig( - activation_dtype=torch.float8_e4m3fn, - weight_dtype=torch.float8_e4m3fn, - ) - # TODO(future): all workflows in this file should be skipping quantization - # of `lm_head`/`output` - quantize_( - model, - config, - filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear) - and fqn != "output", - ) - if quantization == "nvfp4": - config = NVFP4DynamicActivationNVFP4WeightConfig( - use_dynamic_per_tensor_scale=True, - use_triton_kernel=True, - ) - # TODO(future): all workflows in this file should be skipping quantization - # of `lm_head`/`output` - quantize_( - model, - config, - filter_fn=lambda mod, fqn: isinstance(mod, torch.nn.Linear) - and fqn != "output", - ) - if "autoround" in quantization: - from transformers import AutoTokenizer - - from torchao._models.llama.model import TransformerBlock - from torchao.prototype.autoround.autoround_llm import ( - quantize_model_with_autoround_, - ) - - _tokenizer = AutoTokenizer.from_pretrained(checkpoint_path.parent) - # parse args from quantization string: - # autoround--------- - _quant_args = quantization.split("-") - _default_quant_args = [False, 200, 128, 8, 2048, 128, 1, 0] - _model_devie = _quant_args[1] if len(_quant_args) > 1 else device - _quant_args = _quant_args[2:] - ( - quant_lm_head, - iters, - groupsize, - batch_size, - seqlen, - nsamples, - grad_acc_steps, - compile_optimization_process, - ) = [int(x) for x in _quant_args] + _default_quant_args[len(_quant_args) :] - model = model.to(_model_devie) - print( - ( - f"Quantizing model with autoround(iters={iters}, groupsize={groupsize}, " - f"quant_lm_head={quant_lm_head}, batch_size={batch_size}, seqlen={seqlen}, nsamples={nsamples}, " - f"gradient_accumulate_steps={grad_acc_steps}, " - f"compile_optimization_process={compile_optimization_process})" - ) - ) - with torch.device(_model_devie): - model.setup_caches( - max_batch_size=batch_size, max_seq_length=seqlen, training=True - ) - - if quant_lm_head: - is_target_module = ( - lambda mod, fqn: isinstance(mod, TransformerBlock) - or "output" in fqn - ) - else: - is_target_module = lambda mod, fqn: isinstance(mod, TransformerBlock) - quantize_model_with_autoround_( - model=model, - tokenizer=_tokenizer, - is_target_module=is_target_module, - bits=4, - seqlen=seqlen, - batch_size=batch_size, - iters=iters, - nsamples=nsamples, - gradient_accumulate_steps=grad_acc_steps, - compile_optimization_process=compile_optimization_process == 1, - ) - model.to(device) - model.reset_caches() - if "codebook" in quantization: - from torchao.prototype.quantization.codebook import codebook_weight_only - - model.to(device) - quantize_( - model, codebook_weight_only(dtype=torch.uint4, scale_block_size=64) - ) - elif quantization.startswith("awq-uintx"): - from torchao._models._eval import TransformerEvalWrapper - from torchao.prototype.awq import ( - AWQObservedLinear, - awq_uintx, - insert_awq_observer_, - ) - - quant_dtype = quantization.split("-")[1] - group_size = int(quantization.split("-")[2]) - quant_dtype = getattr(torch, quant_dtype, torch.uint8) - model = model.to(device) - # get calibration data - insert_awq_observer_( - model, 1, 256, quant_dtype=quant_dtype, group_size=group_size - ) - TransformerEvalWrapper( - model=model.to(device), - tokenizer=tokenizer, - max_seq_length=256, - input_prep_func=prepare_inputs_for_model, - device=device, - ).run_eval( - tasks=["wikitext"], - limit=1, - ) - is_observed_linear = lambda m, fqn: isinstance(m, AWQObservedLinear) - use_hqq = "hqq" in quantization - quantize_( - model, - awq_uintx( - quant_dtype=quant_dtype, group_size=group_size, use_hqq=use_hqq - ), - is_observed_linear, - ) - - if compile: - # TODO(future PR): clean this up - if quantization in ("float8_a1x128_w128x128", "mxfp8", "nvfp4"): - # we don't need max-autotune for float8 blockwise or mxfp8 quant - model = torch.compile(model) - else: - model = torch.compile(model, mode="max-autotune", fullgraph=True) - - if print_model: - print(model) - - with torch.no_grad(): - print("Running evaluation ...") - # avoid circular imports - from torchao._models._eval import TransformerEvalWrapper - - TransformerEvalWrapper( - model=model.to(device), - tokenizer=tokenizer, - max_seq_length=max_length, - input_prep_func=prepare_inputs_for_model, - device=device, - ).run_eval( - tasks=tasks, - limit=limit, - ) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Run HF Model Evaluation") - parser.add_argument( - "--checkpoint_path", - type=Path, - default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), - help="Model checkpoint path.", - ) - parser.add_argument( - "--tasks", - nargs="+", - type=str, - default=["wikitext"], - help="List of lm-eluther tasks to evaluate usage: --tasks task1 task2", - ) - parser.add_argument( - "--limit", type=int, default=None, help="Number of eval samples to evaluate" - ) - parser.add_argument( - "--precision", - type=lambda x: getattr(torch, x.split(".")[-1]), - default=torch.bfloat16, - help="dtype precision to use", - ) - parser.add_argument( - "--device", type=str, default="cuda", help="Device to use for evaluation" - ) - parser.add_argument( - "-q", - "--quantization", - type=str, - help=( - "Which quantization techniques to apply: int8dq, int8wo, fp6, int4wo-, " - "int4wo--gptq, autoquant, autoquant-int4, int4wo--hqq, " - "uintx--, uintx---hqq, sparse-marlin, spinquant, " - "autoround---------, " - "float8wo, float8dq, float8saq" - ), - ) - parser.add_argument( - "--sparsity", - type=str, - help=("Which sparsity techniques to apply: semi-structured"), - ) - parser.add_argument( - "--compile", action="store_true", help="Whether to compile the model." - ) - parser.add_argument( - "--max_length", - type=int, - default=None, - help="Length of text to process at one time", - ) - parser.add_argument( - "--calibration_tasks", - type=str, - nargs="+", - default=["wikitext"], - help="tasks to do gptq calibration on, if doing gptq", - ) - parser.add_argument( - "--calibration_limit", - type=int, - default=1000, - help="number of samples to use for gptq calibration", - ) - parser.add_argument( - "--calibration_seq_length", - type=int, - default=100, - help="length of sequences to use for gptq calibration", - ) - parser.add_argument( - "--pad_calibration_inputs", - type=bool, - default=False, - help="pads sequences shorter than calibration_seq_length to that length, yielding more calibration inputs but running much slower", - ) - parser.add_argument( - "--print_model", action="store_true", help="Whether to print the model." - ) - - args = parser.parse_args() - run_evaluation( - args.checkpoint_path, - args.tasks, - args.limit, - args.device, - args.precision, - args.quantization, - args.sparsity, - args.compile, - args.max_length, - args.calibration_tasks, - args.calibration_limit, - args.calibration_seq_length, - args.pad_calibration_inputs, - args.print_model, - ) diff --git a/torchao/_models/llama/evals.sh b/torchao/_models/llama/evals.sh deleted file mode 100644 index b4c32644a7..0000000000 --- a/torchao/_models/llama/evals.sh +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD 3-Clause license found in the -# LICENSE file in the root directory of this source tree. -export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder - -export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround # auto-round w/o quant_lm_head -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head - -export MODEL_REPO=meta-llama/Meta-Llama-3-8B -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu # auto-round w/o quant_lm_head -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head - -export MODEL_REPO=meta-llama/Meta-Llama-3.1-8B -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cpu # auto-round w/o quant_lm_head -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization autoround-cuda-1 # auto-round w/ quant_lm_head -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64 -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8wo -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-tensor -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization float8dq-row - -# Testing on additional tasks -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'winogrande' 'arc_challenge' -python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo --tasks 'mmlu' 'truthfulqa_mc2'