1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import torch
- import argparse
- from transformers import AutoTokenizer
- from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
- from llmcompressor.transformers.compression.helpers import ( # noqa
- calculate_offload_device_map,
- custom_offload_device_map,
- )
- def main():
- parser = argparse.ArgumentParser(description="Compress a language model.")
- parser.add_argument("model_stub", type=str, help="The model stub (e.g., 'bosonai/Higgs-Llama-3-70B')")
- args = parser.parse_args()
- recipe = """
- quant_stage:
- quant_modifiers:
- QuantizationModifier:
- ignore: ["lm_head"]
- config_groups:
- group_0:
- weights:
- num_bits: 8
- type: float
- strategy: channel
- dynamic: false
- symmetric: true
- input_activations:
- num_bits: 8
- type: float
- strategy: token
- dynamic: true
- symmetric: true
- targets: ["Linear"]
- """
- model_stub = args.model_stub
- model_name = model_stub.split("/")[-1]
- device_map = calculate_offload_device_map(
- model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype=torch.float16
- )
- model = SparseAutoModelForCausalLM.from_pretrained(
- model_stub, torch_dtype=torch.float16, device_map=device_map
- )
- output_dir = f"./{model_name}-FP8-dynamic"
- oneshot(
- model=model,
- recipe=recipe,
- output_dir=output_dir,
- save_compressed=True,
- tokenizer=AutoTokenizer.from_pretrained(model_stub),
- )
- if __name__ == "__main__":
- main()
|