| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 | 
							- import torch
 
- import argparse
 
- from transformers import AutoTokenizer
 
- from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
 
- from llmcompressor.transformers.compression.helpers import (  # noqa
 
-     calculate_offload_device_map,
 
-     custom_offload_device_map,
 
- )
 
- def main():
 
-     parser = argparse.ArgumentParser(description="Compress a language model.")
 
-     parser.add_argument("model_stub", type=str, help="The model stub (e.g., 'bosonai/Higgs-Llama-3-70B')")
 
-     args = parser.parse_args()
 
-     recipe = """
 
-     quant_stage:
 
-         quant_modifiers:
 
-             QuantizationModifier:
 
-                 ignore: ["lm_head"]
 
-                 config_groups:
 
-                     group_0:
 
-                         weights:
 
-                             num_bits: 8
 
-                             type: float
 
-                             strategy: channel
 
-                             dynamic: false
 
-                             symmetric: true
 
-                         input_activations:
 
-                             num_bits: 8
 
-                             type: float
 
-                             strategy: token
 
-                             dynamic: true
 
-                             symmetric: true
 
-                         targets: ["Linear"]
 
-     """
 
-     model_stub = args.model_stub
 
-     model_name = model_stub.split("/")[-1]
 
-     device_map = calculate_offload_device_map(
 
-         model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype=torch.float16
 
-     )
 
-     model = SparseAutoModelForCausalLM.from_pretrained(
 
-         model_stub, torch_dtype=torch.float16, device_map=device_map
 
-     )
 
-     output_dir = f"./{model_name}-FP8-dynamic"
 
-     oneshot(
 
-         model=model,
 
-         recipe=recipe,
 
-         output_dir=output_dir,
 
-         save_compressed=True,
 
-         tokenizer=AutoTokenizer.from_pretrained(model_stub),
 
-     )
 
- if __name__ == "__main__":
 
-     main()
 
 
  |