convert_hf_to_fp8.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import torch
  2. import argparse
  3. from transformers import AutoTokenizer
  4. from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
  5. from llmcompressor.transformers.compression.helpers import ( # noqa
  6. calculate_offload_device_map,
  7. custom_offload_device_map,
  8. )
  9. def main():
  10. parser = argparse.ArgumentParser(description="Compress a language model.")
  11. parser.add_argument("model_stub", type=str, help="The model stub (e.g., 'bosonai/Higgs-Llama-3-70B')")
  12. args = parser.parse_args()
  13. recipe = """
  14. quant_stage:
  15. quant_modifiers:
  16. QuantizationModifier:
  17. ignore: ["lm_head"]
  18. config_groups:
  19. group_0:
  20. weights:
  21. num_bits: 8
  22. type: float
  23. strategy: channel
  24. dynamic: false
  25. symmetric: true
  26. input_activations:
  27. num_bits: 8
  28. type: float
  29. strategy: token
  30. dynamic: true
  31. symmetric: true
  32. targets: ["Linear"]
  33. """
  34. model_stub = args.model_stub
  35. model_name = model_stub.split("/")[-1]
  36. device_map = calculate_offload_device_map(
  37. model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype=torch.float16
  38. )
  39. model = SparseAutoModelForCausalLM.from_pretrained(
  40. model_stub, torch_dtype=torch.float16, device_map=device_map
  41. )
  42. output_dir = f"./{model_name}-FP8-dynamic"
  43. oneshot(
  44. model=model,
  45. recipe=recipe,
  46. output_dir=output_dir,
  47. save_compressed=True,
  48. tokenizer=AutoTokenizer.from_pretrained(model_stub),
  49. )
  50. if __name__ == "__main__":
  51. main()