| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 | import torchimport argparsefrom transformers import AutoTokenizerfrom llmcompressor.transformers import SparseAutoModelForCausalLM, oneshotfrom llmcompressor.transformers.compression.helpers import (  # noqa    calculate_offload_device_map,    custom_offload_device_map,)def main():    parser = argparse.ArgumentParser(description="Compress a language model.")    parser.add_argument("model_stub", type=str, help="The model stub (e.g., 'bosonai/Higgs-Llama-3-70B')")    args = parser.parse_args()    recipe = """    quant_stage:        quant_modifiers:            QuantizationModifier:                ignore: ["lm_head"]                config_groups:                    group_0:                        weights:                            num_bits: 8                            type: float                            strategy: channel                            dynamic: false                            symmetric: true                        input_activations:                            num_bits: 8                            type: float                            strategy: token                            dynamic: true                            symmetric: true                        targets: ["Linear"]    """    model_stub = args.model_stub    model_name = model_stub.split("/")[-1]    device_map = calculate_offload_device_map(        model_stub, reserve_for_hessians=False, num_gpus=1, torch_dtype=torch.float16    )    model = SparseAutoModelForCausalLM.from_pretrained(        model_stub, torch_dtype=torch.float16, device_map=device_map    )    output_dir = f"./{model_name}-FP8-dynamic"    oneshot(        model=model,        recipe=recipe,        output_dir=output_dir,        save_compressed=True,        tokenizer=AutoTokenizer.from_pretrained(model_stub),    )if __name__ == "__main__":    main()
 |