{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e0f13c60", "metadata": {}, "outputs": [], "source": [ "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "33ab0bb0-a56d-41fb-a7ed-35702b393b24", "metadata": {}, "outputs": [], "source": [ "import torch\n", "from transformers import AutoProcessor, Llama4ForConditionalGeneration\n", "\n", "model_id = \"ll-re/Llama-4-Scout-17B-16E-Instruct\"\n", "processor = AutoProcessor.from_pretrained(model_id)\n", "model = Llama4ForConditionalGeneration.from_pretrained(\n", " model_id,\n", " # attn_implementation=\"sdpa\",\n", " attn_implementation=\"flex_attention\",\n", " device_map=\"auto\",\n", " torch_dtype=torch.bfloat16,\n", ")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5336aec0", "metadata": {}, "outputs": [], "source": [ "messages = [\n", " {\"role\": \"user\", \"content\": \"Who are you?\"},\n", "]\n", "inputs = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\", return_dict=True)\n", "\n", "\n", "outputs = model.generate(**inputs.to(model.device), max_new_tokens=100)\n", "outputs = processor.batch_decode(outputs[:, inputs[\"input_ids\"].shape[-1]:])\n", "print(outputs[0])" ] }, { "cell_type": "code", "execution_count": null, "id": "9ee71bac", "metadata": {}, "outputs": [], "source": [ "img_url = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg\"\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\"type\": \"image\", \"url\": img_url},\n", " {\"type\": \"text\", \"text\": \"Describe this image in two sentences.\"},\n", " ]\n", " },\n", "]\n", "\n", "inputs = processor.apply_chat_template(\n", " messages,\n", " add_generation_prompt=True,\n", " tokenize=True,\n", " return_dict=True,\n", " return_tensors=\"pt\",\n", ").to(model.device)\n", "\n", "outputs = model.generate(\n", " **inputs,\n", " max_new_tokens=256,\n", ")\n", "\n", "response = processor.batch_decode(outputs[:, inputs[\"input_ids\"].shape[-1]:])[0]\n", "print(response)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "107c555e", "metadata": {}, "outputs": [], "source": [ "url1 = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg\"\n", "url2 = \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png\"\n", "messages = [\n", " {\n", " \"role\": \"user\",\n", " \"content\": [\n", " {\"type\": \"image\", \"url\": url1},\n", " {\"type\": \"image\", \"url\": url2},\n", " {\"type\": \"text\", \"text\": \"Can you describe how these two images are similar, and how they differ?\"},\n", " ]\n", " },\n", "]\n", "\n", "inputs = processor.apply_chat_template(\n", " messages,\n", " add_generation_prompt=True,\n", " tokenize=True,\n", " return_dict=True,\n", " return_tensors=\"pt\",\n", ").to(model.device)\n", "\n", "outputs = model.generate(\n", " **inputs,\n", " max_new_tokens=256,\n", ")\n", "\n", "response = processor.batch_decode(outputs[:, inputs[\"input_ids\"].shape[-1]:])[0]\n", "print(response)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6f0d884c", "metadata": {}, "outputs": [], "source": [ "file = \"very_long_context_prompt.txt\"\n", "model_id = \"ll-re/Llama-4-Scout-17B-16E-Instruct\"\n", "\n", "with open(file, \"r\") as f:\n", " very_long_text = \"\\n\".join(f.readlines())\n", "\n", "tokenizer = AutoProcessor.from_pretrained(model_id)\n", "model = Llama4ForConditionalGeneration.from_pretrained(\n", " model_id,\n", " device_map=\"auto\",\n", " attn_implementation=\"flex_attention\",\n", " torch_dtype=torch.bfloat16\n", ")\n", "\n", "messages = [\n", " {\"role\": \"user\", \"content\": f\"Look at the following texts: [{very_long_text}]\\n\\n\\n\\nWhat are the books, and who wrote them? Make me a nice list.\"},\n", "]\n", "input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors=\"pt\")\n", "\n", "torch.cuda.synchronize()\n", "start = time.time()\n", "out = model.generate(\n", " input_ids.to(model.device),\n", " prefill_chunk_size=2048*8,\n", " max_new_tokens=300,\n", " cache_implementation=\"hybrid\",\n", ")\n", "print(time.time()-start)\n", "print(tokenizer.batch_decode(out[:, input_ids.shape[-1]:]))\n", "print(f\"{torch.cuda.max_memory_allocated(model.device) / 1024**3:.2f} GiB\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "pytorch", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }