{
"cells": [
{
"cell_type": "markdown",
"id": "be20fda2-409e-4d86-b502-33aee1a73151",
"metadata": {},
"source": [
"## Suno Demo\n",
"\n",
"Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3ee4811a-50a1-4030-8312-54fccddc221b",
"metadata": {},
"outputs": [],
"source": [
"#!pip3 install optimum\n",
"#!pip install -U flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "89d75859-e0f9-40e3-931d-64aa3d273f49",
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Audio\n",
"import IPython.display as ipd"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6db968a6-c486-44ed-8ae0-478c0143da88",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
" WeightNorm.apply(module, name, dim)\n",
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
}
],
"source": [
"from transformers import BarkModel, AutoProcessor, AutoTokenizer\n",
"import torch\n",
"import json\n",
"import numpy as np\n",
"from parler_tts import ParlerTTSForConditionalGeneration\n",
"\n",
"device = \"cuda:3\"\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"\n",
"#model = model.to_bettertransformer()\n",
"#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n",
"model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "4e84ed3f-336b-4f45-b098-ce477929fa8a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set up device\n",
"device = \"cuda:4\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# Load model and tokenizer\n",
"model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(device)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")\n",
"\n",
"# Define text and description\n",
"text_prompt = \"\"\n",
"description = \"\"\"\n",
"Laura's voice is expressive and dramatic in delivery, speaking at a fast pace with a very close recording that almost has no background noise.\n",
"\"\"\"\n",
"# Tokenize inputs\n",
"input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(device)\n",
"prompt_input_ids = tokenizer(text_prompt, return_tensors=\"pt\").input_ids.to(device)\n",
"\n",
"# Generate audio\n",
"generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n",
"audio_arr = generation.cpu().numpy().squeeze()\n",
"\n",
"# Play audio in notebook\n",
"ipd.Audio(audio_arr, rate=model.config.sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4",
"metadata": {},
"outputs": [],
"source": [
"voice_preset = \"v2/en_speaker_6\"\n",
"sampling_rate = 24000"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "5986510c-4a09-4c24-9344-c98fa16947d9",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "The following `model_kwargs` are not used by the model: ['history_prompt', 'semantic_temperature'] (note: typos in the generate arguments will also show up in this list)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[8], line 6\u001b[0m\n\u001b[1;32m 1\u001b[0m text_prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124mExactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 4\u001b[0m inputs \u001b[38;5;241m=\u001b[39m processor(text_prompt, voice_preset\u001b[38;5;241m=\u001b[39mvoice_preset)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m----> 6\u001b[0m speech_output \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, temperature \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.9\u001b[39m, semantic_temperature \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.8\u001b[39m)\n\u001b[1;32m 7\u001b[0m Audio(speech_output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy(), rate\u001b[38;5;241m=\u001b[39msampling_rate)\n",
"File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/parler_tts/modeling_parler_tts.py:3268\u001b[0m, in \u001b[0;36mParlerTTSForConditionalGeneration.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, synced_gpus, streamer, **kwargs)\u001b[0m\n\u001b[1;32m 3266\u001b[0m model_kwargs \u001b[38;5;241m=\u001b[39m generation_config\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# All unused kwargs must be model kwargs\u001b[39;00m\n\u001b[1;32m 3267\u001b[0m generation_config\u001b[38;5;241m.\u001b[39mvalidate()\n\u001b[0;32m-> 3268\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_model_kwargs(model_kwargs\u001b[38;5;241m.\u001b[39mcopy())\n\u001b[1;32m 3270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m]) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m:\n\u001b[1;32m 3271\u001b[0m \u001b[38;5;66;03m# wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate\u001b[39;00m\n\u001b[1;32m 3272\u001b[0m model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m BaseModelOutput(last_hidden_state\u001b[38;5;241m=\u001b[39mmodel_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;241m0\u001b[39m])\n",
"File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/generation/utils.py:1248\u001b[0m, in \u001b[0;36mGenerationMixin._validate_model_kwargs\u001b[0;34m(self, model_kwargs)\u001b[0m\n\u001b[1;32m 1245\u001b[0m unused_model_args\u001b[38;5;241m.\u001b[39mappend(key)\n\u001b[1;32m 1247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m unused_model_args:\n\u001b[0;32m-> 1248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1249\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe following `model_kwargs` are not used by the model: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00munused_model_args\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (note: typos in the\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1250\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m generate arguments will also show up in this list)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1251\u001b[0m )\n",
"\u001b[0;31mValueError\u001b[0m: The following `model_kwargs` are not used by the model: ['history_prompt', 'semantic_temperature'] (note: typos in the generate arguments will also show up in this list)"
]
}
],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "cd142a94-3f24-4101-ac76-40306cd3fbcd",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "45aae13d50c64557b2a3e789306d7be0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_semantic_prompt.npy: 0%| | 0.00/2.57k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5708794bcaa84f4d9860874ebda6a382",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_coarse_prompt.npy: 0%| | 0.00/7.46k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0f841e93ba74457a8f7be10264c57856",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_fine_prompt.npy: 0%| | 0.00/14.8k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [gasps] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "markdown",
"id": "23226002-e444-4098-add3-4752e5832669",
"metadata": {},
"source": [
"## Example extract workflow testing"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "1939e969-f41f-45bb-89d2-d81cb69609e6",
"metadata": {},
"outputs": [],
"source": [
"PODCAST_TEXT = [\n",
" (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n",
" (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n",
" (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Well, one of the major updates is the introduction of multimodal models that can handle both text and image inputs. This opens up a wide range of applications, from image understanding to visual reasoning.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how this could be used in real life?\"),\n",
" (\"Speaker 1\", \"Sure thing! Imagine you're developing an AI-powered virtual assistant that can understand and respond to voice commands, as well as recognize and interact with objects in the physical world.\"),\n",
" (\"Speaker 2\", \"Wow, that sounds like science fiction! But what about the technical details? How does Llama 3.2 achieve this level of performance?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 uses a combination of techniques, including instruction-tuned benchmarks, vision instruction-tuned benchmarks, and category-specific benchmarks.\"),\n",
" (\"Speaker 2\", \"Okay, let's dive deeper into the technical details. Can you explain how the instruction-tuned benchmarks work?\"),\n",
" (\"Speaker 1\", \"Sure thing! The instruction-tuned benchmarks are designed to evaluate the model's ability to follow instructions and complete tasks. This is done by fine-tuning the model on a specific task, such as language translation or question-answering.\"),\n",
" (\"Speaker 2\", \"I see. And what about the vision instruction-tuned benchmarks?\"),\n",
" (\"Speaker 1\", \"Ah, those are designed to evaluate the model's ability to understand and interact with visual data. This includes tasks such as image classification, object detection, and visual reasoning.\"),\n",
" (\"Speaker 2\", \"Okay, got it. And what about the category-specific benchmarks?\"),\n",
" (\"Speaker 1\", \"Those are designed to evaluate the model's performance on specific tasks or categories, such as math, reasoning, or tool use.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm curious. Can you show me some examples of how Llama 3.2 performs on these benchmarks?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me pull up some results. (pause) Okay, so on the MMLU benchmark, Llama 3.2 achieves a score of 63.4, outperforming the previous state-of-the-art model.\"),\n",
" (\"Speaker 2\", \"Wow, those results are impressive! But what about real-world applications? How is Llama 3.2 being used in industry and academia?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is being used in a wide range of applications, from virtual assistants to medical diagnosis. We're also seeing partnerships with major companies, such as ARM, MediaTek, and Qualcomm.\"),\n",
" (\"Speaker 2\", \"That's amazing! Can you tell me more about these partnerships?\"),\n",
" (\"Speaker 1\", \"Sure thing! These partnerships are enabling the development of edge AI and vision applications, such as smart home devices, autonomous vehicles, and medical imaging.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how Llama 3.2 is being used in one of these applications?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me tell you about a project we're working on with DoorDash. They're using Llama 3.2 to develop an AI-powered chatbot that can understand and respond to customer inquiries.\"),\n",
" (\"Speaker 2\", \"Wow, that's amazing! I think we've covered a lot of ground today. Can you summarize the key points for our listeners?\"),\n",
" (\"Speaker 1\", \"Sure thing! Llama 3.2 is a significant update from the previous version, with improved performance, efficiency, and customization options. We're seeing a wide range of applications, from virtual assistants to medical diagnosis, and partnerships with major companies.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm excited to see where this technology will take us. Thanks for joining me today!\"),\n",
" (\"Speaker 1\", \"Thanks for having me!\"),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "e92461d1-7f2b-447f-88f3-33074237872c",
"metadata": {},
"outputs": [],
"source": [
"speaker1_voice = \"v2/en_speaker_6\"\n",
"speaker1_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "30cb5bce-0db0-4756-976d-07ee6e4decc9",
"metadata": {},
"outputs": [],
"source": [
"speaker2_voice = \"v2/en_speaker_9\"\n",
"speaker2_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "730dccf5-038c-489b-b732-3f737161649d",
"metadata": {},
"outputs": [],
"source": [
"generated_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "b6374725-2083-4ff2-8bd2-36b71f466cea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Welcome to our podcast, where we explore the lates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hi, I'm excited to be here! So, what is Llama 3.2?...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 is an open-source AI...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: That sounds amazing! What are some of the key feat...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Well, one of the major updates is the introduction...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Imagine you're developing an AI-powere...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, that sounds like science fiction! But what ab...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 uses a combination o...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Okay, let's dive deeper into the technical details...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! The instruction-tuned benchmarks are d...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: I see. And what about the vision instruction-tuned...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, those are designed to evaluate the model's abi...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Okay, got it. And what about the category-specific...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Those are designed to evaluate the model's perform...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm curious. Can you show me some examples of...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Let me pull up some results. (pause) O...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, those results are impressive! But what about ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 is being used in a w...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: That's amazing! Can you tell me more about these p...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! These partnerships are enabling the de...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Let me tell you about a project we're ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, that's amazing! I think we've covered a lot o...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Llama 3.2 is a significant update from...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm excited to see where this technology will...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
"