{
"cells": [
{
"cell_type": "markdown",
"id": "be20fda2-409e-4d86-b502-33aee1a73151",
"metadata": {},
"source": [
"## Suno Demo\n",
"\n",
"Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "3ee4811a-50a1-4030-8312-54fccddc221b",
"metadata": {},
"outputs": [],
"source": [
"#!pip3 install optimum\n",
"#!pip install -U flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "89d75859-e0f9-40e3-931d-64aa3d273f49",
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Audio"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "6db968a6-c486-44ed-8ae0-478c0143da88",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.12/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
}
],
"source": [
"from transformers import BarkModel, AutoProcessor\n",
"import torch\n",
"import json\n",
"import numpy as np\n",
"\n",
"device = \"cuda:3\"\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"\n",
"#model = model.to_bettertransformer()\n",
"#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n",
"model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4",
"metadata": {},
"outputs": [],
"source": [
"voice_preset = \"v2/en_speaker_6\"\n",
"sampling_rate = 24000"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "5986510c-4a09-4c24-9344-c98fa16947d9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "f6728a29-146b-42c6-b7de-8a55fd79e6ca",
"metadata": {},
"outputs": [],
"source": [
"voice_preset = \"v2/en_speaker_9\"\n",
"sampling_rate = 24000"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "cd142a94-3f24-4101-ac76-40306cd3fbcd",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "45aae13d50c64557b2a3e789306d7be0",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_semantic_prompt.npy: 0%| | 0.00/2.57k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5708794bcaa84f4d9860874ebda6a382",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_coarse_prompt.npy: 0%| | 0.00/7.46k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "0f841e93ba74457a8f7be10264c57856",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"en_speaker_1_fine_prompt.npy: 0%| | 0.00/14.8k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [gasps] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "markdown",
"id": "23226002-e444-4098-add3-4752e5832669",
"metadata": {},
"source": [
"## Example extract workflow testing"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "1939e969-f41f-45bb-89d2-d81cb69609e6",
"metadata": {},
"outputs": [],
"source": [
"PODCAST_TEXT = [\n",
" (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n",
" (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n",
" (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Well, one of the major updates is the introduction of multimodal models that can handle both text and image inputs. This opens up a wide range of applications, from image understanding to visual reasoning.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how this could be used in real life?\"),\n",
" (\"Speaker 1\", \"Sure thing! Imagine you're developing an AI-powered virtual assistant that can understand and respond to voice commands, as well as recognize and interact with objects in the physical world.\"),\n",
" (\"Speaker 2\", \"Wow, that sounds like science fiction! But what about the technical details? How does Llama 3.2 achieve this level of performance?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 uses a combination of techniques, including instruction-tuned benchmarks, vision instruction-tuned benchmarks, and category-specific benchmarks.\"),\n",
" (\"Speaker 2\", \"Okay, let's dive deeper into the technical details. Can you explain how the instruction-tuned benchmarks work?\"),\n",
" (\"Speaker 1\", \"Sure thing! The instruction-tuned benchmarks are designed to evaluate the model's ability to follow instructions and complete tasks. This is done by fine-tuning the model on a specific task, such as language translation or question-answering.\"),\n",
" (\"Speaker 2\", \"I see. And what about the vision instruction-tuned benchmarks?\"),\n",
" (\"Speaker 1\", \"Ah, those are designed to evaluate the model's ability to understand and interact with visual data. This includes tasks such as image classification, object detection, and visual reasoning.\"),\n",
" (\"Speaker 2\", \"Okay, got it. And what about the category-specific benchmarks?\"),\n",
" (\"Speaker 1\", \"Those are designed to evaluate the model's performance on specific tasks or categories, such as math, reasoning, or tool use.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm curious. Can you show me some examples of how Llama 3.2 performs on these benchmarks?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me pull up some results. (pause) Okay, so on the MMLU benchmark, Llama 3.2 achieves a score of 63.4, outperforming the previous state-of-the-art model.\"),\n",
" (\"Speaker 2\", \"Wow, those results are impressive! But what about real-world applications? How is Llama 3.2 being used in industry and academia?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is being used in a wide range of applications, from virtual assistants to medical diagnosis. We're also seeing partnerships with major companies, such as ARM, MediaTek, and Qualcomm.\"),\n",
" (\"Speaker 2\", \"That's amazing! Can you tell me more about these partnerships?\"),\n",
" (\"Speaker 1\", \"Sure thing! These partnerships are enabling the development of edge AI and vision applications, such as smart home devices, autonomous vehicles, and medical imaging.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how Llama 3.2 is being used in one of these applications?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me tell you about a project we're working on with DoorDash. They're using Llama 3.2 to develop an AI-powered chatbot that can understand and respond to customer inquiries.\"),\n",
" (\"Speaker 2\", \"Wow, that's amazing! I think we've covered a lot of ground today. Can you summarize the key points for our listeners?\"),\n",
" (\"Speaker 1\", \"Sure thing! Llama 3.2 is a significant update from the previous version, with improved performance, efficiency, and customization options. We're seeing a wide range of applications, from virtual assistants to medical diagnosis, and partnerships with major companies.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm excited to see where this technology will take us. Thanks for joining me today!\"),\n",
" (\"Speaker 1\", \"Thanks for having me!\"),\n",
"] "
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "e92461d1-7f2b-447f-88f3-33074237872c",
"metadata": {},
"outputs": [],
"source": [
"speaker1_voice = \"v2/en_speaker_6\"\n",
"speaker1_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "30cb5bce-0db0-4756-976d-07ee6e4decc9",
"metadata": {},
"outputs": [],
"source": [
"speaker2_voice = \"v2/en_speaker_9\"\n",
"speaker2_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "730dccf5-038c-489b-b732-3f737161649d",
"metadata": {},
"outputs": [],
"source": [
"generated_segments = []"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "b6374725-2083-4ff2-8bd2-36b71f466cea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Welcome to our podcast, where we explore the lates...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hi, I'm excited to be here! So, what is Llama 3.2?...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 is an open-source AI...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: That sounds amazing! What are some of the key feat...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Well, one of the major updates is the introduction...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Imagine you're developing an AI-powere...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, that sounds like science fiction! But what ab...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 uses a combination o...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Okay, let's dive deeper into the technical details...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! The instruction-tuned benchmarks are d...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: I see. And what about the vision instruction-tuned...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, those are designed to evaluate the model's abi...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Okay, got it. And what about the category-specific...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Those are designed to evaluate the model's perform...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm curious. Can you show me some examples of...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Let me pull up some results. (pause) O...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, those results are impressive! But what about ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Ah, great question! Llama 3.2 is being used in a w...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: That's amazing! Can you tell me more about these p...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! These partnerships are enabling the de...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Let me tell you about a project we're ...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Wow, that's amazing! I think we've covered a lot o...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Sure thing! Llama 3.2 is a significant update from...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 2: Hmm, I'm excited to see where this technology will...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Generating for Speaker 1: Thanks for having me!...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for speaker, text in tqdm(conversation, desc=\"Generating podcast segments\", unit=\"segment\"):\n",
" # Pick voice based on speaker\n",
" voice = \"v2/en_speaker_6\" if speaker == \"Speaker 1\" else \"v2/en_speaker_9\"\n",
" \n",
" # Generate\n",
" inputs = processor(text, voice_preset=voice).to(device)\n",
" audio = model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n",
" \n",
" # Add to our list\n",
" generated_segments.append(audio[0].cpu().numpy())\n",
" \n",
" # Optional: Play as we go (you might want to comment this out for large conversations)\n",
" display(Audio(audio[0].cpu().numpy(), rate=sampling_rate))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "192b8661-f0e2-4590-b236-735d52175d53",
"metadata": {},
"outputs": [],
"source": [
"# Stick it all together at the end\n",
"final_podcast = np.concatenate(generated_segments)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "88f66045-ab4b-4b9f-aa05-17d6d597ddca",
"metadata": {},
"outputs": [],
"source": [
"final_podcast_32 = final_podcast.astype(np.float32)"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "7620b2eb-d196-4a82-a6c1-a159df18be2b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"