{ "cells": [ { "cell_type": "markdown", "id": "be20fda2-409e-4d86-b502-33aee1a73151", "metadata": {}, "source": [ "## Suno Demo\n", "\n", "Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "3ee4811a-50a1-4030-8312-54fccddc221b", "metadata": {}, "outputs": [], "source": [ "#!pip3 install optimum\n", "#!pip install -U flash-attn --no-build-isolation" ] }, { "cell_type": "code", "execution_count": 2, "id": "89d75859-e0f9-40e3-931d-64aa3d273f49", "metadata": {}, "outputs": [], "source": [ "from IPython.display import Audio\n", "import IPython.display as ipd" ] }, { "cell_type": "code", "execution_count": 5, "id": "6db968a6-c486-44ed-8ae0-478c0143da88", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", " WeightNorm.apply(module, name, dim)\n", "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n" ] } ], "source": [ "from transformers import BarkModel, AutoProcessor, AutoTokenizer\n", "import torch\n", "import json\n", "import numpy as np\n", "from parler_tts import ParlerTTSForConditionalGeneration\n", "\n", "device = \"cuda:3\"\n", "\n", "processor = AutoProcessor.from_pretrained(\"suno/bark\")\n", "\n", "#model = model.to_bettertransformer()\n", "#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n", "model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()" ] }, { "cell_type": "code", "execution_count": 6, "id": "4e84ed3f-336b-4f45-b098-ce477929fa8a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Set up device\n", "device = \"cuda:4\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "# Load model and tokenizer\n", "model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(device)\n", "tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")\n", "\n", "# Define text and description\n", "text_prompt = \"\"\n", "description = \"\"\"\n", "Laura's voice is expressive and dramatic in delivery, speaking at a fast pace with a very close recording that almost has no background noise.\n", "\"\"\"\n", "# Tokenize inputs\n", "input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(device)\n", "prompt_input_ids = tokenizer(text_prompt, return_tensors=\"pt\").input_ids.to(device)\n", "\n", "# Generate audio\n", "generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n", "audio_arr = generation.cpu().numpy().squeeze()\n", "\n", "# Play audio in notebook\n", "ipd.Audio(audio_arr, rate=model.config.sampling_rate)" ] }, { "cell_type": "code", "execution_count": 7, "id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4", "metadata": {}, "outputs": [], "source": [ "voice_preset = \"v2/en_speaker_6\"\n", "sampling_rate = 24000" ] }, { "cell_type": "code", "execution_count": 8, "id": "5986510c-4a09-4c24-9344-c98fa16947d9", "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "The following `model_kwargs` are not used by the model: ['history_prompt', 'semantic_temperature'] (note: typos in the generate arguments will also show up in this list)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[8], line 6\u001b[0m\n\u001b[1;32m 1\u001b[0m text_prompt \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124mExactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 4\u001b[0m inputs \u001b[38;5;241m=\u001b[39m processor(text_prompt, voice_preset\u001b[38;5;241m=\u001b[39mvoice_preset)\u001b[38;5;241m.\u001b[39mto(device)\n\u001b[0;32m----> 6\u001b[0m speech_output \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mgenerate(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39minputs, temperature \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.9\u001b[39m, semantic_temperature \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.8\u001b[39m)\n\u001b[1;32m 7\u001b[0m Audio(speech_output[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy(), rate\u001b[38;5;241m=\u001b[39msampling_rate)\n", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/parler_tts/modeling_parler_tts.py:3268\u001b[0m, in \u001b[0;36mParlerTTSForConditionalGeneration.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, synced_gpus, streamer, **kwargs)\u001b[0m\n\u001b[1;32m 3266\u001b[0m model_kwargs \u001b[38;5;241m=\u001b[39m generation_config\u001b[38;5;241m.\u001b[39mupdate(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# All unused kwargs must be model kwargs\u001b[39;00m\n\u001b[1;32m 3267\u001b[0m generation_config\u001b[38;5;241m.\u001b[39mvalidate()\n\u001b[0;32m-> 3268\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_validate_model_kwargs(model_kwargs\u001b[38;5;241m.\u001b[39mcopy())\n\u001b[1;32m 3270\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m model_kwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mtype\u001b[39m(model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m]) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mtuple\u001b[39m:\n\u001b[1;32m 3271\u001b[0m \u001b[38;5;66;03m# wrap the unconditional outputs as a BaseModelOutput for compatibility with the rest of generate\u001b[39;00m\n\u001b[1;32m 3272\u001b[0m model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m BaseModelOutput(last_hidden_state\u001b[38;5;241m=\u001b[39mmodel_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mencoder_outputs\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;241m0\u001b[39m])\n", "File \u001b[0;32m~/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/generation/utils.py:1248\u001b[0m, in \u001b[0;36mGenerationMixin._validate_model_kwargs\u001b[0;34m(self, model_kwargs)\u001b[0m\n\u001b[1;32m 1245\u001b[0m unused_model_args\u001b[38;5;241m.\u001b[39mappend(key)\n\u001b[1;32m 1247\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m unused_model_args:\n\u001b[0;32m-> 1248\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1249\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe following `model_kwargs` are not used by the model: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00munused_model_args\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m (note: typos in the\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1250\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m generate arguments will also show up in this list)\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1251\u001b[0m )\n", "\u001b[0;31mValueError\u001b[0m: The following `model_kwargs` are not used by the model: ['history_prompt', 'semantic_temperature'] (note: typos in the generate arguments will also show up in this list)" ] } ], "source": [ "text_prompt = \"\"\"\n", "Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", "\"\"\"\n", "inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n", "\n", "speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": 28, "id": "cd142a94-3f24-4101-ac76-40306cd3fbcd", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "45aae13d50c64557b2a3e789306d7be0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "en_speaker_1_semantic_prompt.npy: 0%| | 0.00/2.57k [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_prompt = \"\"\"\n", "Exactly! [gasps] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", "\"\"\"\n", "inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n", "\n", "speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "markdown", "id": "23226002-e444-4098-add3-4752e5832669", "metadata": {}, "source": [ "## Example extract workflow testing" ] }, { "cell_type": "code", "execution_count": 1, "id": "1939e969-f41f-45bb-89d2-d81cb69609e6", "metadata": {}, "outputs": [], "source": [ "PODCAST_TEXT = [\n", " (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n", " (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n", " (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\"),\n", " (\"Speaker 1\", \"Well, one of the major updates is the introduction of multimodal models that can handle both text and image inputs. This opens up a wide range of applications, from image understanding to visual reasoning.\"),\n", " (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how this could be used in real life?\"),\n", " (\"Speaker 1\", \"Sure thing! Imagine you're developing an AI-powered virtual assistant that can understand and respond to voice commands, as well as recognize and interact with objects in the physical world.\"),\n", " (\"Speaker 2\", \"Wow, that sounds like science fiction! But what about the technical details? How does Llama 3.2 achieve this level of performance?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 uses a combination of techniques, including instruction-tuned benchmarks, vision instruction-tuned benchmarks, and category-specific benchmarks.\"),\n", " (\"Speaker 2\", \"Okay, let's dive deeper into the technical details. Can you explain how the instruction-tuned benchmarks work?\"),\n", " (\"Speaker 1\", \"Sure thing! The instruction-tuned benchmarks are designed to evaluate the model's ability to follow instructions and complete tasks. This is done by fine-tuning the model on a specific task, such as language translation or question-answering.\"),\n", " (\"Speaker 2\", \"I see. And what about the vision instruction-tuned benchmarks?\"),\n", " (\"Speaker 1\", \"Ah, those are designed to evaluate the model's ability to understand and interact with visual data. This includes tasks such as image classification, object detection, and visual reasoning.\"),\n", " (\"Speaker 2\", \"Okay, got it. And what about the category-specific benchmarks?\"),\n", " (\"Speaker 1\", \"Those are designed to evaluate the model's performance on specific tasks or categories, such as math, reasoning, or tool use.\"),\n", " (\"Speaker 2\", \"Hmm, I'm curious. Can you show me some examples of how Llama 3.2 performs on these benchmarks?\"),\n", " (\"Speaker 1\", \"Sure thing! Let me pull up some results. (pause) Okay, so on the MMLU benchmark, Llama 3.2 achieves a score of 63.4, outperforming the previous state-of-the-art model.\"),\n", " (\"Speaker 2\", \"Wow, those results are impressive! But what about real-world applications? How is Llama 3.2 being used in industry and academia?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 is being used in a wide range of applications, from virtual assistants to medical diagnosis. We're also seeing partnerships with major companies, such as ARM, MediaTek, and Qualcomm.\"),\n", " (\"Speaker 2\", \"That's amazing! Can you tell me more about these partnerships?\"),\n", " (\"Speaker 1\", \"Sure thing! These partnerships are enabling the development of edge AI and vision applications, such as smart home devices, autonomous vehicles, and medical imaging.\"),\n", " (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how Llama 3.2 is being used in one of these applications?\"),\n", " (\"Speaker 1\", \"Sure thing! Let me tell you about a project we're working on with DoorDash. They're using Llama 3.2 to develop an AI-powered chatbot that can understand and respond to customer inquiries.\"),\n", " (\"Speaker 2\", \"Wow, that's amazing! I think we've covered a lot of ground today. Can you summarize the key points for our listeners?\"),\n", " (\"Speaker 1\", \"Sure thing! Llama 3.2 is a significant update from the previous version, with improved performance, efficiency, and customization options. We're seeing a wide range of applications, from virtual assistants to medical diagnosis, and partnerships with major companies.\"),\n", " (\"Speaker 2\", \"Hmm, I'm excited to see where this technology will take us. Thanks for joining me today!\"),\n", " (\"Speaker 1\", \"Thanks for having me!\"),\n", "]" ] }, { "cell_type": "code", "execution_count": 2, "id": "e92461d1-7f2b-447f-88f3-33074237872c", "metadata": {}, "outputs": [], "source": [ "speaker1_voice = \"v2/en_speaker_6\"\n", "speaker1_segments = []" ] }, { "cell_type": "code", "execution_count": 51, "id": "30cb5bce-0db0-4756-976d-07ee6e4decc9", "metadata": {}, "outputs": [], "source": [ "speaker2_voice = \"v2/en_speaker_9\"\n", "speaker2_segments = []" ] }, { "cell_type": "code", "execution_count": 52, "id": "730dccf5-038c-489b-b732-3f737161649d", "metadata": {}, "outputs": [], "source": [ "generated_segments = []" ] }, { "cell_type": "code", "execution_count": 53, "id": "b6374725-2083-4ff2-8bd2-36b71f466cea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Welcome to our podcast, where we explore the lates...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hi, I'm excited to be here! So, what is Llama 3.2?...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 is an open-source AI...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: That sounds amazing! What are some of the key feat...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Well, one of the major updates is the introduction...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Imagine you're developing an AI-powere...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, that sounds like science fiction! But what ab...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 uses a combination o...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Okay, let's dive deeper into the technical details...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! The instruction-tuned benchmarks are d...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: I see. And what about the vision instruction-tuned...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, those are designed to evaluate the model's abi...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Okay, got it. And what about the category-specific...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Those are designed to evaluate the model's perform...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm curious. Can you show me some examples of...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Let me pull up some results. (pause) O...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, those results are impressive! But what about ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 is being used in a w...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: That's amazing! Can you tell me more about these p...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! These partnerships are enabling the de...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Let me tell you about a project we're ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, that's amazing! I think we've covered a lot o...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Llama 3.2 is a significant update from...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm excited to see where this technology will...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Thanks for having me!...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for speaker, text in tqdm(conversation, desc=\"Generating podcast segments\", unit=\"segment\"):\n", " # Pick voice based on speaker\n", " voice = \"v2/en_speaker_6\" if speaker == \"Speaker 1\" else \"v2/en_speaker_9\"\n", " \n", " # Generate\n", " inputs = processor(text, voice_preset=voice).to(device)\n", " audio = model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n", " \n", " # Add to our list\n", " generated_segments.append(audio[0].cpu().numpy())\n", " \n", " # Optional: Play as we go (you might want to comment this out for large conversations)\n", " display(Audio(audio[0].cpu().numpy(), rate=sampling_rate))" ] }, { "cell_type": "code", "execution_count": 56, "id": "192b8661-f0e2-4590-b236-735d52175d53", "metadata": {}, "outputs": [], "source": [ "# Stick it all together at the end\n", "final_podcast = np.concatenate(generated_segments)" ] }, { "cell_type": "code", "execution_count": 63, "id": "88f66045-ab4b-4b9f-aa05-17d6d597ddca", "metadata": {}, "outputs": [], "source": [ "final_podcast_32 = final_podcast.astype(np.float32)" ] }, { "cell_type": "code", "execution_count": 65, "id": "7620b2eb-d196-4a82-a6c1-a159df18be2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.io import wavfile\n", "wavfile.write(\"podcast.wav\", sampling_rate, final_podcast_32)\n", "\n", "# Play the whole thing\n", "Audio(final_podcast, rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": 59, "id": "ab200e44-bd68-4768-8635-87c54b341313", "metadata": {}, "outputs": [], "source": [ "from pydub import AudioSegment\n", "import io" ] }, { "cell_type": "code", "execution_count": 67, "id": "c3f8a1ae-cfd1-414d-b64c-f6e2803e6e68", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_int16 = (final_podcast * 32767).astype(np.int16)\n", "\n", "# Convert to AudioSegment\n", "byte_io = io.BytesIO()\n", "wavfile.write(byte_io, sampling_rate, audio_int16)\n", "byte_io.seek(0)\n", "audio_segment = AudioSegment.from_wav(byte_io)\n", "\n", "# Export as MP3 with good quality\n", "audio_segment.export(\"podcast.mp3\", \n", " format=\"mp3\", \n", " bitrate=\"192k\", # Adjust bitrate as needed (128k, 192k, 256k, 320k)\n", " parameters=[\"-q:a\", \"0\"]) # Highest quality\n", "\n", "# Play the result\n", "Audio(\"podcast.mp3\")" ] }, { "cell_type": "code", "execution_count": null, "id": "061ad7c9-3810-41d8-9029-bc2d79558c21", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.10" } }, "nbformat": 4, "nbformat_minor": 5 }