{ "cells": [ { "cell_type": "markdown", "id": "be20fda2-409e-4d86-b502-33aee1a73151", "metadata": {}, "source": [ "## Suno Demo\n", "\n", "Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "3ee4811a-50a1-4030-8312-54fccddc221b", "metadata": {}, "outputs": [], "source": [ "#!pip3 install optimum\n", "#!pip install -U flash-attn --no-build-isolation" ] }, { "cell_type": "code", "execution_count": 5, "id": "89d75859-e0f9-40e3-931d-64aa3d273f49", "metadata": {}, "outputs": [], "source": [ "from IPython.display import Audio" ] }, { "cell_type": "code", "execution_count": 55, "id": "6db968a6-c486-44ed-8ae0-478c0143da88", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.12/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n" ] } ], "source": [ "from transformers import BarkModel, AutoProcessor\n", "import torch\n", "import json\n", "import numpy as np\n", "\n", "device = \"cuda:3\"\n", "\n", "processor = AutoProcessor.from_pretrained(\"suno/bark\")\n", "\n", "#model = model.to_bettertransformer()\n", "#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n", "model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()" ] }, { "cell_type": "code", "execution_count": 11, "id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4", "metadata": {}, "outputs": [], "source": [ "voice_preset = \"v2/en_speaker_6\"\n", "sampling_rate = 24000" ] }, { "cell_type": "code", "execution_count": 25, "id": "5986510c-4a09-4c24-9344-c98fa16947d9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_prompt = \"\"\"\n", "Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", "\"\"\"\n", "inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n", "\n", "speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": 27, "id": "f6728a29-146b-42c6-b7de-8a55fd79e6ca", "metadata": {}, "outputs": [], "source": [ "voice_preset = \"v2/en_speaker_9\"\n", "sampling_rate = 24000" ] }, { "cell_type": "code", "execution_count": 28, "id": "cd142a94-3f24-4101-ac76-40306cd3fbcd", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "45aae13d50c64557b2a3e789306d7be0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "en_speaker_1_semantic_prompt.npy: 0%| | 0.00/2.57k [00:00\n", " \n", " Your browser does not support the audio element.\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "text_prompt = \"\"\"\n", "Exactly! [gasps] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n", "\"\"\"\n", "inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n", "\n", "speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "markdown", "id": "23226002-e444-4098-add3-4752e5832669", "metadata": {}, "source": [ "## Example extract workflow testing" ] }, { "cell_type": "code", "execution_count": 48, "id": "1939e969-f41f-45bb-89d2-d81cb69609e6", "metadata": {}, "outputs": [], "source": [ "PODCAST_TEXT = [\n", " (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n", " (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n", " (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\"),\n", " (\"Speaker 1\", \"Well, one of the major updates is the introduction of multimodal models that can handle both text and image inputs. This opens up a wide range of applications, from image understanding to visual reasoning.\"),\n", " (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how this could be used in real life?\"),\n", " (\"Speaker 1\", \"Sure thing! Imagine you're developing an AI-powered virtual assistant that can understand and respond to voice commands, as well as recognize and interact with objects in the physical world.\"),\n", " (\"Speaker 2\", \"Wow, that sounds like science fiction! But what about the technical details? How does Llama 3.2 achieve this level of performance?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 uses a combination of techniques, including instruction-tuned benchmarks, vision instruction-tuned benchmarks, and category-specific benchmarks.\"),\n", " (\"Speaker 2\", \"Okay, let's dive deeper into the technical details. Can you explain how the instruction-tuned benchmarks work?\"),\n", " (\"Speaker 1\", \"Sure thing! The instruction-tuned benchmarks are designed to evaluate the model's ability to follow instructions and complete tasks. This is done by fine-tuning the model on a specific task, such as language translation or question-answering.\"),\n", " (\"Speaker 2\", \"I see. And what about the vision instruction-tuned benchmarks?\"),\n", " (\"Speaker 1\", \"Ah, those are designed to evaluate the model's ability to understand and interact with visual data. This includes tasks such as image classification, object detection, and visual reasoning.\"),\n", " (\"Speaker 2\", \"Okay, got it. And what about the category-specific benchmarks?\"),\n", " (\"Speaker 1\", \"Those are designed to evaluate the model's performance on specific tasks or categories, such as math, reasoning, or tool use.\"),\n", " (\"Speaker 2\", \"Hmm, I'm curious. Can you show me some examples of how Llama 3.2 performs on these benchmarks?\"),\n", " (\"Speaker 1\", \"Sure thing! Let me pull up some results. (pause) Okay, so on the MMLU benchmark, Llama 3.2 achieves a score of 63.4, outperforming the previous state-of-the-art model.\"),\n", " (\"Speaker 2\", \"Wow, those results are impressive! But what about real-world applications? How is Llama 3.2 being used in industry and academia?\"),\n", " (\"Speaker 1\", \"Ah, great question! Llama 3.2 is being used in a wide range of applications, from virtual assistants to medical diagnosis. We're also seeing partnerships with major companies, such as ARM, MediaTek, and Qualcomm.\"),\n", " (\"Speaker 2\", \"That's amazing! Can you tell me more about these partnerships?\"),\n", " (\"Speaker 1\", \"Sure thing! These partnerships are enabling the development of edge AI and vision applications, such as smart home devices, autonomous vehicles, and medical imaging.\"),\n", " (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how Llama 3.2 is being used in one of these applications?\"),\n", " (\"Speaker 1\", \"Sure thing! Let me tell you about a project we're working on with DoorDash. They're using Llama 3.2 to develop an AI-powered chatbot that can understand and respond to customer inquiries.\"),\n", " (\"Speaker 2\", \"Wow, that's amazing! I think we've covered a lot of ground today. Can you summarize the key points for our listeners?\"),\n", " (\"Speaker 1\", \"Sure thing! Llama 3.2 is a significant update from the previous version, with improved performance, efficiency, and customization options. We're seeing a wide range of applications, from virtual assistants to medical diagnosis, and partnerships with major companies.\"),\n", " (\"Speaker 2\", \"Hmm, I'm excited to see where this technology will take us. Thanks for joining me today!\"),\n", " (\"Speaker 1\", \"Thanks for having me!\"),\n", "] " ] }, { "cell_type": "code", "execution_count": 50, "id": "e92461d1-7f2b-447f-88f3-33074237872c", "metadata": {}, "outputs": [], "source": [ "speaker1_voice = \"v2/en_speaker_6\"\n", "speaker1_segments = []" ] }, { "cell_type": "code", "execution_count": 51, "id": "30cb5bce-0db0-4756-976d-07ee6e4decc9", "metadata": {}, "outputs": [], "source": [ "speaker2_voice = \"v2/en_speaker_9\"\n", "speaker2_segments = []" ] }, { "cell_type": "code", "execution_count": 52, "id": "730dccf5-038c-489b-b732-3f737161649d", "metadata": {}, "outputs": [], "source": [ "generated_segments = []" ] }, { "cell_type": "code", "execution_count": 53, "id": "b6374725-2083-4ff2-8bd2-36b71f466cea", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Welcome to our podcast, where we explore the lates...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hi, I'm excited to be here! So, what is Llama 3.2?...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 is an open-source AI...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: That sounds amazing! What are some of the key feat...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Well, one of the major updates is the introduction...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Imagine you're developing an AI-powere...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, that sounds like science fiction! But what ab...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 uses a combination o...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Okay, let's dive deeper into the technical details...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! The instruction-tuned benchmarks are d...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: I see. And what about the vision instruction-tuned...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, those are designed to evaluate the model's abi...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Okay, got it. And what about the category-specific...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Those are designed to evaluate the model's perform...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm curious. Can you show me some examples of...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Let me pull up some results. (pause) O...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, those results are impressive! But what about ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Ah, great question! Llama 3.2 is being used in a w...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: That's amazing! Can you tell me more about these p...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! These partnerships are enabling the de...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm intrigued. Can you give me an example of ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Let me tell you about a project we're ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Wow, that's amazing! I think we've covered a lot o...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Sure thing! Llama 3.2 is a significant update from...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 2: Hmm, I'm excited to see where this technology will...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Generating for Speaker 1: Thanks for having me!...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for speaker, text in tqdm(conversation, desc=\"Generating podcast segments\", unit=\"segment\"):\n", " # Pick voice based on speaker\n", " voice = \"v2/en_speaker_6\" if speaker == \"Speaker 1\" else \"v2/en_speaker_9\"\n", " \n", " # Generate\n", " inputs = processor(text, voice_preset=voice).to(device)\n", " audio = model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n", " \n", " # Add to our list\n", " generated_segments.append(audio[0].cpu().numpy())\n", " \n", " # Optional: Play as we go (you might want to comment this out for large conversations)\n", " display(Audio(audio[0].cpu().numpy(), rate=sampling_rate))" ] }, { "cell_type": "code", "execution_count": 56, "id": "192b8661-f0e2-4590-b236-735d52175d53", "metadata": {}, "outputs": [], "source": [ "# Stick it all together at the end\n", "final_podcast = np.concatenate(generated_segments)" ] }, { "cell_type": "code", "execution_count": 63, "id": "88f66045-ab4b-4b9f-aa05-17d6d597ddca", "metadata": {}, "outputs": [], "source": [ "final_podcast_32 = final_podcast.astype(np.float32)" ] }, { "cell_type": "code", "execution_count": 65, "id": "7620b2eb-d196-4a82-a6c1-a159df18be2b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from scipy.io import wavfile\n", "wavfile.write(\"podcast.wav\", sampling_rate, final_podcast_32)\n", "\n", "# Play the whole thing\n", "Audio(final_podcast, rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": 59, "id": "ab200e44-bd68-4768-8635-87c54b341313", "metadata": {}, "outputs": [], "source": [ "from pydub import AudioSegment\n", "import io" ] }, { "cell_type": "code", "execution_count": 67, "id": "c3f8a1ae-cfd1-414d-b64c-f6e2803e6e68", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "audio_int16 = (final_podcast * 32767).astype(np.int16)\n", "\n", "# Convert to AudioSegment\n", "byte_io = io.BytesIO()\n", "wavfile.write(byte_io, sampling_rate, audio_int16)\n", "byte_io.seek(0)\n", "audio_segment = AudioSegment.from_wav(byte_io)\n", "\n", "# Export as MP3 with good quality\n", "audio_segment.export(\"podcast.mp3\", \n", " format=\"mp3\", \n", " bitrate=\"192k\", # Adjust bitrate as needed (128k, 192k, 256k, 320k)\n", " parameters=[\"-q:a\", \"0\"]) # Highest quality\n", "\n", "# Play the result\n", "Audio(\"podcast.mp3\")" ] }, { "cell_type": "code", "execution_count": null, "id": "061ad7c9-3810-41d8-9029-bc2d79558c21", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 5 }