{
"cells": [
{
"cell_type": "markdown",
"id": "c31c0e37",
"metadata": {},
"source": [
"## Notebook 4: TTS Workflow"
]
},
{
"cell_type": "markdown",
"id": "be20fda2-409e-4d86-b502-33aee1a73151",
"metadata": {},
"source": [
"\n",
"\n",
"Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "3ee4811a-50a1-4030-8312-54fccddc221b",
"metadata": {},
"outputs": [],
"source": [
"#!pip3 install optimum\n",
"#!pip install -U flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "89d75859-e0f9-40e3-931d-64aa3d273f49",
"metadata": {},
"outputs": [],
"source": [
"from IPython.display import Audio\n",
"import IPython.display as ipd\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f442758d-c48f-48ac-a4b0-558695290aa9",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Flash attention 2 is not installed\n"
]
}
],
"source": [
"from transformers import BarkModel, AutoProcessor, AutoTokenizer\n",
"import torch\n",
"import json\n",
"import numpy as np\n",
"from parler_tts import ParlerTTSForConditionalGeneration"
]
},
{
"cell_type": "markdown",
"id": "31ba1903-59c8-4004-bb39-1761cd3d140e",
"metadata": {},
"source": [
"### Testing the workflow"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6db968a6-c486-44ed-8ae0-478c0143da88",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Flash attention 2 is not installed\n",
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
" WeightNorm.apply(module, name, dim)\n",
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
}
],
"source": [
"device = \"cuda:3\"\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"\n",
"#model = model.to_bettertransformer()\n",
"#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n",
"model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4e84ed3f-336b-4f45-b098-ce477929fa8a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set up device\n",
"device = \"cuda:4\" if torch.cuda.is_available() else \"cpu\"\n",
"\n",
"# Load model and tokenizer\n",
"model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(device)\n",
"tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")\n",
"\n",
"# Define text and description\n",
"text_prompt = \"\"\n",
"description = \"\"\"\n",
"Laura's voice is expressive and dramatic in delivery, speaking at a fast pace with a very close recording that almost has no background noise.\n",
"\"\"\"\n",
"# Tokenize inputs\n",
"input_ids = tokenizer(description, return_tensors=\"pt\").input_ids.to(device)\n",
"prompt_input_ids = tokenizer(text_prompt, return_tensors=\"pt\").input_ids.to(device)\n",
"\n",
"# Generate audio\n",
"generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n",
"audio_arr = generation.cpu().numpy().squeeze()\n",
"\n",
"# Play audio in notebook\n",
"ipd.Audio(audio_arr, rate=model.config.sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a20730f0-13dd-48b4-80b6-7c6ef05a0cc4",
"metadata": {},
"outputs": [],
"source": [
"voice_preset = \"v2/en_speaker_6\"\n",
"sampling_rate = 24000"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5986510c-4a09-4c24-9344-c98fa16947d9",
"metadata": {},
"outputs": [],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [sigh] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd142a94-3f24-4101-ac76-40306cd3fbcd",
"metadata": {},
"outputs": [],
"source": [
"text_prompt = \"\"\"\n",
"Exactly! [gasps] And the distillation part is where you take a LARGE-model,and compress-it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset).to(device)\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "markdown",
"id": "23226002-e444-4098-add3-4752e5832669",
"metadata": {},
"source": [
"###Example extract workflow testing"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1939e969-f41f-45bb-89d2-d81cb69609e6",
"metadata": {},
"outputs": [],
"source": [
"PODCAST_TEXT = [\n",
" (\"Speaker 1\", \"Welcome to our podcast, where we explore the latest advancements in AI and technology. I'm your host, and today we're joined by a renowned expert in the field of AI. We're going to dive into the exciting world of Llama 3.2, the latest release from Meta AI.\"),\n",
" (\"Speaker 2\", \"Hi, I'm excited to be here! So, what is Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is an open-source AI model that allows developers to fine-tune, distill, and deploy AI models anywhere. It's a significant update from the previous version, with improved performance, efficiency, and customization options.\"),\n",
" (\"Speaker 2\", \"That sounds amazing! What are some of the key features of Llama 3.2?\"),\n",
" (\"Speaker 1\", \"Well, one of the major updates is the introduction of multimodal models that can handle both text and image inputs. This opens up a wide range of applications, from image understanding to visual reasoning.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how this could be used in real life?\"),\n",
" (\"Speaker 1\", \"Sure thing! Imagine you're developing an AI-powered virtual assistant that can understand and respond to voice commands, as well as recognize and interact with objects in the physical world.\"),\n",
" (\"Speaker 2\", \"Wow, that sounds like science fiction! But what about the technical details? How does Llama 3.2 achieve this level of performance?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 uses a combination of techniques, including instruction-tuned benchmarks, vision instruction-tuned benchmarks, and category-specific benchmarks.\"),\n",
" (\"Speaker 2\", \"Okay, let's dive deeper into the technical details. Can you explain how the instruction-tuned benchmarks work?\"),\n",
" (\"Speaker 1\", \"Sure thing! The instruction-tuned benchmarks are designed to evaluate the model's ability to follow instructions and complete tasks. This is done by fine-tuning the model on a specific task, such as language translation or question-answering.\"),\n",
" (\"Speaker 2\", \"I see. And what about the vision instruction-tuned benchmarks?\"),\n",
" (\"Speaker 1\", \"Ah, those are designed to evaluate the model's ability to understand and interact with visual data. This includes tasks such as image classification, object detection, and visual reasoning.\"),\n",
" (\"Speaker 2\", \"Okay, got it. And what about the category-specific benchmarks?\"),\n",
" (\"Speaker 1\", \"Those are designed to evaluate the model's performance on specific tasks or categories, such as math, reasoning, or tool use.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm curious. Can you show me some examples of how Llama 3.2 performs on these benchmarks?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me pull up some results. (pause) Okay, so on the MMLU benchmark, Llama 3.2 achieves a score of 63.4, outperforming the previous state-of-the-art model.\"),\n",
" (\"Speaker 2\", \"Wow, those results are impressive! But what about real-world applications? How is Llama 3.2 being used in industry and academia?\"),\n",
" (\"Speaker 1\", \"Ah, great question! Llama 3.2 is being used in a wide range of applications, from virtual assistants to medical diagnosis. We're also seeing partnerships with major companies, such as ARM, MediaTek, and Qualcomm.\"),\n",
" (\"Speaker 2\", \"That's amazing! Can you tell me more about these partnerships?\"),\n",
" (\"Speaker 1\", \"Sure thing! These partnerships are enabling the development of edge AI and vision applications, such as smart home devices, autonomous vehicles, and medical imaging.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm intrigued. Can you give me an example of how Llama 3.2 is being used in one of these applications?\"),\n",
" (\"Speaker 1\", \"Sure thing! Let me tell you about a project we're working on with DoorDash. They're using Llama 3.2 to develop an AI-powered chatbot that can understand and respond to customer inquiries.\"),\n",
" (\"Speaker 2\", \"Wow, that's amazing! I think we've covered a lot of ground today. Can you summarize the key points for our listeners?\"),\n",
" (\"Speaker 1\", \"Sure thing! Llama 3.2 is a significant update from the previous version, with improved performance, efficiency, and customization options. We're seeing a wide range of applications, from virtual assistants to medical diagnosis, and partnerships with major companies.\"),\n",
" (\"Speaker 2\", \"Hmm, I'm excited to see where this technology will take us. Thanks for joining me today!\"),\n",
" (\"Speaker 1\", \"Thanks for having me!\"),\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e92461d1-7f2b-447f-88f3-33074237872c",
"metadata": {},
"outputs": [],
"source": [
"speaker1_voice = \"v2/en_speaker_6\"\n",
"speaker1_segments = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30cb5bce-0db0-4756-976d-07ee6e4decc9",
"metadata": {},
"outputs": [],
"source": [
"speaker2_voice = \"v2/en_speaker_9\"\n",
"speaker2_segments = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "730dccf5-038c-489b-b732-3f737161649d",
"metadata": {},
"outputs": [],
"source": [
"generated_segments = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6374725-2083-4ff2-8bd2-36b71f466cea",
"metadata": {},
"outputs": [],
"source": [
"for speaker, text in tqdm(conversation, desc=\"Generating podcast segments\", unit=\"segment\"):\n",
" # Pick voice based on speaker\n",
" voice = \"v2/en_speaker_6\" if speaker == \"Speaker 1\" else \"v2/en_speaker_9\"\n",
" \n",
" # Generate\n",
" inputs = processor(text, voice_preset=voice).to(device)\n",
" audio = model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n",
" \n",
" # Add to our list\n",
" generated_segments.append(audio[0].cpu().numpy())\n",
" \n",
" # Optional: Play as we go (you might want to comment this out for large conversations)\n",
" display(Audio(audio[0].cpu().numpy(), rate=sampling_rate))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "192b8661-f0e2-4590-b236-735d52175d53",
"metadata": {},
"outputs": [],
"source": [
"# Stick it all together at the end\n",
"final_podcast = np.concatenate(generated_segments)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88f66045-ab4b-4b9f-aa05-17d6d597ddca",
"metadata": {},
"outputs": [],
"source": [
"final_podcast_32 = final_podcast.astype(np.float32)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7620b2eb-d196-4a82-a6c1-a159df18be2b",
"metadata": {},
"outputs": [],
"source": [
"from scipy.io import wavfile\n",
"wavfile.write(\"podcast.wav\", sampling_rate, final_podcast_32)\n",
"\n",
"# Play the whole thing\n",
"Audio(final_podcast, rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ab200e44-bd68-4768-8635-87c54b341313",
"metadata": {},
"outputs": [],
"source": [
"from pydub import AudioSegment\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c3f8a1ae-cfd1-414d-b64c-f6e2803e6e68",
"metadata": {},
"outputs": [],
"source": [
"audio_int16 = (final_podcast * 32767).astype(np.int16)\n",
"\n",
"# Convert to AudioSegment\n",
"byte_io = io.BytesIO()\n",
"wavfile.write(byte_io, sampling_rate, audio_int16)\n",
"byte_io.seek(0)\n",
"audio_segment = AudioSegment.from_wav(byte_io)\n",
"\n",
"# Export as MP3 with good quality\n",
"audio_segment.export(\"podcast.mp3\", \n",
" format=\"mp3\", \n",
" bitrate=\"192k\", # Adjust bitrate as needed (128k, 192k, 256k, 320k)\n",
" parameters=[\"-q:a\", \"0\"]) # Highest quality\n",
"\n",
"# Play the result\n",
"Audio(\"podcast.mp3\")"
]
},
{
"cell_type": "markdown",
"id": "dd650176-ab17-47a7-8e02-10dc9ca9e852",
"metadata": {},
"source": [
"## Bringing it together"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b1dca30f-1226-4002-8e02-fd97e78ecc83",
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"with open('./podcast_ready_data.pkl', 'rb') as file:\n",
" PODCAST_TEXT = pickle.load(file)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8db78921-36c7-4388-b1d9-78dff4f972c2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
" WeightNorm.apply(module, name, dim)\n",
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.11/site-packages/transformers/models/encodec/modeling_encodec.py:120: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
}
],
"source": [
"bark_processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"bark_model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(\"cuda:3\")\n",
"bark_sampling_rate = 24000"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "6c04a04d-3686-4932-bd45-72d7f518c602",
"metadata": {},
"outputs": [],
"source": [
"parler_model = ParlerTTSForConditionalGeneration.from_pretrained(\"parler-tts/parler-tts-mini-v1\").to(\"cuda:3\")\n",
"parler_tokenizer = AutoTokenizer.from_pretrained(\"parler-tts/parler-tts-mini-v1\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "efbe1434-37f3-4f77-a5fb-b39625f5e676",
"metadata": {},
"outputs": [],
"source": [
"speaker1_description = \"\"\"\n",
"Laura's voice is expressive and dramatic in delivery, speaking at a moderately fast pace with a very close recording that almost has no background noise.\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cebfd0f9-8703-4fce-b207-014c6e16cc8a",
"metadata": {},
"outputs": [],
"source": [
"generated_segments = []\n",
"sampling_rates = [] # We'll need to keep track of sampling rates for each segment"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9b333e36-9579-4237-b329-e2911229be42",
"metadata": {},
"outputs": [],
"source": [
"device=\"cuda:3\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "50323f9e-09ed-4c8c-9020-1511ab775969",
"metadata": {},
"outputs": [],
"source": [
"def generate_speaker1_audio(text):\n",
" \"\"\"Generate audio using ParlerTTS for Speaker 1\"\"\"\n",
" input_ids = parler_tokenizer(speaker1_description, return_tensors=\"pt\").input_ids.to(device)\n",
" prompt_input_ids = parler_tokenizer(text, return_tensors=\"pt\").input_ids.to(device)\n",
" generation = parler_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)\n",
" audio_arr = generation.cpu().numpy().squeeze()\n",
" return audio_arr, parler_model.config.sampling_rate"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "0e6120ba-5190-4739-97ca-4e8b44dddc5e",
"metadata": {},
"outputs": [],
"source": [
"def generate_speaker2_audio(text):\n",
" \"\"\"Generate audio using Bark for Speaker 2\"\"\"\n",
" inputs = bark_processor(text, voice_preset=\"v2/en_speaker_6\").to(device)\n",
" speech_output = bark_model.generate(**inputs, temperature=0.9, semantic_temperature=0.8)\n",
" audio_arr = speech_output[0].cpu().numpy()\n",
" return audio_arr, bark_sampling_rate\n"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "4482d864-2806-4410-b239-da4b2d0d1340",
"metadata": {},
"outputs": [],
"source": [
"def numpy_to_audio_segment(audio_arr, sampling_rate):\n",
" \"\"\"Convert numpy array to AudioSegment\"\"\"\n",
" # Convert to 16-bit PCM\n",
" audio_int16 = (audio_arr * 32767).astype(np.int16)\n",
" \n",
" # Create WAV file in memory\n",
" byte_io = io.BytesIO()\n",
" wavfile.write(byte_io, sampling_rate, audio_int16)\n",
" byte_io.seek(0)\n",
" \n",
" # Convert to AudioSegment\n",
" return AudioSegment.from_wav(byte_io)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c4dbb3b3-cdd3-4a1f-a60a-661e64a67f53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'[\\n (\"Speaker 1\", \"Welcome to this week\\'s episode of AI Insights, where we explore the latest developments in the field of artificial intelligence. Today, we\\'re going to dive into the fascinating world of knowledge distillation, a methodology that transfers advanced capabilities from leading proprietary Large Language Models, or LLMs, to their open-source counterparts. Joining me on this journey is my co-host, who\\'s new to the topic, and I\\'ll be guiding them through the ins and outs of knowledge distillation. So, let\\'s get started!\"),\\n (\"Speaker 2\", \"Sounds exciting! I\\'ve heard of knowledge distillation, but I\\'m not entirely sure what it\\'s all about. Can you give me a brief overview?\"),\\n (\"Speaker 1\", \"Of course! Knowledge distillation is a technique that enables the transfer of knowledge from a large, complex model, like GPT-4 or Gemini, to a smaller, more efficient model, like LLaMA or Mistral. This process allows the smaller model to learn from the teacher model\\'s output, enabling it to acquire similar capabilities. Think of it like a master chef teaching their apprentice the art of cooking – the apprentice doesn\\'t need to start from scratch.\"),\\n (\"Speaker 2\", \"Hmm, that sounds interesting. So, it\\'s like a teacher-student relationship, where the teacher model guides the student model to learn from its output... Umm, can you explain this process in more detail?\"),\\n (\"Speaker 1\", \"The distillation process involves several stages, including knowledge elicitation, knowledge storage, knowledge inference, and knowledge application. The teacher model shares its knowledge with the student model, which then learns to emulate the teacher\\'s output behavior.\"),\\n (\"Speaker 2\", \"That makes sense, I think. So, it\\'s like the teacher model is saying, \\'Hey, student model, learn from my output, and try to produce similar results.\\' But what about the different approaches to knowledge distillation? I\\'ve heard of supervised fine-tuning, divergence and similarity, reinforcement learning, and rank optimization.\"),\\n (\"Speaker 1\", \"Ah, yes! Those are all valid approaches to knowledge distillation. Supervised fine-tuning involves training the student model on a smaller dataset, while divergence and similarity focus on aligning the hidden states or features of the student model with those of the teacher model. Reinforcement learning and rank optimization are more advanced methods that involve feedback from the teacher model to train the student model. Imagine you\\'re trying to tune a piano – you need to adjust the keys to produce the perfect sound.\"),\\n (\"Speaker 2\", \"[laughs] Okay, I think I\\'m starting to get it. But can you give me some examples of how these approaches are used in real-world applications? I\\'m thinking of something like a language model that can generate human-like text...\"),\\n (\"Speaker 1\", \"Of course! For instance, the Vicuna model uses supervised fine-tuning to distill knowledge from the teacher model, while the UltraChat model employs a combination of knowledge distillation and reinforcement learning to create a powerful chat model.\"),\\n (\"Speaker 2\", \"Wow, that\\'s fascinating! I\\'m starting to see how knowledge distillation can be applied to various domains, like natural language processing, computer vision, and even multimodal tasks... Umm, can we talk more about multimodal tasks? That sounds really interesting.\"),\\n (\"Speaker 1\", \"Exactly! Knowledge distillation has far-reaching implications for AI research and applications. It enables the transfer of knowledge across different models, architectures, and domains, making it a powerful tool for building more efficient and effective AI systems.\"),\\n (\"Speaker 2\", \"[sigh] I\\'m starting to see the bigger picture now. Knowledge distillation is not just a technique; it\\'s a way to democratize access to advanced AI capabilities and foster innovation across a broader spectrum of applications and users... Hmm, that\\'s a pretty big deal.\"),\\n (\"Speaker 1\", \"That\\'s right! And as we continue to explore the frontiers of AI, knowledge distillation will play an increasingly important role in shaping the future of artificial intelligence.\"),\\n (\"Speaker 2\", \"Well, I\\'m excited to learn more about knowledge distillation and its applications. Thanks for guiding me through this journey, and I\\'m looking forward to our next episode!\"),\\n (\"Speaker 1\", \"Thank you for joining me on this episode of AI Insights! If you want to learn more about knowledge distillation and its applications, be sure to check out our resources section, where we\\'ve curated a list of papers, articles, and tutorials to help you get started.\"),\\n (\"Speaker 2\", \"And if you\\'re interested in building your own AI model using knowledge distillation, maybe we can even do a follow-up episode on how to get started... Umm, let\\'s discuss that further next time.\"),\\n]'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PODCAST_TEXT"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "9946e46c-3457-4bf9-9042-b89fa8f5b47a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Speaker 1',\n",
" \"Welcome to this week's episode of AI Insights, where we explore the latest developments in the field of artificial intelligence. Today, we're going to dive into the fascinating world of knowledge distillation, a methodology that transfers advanced capabilities from leading proprietary Large Language Models, or LLMs, to their open-source counterparts. Joining me on this journey is my co-host, who's new to the topic, and I'll be guiding them through the ins and outs of knowledge distillation. So, let's get started!\"),\n",
" ('Speaker 2',\n",
" \"Sounds exciting! I've heard of knowledge distillation, but I'm not entirely sure what it's all about. Can you give me a brief overview?\"),\n",
" ('Speaker 1',\n",
" \"Of course! Knowledge distillation is a technique that enables the transfer of knowledge from a large, complex model, like GPT-4 or Gemini, to a smaller, more efficient model, like LLaMA or Mistral. This process allows the smaller model to learn from the teacher model's output, enabling it to acquire similar capabilities. Think of it like a master chef teaching their apprentice the art of cooking – the apprentice doesn't need to start from scratch.\"),\n",
" ('Speaker 2',\n",
" \"Hmm, that sounds interesting. So, it's like a teacher-student relationship, where the teacher model guides the student model to learn from its output... Umm, can you explain this process in more detail?\"),\n",
" ('Speaker 1',\n",
" \"The distillation process involves several stages, including knowledge elicitation, knowledge storage, knowledge inference, and knowledge application. The teacher model shares its knowledge with the student model, which then learns to emulate the teacher's output behavior.\"),\n",
" ('Speaker 2',\n",
" \"That makes sense, I think. So, it's like the teacher model is saying, 'Hey, student model, learn from my output, and try to produce similar results.' But what about the different approaches to knowledge distillation? I've heard of supervised fine-tuning, divergence and similarity, reinforcement learning, and rank optimization.\"),\n",
" ('Speaker 1',\n",
" \"Ah, yes! Those are all valid approaches to knowledge distillation. Supervised fine-tuning involves training the student model on a smaller dataset, while divergence and similarity focus on aligning the hidden states or features of the student model with those of the teacher model. Reinforcement learning and rank optimization are more advanced methods that involve feedback from the teacher model to train the student model. Imagine you're trying to tune a piano – you need to adjust the keys to produce the perfect sound.\"),\n",
" ('Speaker 2',\n",
" \"[laughs] Okay, I think I'm starting to get it. But can you give me some examples of how these approaches are used in real-world applications? I'm thinking of something like a language model that can generate human-like text...\"),\n",
" ('Speaker 1',\n",
" 'Of course! For instance, the Vicuna model uses supervised fine-tuning to distill knowledge from the teacher model, while the UltraChat model employs a combination of knowledge distillation and reinforcement learning to create a powerful chat model.'),\n",
" ('Speaker 2',\n",
" \"Wow, that's fascinating! I'm starting to see how knowledge distillation can be applied to various domains, like natural language processing, computer vision, and even multimodal tasks... Umm, can we talk more about multimodal tasks? That sounds really interesting.\"),\n",
" ('Speaker 1',\n",
" 'Exactly! Knowledge distillation has far-reaching implications for AI research and applications. It enables the transfer of knowledge across different models, architectures, and domains, making it a powerful tool for building more efficient and effective AI systems.'),\n",
" ('Speaker 2',\n",
" \"[sigh] I'm starting to see the bigger picture now. Knowledge distillation is not just a technique; it's a way to democratize access to advanced AI capabilities and foster innovation across a broader spectrum of applications and users... Hmm, that's a pretty big deal.\"),\n",
" ('Speaker 1',\n",
" \"That's right! And as we continue to explore the frontiers of AI, knowledge distillation will play an increasingly important role in shaping the future of artificial intelligence.\"),\n",
" ('Speaker 2',\n",
" \"Well, I'm excited to learn more about knowledge distillation and its applications. Thanks for guiding me through this journey, and I'm looking forward to our next episode!\"),\n",
" ('Speaker 1',\n",
" \"Thank you for joining me on this episode of AI Insights! If you want to learn more about knowledge distillation and its applications, be sure to check out our resources section, where we've curated a list of papers, articles, and tutorials to help you get started.\"),\n",
" ('Speaker 2',\n",
" \"And if you're interested in building your own AI model using knowledge distillation, maybe we can even do a follow-up episode on how to get started... Umm, let's discuss that further next time.\")]"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import ast\n",
"ast.literal_eval(PODCAST_TEXT)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "c640fead-2017-478f-a7b6-1b96105d45d6",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Generating podcast segments: 6%|███▉ | 1/16 [00:20<05:02, 20.16s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 19%|███████████▋ | 3/16 [01:02<04:33, 21.06s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 31%|███████████████████▍ | 5/16 [01:41<03:30, 19.18s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 44%|███████████████████████████▏ | 7/16 [02:26<03:05, 20.57s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 56%|██████████████████████████████████▉ | 9/16 [03:04<02:13, 19.10s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 69%|█████████████████████████████████████████▉ | 11/16 [03:42<01:31, 18.27s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 81%|█████████████████████████████████████████████████▌ | 13/16 [04:17<00:50, 16.99s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 94%|█████████████████████████████████████████████████████████▏ | 15/16 [04:49<00:15, 15.83s/segment]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.\n",
"Generating podcast segments: 100%|█████████████████████████████████████████████████████████████| 16/16 [05:13<00:00, 19.57s/segment]\n"
]
}
],
"source": [
"final_audio = None\n",
"\n",
"for speaker, text in tqdm(ast.literal_eval(PODCAST_TEXT), desc=\"Generating podcast segments\", unit=\"segment\"):\n",
" if speaker == \"Speaker 1\":\n",
" audio_arr, rate = generate_speaker1_audio(text)\n",
" else: # Speaker 2\n",
" audio_arr, rate = generate_speaker2_audio(text)\n",
" \n",
" # Convert to AudioSegment (pydub will handle sample rate conversion automatically)\n",
" audio_segment = numpy_to_audio_segment(audio_arr, rate)\n",
" \n",
" # Add to final audio\n",
" if final_audio is None:\n",
" final_audio = audio_segment\n",
" else:\n",
" final_audio += audio_segment"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "2eeffdb7-875a-45ec-bdd8-c8c5b34f5a7b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<_io.BufferedRandom name='_podcast.mp3'>"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_audio.export(\"_podcast.mp3\", \n",
" format=\"mp3\", \n",
" bitrate=\"192k\",\n",
" parameters=[\"-q:a\", \"0\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26cc56c5-b9c9-47c2-b860-0ea9f05c79af",
"metadata": {},
"outputs": [],
"source": [
"#fin"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}