{ "cells": [ { "cell_type": "markdown", "id": "18662496-bb36-45a6-99ab-b2f0b91eb534", "metadata": {}, "source": [ "## Suno Demo\n", "\n", "Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n" ] }, { "cell_type": "code", "execution_count": null, "id": "73cf9a1c-c6d4-492f-9d3d-8b19466e6014", "metadata": {}, "outputs": [], "source": [ "#!pip3 install optimum\n", "#!pip install -U flash-attn --no-build-isolation" ] }, { "cell_type": "code", "execution_count": 1, "id": "f6c0c08d-b1b7-479c-ae10-bd126d925bcd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.12/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n", " self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n" ] } ], "source": [ "from transformers import BarkModel, AutoProcessor\n", "import torch\n", "\n", "device = \"cuda:3\"\n", "\n", "processor = AutoProcessor.from_pretrained(\"suno/bark\")\n", "\n", "#model = model.to_bettertransformer()\n", "#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n", "model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()" ] }, { "cell_type": "code", "execution_count": 2, "id": "1f92997d-17d8-41c5-867d-fd5f7d94853f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n", "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" ] } ], "source": [ "# prepare the inputs\n", "text_prompt = \"Let's try generating speech, with Bark, a text-to-speech model\"\n", "inputs = processor(text_prompt)\n", "\n", "# generate speech\n", "speech_output = model.generate(**inputs.to(device))" ] }, { "cell_type": "code", "execution_count": null, "id": "94c95582-49cf-419d-89bf-51e2e4e04379", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] } ], "source": [ "voice_preset = \"v2/en_speaker_6\"\n", "\n", "# prepare the inputs\n", "text_prompt = \"Let's try [laughs] generating speech, with Bark, a text-to-speech model\"\n", "inputs = processor(text_prompt, voice_preset=voice_preset)\n", "\n", "# generate speech\n", "speech_output = model.generate(**inputs.to(device))\n", "\n", "# let's hear it\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": 5, "id": "330d02ed-abfe-4fcc-9858-c12f9add3ce5", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n", "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#for some reason this always gets stuck in a loop\n", "speech_output = model.generate(**inputs, num_beams = 4, temperature = 0.5, semantic_temperature = 0.5)\n", "\n", "speech_output = model.generate(**inputs, temperature = 0.5, semantic_temperature = 0.5)\n", "\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "a258e898-b007-4697-af9f-a9e7dbbb7fe4", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.1)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "11561846-d029-4f7e-be4a-4f7a09e84bf4", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.2)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "e80a13a5-6c5c-4850-bdc6-bf7d18484162", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.3)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "a8f3285e-d658-488f-a6e0-37f80c59cd04", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.4)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "b11d23af-62e3-432d-8ff5-0452c91cc42d", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.5)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "045b9608-7ec9-4950-b3da-62d51ca3d792", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.6)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "f4270d19-8752-48e0-86a0-05ef576e51ec", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.7)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "0d887a7e-9680-4cb1-9158-1a612c499412", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.8)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "code", "execution_count": null, "id": "354ff9ac-955b-4bc5-bcf6-7444e9512eef", "metadata": {}, "outputs": [], "source": [ "speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.9)\n", "Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)" ] }, { "cell_type": "markdown", "id": "e2cd65ba-69ec-4ee3-be6e-c88b9f7c3f9e", "metadata": {}, "source": [ "## To save" ] }, { "cell_type": "code", "execution_count": null, "id": "a8839e82-79eb-4f83-8524-44ad279b593f", "metadata": {}, "outputs": [], "source": [ "import scipy\n", "\n", "scipy.io.wavfile.write(\"bark_out.wav\", rate=sampling_rate, data=speech_output[0].cpu().numpy())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.5" } }, "nbformat": 4, "nbformat_minor": 5 }