{
"cells": [
{
"cell_type": "markdown",
"id": "18662496-bb36-45a6-99ab-b2f0b91eb534",
"metadata": {},
"source": [
"## Suno Demo\n",
"\n",
"Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73cf9a1c-c6d4-492f-9d3d-8b19466e6014",
"metadata": {},
"outputs": [],
"source": [
"#!pip3 install optimum\n",
"#!pip install -U flash-attn --no-build-isolation"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "f6c0c08d-b1b7-479c-ae10-bd126d925bcd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.12/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
}
],
"source": [
"from transformers import BarkModel, AutoProcessor\n",
"import torch\n",
"\n",
"device = \"cuda:3\"\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"\n",
"#model = model.to_bettertransformer()\n",
"#model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16, attn_implementation=\"flash_attention_2\").to(device)\n",
"model = BarkModel.from_pretrained(\"suno/bark\", torch_dtype=torch.float16).to(device)#.to_bettertransformer()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "94c95582-49cf-419d-89bf-51e2e4e04379",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"voice_preset = \"v2/en_speaker_6\"\n",
"\n",
"# prepare the inputs\n",
"text_prompt = \"\"\"\n",
"[Laughs] Exactly! And the distillation part is where you take a large model and compress it down into a smaller, more efficient model that can run on devices with limited resources.\n",
"\"\"\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset)\n",
"\n",
"# generate speech\n",
"speech_output = model.generate(**inputs.to(device))\n",
"\n",
"# let's hear it\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "a258e898-b007-4697-af9f-a9e7dbbb7fe4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.1)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "11561846-d029-4f7e-be4a-4f7a09e84bf4",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.2)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "e80a13a5-6c5c-4850-bdc6-bf7d18484162",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#I like this the most so far\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.3)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "a8f3285e-d658-488f-a6e0-37f80c59cd04",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# This is better than 0.3\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.4)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "b11d23af-62e3-432d-8ff5-0452c91cc42d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Falls Apart\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.5)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "045b9608-7ec9-4950-b3da-62d51ca3d792",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# so-so\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.6)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "f4270d19-8752-48e0-86a0-05ef576e51ec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#probably the best so far\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.7)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "0d887a7e-9680-4cb1-9158-1a612c499412",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# probably better\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.8)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "354ff9ac-955b-4bc5-bcf6-7444e9512eef",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# mix feelings\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "markdown",
"id": "b0fb79c0-72a8-43cd-a5ef-0d3df6ce81f6",
"metadata": {},
"source": [
"### Now changing temperature"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "e585aa5d-67a0-4a6c-a8d6-aec038f9f2c2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# very robotic\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.1, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "b7d627f4-173e-408f-a20a-49f7a6106966",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Less robotic, still not very convinving though, feels robotic later\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.2, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "be0cde0a-f459-4937-a746-d766f27cb443",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# This feels a little less robotic\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.3, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "4bd995a6-9622-4241-8a12-24b8393e5bec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Much better than the previous but still has a robotic tinge\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.4, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "b0d622d5-da03-45d7-8a0e-32862c45936f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# The laugh was weird but the robotic sense goes away and tone changes throughout\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.5, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "431918c1-bcb4-44c5-a604-01b5d5b95f57",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Probably most consistent but has robotic tinge still\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.6, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "147a43c0-7e85-478e-bc55-09fead6468dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
}
],
"source": [
"# It hallucinated this one lol at first run\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.7, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "37c35555-7fba-4c52-8a1a-ce2f48947fd7",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Doesnt laugh but feels a bit more natural\n",
"\n",
"speech_output = model.generate(**inputs, temperature = 0.8, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9234e00-473f-4736-acd1-cb9d4d1a9589",
"metadata": {},
"outputs": [],
"source": [
"speech_output = model.generate(**inputs, temperature = 0.9, semantic_temperature = 0.9)\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "markdown",
"id": "e2cd65ba-69ec-4ee3-be6e-c88b9f7c3f9e",
"metadata": {},
"source": [
"## To save"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8839e82-79eb-4f83-8524-44ad279b593f",
"metadata": {},
"outputs": [],
"source": [
"import scipy\n",
"\n",
"scipy.io.wavfile.write(\"bark_out.wav\", rate=sampling_rate, data=speech_output[0].cpu().numpy())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}