{
"cells": [
{
"cell_type": "markdown",
"id": "18662496-bb36-45a6-99ab-b2f0b91eb534",
"metadata": {},
"source": [
"## Suno Demo\n",
"\n",
"Copy-Pasted from: https://colab.research.google.com/drive/1dWWkZzvu7L9Bunq9zvD-W02RFUXoW-Pd?usp=sharing#scrollTo=68QtoUqPWdLk\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6c0c08d-b1b7-479c-ae10-bd126d925bcd",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6bcc5df3f0fc46e69824f2f194c3b805",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"config.json: 0%| | 0.00/8.81k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "1175ac2d1a224e089b72e73e2330f522",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/4.49G [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/sanyambhutani/.conda/envs/final-checking-meta/lib/python3.12/site-packages/transformers/models/encodec/modeling_encodec.py:124: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).\n",
" self.register_buffer(\"padding_total\", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "59d7ab6348494a4389a57a6fe04bc14b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"generation_config.json: 0%| | 0.00/4.91k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from transformers import BarkModel, AutoProcessor\n",
"\n",
"processor = AutoProcessor.from_pretrained(\"suno/bark\")\n",
"\n",
"model = BarkModel.from_pretrained(\"suno/bark\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4c3883af-849a-45b5-bc09-8ee54e0d804b",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"\n",
"device = \"cuda:3\"\n",
"model = model.to(device)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "1f92997d-17d8-41c5-867d-fd5f7d94853f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n",
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
]
}
],
"source": [
"# prepare the inputs\n",
"text_prompt = \"Let's try generating speech, with Bark, a text-to-speech model\"\n",
"inputs = processor(text_prompt)\n",
"\n",
"# generate speech\n",
"speech_output = model.generate(**inputs.to(device))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "76834cbb-66fb-443e-ad7b-54fd4fb87007",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import Audio\n",
"\n",
"sampling_rate = model.generation_config.sample_rate\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "74af0e3c-a34c-4477-b917-5c62c0ddaa94",
"metadata": {},
"outputs": [],
"source": [
"import scipy\n",
"\n",
"scipy.io.wavfile.write(\"bark_out.wav\", rate=sampling_rate, data=speech_output[0].cpu().numpy())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "94c95582-49cf-419d-89bf-51e2e4e04379",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"voice_preset = \"v2/en_speaker_6\"\n",
"\n",
"# prepare the inputs\n",
"text_prompt = \"Let's try generating speech, with Bark, a text-to-speech model\"\n",
"inputs = processor(text_prompt, voice_preset=voice_preset)\n",
"\n",
"# generate speech\n",
"speech_output = model.generate(**inputs.to(device))\n",
"\n",
"# let's hear it\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "330d02ed-abfe-4fcc-9858-c12f9add3ce5",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"speech_output = model.generate(**inputs, num_beams = 4, temperature = 0.5, semantic_temperature = 0.8)\n",
"\n",
"Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a8839e82-79eb-4f83-8524-44ad279b593f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}