From 99690e02d09c455d04a17b98a2f993f28045574f Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:15:14 +0300 Subject: [PATCH 1/8] Modified clone-voice.ipynb --- clone_voice.ipynb | 143 +++++++++++++++++++++++++++++++--------------- 1 file changed, 96 insertions(+), 47 deletions(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index eb2495ca..be6dc4cf 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -3,7 +3,26 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "!pip install git+https://github.com/suno-ai/bark.git\n", + "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n", + "!pip install -r ./bark-voice-cloning-HuBERT-quantizer/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "from bark.generation import load_codec_model, generate_text_semantic\n", @@ -12,18 +31,23 @@ "import torchaudio\n", "import torch\n", "\n", - "device = 'cuda' # or 'cpu'\n", + "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "model = load_codec_model(use_gpu=True if device == 'cuda' else False)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ - "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n", - "from hubert.hubert_manager import HuBERTManager\n", + "import sys\n", + "sys.path.append('./bark-voice-cloning-HuBERT-quantizer')\n", + "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n", "hubert_manager = HuBERTManager()\n", "hubert_manager.make_sure_hubert_installed()\n", "hubert_manager.make_sure_tokenizer_installed()" @@ -32,13 +56,16 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ - "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer \n", "# Load HuBERT for semantic tokens\n", - "from hubert.pre_kmeans_hubert import CustomHubert\n", - "from hubert.customtokenizer import CustomTokenizer\n", + "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n", + "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n", "\n", "# Load the HuBERT model\n", "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n", @@ -50,7 +77,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# Load and pre-process the audio waveform\n", @@ -63,7 +94,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n", @@ -73,7 +108,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# Extract discrete codes from EnCodec\n", @@ -85,7 +124,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# move codes to cpu\n", @@ -97,44 +140,30 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "import numpy as np\n", - "voice_name = 'output' # whatever you want the name of the voice to be\n", - "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n", + "\n", + "voice_filename = 'output'\n", + "current_path = os.getcwd()\n", + "voice_name = os.path.join(current_path, f'{voice_filename}.npz')\n", + "\n", "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Heres the generation stuff copy-pasted for convenience" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "from bark.api import generate_audio\n", @@ -149,7 +178,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# download and load all models\n", @@ -169,7 +202,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# simple generation\n", @@ -179,7 +216,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "# generation with more control\n", @@ -209,7 +250,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "from IPython.display import Audio\n", @@ -220,7 +265,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, "outputs": [], "source": [ "from scipy.io.wavfile import write as write_wav\n", From 293f3d414e64afb43f34320df97aed4a814987e7 Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:25:14 +0300 Subject: [PATCH 2/8] Deleted param --- clone_voice.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index be6dc4cf..50bcb29e 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -194,8 +194,7 @@ " fine_use_gpu=True,\n", " fine_use_small=False,\n", " codec_use_gpu=True,\n", - " force_reload=False,\n", - " path=\"models\"\n", + " force_reload=False\n", ")" ] }, From 5d582ffae6fc1dcb339ae006d1c361c3ec1d4546 Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:35:58 +0300 Subject: [PATCH 3/8] Added os import --- clone_voice.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index 50bcb29e..2a1ab963 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -148,6 +148,7 @@ "outputs": [], "source": [ "import numpy as np\n", + "import os\n", "\n", "voice_filename = 'output'\n", "current_path = os.getcwd()\n", From 75c77cc82ba57946c4eb95015243f940b8689bbc Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:37:12 +0300 Subject: [PATCH 4/8] Code refactoring --- clone_voice.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index 2a1ab963..19931baf 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -150,9 +150,9 @@ "import numpy as np\n", "import os\n", "\n", - "voice_filename = 'output'\n", + "voice_filename = 'output.npz'\n", "current_path = os.getcwd()\n", - "voice_name = os.path.join(current_path, f'{voice_filename}.npz')\n", + "voice_name = os.path.join(current_path, voice_filename)\n", "\n", "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" ] From 7a48c955cd1619178529110e6fe3f3098368c535 Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:39:50 +0300 Subject: [PATCH 5/8] Added installation of bark --- generate.ipynb | 9 +++++++++ generate_chunked.ipynb | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/generate.ipynb b/generate.ipynb index a7aae13e..7e10d4aa 100644 --- a/generate.ipynb +++ b/generate.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/suno-ai/bark.git" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/generate_chunked.ipynb b/generate_chunked.ipynb index 1cf3d56d..8dd370cc 100644 --- a/generate_chunked.ipynb +++ b/generate_chunked.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/suno-ai/bark.git" + ] + }, { "cell_type": "code", "execution_count": null, From 8273cdec8db672ef82a39e9ebbbb631fef7429ca Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 20:43:38 +0300 Subject: [PATCH 6/8] Cleaned metadata --- clone_voice.ipynb | 96 ++++++++---------------------------------- generate.ipynb | 6 +-- generate_chunked.ipynb | 6 +-- 3 files changed, 21 insertions(+), 87 deletions(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index 19931baf..87398905 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -3,11 +3,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "!pip install git+https://github.com/suno-ai/bark.git\n", @@ -18,11 +14,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "from bark.generation import load_codec_model, generate_text_semantic\n", @@ -38,11 +30,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "import sys\n", @@ -56,11 +44,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Load HuBERT for semantic tokens\n", @@ -77,11 +61,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Load and pre-process the audio waveform\n", @@ -94,11 +74,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n", @@ -108,11 +84,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Extract discrete codes from EnCodec\n", @@ -124,11 +96,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# move codes to cpu\n", @@ -140,11 +108,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -160,11 +124,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "from bark.api import generate_audio\n", @@ -179,11 +139,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# download and load all models\n", @@ -202,11 +158,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# simple generation\n", @@ -216,11 +168,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "# generation with more control\n", @@ -250,11 +198,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "from IPython.display import Audio\n", @@ -265,11 +209,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, + "metadata": {}, "outputs": [], "source": [ "from scipy.io.wavfile import write as write_wav\n", @@ -294,10 +234,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/generate.ipynb b/generate.ipynb index 7e10d4aa..724377b3 100644 --- a/generate.ipynb +++ b/generate.ipynb @@ -172,10 +172,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/generate_chunked.ipynb b/generate_chunked.ipynb index 8dd370cc..9662b0bf 100644 --- a/generate_chunked.ipynb +++ b/generate_chunked.ipynb @@ -343,10 +343,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2 From 99e4bca41eed81db69f30265bdb9178441bdf20d Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Sat, 22 Jul 2023 21:11:41 +0300 Subject: [PATCH 7/8] Added map_location --- clone_voice.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index 87398905..a1d5b29a 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -55,7 +55,7 @@ "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n", "\n", "# Load the CustomTokenizer model\n", - "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device) # Automatically uses the right layers" + "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth', map_location=device).to(device) # Automatically uses the right layers" ] }, { From 0fba9367b89ac0815de4e05a2ce5735735b8e25a Mon Sep 17 00:00:00 2001 From: BrasD99 Date: Mon, 31 Jul 2023 15:06:41 +0300 Subject: [PATCH 8/8] Bug fix --- clone_voice.ipynb | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clone_voice.ipynb b/clone_voice.ipynb index a1d5b29a..85df680d 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -17,7 +17,7 @@ "metadata": {}, "outputs": [], "source": [ - "from bark.generation import load_codec_model, generate_text_semantic\n", + "from bark.generation import load_codec_model\n", "from encodec.utils import convert_audio\n", "\n", "import torchaudio\n", @@ -118,7 +118,7 @@ "current_path = os.getcwd()\n", "voice_name = os.path.join(current_path, voice_filename)\n", "\n", - "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" + "np.savez(voice_name, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" ] }, { @@ -128,12 +128,10 @@ "outputs": [], "source": [ "from bark.api import generate_audio\n", - "from transformers import BertTokenizer\n", "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n", "\n", "# Enter your prompt and speaker here\n", - "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n", - "voice_name = \"output\" # use your custom voice name here if you have one" + "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"" ] }, {