diff --git a/clone_voice.ipynb b/clone_voice.ipynb index eb2495ca..85df680d 100644 --- a/clone_voice.ipynb +++ b/clone_voice.ipynb @@ -6,13 +6,24 @@ "metadata": {}, "outputs": [], "source": [ - "from bark.generation import load_codec_model, generate_text_semantic\n", + "!pip install git+https://github.com/suno-ai/bark.git\n", + "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n", + "!pip install -r ./bark-voice-cloning-HuBERT-quantizer/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bark.generation import load_codec_model\n", "from encodec.utils import convert_audio\n", "\n", "import torchaudio\n", "import torch\n", "\n", - "device = 'cuda' # or 'cpu'\n", + "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n", "model = load_codec_model(use_gpu=True if device == 'cuda' else False)" ] }, @@ -22,8 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n", - "from hubert.hubert_manager import HuBERTManager\n", + "import sys\n", + "sys.path.append('./bark-voice-cloning-HuBERT-quantizer')\n", + "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n", "hubert_manager = HuBERTManager()\n", "hubert_manager.make_sure_hubert_installed()\n", "hubert_manager.make_sure_tokenizer_installed()" @@ -35,16 +47,15 @@ "metadata": {}, "outputs": [], "source": [ - "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer \n", "# Load HuBERT for semantic tokens\n", - "from hubert.pre_kmeans_hubert import CustomHubert\n", - "from hubert.customtokenizer import CustomTokenizer\n", + "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n", + "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n", "\n", "# Load the HuBERT model\n", "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n", "\n", "# Load the CustomTokenizer model\n", - "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device) # Automatically uses the right layers" + "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth', map_location=device).to(device) # Automatically uses the right layers" ] }, { @@ -101,34 +112,13 @@ "outputs": [], "source": [ "import numpy as np\n", - "voice_name = 'output' # whatever you want the name of the voice to be\n", - "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n", - "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Heres the generation stuff copy-pasted for convenience" + "import os\n", + "\n", + "voice_filename = 'output.npz'\n", + "current_path = os.getcwd()\n", + "voice_name = os.path.join(current_path, voice_filename)\n", + "\n", + "np.savez(voice_name, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)" ] }, { @@ -138,12 +128,10 @@ "outputs": [], "source": [ "from bark.api import generate_audio\n", - "from transformers import BertTokenizer\n", "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n", "\n", "# Enter your prompt and speaker here\n", - "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n", - "voice_name = \"output\" # use your custom voice name here if you have one" + "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"" ] }, { @@ -161,8 +149,7 @@ " fine_use_gpu=True,\n", " fine_use_small=False,\n", " codec_use_gpu=True,\n", - " force_reload=False,\n", - " path=\"models\"\n", + " force_reload=False\n", ")" ] }, @@ -245,10 +232,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/generate.ipynb b/generate.ipynb index a7aae13e..724377b3 100644 --- a/generate.ipynb +++ b/generate.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/suno-ai/bark.git" + ] + }, { "cell_type": "code", "execution_count": null, @@ -163,10 +172,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2 diff --git a/generate_chunked.ipynb b/generate_chunked.ipynb index 1cf3d56d..9662b0bf 100644 --- a/generate_chunked.ipynb +++ b/generate_chunked.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install git+https://github.com/suno-ai/bark.git" + ] + }, { "cell_type": "code", "execution_count": null, @@ -334,10 +343,8 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.8" - }, - "orig_nbformat": 4 + "pygments_lexer": "ipython3" + } }, "nbformat": 4, "nbformat_minor": 2