From 99690e02d09c455d04a17b98a2f993f28045574f Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:15:14 +0300
Subject: [PATCH 1/8] Modified clone-voice.ipynb

---
 clone_voice.ipynb | 143 +++++++++++++++++++++++++++++++---------------
 1 file changed, 96 insertions(+), 47 deletions(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index eb2495ca..be6dc4cf 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -3,7 +3,26 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/suno-ai/bark.git\n",
+    "!git clone https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
+    "!pip install -r ./bark-voice-cloning-HuBERT-quantizer/requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "from bark.generation import load_codec_model, generate_text_semantic\n",
@@ -12,18 +31,23 @@
     "import torchaudio\n",
     "import torch\n",
     "\n",
-    "device = 'cuda' # or 'cpu'\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
     "model = load_codec_model(use_gpu=True if device == 'cuda' else False)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
-    "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer\n",
-    "from hubert.hubert_manager import HuBERTManager\n",
+    "import sys\n",
+    "sys.path.append('./bark-voice-cloning-HuBERT-quantizer')\n",
+    "from bark_hubert_quantizer.hubert_manager import HuBERTManager\n",
     "hubert_manager = HuBERTManager()\n",
     "hubert_manager.make_sure_hubert_installed()\n",
     "hubert_manager.make_sure_tokenizer_installed()"
@@ -32,13 +56,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
-    "# From https://github.com/gitmylo/bark-voice-cloning-HuBERT-quantizer \n",
     "# Load HuBERT for semantic tokens\n",
-    "from hubert.pre_kmeans_hubert import CustomHubert\n",
-    "from hubert.customtokenizer import CustomTokenizer\n",
+    "from bark_hubert_quantizer.pre_kmeans_hubert import CustomHubert\n",
+    "from bark_hubert_quantizer.customtokenizer import CustomTokenizer\n",
     "\n",
     "# Load the HuBERT model\n",
     "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n",
@@ -50,7 +77,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# Load and pre-process the audio waveform\n",
@@ -63,7 +94,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n",
@@ -73,7 +108,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# Extract discrete codes from EnCodec\n",
@@ -85,7 +124,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# move codes to cpu\n",
@@ -97,44 +140,30 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "voice_name = 'output' # whatever you want the name of the voice to be\n",
-    "output_path = 'bark/assets/prompts/' + voice_name + '.npz'\n",
+    "\n",
+    "voice_filename = 'output'\n",
+    "current_path = os.getcwd()\n",
+    "voice_name = os.path.join(current_path, f'{voice_filename}.npz')\n",
+    "\n",
     "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Heres the generation stuff copy-pasted for convenience"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "from bark.api import generate_audio\n",
@@ -149,7 +178,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# download and load all models\n",
@@ -169,7 +202,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# simple generation\n",
@@ -179,7 +216,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "# generation with more control\n",
@@ -209,7 +250,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "from IPython.display import Audio\n",
@@ -220,7 +265,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
    "outputs": [],
    "source": [
     "from scipy.io.wavfile import write as write_wav\n",

From 293f3d414e64afb43f34320df97aed4a814987e7 Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:25:14 +0300
Subject: [PATCH 2/8] Deleted param

---
 clone_voice.ipynb | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index be6dc4cf..50bcb29e 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -194,8 +194,7 @@
     "    fine_use_gpu=True,\n",
     "    fine_use_small=False,\n",
     "    codec_use_gpu=True,\n",
-    "    force_reload=False,\n",
-    "    path=\"models\"\n",
+    "    force_reload=False\n",
     ")"
    ]
   },

From 5d582ffae6fc1dcb339ae006d1c361c3ec1d4546 Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:35:58 +0300
Subject: [PATCH 3/8] Added os import

---
 clone_voice.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index 50bcb29e..2a1ab963 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -148,6 +148,7 @@
    "outputs": [],
    "source": [
     "import numpy as np\n",
+    "import os\n",
     "\n",
     "voice_filename = 'output'\n",
     "current_path = os.getcwd()\n",

From 75c77cc82ba57946c4eb95015243f940b8689bbc Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:37:12 +0300
Subject: [PATCH 4/8] Code refactoring

---
 clone_voice.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index 2a1ab963..19931baf 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -150,9 +150,9 @@
     "import numpy as np\n",
     "import os\n",
     "\n",
-    "voice_filename = 'output'\n",
+    "voice_filename = 'output.npz'\n",
     "current_path = os.getcwd()\n",
-    "voice_name = os.path.join(current_path, f'{voice_filename}.npz')\n",
+    "voice_name = os.path.join(current_path, voice_filename)\n",
     "\n",
     "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
    ]

From 7a48c955cd1619178529110e6fe3f3098368c535 Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:39:50 +0300
Subject: [PATCH 5/8] Added installation of bark

---
 generate.ipynb         | 9 +++++++++
 generate_chunked.ipynb | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/generate.ipynb b/generate.ipynb
index a7aae13e..7e10d4aa 100644
--- a/generate.ipynb
+++ b/generate.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/suno-ai/bark.git"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/generate_chunked.ipynb b/generate_chunked.ipynb
index 1cf3d56d..8dd370cc 100644
--- a/generate_chunked.ipynb
+++ b/generate_chunked.ipynb
@@ -1,5 +1,14 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install git+https://github.com/suno-ai/bark.git"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

From 8273cdec8db672ef82a39e9ebbbb631fef7429ca Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 20:43:38 +0300
Subject: [PATCH 6/8] Cleaned metadata

---
 clone_voice.ipynb      | 96 ++++++++----------------------------------
 generate.ipynb         |  6 +--
 generate_chunked.ipynb |  6 +--
 3 files changed, 21 insertions(+), 87 deletions(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index 19931baf..87398905 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -3,11 +3,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "!pip install git+https://github.com/suno-ai/bark.git\n",
@@ -18,11 +14,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from bark.generation import load_codec_model, generate_text_semantic\n",
@@ -38,11 +30,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import sys\n",
@@ -56,11 +44,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Load HuBERT for semantic tokens\n",
@@ -77,11 +61,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Load and pre-process the audio waveform\n",
@@ -94,11 +74,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)\n",
@@ -108,11 +84,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# Extract discrete codes from EnCodec\n",
@@ -124,11 +96,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# move codes to cpu\n",
@@ -140,11 +108,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -160,11 +124,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from bark.api import generate_audio\n",
@@ -179,11 +139,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# download and load all models\n",
@@ -202,11 +158,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# simple generation\n",
@@ -216,11 +168,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# generation with more control\n",
@@ -250,11 +198,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from IPython.display import Audio\n",
@@ -265,11 +209,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "vscode": {
-     "languageId": "plaintext"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from scipy.io.wavfile import write as write_wav\n",
@@ -294,10 +234,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  },
-  "orig_nbformat": 4
+   "pygments_lexer": "ipython3"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2
diff --git a/generate.ipynb b/generate.ipynb
index 7e10d4aa..724377b3 100644
--- a/generate.ipynb
+++ b/generate.ipynb
@@ -172,10 +172,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  },
-  "orig_nbformat": 4
+   "pygments_lexer": "ipython3"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2
diff --git a/generate_chunked.ipynb b/generate_chunked.ipynb
index 8dd370cc..9662b0bf 100644
--- a/generate_chunked.ipynb
+++ b/generate_chunked.ipynb
@@ -343,10 +343,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.8"
-  },
-  "orig_nbformat": 4
+   "pygments_lexer": "ipython3"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2

From 99e4bca41eed81db69f30265bdb9178441bdf20d Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Sat, 22 Jul 2023 21:11:41 +0300
Subject: [PATCH 7/8] Added map_location

---
 clone_voice.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index 87398905..a1d5b29a 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -55,7 +55,7 @@
     "hubert_model = CustomHubert(checkpoint_path='data/models/hubert/hubert.pt').to(device)\n",
     "\n",
     "# Load the CustomTokenizer model\n",
-    "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth').to(device)  # Automatically uses the right layers"
+    "tokenizer = CustomTokenizer.load_from_checkpoint('data/models/hubert/tokenizer.pth', map_location=device).to(device)  # Automatically uses the right layers"
    ]
   },
   {

From 0fba9367b89ac0815de4e05a2ce5735735b8e25a Mon Sep 17 00:00:00 2001
From: BrasD99 <brasd99@gmail.com>
Date: Mon, 31 Jul 2023 15:06:41 +0300
Subject: [PATCH 8/8] Bug fix

---
 clone_voice.ipynb | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/clone_voice.ipynb b/clone_voice.ipynb
index a1d5b29a..85df680d 100644
--- a/clone_voice.ipynb
+++ b/clone_voice.ipynb
@@ -17,7 +17,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from bark.generation import load_codec_model, generate_text_semantic\n",
+    "from bark.generation import load_codec_model\n",
     "from encodec.utils import convert_audio\n",
     "\n",
     "import torchaudio\n",
@@ -118,7 +118,7 @@
     "current_path = os.getcwd()\n",
     "voice_name = os.path.join(current_path, voice_filename)\n",
     "\n",
-    "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
+    "np.savez(voice_name, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)"
    ]
   },
   {
@@ -128,12 +128,10 @@
    "outputs": [],
    "source": [
     "from bark.api import generate_audio\n",
-    "from transformers import BertTokenizer\n",
     "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
     "\n",
     "# Enter your prompt and speaker here\n",
-    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\"\n",
-    "voice_name = \"output\" # use your custom voice name here if you have one"
+    "text_prompt = \"Hello, my name is Serpy. And, uh — and I like pizza. [laughs]\""
    ]
   },
   {