Attempt to split segment correction processing into batches of 4

nomadkaraoke · Nov 17, 2023 · 34b7bb1 · 34b7bb1
1 parent c1dea51
commit 34b7bb1
Show file tree

Hide file tree

Showing 4 changed files with 151 additions and 90 deletions.
diff --git a/llm_br_parsing_instructions.txt b/llm_br_parsing_instructions.txt
diff --git a/lyrics_transcriber/example-llm-response.json b/lyrics_transcriber/example-llm-response.json
@@ -0,0 +1,104 @@
+{
+    "CorrectedLyricsRawResponse": {
+        "segments": [
+            {
+            "id": 0,
+            "text": "Don't know how to take it, don't know where to go",
+            "words": [
+                {"text": "Don't", "start": 17.46, "end": 18.2, "confidence": 0.278},
+                {"text": "know", "start": 18.2, "end": 18.42, "confidence": 0.965},
+                {"text": "how", "start": 18.42, "end": 18.66, "confidence": 0.865},
+                {"text": "to", "start": 18.66, "end": 18.88, "confidence": 0.994},
+                {"text": "take", "start": 18.88, "end": 19.2, "confidence": 0.992},
+                {"text": "it,", "start": 19.2, "end": 19.44, "confidence": 0.974},
+                {"text": "don't", "start": 19.56, "end": 19.8, "confidence": 0.917},
+                {"text": "know", "start": 19.8, "end": 20.02, "confidence": 0.989},
+                {"text": "where", "start": 20.02, "end": 20.46, "confidence": 0.963},
+                {"text": "to", "start": 20.46, "end": 20.76, "confidence": 0.983},
+                {"text": "go", "start": 20.76, "end": 21.3, "confidence": 0.982}
+            ]
+            },
+            {
+            "id": 1,
+            "text": "My resistance running low",
+            "words": [
+                {"text": "My", "start": 22.04, "end": 22.32, "confidence": 0.535},
+                {"text": "resistance", "start": 22.32, "end": 22.94, "confidence": 0.936},
+                {"text": "running", "start": 22.94, "end": 23.66, "confidence": 0.89},
+                {"text": "low", "start": 23.66, "end": 24.36, "confidence": 0.999}
+            ]
+            },
+            {
+            "id": 2,
+            "text": "And every day the hold is getting tighter, and it troubles me so",
+            "words": [
+                {"text": "And", "start": 24.36, "end": 25.14, "confidence": 0.485},
+                {"text": "every", "start": 25.14, "end": 25.56, "confidence": 0.568},
+                {"text": "day", "start": 25.56, "end": 25.88, "confidence": 0.997},
+                {"text": "the", "start": 25.88, "end": 26.1, "confidence": 0.959},
+                {"text": "hold", "start": 26.1, "end": 26.48, "confidence": 0.361},
+                {"text": "is", "start": 26.48, "end": 26.68, "confidence": 0.947},
+                {"text": "getting", "start": 26.68, "end": 27.08, "confidence": 0.996},
+                {"text": "tighter,", "start": 27.08, "end": 27.84, "confidence": 0.975},
+                {"text": "and", "start": 28.42, "end": 28.8, "confidence": 0.347},
+                {"text": "it", "start": 28.8, "end": 28.98, "confidence": 0.821},
+                {"text": "troubles", "start": 28.98, "end": 29.72, "confidence": 0.519},
+                {"text": "me", "start": 29.72, "end": 30.02, "confidence": 0.987},
+                {"text": "so", "start": 30.02, "end": 30.48, "confidence": 0.843}
+            ]
+            },
+            {
+            "id": 3,
+            "text": "You know that I'm nobody's fool",
+            "words": [
+                {"text": "You", "start": 30.56, "end": 30.8, "confidence": 0.676},
+                {"text": "know", "start": 30.8, "end": 31.1, "confidence": 0.987},
+                {"text": "that", "start": 31.1, "end": 31.46, "confidence": 0.984},
+                {"text": "I'm", "start": 31.46, "end": 32.4, "confidence": 0.954},
+                {"text": "nobody's", "start": 32.4, "end": 32.58, "confidence": 0.569},
+                {"text": "fool", "start": 32.58, "end": 33.66, "confidence": 0.854}
+            ]
+            },
+            {
+            "id": 4,
+            "text": "I'm nobody's fool and yet it's clear to me",
+            "words": [
+                {"text": "I'm", "start": 32.58, "end": 32.7, "confidence": 0.854},
+                {"text": "nobody's", "start": 32.7, "end": 33.4, "confidence": 0.992},
+                {"text": "fool", "start": 33.4, "end": 33.66, "confidence": 0.997},
+                {"text": "and", "start": 33.66, "end": 33.88, "confidence": 0.445},
+                {"text": "yet", "start": 33.88, "end": 34.18, "confidence": 0.952},
+                {"text": "it's", "start": 34.18, "end": 34.48, "confidence": 0.956},
+                {"text": "clear", "start": 34.48, "end": 34.86, "confidence": 0.298},
+                {"text": "to", "start": 34.86, "end": 35.16, "confidence": 0.843},
+                {"text": "me", "start": 35.16, "end": 35.9, "confidence": 0.992}
+            ]
+            },
+            {
+            "id": 5,
+            "text": "I don't have a strategy",
+            "words": [
+                {"text": "I", "start": 36.46, "end": 36.7, "confidence": 0.994},
+                {"text": "don't", "start": 36.7, "end": 37.06, "confidence": 0.999},
+                {"text": "have", "start": 37.06, "end": 37.3, "confidence": 0.999},
+                {"text": "a", "start": 37.3, "end": 37.48, "confidence": 0.986},
+                {"text": "strategy", "start": 37.48, "end": 38.52, "confidence": 0.999}
+            ]
+            },
+            {
+            "id": 6,
+            "text": "It's just like taking candy from a baby",
+            "words": [
+                {"text": "It's", "start": 39.3, "end": 39.58, "confidence": 0.986},
+                {"text": "just", "start": 39.58, "end": 39.8, "confidence": 0.992},
+                {"text": "like", "start": 39.8, "end": 40.06, "confidence": 0.999},
+                {"text": "taking", "start": 40.06, "end": 40.46, "confidence": 0.986},
+                {"text": "candy", "start": 40.46, "end": 41.0, "confidence": 0.997},
+                {"text": "from", "start": 41.0, "end": 41.38, "confidence": 0.996},
+                {"text": "a", "start": 41.38, "end": 41.6, "confidence": 0.839},
+                {"text": "baby", "start": 41.6, "end": 42.2, "confidence": 0.998}
+            ]
+            }
+        ]
+    }
+}
diff --git a/lyrics_transcriber/llm_correction_instructions.txt b/lyrics_transcriber/llm_correction_instructions.txt
@@ -12,14 +12,13 @@ These should be reasonably accurate, with generally correct words and phrases.
 However, they may not be perfect, and sometimes whole sections (such as a chorus or outro) may be missing or assumed to be repeated.
 
 Carefully analyse each segment in data input 1, and compare with the lyrics in data file 2.
-If all of the words match up correctly with the published lyrics, great! You can add that whole segment to your response.
+If all of the words match up correctly with part of the published lyrics, great! You can add that whole segment to your response.
 If some of the words match up but there are a couple of differences, correct those differences.
 If you need to delete a word or two in order to correct the lyrics, that's acceptable.
 If you need to add a word or two which were missing from the transcription, that's acceptable - you'll need to estimate the start and end timestamps based on the timestamps of the surrounding words.
 
-The response needs to contain all of the following fields of CorrectedLyricsRawResponse:
+The response JSON object needs to contain all of the following fields:
 
-CorrectedLyricsRawResponse:
 - segments: this is a list
   - id: The id of the segment, from data input 1
   - text: The full text of the corrected lyrics for this segment
@@ -28,3 +27,4 @@ CorrectedLyricsRawResponse:
     - start: The start timestamp for this word, estimated if not known for sure.
     - end: The end timestamp for this word, estimated if not known for sure.
     - confidence: Your self-assessed confidence score (from 0 to 1) of how likely it is that this word is accurate. If the word has not changed from data input 1, keep the existing confidence value.
+
diff --git a/lyrics_transcriber/transcriber.py b/lyrics_transcriber/transcriber.py
@@ -155,7 +155,7 @@ def copy_files_to_output_dir(self):
             self.result_metadata["karaoke_video_filepath"] = shutil.copy(self.result_metadata["karaoke_video_filepath"], self.output_dir)
 
     def correct_whisper_lyrics(self):
-        self.logger.debug(f"correct_whisper_lyrics initiating OpenAI client")
+        self.logger.debug("correct_whisper_lyrics initiating OpenAI client")
         client = OpenAI()
         llm_instructions = ""
 
@@ -164,46 +164,54 @@ def correct_whisper_lyrics(self):
 
         whisper_result_simplified = {"segments": []}
 
-        for segment in self.whisper_result_dict["segments"]:
-            whisper_result_simplified["segments"].append(
-                {
-                    "id": segment["id"],
-                    "start": segment["start"],
-                    "end": segment["end"],
-                    "text": segment["text"],
-                    "confidence": segment["confidence"],
-                    "words": segment["words"],
-                }
+        # Splitting the segments into batches of 4
+        segment_batches = [self.whisper_result_dict["segments"][i : i + 4] for i in range(0, len(self.whisper_result_dict["segments"]), 4)]
+        all_corrected_segments = []
+
+        for batch in segment_batches:
+            whisper_result_simplified["segments"].clear()
+
+            for segment in batch:
+                whisper_result_simplified["segments"].append(
+                    {
+                        "id": segment["id"],
+                        "start": segment["start"],
+                        "end": segment["end"],
+                        "text": segment["text"],
+                        "confidence": segment["confidence"],
+                        "words": segment["words"],
+                    }
+                )
+            whisper_result_simplified_str = json.dumps(whisper_result_simplified, indent=4)
+            data_string = f"Data input 1:\n{whisper_result_simplified_str} \n\nData input 2:\n{self.spotify_lyrics_text}\n"
+
+            self.logger.debug("About to call chat.completions API with system instructions and data inputs")
+            print(llm_instructions)
+            print(data_string)
+
+            # API call for each batch
+            response = client.chat.completions.create(
+                model="gpt-4-1106-preview",
+                response_format={"type": "json_object"},
+                messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_string}],
             )
-        whisper_result_simplified_str = json.dumps(whisper_result_simplified, indent=4)
-
-        data_string = f"Data input 1:\n{whisper_result_simplified_str} \n\nData input 2:\n{self.spotify_lyrics_text}\n"
-
-        self.logger.debug("about to call chat.completions API with system instructions and data inputs")
-        print(llm_instructions)
-        print(data_string)
 
-        response = client.chat.completions.create(
-            model="gpt-4-1106-preview",
-            response_format={"type": "json_object"},
-            messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_string}],
-        )
+            print(response)
 
-        print(response)
+            message = response.choices[0].message.content
+            finish_reason = response.choices[0].finish_reason
 
-        message = response["choices"][0]["message"]["content"]
-        finish_reason = response["choices"][0]["finish_reason"]
+            if finish_reason == "stop":
+                try:
+                    corrected_whisper_lyrics_dict = json.loads(message)
+                    print(corrected_whisper_lyrics_dict)
+                    all_corrected_segments.extend(corrected_whisper_lyrics_dict["segments"])
+                except json.JSONDecodeError as e:
+                    raise Exception("Failed to parse response from GPT as JSON") from e
+            else:
+                self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
 
-        if finish_reason == "stop":
-            try:
-                corrected_whisper_lyrics_dict = json.loads(message)
-                print(corrected_whisper_lyrics_dict)
-                return corrected_whisper_lyrics_dict
-            except json.JSONDecodeError as e:
-                raise Exception("Failed to parse response from GPT as JSON") from e
-        else:
-            self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
-            return None
+        return {"segments": all_corrected_segments}
 
     def write_spotify_lyrics_data_file(self):
         if self.spotify_cookie and self.song_known: