Skip to content

Commit

Permalink
Attempt to split segment correction processing into batches of 4
Browse files Browse the repository at this point in the history
  • Loading branch information
beveradb committed Nov 17, 2023
1 parent c1dea51 commit 34b7bb1
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 90 deletions.
51 changes: 0 additions & 51 deletions llm_br_parsing_instructions.txt

This file was deleted.

104 changes: 104 additions & 0 deletions lyrics_transcriber/example-llm-response.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"CorrectedLyricsRawResponse": {
"segments": [
{
"id": 0,
"text": "Don't know how to take it, don't know where to go",
"words": [
{"text": "Don't", "start": 17.46, "end": 18.2, "confidence": 0.278},
{"text": "know", "start": 18.2, "end": 18.42, "confidence": 0.965},
{"text": "how", "start": 18.42, "end": 18.66, "confidence": 0.865},
{"text": "to", "start": 18.66, "end": 18.88, "confidence": 0.994},
{"text": "take", "start": 18.88, "end": 19.2, "confidence": 0.992},
{"text": "it,", "start": 19.2, "end": 19.44, "confidence": 0.974},
{"text": "don't", "start": 19.56, "end": 19.8, "confidence": 0.917},
{"text": "know", "start": 19.8, "end": 20.02, "confidence": 0.989},
{"text": "where", "start": 20.02, "end": 20.46, "confidence": 0.963},
{"text": "to", "start": 20.46, "end": 20.76, "confidence": 0.983},
{"text": "go", "start": 20.76, "end": 21.3, "confidence": 0.982}
]
},
{
"id": 1,
"text": "My resistance running low",
"words": [
{"text": "My", "start": 22.04, "end": 22.32, "confidence": 0.535},
{"text": "resistance", "start": 22.32, "end": 22.94, "confidence": 0.936},
{"text": "running", "start": 22.94, "end": 23.66, "confidence": 0.89},
{"text": "low", "start": 23.66, "end": 24.36, "confidence": 0.999}
]
},
{
"id": 2,
"text": "And every day the hold is getting tighter, and it troubles me so",
"words": [
{"text": "And", "start": 24.36, "end": 25.14, "confidence": 0.485},
{"text": "every", "start": 25.14, "end": 25.56, "confidence": 0.568},
{"text": "day", "start": 25.56, "end": 25.88, "confidence": 0.997},
{"text": "the", "start": 25.88, "end": 26.1, "confidence": 0.959},
{"text": "hold", "start": 26.1, "end": 26.48, "confidence": 0.361},
{"text": "is", "start": 26.48, "end": 26.68, "confidence": 0.947},
{"text": "getting", "start": 26.68, "end": 27.08, "confidence": 0.996},
{"text": "tighter,", "start": 27.08, "end": 27.84, "confidence": 0.975},
{"text": "and", "start": 28.42, "end": 28.8, "confidence": 0.347},
{"text": "it", "start": 28.8, "end": 28.98, "confidence": 0.821},
{"text": "troubles", "start": 28.98, "end": 29.72, "confidence": 0.519},
{"text": "me", "start": 29.72, "end": 30.02, "confidence": 0.987},
{"text": "so", "start": 30.02, "end": 30.48, "confidence": 0.843}
]
},
{
"id": 3,
"text": "You know that I'm nobody's fool",
"words": [
{"text": "You", "start": 30.56, "end": 30.8, "confidence": 0.676},
{"text": "know", "start": 30.8, "end": 31.1, "confidence": 0.987},
{"text": "that", "start": 31.1, "end": 31.46, "confidence": 0.984},
{"text": "I'm", "start": 31.46, "end": 32.4, "confidence": 0.954},
{"text": "nobody's", "start": 32.4, "end": 32.58, "confidence": 0.569},
{"text": "fool", "start": 32.58, "end": 33.66, "confidence": 0.854}
]
},
{
"id": 4,
"text": "I'm nobody's fool and yet it's clear to me",
"words": [
{"text": "I'm", "start": 32.58, "end": 32.7, "confidence": 0.854},
{"text": "nobody's", "start": 32.7, "end": 33.4, "confidence": 0.992},
{"text": "fool", "start": 33.4, "end": 33.66, "confidence": 0.997},
{"text": "and", "start": 33.66, "end": 33.88, "confidence": 0.445},
{"text": "yet", "start": 33.88, "end": 34.18, "confidence": 0.952},
{"text": "it's", "start": 34.18, "end": 34.48, "confidence": 0.956},
{"text": "clear", "start": 34.48, "end": 34.86, "confidence": 0.298},
{"text": "to", "start": 34.86, "end": 35.16, "confidence": 0.843},
{"text": "me", "start": 35.16, "end": 35.9, "confidence": 0.992}
]
},
{
"id": 5,
"text": "I don't have a strategy",
"words": [
{"text": "I", "start": 36.46, "end": 36.7, "confidence": 0.994},
{"text": "don't", "start": 36.7, "end": 37.06, "confidence": 0.999},
{"text": "have", "start": 37.06, "end": 37.3, "confidence": 0.999},
{"text": "a", "start": 37.3, "end": 37.48, "confidence": 0.986},
{"text": "strategy", "start": 37.48, "end": 38.52, "confidence": 0.999}
]
},
{
"id": 6,
"text": "It's just like taking candy from a baby",
"words": [
{"text": "It's", "start": 39.3, "end": 39.58, "confidence": 0.986},
{"text": "just", "start": 39.58, "end": 39.8, "confidence": 0.992},
{"text": "like", "start": 39.8, "end": 40.06, "confidence": 0.999},
{"text": "taking", "start": 40.06, "end": 40.46, "confidence": 0.986},
{"text": "candy", "start": 40.46, "end": 41.0, "confidence": 0.997},
{"text": "from", "start": 41.0, "end": 41.38, "confidence": 0.996},
{"text": "a", "start": 41.38, "end": 41.6, "confidence": 0.839},
{"text": "baby", "start": 41.6, "end": 42.2, "confidence": 0.998}
]
}
]
}
}
6 changes: 3 additions & 3 deletions lyrics_transcriber/llm_correction_instructions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,13 @@ These should be reasonably accurate, with generally correct words and phrases.
However, they may not be perfect, and sometimes whole sections (such as a chorus or outro) may be missing or assumed to be repeated.

Carefully analyse each segment in data input 1, and compare with the lyrics in data file 2.
If all of the words match up correctly with the published lyrics, great! You can add that whole segment to your response.
If all of the words match up correctly with part of the published lyrics, great! You can add that whole segment to your response.
If some of the words match up but there are a couple of differences, correct those differences.
If you need to delete a word or two in order to correct the lyrics, that's acceptable.
If you need to add a word or two which were missing from the transcription, that's acceptable - you'll need to estimate the start and end timestamps based on the timestamps of the surrounding words.

The response needs to contain all of the following fields of CorrectedLyricsRawResponse:
The response JSON object needs to contain all of the following fields:

CorrectedLyricsRawResponse:
- segments: this is a list
- id: The id of the segment, from data input 1
- text: The full text of the corrected lyrics for this segment
Expand All @@ -28,3 +27,4 @@ CorrectedLyricsRawResponse:
- start: The start timestamp for this word, estimated if not known for sure.
- end: The end timestamp for this word, estimated if not known for sure.
- confidence: Your self-assessed confidence score (from 0 to 1) of how likely it is that this word is accurate. If the word has not changed from data input 1, keep the existing confidence value.

80 changes: 44 additions & 36 deletions lyrics_transcriber/transcriber.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def copy_files_to_output_dir(self):
self.result_metadata["karaoke_video_filepath"] = shutil.copy(self.result_metadata["karaoke_video_filepath"], self.output_dir)

def correct_whisper_lyrics(self):
self.logger.debug(f"correct_whisper_lyrics initiating OpenAI client")
self.logger.debug("correct_whisper_lyrics initiating OpenAI client")
client = OpenAI()
llm_instructions = ""

Expand All @@ -164,46 +164,54 @@ def correct_whisper_lyrics(self):

whisper_result_simplified = {"segments": []}

for segment in self.whisper_result_dict["segments"]:
whisper_result_simplified["segments"].append(
{
"id": segment["id"],
"start": segment["start"],
"end": segment["end"],
"text": segment["text"],
"confidence": segment["confidence"],
"words": segment["words"],
}
# Splitting the segments into batches of 4
segment_batches = [self.whisper_result_dict["segments"][i : i + 4] for i in range(0, len(self.whisper_result_dict["segments"]), 4)]
all_corrected_segments = []

for batch in segment_batches:
whisper_result_simplified["segments"].clear()

for segment in batch:
whisper_result_simplified["segments"].append(
{
"id": segment["id"],
"start": segment["start"],
"end": segment["end"],
"text": segment["text"],
"confidence": segment["confidence"],
"words": segment["words"],
}
)
whisper_result_simplified_str = json.dumps(whisper_result_simplified, indent=4)
data_string = f"Data input 1:\n{whisper_result_simplified_str} \n\nData input 2:\n{self.spotify_lyrics_text}\n"

self.logger.debug("About to call chat.completions API with system instructions and data inputs")
print(llm_instructions)
print(data_string)

# API call for each batch
response = client.chat.completions.create(
model="gpt-4-1106-preview",
response_format={"type": "json_object"},
messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_string}],
)
whisper_result_simplified_str = json.dumps(whisper_result_simplified, indent=4)

data_string = f"Data input 1:\n{whisper_result_simplified_str} \n\nData input 2:\n{self.spotify_lyrics_text}\n"

self.logger.debug("about to call chat.completions API with system instructions and data inputs")
print(llm_instructions)
print(data_string)

response = client.chat.completions.create(
model="gpt-4-1106-preview",
response_format={"type": "json_object"},
messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_string}],
)
print(response)

print(response)
message = response.choices[0].message.content
finish_reason = response.choices[0].finish_reason

message = response["choices"][0]["message"]["content"]
finish_reason = response["choices"][0]["finish_reason"]
if finish_reason == "stop":
try:
corrected_whisper_lyrics_dict = json.loads(message)
print(corrected_whisper_lyrics_dict)
all_corrected_segments.extend(corrected_whisper_lyrics_dict["segments"])
except json.JSONDecodeError as e:
raise Exception("Failed to parse response from GPT as JSON") from e
else:
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")

if finish_reason == "stop":
try:
corrected_whisper_lyrics_dict = json.loads(message)
print(corrected_whisper_lyrics_dict)
return corrected_whisper_lyrics_dict
except json.JSONDecodeError as e:
raise Exception("Failed to parse response from GPT as JSON") from e
else:
self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}")
return None
return {"segments": all_corrected_segments}

def write_spotify_lyrics_data_file(self):
if self.spotify_cookie and self.song_known:
Expand Down

0 comments on commit 34b7bb1

Please sign in to comment.