From ff121dd3dd14dd1baee8027ed723786d20c93ba8 Mon Sep 17 00:00:00 2001 From: Andrew Beveridge Date: Sun, 19 Nov 2023 02:14:26 -0600 Subject: [PATCH] Improved prompt to pass in the previous 2 lines of corrected lyrics, which helps to prevent duplications and mistakes. Also logged the LLM transcript for easier debugging and understanding of what's going on --- .../llm_lyrics_correction_prompt.txt | 2 + lyrics_transcriber/transcriber.py | 128 ++++++++++++------ pyproject.toml | 2 +- 3 files changed, 87 insertions(+), 45 deletions(-) diff --git a/lyrics_transcriber/llm_prompts/llm_lyrics_correction_prompt.txt b/lyrics_transcriber/llm_prompts/llm_lyrics_correction_prompt.txt index afb1839..716b45c 100644 --- a/lyrics_transcriber/llm_prompts/llm_lyrics_correction_prompt.txt +++ b/lyrics_transcriber/llm_prompts/llm_lyrics_correction_prompt.txt @@ -7,6 +7,8 @@ You will be provided with one or more reference data, containing published lyric These should be reasonably accurate, with generally correct words and phrases. However, they may not be perfect, and sometimes whole sections (such as a chorus or outro) may be missing or assumed to be repeated. +When possible, you will also be provided with the previous 2 (corrected) lines of text, for additional context. + Data input will contain one segment of an automated machine transcription of lyrics from a song, with start/end timestamps and confidence scores for every word in that segment. The timestamps for words are usually quite accurate, but the actual words which were heard by the transcription are typically only around 70% to 90% accurate. As such, it is common for there to be segments where most of the words are correct but one or two are wrong, or a single word may have been mistaken as two different words. diff --git a/lyrics_transcriber/transcriber.py b/lyrics_transcriber/transcriber.py index 32c4cde..7f6b447 100644 --- a/lyrics_transcriber/transcriber.py +++ b/lyrics_transcriber/transcriber.py @@ -68,7 +68,7 @@ def __init__( self.video_resolution = video_resolution self.video_background_image = video_background_image self.video_background_color = video_background_color - self.font_size = 120 + self.font_size = 100 match video_resolution: case "4k": @@ -101,6 +101,7 @@ def __init__( "spotify_lyrics_text_filepath": None, "llm_token_usage": {"input": 0, "output": 0}, "llm_costs_usd": {"input": 0.0, "output": 0.0, "total": 0.0}, + "llm_transcript": None, "corrected_lyrics_text": None, "corrected_lyrics_text_filepath": None, "midico_lrc_filepath": None, @@ -258,54 +259,93 @@ def write_corrected_lyrics_data_file(self): # TODO: Test if results are cleaner when using the vocal file from a background vocal audio separation model - for segment in self.outputs["transcription_data_dict"]["segments"]: - simplified_segment = { - "id": segment["id"], - "start": segment["start"], - "end": segment["end"], - "confidence": segment["confidence"], - "text": segment["text"], - "words": segment["words"], - } - - simplified_segment_str = json.dumps(simplified_segment) - data_input_str = f"Data input:\n{simplified_segment_str}\n" - - self.logger.info(f'Calling completion model {self.llm_model} with instructions and data input for segment {segment["id"]}:') - # self.logger.debug(data_input_str) - - # API call for each segment - response = self.openai_client.chat.completions.create( - model=self.llm_model, - response_format={"type": "json_object"}, - messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_input_str}], - ) + # TODO: Record more info about the correction process (e.g before/after diffs for each segment) to a file for debugging + # TODO: Possibly add a step after segment-based correct to get the LLM to self-analyse the diff - message = response.choices[0].message.content - finish_reason = response.choices[0].finish_reason + llm_transcript_filepath = os.path.join(self.cache_dir, "llm-transcript-" + self.get_song_slug() + ".txt") + self.outputs["llm_transcript"] = "" - self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens - self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens + total_segments = len(self.outputs["transcription_data_dict"]["segments"]) + self.logger.info(f"Beginning correction using LLM, total segments: {total_segments}") - # self.logger.debug(f"response finish_reason: {finish_reason} message: \n{message}") + with open(llm_transcript_filepath, "a") as llm_transcript_file: + self.logger.debug(f"writing LLM chat instructions: {llm_transcript_filepath}") + llm_instructions_header = f"--- SYSTEM instructions passed in for all segments ---:\n\n" + self.outputs["llm_transcript"] += llm_instructions_header + llm_instructions + "\n" + llm_transcript_file.write(llm_instructions_header + llm_instructions + "\n") - if finish_reason == "stop": - try: - corrected_segment_dict = json.loads(message) - corrected_lyrics_dict["segments"].append(corrected_segment_dict) - self.logger.info("Successfully parsed response from GPT as JSON and appended to corrected_lyrics_dict.segments") - except json.JSONDecodeError as e: - raise Exception("Failed to parse response from GPT as JSON") from e - else: - self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}") + for segment in self.outputs["transcription_data_dict"]["segments"]: + simplified_segment = { + "id": segment["id"], + "start": segment["start"], + "end": segment["end"], + "confidence": segment["confidence"], + "text": segment["text"], + "words": segment["words"], + } - self.logger.info(f'Successfully processed correction for all {len(corrected_lyrics_dict["segments"])} lyrics segments') + simplified_segment_str = json.dumps(simplified_segment) - self.logger.debug( - f"writing lyrics data JSON to corrected_lyrics_data_json_cache_filepath: {corrected_lyrics_data_json_cache_filepath}" - ) - with open(corrected_lyrics_data_json_cache_filepath, "w") as f: - f.write(json.dumps(corrected_lyrics_dict, indent=4)) + previous_two_lines = "" + + if segment["id"] > 2: + previous_two_lines = "Previous two lines:\n\n" + + for previous_segment in corrected_lyrics_dict["segments"]: + if previous_segment["id"] == (segment["id"] - 2): + previous_two_lines += previous_segment["text"].strip() + "\n" + + for previous_segment in corrected_lyrics_dict["segments"]: + if previous_segment["id"] == (segment["id"] - 1): + previous_two_lines += previous_segment["text"].strip() + "\n" + + data_input_str = f"{previous_two_lines}\nData input:\n\n{simplified_segment_str}\n" + + self.logger.info( + f'Calling completion model {self.llm_model} with instructions and data input for segment {segment["id"]}:' + ) + # self.logger.debug(data_input_str) + + llm_transcript_segment = f"--- INPUT for segment {segment['id']} / {total_segments} ---:\n\n" + llm_transcript_segment += data_input_str + + response = self.openai_client.chat.completions.create( + model=self.llm_model, + response_format={"type": "json_object"}, + messages=[{"role": "system", "content": llm_instructions}, {"role": "user", "content": data_input_str}], + ) + + message = response.choices[0].message.content + finish_reason = response.choices[0].finish_reason + + llm_transcript_segment += f"\n--- RESPONSE for segment {segment['id']} ---:\n\n" + llm_transcript_segment += message + llm_transcript_segment += f"\n--- END segment {segment['id']} / {total_segments} ---:\n\n" + + self.logger.debug(f"writing LLM chat transcript for segment to: {llm_transcript_filepath}") + llm_transcript_file.write(llm_transcript_segment) + self.outputs["llm_transcript"] += llm_transcript_segment + + self.outputs["llm_token_usage"]["input"] += response.usage.prompt_tokens + self.outputs["llm_token_usage"]["output"] += response.usage.completion_tokens + + # self.logger.debug(f"response finish_reason: {finish_reason} message: \n{message}") + + if finish_reason == "stop": + try: + corrected_segment_dict = json.loads(message) + corrected_lyrics_dict["segments"].append(corrected_segment_dict) + self.logger.info("Successfully parsed response from GPT as JSON and appended to corrected_lyrics_dict.segments") + except json.JSONDecodeError as e: + raise Exception("Failed to parse response from GPT as JSON") from e + else: + self.logger.warning(f"OpenAI API call did not finish successfully, finish_reason: {finish_reason}") + + self.logger.info(f'Successfully processed correction for all {len(corrected_lyrics_dict["segments"])} lyrics segments') + + self.logger.debug(f"writing corrected lyrics data JSON filepath: {corrected_lyrics_data_json_cache_filepath}") + with open(corrected_lyrics_data_json_cache_filepath, "w") as corrected_lyrics_data_json_cache_file: + corrected_lyrics_data_json_cache_file.write(json.dumps(corrected_lyrics_dict, indent=4)) except Exception as e: self.logger.warn(f"caught exception while attempting to correct lyrics: ", e) @@ -535,7 +575,7 @@ def create_screens(self): lines_in_current_screen += 1 line = None - # If current screen has 3 lines already, add screen to list and start new screen + # If current screen has 2 lines already, add screen to list and start new screen # Before looping to the next line if lines_in_current_screen == 2: self.logger.debug(f"lines_in_current_screen is 2, adding screen to list and starting new screen") diff --git a/pyproject.toml b/pyproject.toml index 854697a..73b4c88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "lyrics-transcriber" -version = "0.12.2" +version = "0.12.3" description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify" authors = ["Andrew Beveridge "] license = "MIT"