Added genius lyrics cleaning and lyrics processing

nomadkaraoke · Jan 28, 2025 · b63a62c · b63a62c
1 parent b72928a
commit b63a62c
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 51 deletions.
diff --git a/lyrics_transcriber/core/controller.py b/lyrics_transcriber/core/controller.py
@@ -1,10 +1,9 @@
 import difflib
-import json
 import os
 import logging
 from dataclasses import dataclass, field
 from typing import Dict, Optional, List
-from lyrics_transcriber.types import LyricsData, PhraseType, TranscriptionResult, CorrectionResult, AnchorSequence, GapSequence, PhraseScore
+from lyrics_transcriber.types import LyricsData, TranscriptionResult, CorrectionResult
 from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber
 from lyrics_transcriber.transcribers.audioshake import AudioShakeTranscriber, AudioShakeConfig
 from lyrics_transcriber.transcribers.whisper import WhisperTranscriber, WhisperConfig
@@ -205,12 +204,17 @@ def process(self) -> LyricsControllerResult:
         if self.output_config.run_transcription:
             self.transcribe()
 
-        # Step 3: Process and correct lyrics if enabled
-        if self.output_config.run_correction:
+        # Step 3: Process and correct lyrics if enabled AND we have transcription results
+        if self.output_config.run_correction and self.results.transcription_results:
             self.correct_lyrics()
+        elif self.output_config.run_correction:
+            self.logger.info("Skipping lyrics correction - no transcription results available")
 
-        # Step 4: Generate outputs based on what's enabled and available
-        self.generate_outputs()
+        # Step 4: Generate outputs based on what we have
+        if self.results.transcription_corrected or self.results.lyrics_results:
+            self.generate_outputs()
+        else:
+            self.logger.warning("No corrected transcription or lyrics available. Skipping output generation.")
 
         self.logger.info("Processing completed successfully")
         return self.results

diff --git a/lyrics_transcriber/correction/corrector.py b/lyrics_transcriber/correction/corrector.py
@@ -28,7 +28,8 @@ def __init__(
         logger: Optional[logging.Logger] = None,
     ):
         self.logger = logger or logging.getLogger(__name__)
-        self.anchor_finder = anchor_finder or AnchorSequenceFinder(cache_dir=cache_dir, logger=self.logger)
+        self._anchor_finder = anchor_finder
+        self._cache_dir = cache_dir
 
         # Default handlers in order of preference
         self.handlers = handlers or [
@@ -42,6 +43,13 @@ def __init__(
             LevenshteinHandler(),
         ]
 
+    @property
+    def anchor_finder(self) -> AnchorSequenceFinder:
+        """Lazy load the anchor finder instance, initializing it if not already set."""
+        if self._anchor_finder is None:
+            self._anchor_finder = AnchorSequenceFinder(cache_dir=self._cache_dir, logger=self.logger)
+        return self._anchor_finder
+
     def run(self, transcription_results: List[TranscriptionResult], lyrics_results: List[LyricsData]) -> CorrectionResult:
         """Execute the correction process."""
         if not transcription_results:

diff --git a/lyrics_transcriber/lyrics/base_lyrics_provider.py b/lyrics_transcriber/lyrics/base_lyrics_provider.py
@@ -7,6 +7,7 @@
 import os
 from abc import ABC, abstractmethod
 from lyrics_transcriber.types import LyricsData
+from karaoke_lyrics_processor import KaraokeLyricsProcessor
 
 
 @dataclass
@@ -17,6 +18,7 @@ class LyricsProviderConfig:
     spotify_cookie: Optional[str] = None
     cache_dir: Optional[str] = None
     audio_filepath: Optional[str] = None
+    max_line_length: int = 36  # New config parameter for KaraokeLyricsProcessor
 
 
 class BaseLyricsProvider(ABC):
@@ -26,6 +28,7 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
         self.logger = logger or logging.getLogger(__name__)
         self.cache_dir = Path(config.cache_dir) if config.cache_dir else None
         self.audio_filepath = config.audio_filepath
+        self.max_line_length = config.max_line_length
         if self.cache_dir:
             self.cache_dir.mkdir(parents=True, exist_ok=True)
             self.logger.debug(f"Initialized {self.__class__.__name__} with cache dir: {self.cache_dir}")
@@ -35,21 +38,22 @@ def fetch_lyrics(self, artist: str, title: str) -> Optional[LyricsData]:
         if not self.cache_dir:
             return self._fetch_and_convert_result(artist, title)
 
-        file_hash = self._get_file_hash(self.audio_filepath)
-        raw_cache_path = self._get_cache_path(file_hash, "raw")
+        # Use artist and title for cache key instead of audio file hash
+        cache_key = self._get_artist_title_hash(artist, title)
+        raw_cache_path = self._get_cache_path(cache_key, "raw")
 
         # Try to load from cache first
         raw_data = self._load_from_cache(raw_cache_path)
         if raw_data is not None:
             self.logger.info(f"Using cached lyrics for {artist} - {title}")
-            return self._save_and_convert_result(file_hash, raw_data)
+            return self._save_and_convert_result(cache_key, raw_data)
 
         # If not in cache, fetch from source
         raw_result = self._fetch_data_from_source(artist, title)
         if raw_result:
             # Save raw API response
             self._save_to_cache(raw_cache_path, raw_result)
-            return self._save_and_convert_result(file_hash, raw_result)
+            return self._save_and_convert_result(cache_key, raw_result)
 
         return None
 
@@ -95,13 +99,30 @@ def _load_from_cache(self, cache_path: str) -> Optional[Dict[str, Any]]:
             self.logger.warning(f"Cache file {cache_path} is corrupted")
             return None
 
+    def _process_lyrics(self, lyrics_data: LyricsData) -> LyricsData:
+        """Process lyrics using KaraokeLyricsProcessor."""
+        processor = KaraokeLyricsProcessor(
+            log_level=self.logger.getEffectiveLevel(),
+            log_formatter=self.logger.handlers[0].formatter if self.logger.handlers else None,
+            input_lyrics_text=lyrics_data.lyrics,
+            max_line_length=self.max_line_length,
+        )
+        processed_text = processor.process()
+
+        # Create new LyricsData with processed text
+        return LyricsData(source=lyrics_data.source, lyrics=processed_text, segments=lyrics_data.segments, metadata=lyrics_data.metadata)
+
     def _save_and_convert_result(self, cache_key: str, raw_data: Dict[str, Any]) -> LyricsData:
-        """Convert raw result to standardized format, save to cache, and return."""
+        """Convert raw result to standardized format, process lyrics, save to cache, and return."""
         converted_cache_path = self._get_cache_path(cache_key, "converted")
         converted_result = self._convert_result_format(raw_data)
+
+        # Process the lyrics
+        processed_result = self._process_lyrics(converted_result)
+
         # Convert to dictionary before saving to cache
-        self._save_to_cache(converted_cache_path, converted_result.to_dict())
-        return converted_result
+        self._save_to_cache(converted_cache_path, processed_result.to_dict())
+        return processed_result
 
     def _fetch_and_convert_result(self, artist: str, title: str) -> Optional[LyricsData]:
         """Fetch and convert result when caching is disabled."""

diff --git a/lyrics_transcriber/lyrics/genius.py b/lyrics_transcriber/lyrics/genius.py
@@ -1,4 +1,5 @@
 import logging
+import re
 from typing import Optional, Dict, Any
 import lyricsgenius
 from lyrics_transcriber.types import LyricsData, LyricsMetadata
@@ -13,9 +14,15 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
         self.api_token = config.genius_api_token
         self.client = None
         if self.api_token:
-            self.client = lyricsgenius.Genius(self.api_token)
-            self.client.verbose = False
-            self.client.remove_section_headers = True
+            self.client = lyricsgenius.Genius(
+                self.api_token,
+                verbose=(logger.getEffectiveLevel() == logging.DEBUG if logger else False),
+                remove_section_headers=True,  # Remove [Chorus], [Verse], etc.
+                skip_non_songs=True,  # Skip track listings and other non-song results
+                timeout=10,  # Reasonable timeout for requests
+                retries=3,  # Number of retries for failed requests
+                sleep_time=1,  # Small delay between requests to be nice to the API
+            )
 
     def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
         """Fetch raw song data from Genius API."""
@@ -35,6 +42,9 @@ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str,
 
     def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
         """Convert Genius's raw API response to standardized format."""
+        # Clean the lyrics before processing
+        lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))
+
         # Extract release date components if available
         release_date = None
         if release_components := raw_data.get("release_date_components"):
@@ -68,6 +78,23 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
         )
 
         # Create result object
-        return LyricsData(
-            source="genius", lyrics=raw_data.get("lyrics", ""), segments=[], metadata=metadata
-        )  # Genius doesn't provide timestamp data
+        return LyricsData(source="genius", lyrics=lyrics, segments=[], metadata=metadata)
+
+    def _clean_lyrics(self, lyrics: str) -> str:
+        """Clean and process lyrics from Genius to remove unwanted content."""
+
+        lyrics = lyrics.replace("\\n", "\n")
+        lyrics = re.sub(r"You might also like", "", lyrics)
+        lyrics = re.sub(
+            r".*?Lyrics([A-Z])", r"\1", lyrics
+        )  # Remove the song name and word "Lyrics" if this has a non-newline char at the start
+        lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics)  # Remove this example: 27 ContributorsSex Bomb Lyrics
+        lyrics = re.sub(
+            r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
+        )  # Remove this example: See Tom Jones LiveGet tickets as low as $71
+        lyrics = re.sub(r"[0-9]+Embed$", "", lyrics)  # Remove the word "Embed" at end of line with preceding numbers if found
+        lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics)  # Remove the word "Embed" if it has been tacked onto a word at the end of a line
+        lyrics = re.sub(r"^Embed$", r"", lyrics)  # Remove the word "Embed" if it has been tacked onto a word at the end of a line
+        lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics)  # Remove lines containing square brackets
+        # add any additional cleaning rules here
+        return lyrics
diff --git a/lyrics_transcriber/output/generator.py b/lyrics_transcriber/output/generator.py
@@ -95,7 +95,7 @@ def __init__(
 
     def generate_outputs(
         self,
-        transcription_corrected: CorrectionResult,
+        transcription_corrected: Optional[CorrectionResult],
         lyrics_results: List[LyricsData],
         output_prefix: str,
         audio_filepath: str,
@@ -110,35 +110,37 @@ def generate_outputs(
             for lyrics_data in lyrics_results:
                 self.plain_text.write_lyrics(lyrics_data, output_prefix)
 
-            # Write original (uncorrected) transcription
-            outputs.original_txt = self.plain_text.write_original_transcription(transcription_corrected, output_prefix)
-
-            # Resize corrected segments to ensure none are longer than max_line_length
-            resized_segments = self.segment_resizer.resize_segments(transcription_corrected.corrected_segments)
-            transcription_corrected.resized_segments = resized_segments
-            outputs.corrections_json = self.write_corrections_data(transcription_corrected, output_prefix)
-
-            # Write corrected lyrics as plain text
-            outputs.corrected_txt = self.plain_text.write_corrected_lyrics(resized_segments, output_prefix)
-
-            # Generate LRC using LyricsFileGenerator
-            outputs.lrc = self.lyrics_file.generate_lrc(resized_segments, output_prefix)
-
-            # Generate CDG file if requested
-            if self.config.generate_cdg:
-                outputs.cdg, outputs.mp3, outputs.cdg_zip = self.cdg.generate_cdg(
-                    segments=resized_segments,
-                    audio_file=audio_filepath,
-                    title=title or output_prefix,
-                    artist=artist or "",
-                    cdg_styles=self.config.styles["cdg"],
-                )
-
-            # Generate video if requested
-            if self.config.render_video:
-                # Generate ASS subtitles
-                outputs.ass = self.subtitle.generate_ass(resized_segments, output_prefix, audio_filepath)
-                outputs.video = self.video.generate_video(outputs.ass, audio_filepath, output_prefix)
+            # Only process transcription-related outputs if we have transcription data
+            if transcription_corrected:
+                # Write original (uncorrected) transcription
+                outputs.original_txt = self.plain_text.write_original_transcription(transcription_corrected, output_prefix)
+
+                # Resize corrected segments to ensure none are longer than max_line_length
+                resized_segments = self.segment_resizer.resize_segments(transcription_corrected.corrected_segments)
+                transcription_corrected.resized_segments = resized_segments
+                outputs.corrections_json = self.write_corrections_data(transcription_corrected, output_prefix)
+
+                # Write corrected lyrics as plain text
+                outputs.corrected_txt = self.plain_text.write_corrected_lyrics(resized_segments, output_prefix)
+
+                # Generate LRC using LyricsFileGenerator
+                outputs.lrc = self.lyrics_file.generate_lrc(resized_segments, output_prefix)
+
+                # Generate CDG file if requested
+                if self.config.generate_cdg:
+                    outputs.cdg, outputs.mp3, outputs.cdg_zip = self.cdg.generate_cdg(
+                        segments=resized_segments,
+                        audio_file=audio_filepath,
+                        title=title or output_prefix,
+                        artist=artist or "",
+                        cdg_styles=self.config.styles["cdg"],
+                    )
+
+                # Generate video if requested
+                if self.config.render_video:
+                    # Generate ASS subtitles
+                    outputs.ass = self.subtitle.generate_ass(resized_segments, output_prefix, audio_filepath)
+                    outputs.video = self.video.generate_video(outputs.ass, audio_filepath, output_prefix)
 
             return outputs
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "lyrics-transcriber"
-version = "0.33.0"
+version = "0.34.0"
 description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
 authors = ["Andrew Beveridge <[email protected]>"]
 license = "MIT"