Skip to content

Commit

Permalink
Added genius lyrics cleaning and lyrics processing
Browse files Browse the repository at this point in the history
  • Loading branch information
beveradb committed Jan 28, 2025
1 parent b72928a commit b63a62c
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 51 deletions.
16 changes: 10 additions & 6 deletions lyrics_transcriber/core/controller.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import difflib
import json
import os
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, List
from lyrics_transcriber.types import LyricsData, PhraseType, TranscriptionResult, CorrectionResult, AnchorSequence, GapSequence, PhraseScore
from lyrics_transcriber.types import LyricsData, TranscriptionResult, CorrectionResult
from lyrics_transcriber.transcribers.base_transcriber import BaseTranscriber
from lyrics_transcriber.transcribers.audioshake import AudioShakeTranscriber, AudioShakeConfig
from lyrics_transcriber.transcribers.whisper import WhisperTranscriber, WhisperConfig
Expand Down Expand Up @@ -205,12 +204,17 @@ def process(self) -> LyricsControllerResult:
if self.output_config.run_transcription:
self.transcribe()

# Step 3: Process and correct lyrics if enabled
if self.output_config.run_correction:
# Step 3: Process and correct lyrics if enabled AND we have transcription results
if self.output_config.run_correction and self.results.transcription_results:
self.correct_lyrics()
elif self.output_config.run_correction:
self.logger.info("Skipping lyrics correction - no transcription results available")

# Step 4: Generate outputs based on what's enabled and available
self.generate_outputs()
# Step 4: Generate outputs based on what we have
if self.results.transcription_corrected or self.results.lyrics_results:
self.generate_outputs()
else:
self.logger.warning("No corrected transcription or lyrics available. Skipping output generation.")

self.logger.info("Processing completed successfully")
return self.results
Expand Down
10 changes: 9 additions & 1 deletion lyrics_transcriber/correction/corrector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ def __init__(
logger: Optional[logging.Logger] = None,
):
self.logger = logger or logging.getLogger(__name__)
self.anchor_finder = anchor_finder or AnchorSequenceFinder(cache_dir=cache_dir, logger=self.logger)
self._anchor_finder = anchor_finder
self._cache_dir = cache_dir

# Default handlers in order of preference
self.handlers = handlers or [
Expand All @@ -42,6 +43,13 @@ def __init__(
LevenshteinHandler(),
]

@property
def anchor_finder(self) -> AnchorSequenceFinder:
"""Lazy load the anchor finder instance, initializing it if not already set."""
if self._anchor_finder is None:
self._anchor_finder = AnchorSequenceFinder(cache_dir=self._cache_dir, logger=self.logger)
return self._anchor_finder

def run(self, transcription_results: List[TranscriptionResult], lyrics_results: List[LyricsData]) -> CorrectionResult:
"""Execute the correction process."""
if not transcription_results:
Expand Down
35 changes: 28 additions & 7 deletions lyrics_transcriber/lyrics/base_lyrics_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
from abc import ABC, abstractmethod
from lyrics_transcriber.types import LyricsData
from karaoke_lyrics_processor import KaraokeLyricsProcessor


@dataclass
Expand All @@ -17,6 +18,7 @@ class LyricsProviderConfig:
spotify_cookie: Optional[str] = None
cache_dir: Optional[str] = None
audio_filepath: Optional[str] = None
max_line_length: int = 36 # New config parameter for KaraokeLyricsProcessor


class BaseLyricsProvider(ABC):
Expand All @@ -26,6 +28,7 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
self.logger = logger or logging.getLogger(__name__)
self.cache_dir = Path(config.cache_dir) if config.cache_dir else None
self.audio_filepath = config.audio_filepath
self.max_line_length = config.max_line_length
if self.cache_dir:
self.cache_dir.mkdir(parents=True, exist_ok=True)
self.logger.debug(f"Initialized {self.__class__.__name__} with cache dir: {self.cache_dir}")
Expand All @@ -35,21 +38,22 @@ def fetch_lyrics(self, artist: str, title: str) -> Optional[LyricsData]:
if not self.cache_dir:
return self._fetch_and_convert_result(artist, title)

file_hash = self._get_file_hash(self.audio_filepath)
raw_cache_path = self._get_cache_path(file_hash, "raw")
# Use artist and title for cache key instead of audio file hash
cache_key = self._get_artist_title_hash(artist, title)
raw_cache_path = self._get_cache_path(cache_key, "raw")

# Try to load from cache first
raw_data = self._load_from_cache(raw_cache_path)
if raw_data is not None:
self.logger.info(f"Using cached lyrics for {artist} - {title}")
return self._save_and_convert_result(file_hash, raw_data)
return self._save_and_convert_result(cache_key, raw_data)

# If not in cache, fetch from source
raw_result = self._fetch_data_from_source(artist, title)
if raw_result:
# Save raw API response
self._save_to_cache(raw_cache_path, raw_result)
return self._save_and_convert_result(file_hash, raw_result)
return self._save_and_convert_result(cache_key, raw_result)

return None

Expand Down Expand Up @@ -95,13 +99,30 @@ def _load_from_cache(self, cache_path: str) -> Optional[Dict[str, Any]]:
self.logger.warning(f"Cache file {cache_path} is corrupted")
return None

def _process_lyrics(self, lyrics_data: LyricsData) -> LyricsData:
"""Process lyrics using KaraokeLyricsProcessor."""
processor = KaraokeLyricsProcessor(
log_level=self.logger.getEffectiveLevel(),
log_formatter=self.logger.handlers[0].formatter if self.logger.handlers else None,
input_lyrics_text=lyrics_data.lyrics,
max_line_length=self.max_line_length,
)
processed_text = processor.process()

# Create new LyricsData with processed text
return LyricsData(source=lyrics_data.source, lyrics=processed_text, segments=lyrics_data.segments, metadata=lyrics_data.metadata)

def _save_and_convert_result(self, cache_key: str, raw_data: Dict[str, Any]) -> LyricsData:
"""Convert raw result to standardized format, save to cache, and return."""
"""Convert raw result to standardized format, process lyrics, save to cache, and return."""
converted_cache_path = self._get_cache_path(cache_key, "converted")
converted_result = self._convert_result_format(raw_data)

# Process the lyrics
processed_result = self._process_lyrics(converted_result)

# Convert to dictionary before saving to cache
self._save_to_cache(converted_cache_path, converted_result.to_dict())
return converted_result
self._save_to_cache(converted_cache_path, processed_result.to_dict())
return processed_result

def _fetch_and_convert_result(self, artist: str, title: str) -> Optional[LyricsData]:
"""Fetch and convert result when caching is disabled."""
Expand Down
39 changes: 33 additions & 6 deletions lyrics_transcriber/lyrics/genius.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
from typing import Optional, Dict, Any
import lyricsgenius
from lyrics_transcriber.types import LyricsData, LyricsMetadata
Expand All @@ -13,9 +14,15 @@ def __init__(self, config: LyricsProviderConfig, logger: Optional[logging.Logger
self.api_token = config.genius_api_token
self.client = None
if self.api_token:
self.client = lyricsgenius.Genius(self.api_token)
self.client.verbose = False
self.client.remove_section_headers = True
self.client = lyricsgenius.Genius(
self.api_token,
verbose=(logger.getEffectiveLevel() == logging.DEBUG if logger else False),
remove_section_headers=True, # Remove [Chorus], [Verse], etc.
skip_non_songs=True, # Skip track listings and other non-song results
timeout=10, # Reasonable timeout for requests
retries=3, # Number of retries for failed requests
sleep_time=1, # Small delay between requests to be nice to the API
)

def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str, Any]]:
"""Fetch raw song data from Genius API."""
Expand All @@ -35,6 +42,9 @@ def _fetch_data_from_source(self, artist: str, title: str) -> Optional[Dict[str,

def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
"""Convert Genius's raw API response to standardized format."""
# Clean the lyrics before processing
lyrics = self._clean_lyrics(raw_data.get("lyrics", ""))

# Extract release date components if available
release_date = None
if release_components := raw_data.get("release_date_components"):
Expand Down Expand Up @@ -68,6 +78,23 @@ def _convert_result_format(self, raw_data: Dict[str, Any]) -> LyricsData:
)

# Create result object
return LyricsData(
source="genius", lyrics=raw_data.get("lyrics", ""), segments=[], metadata=metadata
) # Genius doesn't provide timestamp data
return LyricsData(source="genius", lyrics=lyrics, segments=[], metadata=metadata)

def _clean_lyrics(self, lyrics: str) -> str:
"""Clean and process lyrics from Genius to remove unwanted content."""

lyrics = lyrics.replace("\\n", "\n")
lyrics = re.sub(r"You might also like", "", lyrics)
lyrics = re.sub(
r".*?Lyrics([A-Z])", r"\1", lyrics
) # Remove the song name and word "Lyrics" if this has a non-newline char at the start
lyrics = re.sub(r"^[0-9]* Contributors.*Lyrics", "", lyrics) # Remove this example: 27 ContributorsSex Bomb Lyrics
lyrics = re.sub(
r"See.*Live.*Get tickets as low as \$[0-9]+", "", lyrics
) # Remove this example: See Tom Jones LiveGet tickets as low as $71
lyrics = re.sub(r"[0-9]+Embed$", "", lyrics) # Remove the word "Embed" at end of line with preceding numbers if found
lyrics = re.sub(r"(\S)Embed$", r"\1", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
lyrics = re.sub(r"^Embed$", r"", lyrics) # Remove the word "Embed" if it has been tacked onto a word at the end of a line
lyrics = re.sub(r".*?\[.*?\].*?", "", lyrics) # Remove lines containing square brackets
# add any additional cleaning rules here
return lyrics
62 changes: 32 additions & 30 deletions lyrics_transcriber/output/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def __init__(

def generate_outputs(
self,
transcription_corrected: CorrectionResult,
transcription_corrected: Optional[CorrectionResult],
lyrics_results: List[LyricsData],
output_prefix: str,
audio_filepath: str,
Expand All @@ -110,35 +110,37 @@ def generate_outputs(
for lyrics_data in lyrics_results:
self.plain_text.write_lyrics(lyrics_data, output_prefix)

# Write original (uncorrected) transcription
outputs.original_txt = self.plain_text.write_original_transcription(transcription_corrected, output_prefix)

# Resize corrected segments to ensure none are longer than max_line_length
resized_segments = self.segment_resizer.resize_segments(transcription_corrected.corrected_segments)
transcription_corrected.resized_segments = resized_segments
outputs.corrections_json = self.write_corrections_data(transcription_corrected, output_prefix)

# Write corrected lyrics as plain text
outputs.corrected_txt = self.plain_text.write_corrected_lyrics(resized_segments, output_prefix)

# Generate LRC using LyricsFileGenerator
outputs.lrc = self.lyrics_file.generate_lrc(resized_segments, output_prefix)

# Generate CDG file if requested
if self.config.generate_cdg:
outputs.cdg, outputs.mp3, outputs.cdg_zip = self.cdg.generate_cdg(
segments=resized_segments,
audio_file=audio_filepath,
title=title or output_prefix,
artist=artist or "",
cdg_styles=self.config.styles["cdg"],
)

# Generate video if requested
if self.config.render_video:
# Generate ASS subtitles
outputs.ass = self.subtitle.generate_ass(resized_segments, output_prefix, audio_filepath)
outputs.video = self.video.generate_video(outputs.ass, audio_filepath, output_prefix)
# Only process transcription-related outputs if we have transcription data
if transcription_corrected:
# Write original (uncorrected) transcription
outputs.original_txt = self.plain_text.write_original_transcription(transcription_corrected, output_prefix)

# Resize corrected segments to ensure none are longer than max_line_length
resized_segments = self.segment_resizer.resize_segments(transcription_corrected.corrected_segments)
transcription_corrected.resized_segments = resized_segments
outputs.corrections_json = self.write_corrections_data(transcription_corrected, output_prefix)

# Write corrected lyrics as plain text
outputs.corrected_txt = self.plain_text.write_corrected_lyrics(resized_segments, output_prefix)

# Generate LRC using LyricsFileGenerator
outputs.lrc = self.lyrics_file.generate_lrc(resized_segments, output_prefix)

# Generate CDG file if requested
if self.config.generate_cdg:
outputs.cdg, outputs.mp3, outputs.cdg_zip = self.cdg.generate_cdg(
segments=resized_segments,
audio_file=audio_filepath,
title=title or output_prefix,
artist=artist or "",
cdg_styles=self.config.styles["cdg"],
)

# Generate video if requested
if self.config.render_video:
# Generate ASS subtitles
outputs.ass = self.subtitle.generate_ass(resized_segments, output_prefix, audio_filepath)
outputs.video = self.video.generate_video(outputs.ass, audio_filepath, output_prefix)

return outputs

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "lyrics-transcriber"
version = "0.33.0"
version = "0.34.0"
description = "Automatically create synchronised lyrics files in ASS and MidiCo LRC formats with word-level timestamps, using Whisper and lyrics from Genius and Spotify"
authors = ["Andrew Beveridge <[email protected]>"]
license = "MIT"
Expand Down

0 comments on commit b63a62c

Please sign in to comment.