diff --git a/lyrics_transcriber/correction/corrector.py b/lyrics_transcriber/correction/corrector.py index 38cbb2d..9df196b 100644 --- a/lyrics_transcriber/correction/corrector.py +++ b/lyrics_transcriber/correction/corrector.py @@ -6,6 +6,9 @@ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler from lyrics_transcriber.correction.handlers.word_count_match import WordCountMatchHandler from lyrics_transcriber.correction.handlers.extra_words import ExtraWordsHandler +from lyrics_transcriber.correction.handlers.sound_alike import SoundAlikeHandler +from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler +from lyrics_transcriber.correction.handlers.repeat import RepeatCorrectionHandler class LyricsCorrector: @@ -26,12 +29,9 @@ def __init__( self.handlers = handlers or [ WordCountMatchHandler(), ExtraWordsHandler(), - # AnchorWordsInGapHandler(), # "Correct" words which are in the gap but are identical in the reference - # CombinedHandler(), # Try combined matching first - # MetaphoneHandler(), # Fall back to individual matchers - # SemanticHandler(), - # MultiWordLevenshteinHandler(), - # LevenshteinSimilarityHandler(), # Last resort + RepeatCorrectionHandler(), + SoundAlikeHandler(), + LevenshteinHandler(), # Last resort # HumanHandler(), # Open web UI for human to review and correct ] @@ -143,6 +143,11 @@ def _process_gaps(self, gap_sequences: List[GapSequence]) -> List[WordCorrection break self.logger.debug(f"Trying handler {handler.__class__.__name__}") + + # Pass previous corrections to RepeatCorrectionHandler + if isinstance(handler, RepeatCorrectionHandler): + handler.set_previous_corrections(all_corrections) + if handler.can_handle(gap): self.logger.debug(f"{handler.__class__.__name__} can handle gap") corrections = handler.handle(gap) diff --git a/lyrics_transcriber/correction/handlers/__init__.py b/lyrics_transcriber/correction/handlers/__init__.py index 6dba361..13a20aa 100644 --- a/lyrics_transcriber/correction/handlers/__init__.py +++ b/lyrics_transcriber/correction/handlers/__init__.py @@ -1,17 +1,15 @@ from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler from lyrics_transcriber.correction.handlers.word_count_match import WordCountMatchHandler -from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinSimilarityHandler -from lyrics_transcriber.correction.handlers.multi_levenshtein import MultiWordLevenshteinHandler -from lyrics_transcriber.correction.handlers.metaphone import MetaphoneHandler -from lyrics_transcriber.correction.handlers.semantic import SemanticHandler -from lyrics_transcriber.correction.handlers.combined import CombinedHandler +from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler +from lyrics_transcriber.correction.handlers.sound_alike import SoundAlikeHandler +from lyrics_transcriber.correction.handlers.extra_words import ExtraWordsHandler +from lyrics_transcriber.correction.handlers.human import HumanHandler __all__ = [ "GapCorrectionHandler", "WordCountMatchHandler", - "LevenshteinSimilarityHandler", - "MultiWordLevenshteinHandler", - "MetaphoneHandler", - "SemanticHandler", - "CombinedHandler", + "LevenshteinHandler", + "SoundAlikeHandler", + "ExtraWordsHandler", + "HumanHandler", ] diff --git a/lyrics_transcriber/correction/handlers/combined.py b/lyrics_transcriber/correction/handlers/combined.py deleted file mode 100644 index 9c357a6..0000000 --- a/lyrics_transcriber/correction/handlers/combined.py +++ /dev/null @@ -1,86 +0,0 @@ -from typing import Dict, List, Optional, Set, Tuple - -from lyrics_transcriber.types import GapSequence, Word, WordCorrection -from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler -from lyrics_transcriber.correction.handlers.metaphone import MetaphoneHandler -from lyrics_transcriber.correction.handlers.semantic import SemanticHandler - - -class CombinedHandler(GapCorrectionHandler): - """Combines phonetic and semantic matching with weighted scoring.""" - - def __init__( - self, - phonetic_weight: float = 0.6, - semantic_weight: float = 0.4, - combined_threshold: float = 0.5, - phonetic_threshold: float = 0.4, - semantic_threshold: float = 0.3, - ): - self.phonetic_matcher = MetaphoneHandler() - self.semantic_matcher = SemanticHandler() - self.phonetic_weight = phonetic_weight - self.semantic_weight = semantic_weight - self.combined_threshold = combined_threshold - self.phonetic_threshold = phonetic_threshold - self.semantic_threshold = semantic_threshold - - def _find_best_match(self, word: str, reference_words: Dict[str, List[str]]) -> Tuple[Optional[str], float, float, float, Set[str]]: - """Find the best matching reference word using combined scoring.""" - best_match = None - best_combined_score = 0.0 - best_phonetic_score = 0.0 - best_semantic_score = 0.0 - matching_sources = set() - - # Get unique reference words - all_ref_words = {w for words in reference_words.values() for w in words} - - for ref_word in all_ref_words: - # Get phonetic similarity - phonetic_score = self.phonetic_matcher._get_phonetic_similarity(word, ref_word) - - # Get semantic similarity - semantic_score = self.semantic_matcher._get_semantic_similarity(word, ref_word) - - # Calculate combined score - combined_score = (phonetic_score * self.phonetic_weight) + (semantic_score * self.semantic_weight) - - # Check if this is a better match - if ( - combined_score > best_combined_score - and phonetic_score >= self.phonetic_threshold - and semantic_score >= self.semantic_threshold - ): - best_combined_score = combined_score - best_phonetic_score = phonetic_score - best_semantic_score = semantic_score - best_match = ref_word - matching_sources = {source for source, words in reference_words.items() if ref_word in words} - - return best_match, best_phonetic_score, best_semantic_score, best_combined_score, matching_sources - - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: - """Check if we can handle this gap.""" - return bool(gap.reference_words) - - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: - """Try to correct word using combined matching.""" - if not word.text.strip(): - return None - - best_match, phonetic_score, semantic_score, combined_score, matching_sources = self._find_best_match(word.text, gap.reference_words) - - if best_match and combined_score >= self.combined_threshold and best_match.lower() != word.text.lower(): - return WordCorrection( - original_word=word.text, - corrected_word=best_match, - segment_index=segment_idx, - word_index=current_word_idx, - confidence=combined_score, - source=", ".join(matching_sources), - reason=f"Combined matching (phonetic: {phonetic_score:.2f}, semantic: {semantic_score:.2f})", - alternatives={}, - ) - - return None diff --git a/lyrics_transcriber/correction/handlers/human.py b/lyrics_transcriber/correction/handlers/human.py index ca654dd..11d929b 100644 --- a/lyrics_transcriber/correction/handlers/human.py +++ b/lyrics_transcriber/correction/handlers/human.py @@ -7,9 +7,9 @@ class HumanHandler(GapCorrectionHandler): """Handles gaps by opening a web UI for human to review the corrections made and manually fix any last gaps.""" - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: + def can_handle(self, gap: GapSequence) -> bool: return True - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: + def handle(self, gap: GapSequence) -> Optional[WordCorrection]: # TODO: Open web UI for human to review the corrections made and manually fix any last gaps return None diff --git a/lyrics_transcriber/correction/handlers/levenshtein.py b/lyrics_transcriber/correction/handlers/levenshtein.py index 925e460..1209ab4 100644 --- a/lyrics_transcriber/correction/handlers/levenshtein.py +++ b/lyrics_transcriber/correction/handlers/levenshtein.py @@ -1,23 +1,126 @@ +from typing import List, Optional import string -from typing import Dict, List, Optional, Set, Tuple import Levenshtein +import logging -from lyrics_transcriber.types import GapSequence, Word, WordCorrection +from lyrics_transcriber.types import GapSequence, WordCorrection from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler -class LevenshteinSimilarityHandler(GapCorrectionHandler): - """Handles corrections based on Levenshtein (edit distance) similarity between words.""" - - def __init__(self, similarity_threshold: float = 0.65): +class LevenshteinHandler(GapCorrectionHandler): + """Handles corrections based on Levenshtein (edit distance) similarity between words. + + This handler looks for words that are similar in spelling to reference words in the same position. + The similarity calculation includes: + 1. Basic Levenshtein ratio + 2. Bonus for words starting with the same letter + 3. Penalty for words starting with different letters + 4. Bonus for similar length words + + Examples: + Gap: "wold" (misspelling) + References: + genius: ["world"] + spotify: ["world"] + Result: + - Correct "wold" to "world" (high confidence due to small edit distance) + + Gap: "worde" (misspelling) + References: + genius: ["world"] + spotify: ["words"] + Result: + - Correct "worde" to "world" (lower confidence due to disagreeing sources) + """ + + def __init__(self, similarity_threshold: float = 0.65, logger: Optional[logging.Logger] = None): self.similarity_threshold = similarity_threshold + self.logger = logger or logging.getLogger(__name__) + + def can_handle(self, gap: GapSequence) -> bool: + """Check if we can handle this gap - we'll try if there are reference words.""" + if not gap.reference_words: + self.logger.debug("No reference words available") + return False + + if not gap.words: + self.logger.debug("No gap words available") + return False + + # Check if any word has sufficient similarity to reference + for i, word in enumerate(gap.words): + for ref_words in gap.reference_words.values(): + if i < len(ref_words): + similarity = self._get_string_similarity(word, ref_words[i]) + if similarity >= self.similarity_threshold: + self.logger.debug(f"Found similar word: '{word}' -> '{ref_words[i]}' ({similarity:.2f})") + return True + + self.logger.debug("No words meet similarity threshold") + return False + + def handle(self, gap: GapSequence) -> List[WordCorrection]: + """Try to correct words based on string similarity.""" + corrections = [] + + # Process each word in the gap + for i, word in enumerate(gap.words): + # Skip if word is empty or just punctuation + if not word.strip(): + continue + + # Skip exact matches + if any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values()): + self.logger.debug(f"Skipping exact match: '{word}'") + continue + + # Find matching reference words at this position + matches = {} # word -> (sources, similarity) + for source, ref_words in gap.reference_words.items(): + if i >= len(ref_words): + continue + + ref_word = ref_words[i] + similarity = self._get_string_similarity(word, ref_word) + + if similarity >= self.similarity_threshold: + self.logger.debug(f"Found match: '{word}' -> '{ref_word}' ({similarity:.2f})") + if ref_word not in matches: + matches[ref_word] = ([], similarity) + matches[ref_word][0].append(source) + + # Create correction for best match if any found + if matches: + best_match, (sources, similarity) = max( + matches.items(), key=lambda x: (len(x[1][0]), x[1][1]) # Sort by number of sources, then similarity + ) + + source_confidence = len(sources) / len(gap.reference_words) + final_confidence = similarity * source_confidence + + self.logger.debug(f"Creating correction: {word} -> {best_match} (confidence: {final_confidence})") + corrections.append( + WordCorrection( + original_word=word, + corrected_word=best_match, + segment_index=0, + word_index=gap.transcription_position + i, + confidence=final_confidence, + source=", ".join(sources), + reason=f"LevenshteinHandler: String similarity ({final_confidence:.2f})", + alternatives={k: len(v[0]) for k, v in matches.items()}, + is_deletion=False, + ) + ) + + return corrections def _clean_word(self, word: str) -> str: """Remove punctuation and standardize for comparison.""" return word.strip().lower().strip(string.punctuation) def _get_string_similarity(self, word1: str, word2: str) -> float: - """Calculate string similarity using Levenshtein ratio.""" + """Calculate string similarity using Levenshtein ratio with adjustments.""" # Clean words w1, w2 = self._clean_word(word1), self._clean_word(word2) if not w1 or not w2: @@ -38,50 +141,3 @@ def _get_string_similarity(self, word1: str, word2: str) -> float: similarity = (similarity + length_ratio) / 2 return similarity - - def _find_best_match(self, word: str, reference_words: Dict[str, List[str]]) -> Tuple[Optional[str], float, Set[str]]: - """Find the best matching reference word across all sources.""" - best_match = None - best_similarity = 0.0 - matching_sources = set() - - # Get unique reference words - all_ref_words = {w for words in reference_words.values() for w in words} - - for ref_word in all_ref_words: - similarity = self._get_string_similarity(word, ref_word) - - if similarity > best_similarity: - best_similarity = similarity - best_match = ref_word - matching_sources = {source for source, words in reference_words.items() if ref_word in words} - - return best_match, best_similarity, matching_sources - - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: - """Check if we can handle this gap - we'll try if there are reference words.""" - return bool(gap.reference_words) - - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: - """Try to correct word based on string similarity.""" - # Skip if word is empty or just punctuation - if not word.text.strip(): - return None - - # Find best matching reference word - best_match, similarity, matching_sources = self._find_best_match(word.text, gap.reference_words) - - # Return correction if we found a good match - if best_match and similarity >= self.similarity_threshold and best_match.lower() != word.text.lower(): - return WordCorrection( - original_word=word.text, - corrected_word=best_match, - segment_index=segment_idx, - word_index=current_word_idx, - confidence=similarity, - source=", ".join(matching_sources), - reason=f"String similarity ({similarity:.2f})", - alternatives={}, - ) - - return None diff --git a/lyrics_transcriber/correction/handlers/metaphone.py b/lyrics_transcriber/correction/handlers/metaphone.py deleted file mode 100644 index adfe4a2..0000000 --- a/lyrics_transcriber/correction/handlers/metaphone.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Dict, List, Optional, Set, Tuple -from metaphone import doublemetaphone -from nltk.metrics import edit_distance - -from lyrics_transcriber.types import GapSequence, Word, WordCorrection -from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler - - -class MetaphoneHandler(GapCorrectionHandler): - """Handles corrections using Double Metaphone phonetic algorithm.""" - - def __init__(self, similarity_threshold: float = 0.7): - self.similarity_threshold = similarity_threshold - - def _get_phonetic_similarity(self, word1: str, word2: str) -> float: - """Calculate phonetic similarity between two words using Double Metaphone.""" - # Get phonetic codes - code1_primary, code1_secondary = doublemetaphone(word1) - code2_primary, code2_secondary = doublemetaphone(word2) - - # Handle empty codes - if not code1_primary or not code2_primary: - return 0.0 - - # Compare primary codes - primary_similarity = 1 - (edit_distance(code1_primary, code2_primary) / max(len(code1_primary), len(code2_primary))) - - # Compare secondary codes if available - if code1_secondary and code2_secondary: - secondary_similarity = 1 - (edit_distance(code1_secondary, code2_secondary) / max(len(code1_secondary), len(code2_secondary))) - return max(primary_similarity, secondary_similarity) - - return primary_similarity - - def _find_best_match(self, word: str, reference_words: Dict[str, List[str]]) -> Tuple[Optional[str], float, Set[str]]: - """Find the best matching reference word across all sources.""" - best_match = None - best_similarity = 0.0 - matching_sources = set() - - # Get unique reference words - all_ref_words = {w for words in reference_words.values() for w in words} - - for ref_word in all_ref_words: - similarity = self._get_phonetic_similarity(word, ref_word) - - if similarity > best_similarity: - best_similarity = similarity - best_match = ref_word - matching_sources = {source for source, words in reference_words.items() if ref_word in words} - - return best_match, best_similarity, matching_sources - - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: - """Check if we can handle this gap.""" - return bool(gap.reference_words) - - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: - """Try to correct word based on phonetic similarity.""" - if not word.text.strip(): - return None - - best_match, similarity, matching_sources = self._find_best_match(word.text, gap.reference_words) - - if best_match and similarity >= self.similarity_threshold and best_match.lower() != word.text.lower(): - return WordCorrection( - original_word=word.text, - corrected_word=best_match, - segment_index=segment_idx, - word_index=current_word_idx, - confidence=similarity, - source=", ".join(matching_sources), - reason=f"Metaphone phonetic similarity ({similarity:.2f})", - alternatives={}, - ) - - return None diff --git a/lyrics_transcriber/correction/handlers/multi_levenshtein.py b/lyrics_transcriber/correction/handlers/multi_levenshtein.py deleted file mode 100644 index 7f2d20d..0000000 --- a/lyrics_transcriber/correction/handlers/multi_levenshtein.py +++ /dev/null @@ -1,97 +0,0 @@ -from typing import List, Optional, Tuple - -from lyrics_transcriber.types import GapSequence, Word, WordCorrection -from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler -from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinSimilarityHandler - - -class MultiWordLevenshteinHandler(GapCorrectionHandler): - """Handles corrections by matching sequences of words.""" - - def __init__(self, similarity_threshold: float = 0.65): - self.similarity_threshold = similarity_threshold - self.levenshtein_matcher = LevenshteinSimilarityHandler(similarity_threshold) - - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: - """Check if we can handle this gap.""" - if not gap.reference_words: - return False - - # Don't handle cases where sources disagree - ref_words_lists = list(gap.reference_words.values()) - if not all(words == ref_words_lists[0] for words in ref_words_lists[1:]): - return False - - # Don't handle cases where reference has different length than gap - if any(len(words) != len(gap.words) for words in gap.reference_words.values()): - return False - - return True - - def _align_sequences(self, gap_words: List[str], ref_words: List[str]) -> List[Tuple[Optional[str], Optional[str], float]]: - """Align two sequences of words and return matches with confidence scores.""" - alignments = [] - - # For each gap word, try to find the best match in the reference words - for i, gap_word in enumerate(gap_words): - best_match = None - best_score = 0.0 - - # First, try exact position match if available - if i < len(ref_words): - ref_word = ref_words[i] - # Use a base position confidence even if words aren't similar - position_score = 0.7 # Base confidence for position match - - # If words are similar, boost the confidence - similarity = self.levenshtein_matcher._get_string_similarity(gap_word, ref_word) - score = max(position_score, similarity) - - if score >= self.similarity_threshold: - best_match = ref_word - best_score = score - - alignments.append((gap_word, best_match, best_score)) - - return alignments - - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: - """Try to correct word based on sequence alignment.""" - if not word.text.strip(): - return None - - gap_pos = current_word_idx - gap.transcription_position - - best_alignment = None - best_confidence = 0.0 - best_sources = set() - - for source, ref_words in gap.reference_words.items(): - alignments = self._align_sequences(gap.words, ref_words) - - if gap_pos < len(alignments): - gap_word, correction, confidence = alignments[gap_pos] - - if correction and correction.lower() == word.text.lower(): - return None - - if correction and confidence > best_confidence: - best_alignment = correction - best_confidence = confidence - best_sources = {source} - elif correction and confidence == best_confidence: - best_sources.add(source) - - if best_alignment and best_confidence >= self.similarity_threshold: - return WordCorrection( - original_word=word.text, - corrected_word=best_alignment, - segment_index=segment_idx, - word_index=current_word_idx, - confidence=best_confidence, - source=", ".join(best_sources), - reason=f"Sequence alignment ({best_confidence:.2f})", - alternatives={}, - ) - - return None diff --git a/lyrics_transcriber/correction/handlers/repeat.py b/lyrics_transcriber/correction/handlers/repeat.py new file mode 100644 index 0000000..a8c5de3 --- /dev/null +++ b/lyrics_transcriber/correction/handlers/repeat.py @@ -0,0 +1,62 @@ +from typing import List, Dict, Optional +from lyrics_transcriber.types import GapSequence, WordCorrection +import logging + + +class RepeatCorrectionHandler: + """Handler that applies corrections that were previously made by other handlers.""" + + def __init__(self, logger: Optional[logging.Logger] = None, confidence_threshold: float = 0.7): + self.logger = logger or logging.getLogger(__name__) + self.confidence_threshold = confidence_threshold + self.previous_corrections: List[WordCorrection] = [] + + def can_handle(self, gap: GapSequence) -> bool: + """Check if any words in the gap match previous corrections.""" + return bool(self.previous_corrections) + + def set_previous_corrections(self, corrections: List[WordCorrection]) -> None: + """Store corrections from previous handlers to use as reference.""" + self.previous_corrections = corrections + + def handle(self, gap: GapSequence) -> List[WordCorrection]: + """Apply previous corrections to matching words in the current gap.""" + corrections = [] + + # Build a map of original words to their corrections + correction_map: Dict[str, List[WordCorrection]] = {} + for corr in self.previous_corrections: + if corr.confidence >= self.confidence_threshold: + correction_map.setdefault(corr.original_word.lower(), []).append(corr) + + # Check each word in the gap + for i, word in enumerate(gap.words): + word_lower = word.lower() + if word_lower in correction_map: + # Get the most common correction for this word + prev_corrections = correction_map[word_lower] + best_correction = max( + prev_corrections, + key=lambda c: (sum(1 for pc in prev_corrections if pc.corrected_word == c.corrected_word), c.confidence), + ) + + self.logger.debug( + f"Applying previous correction: {word} -> {best_correction.corrected_word} " + f"(confidence: {best_correction.confidence:.2f})" + ) + + corrections.append( + WordCorrection( + original_word=word, + corrected_word=best_correction.corrected_word, + segment_index=0, + word_index=gap.transcription_position + i, + confidence=best_correction.confidence * 0.9, # Slightly lower confidence for repeats + source=best_correction.source, + reason=f"RepeatCorrectionHandler: Matches previous correction", + alternatives={best_correction.corrected_word: 1}, + is_deletion=False, + ) + ) + + return corrections diff --git a/lyrics_transcriber/correction/handlers/semantic.py b/lyrics_transcriber/correction/handlers/semantic.py deleted file mode 100644 index def463d..0000000 --- a/lyrics_transcriber/correction/handlers/semantic.py +++ /dev/null @@ -1,80 +0,0 @@ -import torch -from transformers import AutoTokenizer, AutoModel -from typing import Dict, List, Optional, Set, Tuple - -from lyrics_transcriber.types import GapSequence, Word, WordCorrection -from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler - - -class SemanticHandler(GapCorrectionHandler): - """Handles corrections using transformer-based semantic similarity.""" - - def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", similarity_threshold: float = 0.3): - self.similarity_threshold = similarity_threshold - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModel.from_pretrained(model_name) - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.model.to(self.device) - - def _get_embedding(self, text: str) -> torch.Tensor: - """Get embedding for a piece of text.""" - inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - embedding = outputs.last_hidden_state.mean(dim=1) - - return embedding - - def _get_semantic_similarity(self, text1: str, text2: str) -> float: - """Calculate semantic similarity between two pieces of text.""" - emb1 = self._get_embedding(text1) - emb2 = self._get_embedding(text2) - - similarity = torch.nn.functional.cosine_similarity(emb1, emb2, dim=1) - return similarity.item() - - def _find_best_match(self, word: str, reference_words: Dict[str, List[str]]) -> Tuple[Optional[str], float, Set[str]]: - """Find the best matching reference word across all sources.""" - best_match = None - best_similarity = 0.0 - matching_sources = set() - - # Get unique reference words - all_ref_words = {w for words in reference_words.values() for w in words} - - for ref_word in all_ref_words: - similarity = self._get_semantic_similarity(word, ref_word) - - if similarity > best_similarity: - best_similarity = similarity - best_match = ref_word - matching_sources = {source for source, words in reference_words.items() if ref_word in words} - - return best_match, best_similarity, matching_sources - - def can_handle(self, gap: GapSequence, current_word_idx: int) -> bool: - """Check if we can handle this gap.""" - return bool(gap.reference_words) - - def handle(self, gap: GapSequence, word: Word, current_word_idx: int, segment_idx: int) -> Optional[WordCorrection]: - """Try to correct word based on semantic similarity.""" - if not word.text.strip(): - return None - - best_match, similarity, matching_sources = self._find_best_match(word.text, gap.reference_words) - - if best_match and similarity >= self.similarity_threshold and best_match.lower() != word.text.lower(): - return WordCorrection( - original_word=word.text, - corrected_word=best_match, - segment_index=segment_idx, - word_index=current_word_idx, - confidence=similarity, - source=", ".join(matching_sources), - reason=f"Semantic similarity ({similarity:.2f})", - alternatives={}, - ) - - return None diff --git a/lyrics_transcriber/correction/handlers/sound_alike.py b/lyrics_transcriber/correction/handlers/sound_alike.py new file mode 100644 index 0000000..c16725f --- /dev/null +++ b/lyrics_transcriber/correction/handlers/sound_alike.py @@ -0,0 +1,216 @@ +from typing import List, Dict, Tuple, Optional +import logging +from metaphone import doublemetaphone +from lyrics_transcriber.types import GapSequence, WordCorrection +from lyrics_transcriber.correction.handlers.base import GapCorrectionHandler + + +class SoundAlikeHandler(GapCorrectionHandler): + """Handles gaps where words sound similar to reference words but are spelled differently. + + Uses Double Metaphone algorithm to detect sound-alike words. For each word in the gap, + it checks if its phonetic encoding matches any reference word's encoding. + + The confidence of corrections is based on: + 1. The ratio of reference sources agreeing on the correction + 2. Whether the match was on primary (1.0) or secondary (0.8) metaphone code + + Examples: + Gap: "shush look deep" + References: + genius: ["search", "look", "deep"] + spotify: ["search", "look", "deep"] + Result: + - Correct "shush" to "search" (confidence based on metaphone match type) + - Validate "look" and "deep" (exact matches) + """ + + def __init__(self, logger: Optional[logging.Logger] = None, similarity_threshold: float = 0.6): + """Initialize the handler. + + Args: + logger: Optional logger instance + similarity_threshold: Minimum confidence threshold for matches (default: 0.6) + """ + self.logger = logger or logging.getLogger(__name__) + self.similarity_threshold = similarity_threshold + + def can_handle(self, gap: GapSequence) -> bool: + # Must have reference words + if not gap.reference_words: + self.logger.debug("No reference words available") + return False + + # Gap must have words + if not gap.words: + self.logger.debug("No gap words available") + return False + + # Check if any gap word has a metaphone match with any reference word + for word in gap.words: + word_codes = doublemetaphone(word) + self.logger.debug(f"Gap word '{word}' has metaphone codes: {word_codes}") + for ref_words in gap.reference_words.values(): + for ref_word in ref_words: + ref_codes = doublemetaphone(ref_word) + self.logger.debug(f"Reference word '{ref_word}' has metaphone codes: {ref_codes}") + if self._codes_match(word_codes, ref_codes): + self.logger.debug(f"Found metaphone match between '{word}' and '{ref_word}'") + return True + self.logger.debug("No metaphone matches found") + return False + + def handle(self, gap: GapSequence) -> List[WordCorrection]: + corrections = [] + + # For each word in the gap + for i, word in enumerate(gap.words): + word_codes = doublemetaphone(word) + self.logger.debug(f"Processing '{word}' (codes: {word_codes})") + + # Skip if word exactly matches any reference + exact_match = any(i < len(ref_words) and word.lower() == ref_words[i].lower() for ref_words in gap.reference_words.values()) + if exact_match: + continue + + # Find sound-alike matches in references + matches: Dict[str, Tuple[List[str], float]] = {} + + for source, ref_words in gap.reference_words.items(): + for j, ref_word in enumerate(ref_words): + ref_codes = doublemetaphone(ref_word) + + match_confidence = self._get_match_confidence(word_codes, ref_codes) + if match_confidence >= self.similarity_threshold: + # Special handling for short codes - don't apply position penalty + is_short_code = any(len(c) <= 2 for c in word_codes if c) or any(len(c) <= 2 for c in ref_codes if c) + position_multiplier = 1.0 if is_short_code or i == j else 0.8 + + adjusted_confidence = match_confidence * position_multiplier + + if adjusted_confidence >= self.similarity_threshold: + if ref_word not in matches: + matches[ref_word] = ([], adjusted_confidence) + matches[ref_word][0].append(source) + + # Create correction for best match if any found + if matches: + best_match, (sources, base_confidence) = max(matches.items(), key=lambda x: (len(x[1][0]), x[1][1])) + + source_confidence = len(sources) / len(gap.reference_words) + final_confidence = base_confidence * source_confidence + + self.logger.debug(f"Found match: {word} -> {best_match} (confidence: {final_confidence:.2f}, sources: {sources})") + corrections.append( + WordCorrection( + original_word=word, + corrected_word=best_match, + segment_index=0, + word_index=gap.transcription_position + i, + confidence=final_confidence, + source=", ".join(sources), + reason=f"SoundAlikeHandler: Phonetic match ({final_confidence:.2f} confidence)", + alternatives={k: len(v[0]) for k, v in matches.items()}, + is_deletion=False, + ) + ) + + return corrections + + def _codes_match(self, codes1: Tuple[str, str], codes2: Tuple[str, str]) -> float: + """Check if two sets of metaphone codes match and return match quality.""" + # Get all non-empty codes + codes1_set = {c for c in codes1 if c} + codes2_set = {c for c in codes2 if c} + + if not codes1_set or not codes2_set: + return 0.0 + + best_match = 0.0 + for code1 in codes1_set: + for code2 in codes2_set: + # Special case for very short codes (like 'A' for 'you') + if len(code1) <= 2 or len(code2) <= 2: + if code1 == code2: + best_match = max(best_match, 1.0) + elif code1 in code2 or code2 in code1: + best_match = max(best_match, 0.8) + elif code1[0] == code2[0]: # Match first character + best_match = max(best_match, 0.7) + continue + + # Skip if codes are too different in length + length_diff = abs(len(code1) - len(code2)) + if length_diff > 3: + continue + + # Exact match + if code1 == code2: + best_match = max(best_match, 1.0) + continue + + # Similar codes (allow 1-2 character differences) + if len(code1) >= 2 and len(code2) >= 2: + # Compare first N characters where N is min length + min_len = min(len(code1), len(code2)) + + # Check for shared characters in any position + shared_chars = sum(1 for c in code1 if c in code2) + if shared_chars >= min(2, min_len): # More lenient shared character requirement + match_quality = 0.7 + (0.1 * shared_chars / max(len(code1), len(code2))) + best_match = max(best_match, match_quality) + continue + + # Compare aligned characters + differences = sum(1 for a, b in zip(code1[:min_len], code2[:min_len]) if a != b) + if differences <= 2: + match_quality = 0.85 - (differences * 0.1) + best_match = max(best_match, match_quality) + continue + + # Common prefix/suffix match with more lenient threshold + common_prefix_len = 0 + for a, b in zip(code1, code2): + if a != b: + break + common_prefix_len += 1 + + common_suffix_len = 0 + for a, b in zip(code1[::-1], code2[::-1]): + if a != b: + break + common_suffix_len += 1 + + if common_prefix_len >= 1 or common_suffix_len >= 1: # Even more lenient prefix/suffix requirement + match_quality = 0.7 + (0.1 * max(common_prefix_len, common_suffix_len)) + best_match = max(best_match, match_quality) + continue + + # Substring match + if len(code1) >= 2 and len(code2) >= 2: # More lenient length requirement + # Look for shared substrings of length 2 or more + for length in range(min(len(code1), len(code2)), 1, -1): + for i in range(len(code1) - length + 1): + substring = code1[i : i + length] + if substring in code2: + match_quality = 0.7 + (0.1 * length / max(len(code1), len(code2))) + best_match = max(best_match, match_quality) + break + + return best_match + + def _get_match_confidence(self, codes1: Tuple[str, str], codes2: Tuple[str, str]) -> float: + """Calculate confidence score for a metaphone code match.""" + match_quality = self._codes_match(codes1, codes2) + if match_quality == 0: + return 0.0 + + # Get primary codes (first code of each tuple) + code1, code2 = codes1[0], codes2[0] + + # Boost confidence for codes that share prefixes + if code1 and code2 and len(code1) >= 2 and len(code2) >= 2: + if code1[:2] == code2[:2]: + match_quality = min(1.0, match_quality + 0.1) + + return match_quality diff --git a/tests/correction/handlers/test_levenshtein.py b/tests/correction/handlers/test_levenshtein.py new file mode 100644 index 0000000..2f900da --- /dev/null +++ b/tests/correction/handlers/test_levenshtein.py @@ -0,0 +1,119 @@ +import pytest +import logging +from lyrics_transcriber.correction.handlers.levenshtein import LevenshteinHandler +from lyrics_transcriber.types import GapSequence + + +@pytest.fixture +def logger(): + logger = logging.getLogger("test_levenshtein") + logger.setLevel(logging.DEBUG) + return logger + + +def test_handle_basic_example(logger): + handler = LevenshteinHandler(logger=logger) + gap = GapSequence( + words=("wold", "worde"), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["world", "words"], "spotify": ["world", "words"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 2 + + assert corrections[0].original_word == "wold" + assert corrections[0].corrected_word == "world" + assert corrections[0].confidence > 0.8 # High confidence - small edit distance + assert corrections[0].source == "genius, spotify" + + assert corrections[1].original_word == "worde" + assert corrections[1].corrected_word == "words" + assert corrections[1].confidence > 0.7 + + +def test_handle_sound_alike_example(logger): + handler = LevenshteinHandler(logger=logger) + gap = GapSequence( + words=("shush", "look", "deep"), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["search", "look", "deep"], "spotify": ["search", "look", "deep"]}, + ) + + # First check if handler thinks it can handle this + can_handle = handler.can_handle(gap) + logger.debug(f"Can handle 'shush' -> 'search': {can_handle}") + + corrections = handler.handle(gap) + logger.debug(f"Corrections for sound-alike example: {corrections}") + + # We expect this to fail or have very low confidence + # as Levenshtein distance between "shush" and "search" is quite large + assert len(corrections) <= 1 # Might not find any matches + + +def test_handle_disagreeing_references(logger): + handler = LevenshteinHandler(logger=logger) + gap = GapSequence( + words=("worde",), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["world"], "spotify": ["words"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 1 + assert corrections[0].confidence < 0.8 # Lower confidence due to disagreeing sources + + +def test_preserves_exact_matches(logger): + handler = LevenshteinHandler(logger=logger) + gap = GapSequence( + words=("wold", "words", "test"), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["world", "words", "test"], "spotify": ["world", "words", "test"]}, + ) + + corrections = handler.handle(gap) + + # Should only correct "wold", leaving exact matches alone + assert len(corrections) == 1 + assert corrections[0].original_word == "wold" + assert corrections[0].corrected_word == "world" + + +def test_similarity_thresholds(logger): + handler = LevenshteinHandler(similarity_threshold=0.8, logger=logger) + gap = GapSequence( + words=("completely",), # More different from reference + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["different"], "spotify": ["different"]}, + ) + + # With high threshold, should not find matches + assert handler.can_handle(gap) is False + + # Lower threshold should still not match these very different words + handler.similarity_threshold = 0.6 + assert handler.can_handle(gap) is False + + # But should match similar words + gap = GapSequence( + words=("worde",), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["words"], "spotify": ["words"]}, + ) + assert handler.can_handle(gap) is True diff --git a/tests/correction/handlers/test_repeat.py b/tests/correction/handlers/test_repeat.py new file mode 100644 index 0000000..099bde9 --- /dev/null +++ b/tests/correction/handlers/test_repeat.py @@ -0,0 +1,181 @@ +import pytest +import logging +from lyrics_transcriber.correction.handlers.repeat import RepeatCorrectionHandler +from lyrics_transcriber.types import GapSequence, WordCorrection + + +@pytest.fixture +def logger(): + logger = logging.getLogger("test_repeat") + logger.setLevel(logging.DEBUG) + return logger + + +def test_cannot_handle_without_previous_corrections(logger): + handler = RepeatCorrectionHandler(logger) + gap = GapSequence( + words=("test", "words"), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["some", "words"], "spotify": ["some", "words"]}, + ) + + assert not handler.can_handle(gap) + + +def test_handle_repeat_correction(logger): + handler = RepeatCorrectionHandler(logger) + + # Set up previous corrections + previous_corrections = [ + WordCorrection( + original_word="war", + corrected_word="waterloo", + segment_index=0, + word_index=0, + confidence=0.9, + source="genius", + reason="Previous handler correction", + alternatives={}, + is_deletion=False, + ) + ] + handler.set_previous_corrections(previous_corrections) + + # Create gap with same word + gap = GapSequence( + words=("war", "again"), + transcription_position=5, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["some", "words"], "spotify": ["some", "words"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 1 + assert corrections[0].original_word == "war" + assert corrections[0].corrected_word == "waterloo" + assert corrections[0].word_index == 5 # Should use gap's transcription_position + assert corrections[0].confidence == 0.81 # 0.9 * 0.9 + assert "previous correction" in corrections[0].reason.lower() + + +def test_handle_multiple_previous_corrections(logger): + handler = RepeatCorrectionHandler(logger) + + # Set up previous corrections with same word corrected differently + previous_corrections = [ + WordCorrection( + original_word="word", + corrected_word="correction1", + segment_index=0, + word_index=0, + confidence=0.8, + source="genius", + reason="First correction", + alternatives={}, + is_deletion=False, + ), + WordCorrection( + original_word="word", + corrected_word="correction2", + segment_index=0, + word_index=1, + confidence=0.9, + source="spotify", + reason="Second correction", + alternatives={}, + is_deletion=False, + ), + WordCorrection( + original_word="word", + corrected_word="correction2", + segment_index=0, + word_index=2, + confidence=0.85, + source="genius", + reason="Third correction", + alternatives={}, + is_deletion=False, + ), + ] + handler.set_previous_corrections(previous_corrections) + + gap = GapSequence( + words=("word",), + transcription_position=10, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["some"], "spotify": ["some"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 1 + assert corrections[0].original_word == "word" + assert corrections[0].corrected_word == "correction2" # Should pick most common correction + assert corrections[0].word_index == 10 + + +def test_ignore_low_confidence_corrections(logger): + handler = RepeatCorrectionHandler(logger, confidence_threshold=0.8) + + # Set up previous corrections with low confidence + previous_corrections = [ + WordCorrection( + original_word="test", + corrected_word="low_confidence", + segment_index=0, + word_index=0, + confidence=0.6, # Below threshold + source="genius", + reason="Low confidence correction", + alternatives={}, + is_deletion=False, + ) + ] + handler.set_previous_corrections(previous_corrections) + + gap = GapSequence( + words=("test",), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["some"], "spotify": ["some"]}, + ) + + corrections = handler.handle(gap) + assert len(corrections) == 0 # Should not apply low confidence corrections + + +def test_case_insensitive_matching(logger): + handler = RepeatCorrectionHandler(logger) + + previous_corrections = [ + WordCorrection( + original_word="Word", + corrected_word="Correction", + segment_index=0, + word_index=0, + confidence=0.9, + source="genius", + reason="Previous correction", + alternatives={}, + is_deletion=False, + ) + ] + handler.set_previous_corrections(previous_corrections) + + gap = GapSequence( + words=("word", "WORD", "Word"), # Different cases + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["some", "words"], "spotify": ["some", "words"]}, + ) + + corrections = handler.handle(gap) + assert len(corrections) == 3 # Should correct all variations + assert all(c.corrected_word == "Correction" for c in corrections) diff --git a/tests/correction/handlers/test_sound_alike.py b/tests/correction/handlers/test_sound_alike.py new file mode 100644 index 0000000..0616003 --- /dev/null +++ b/tests/correction/handlers/test_sound_alike.py @@ -0,0 +1,137 @@ +import pytest +import logging +from lyrics_transcriber.correction.handlers.sound_alike import SoundAlikeHandler +from lyrics_transcriber.types import GapSequence + + +@pytest.fixture +def logger(): + logger = logging.getLogger("test_sound_alike") + logger.setLevel(logging.DEBUG) + return logger + + +def test_handle_phonetic_example(logger): + handler = SoundAlikeHandler(logger, similarity_threshold=0.7) + gap = GapSequence( + words=("fone", "lite", "nite"), # Common phonetic misspellings + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["phone", "light", "night"], "spotify": ["phone", "light", "night"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 3 # All words need correction + + assert corrections[0].original_word == "fone" + assert corrections[0].corrected_word == "phone" + assert corrections[0].word_index == 0 + assert corrections[0].confidence >= 0.7 + + assert corrections[1].original_word == "lite" + assert corrections[1].corrected_word == "light" + assert corrections[1].word_index == 1 + + assert corrections[2].original_word == "nite" + assert corrections[2].corrected_word == "night" + assert corrections[2].word_index == 2 + + +def test_handle_disagreeing_references(logger): + handler = SoundAlikeHandler(logger, similarity_threshold=0.7) + gap = GapSequence( + words=("fone",), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["phone"], "spotify": ["foam"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 1 + assert corrections[0].confidence < 0.7 # Lower confidence due to disagreeing sources + + +def test_cannot_handle_no_sound_alike_matches(logger): + handler = SoundAlikeHandler(logger, similarity_threshold=0.8) + gap = GapSequence( + words=("xyz", "abc", "def"), # Use words with completely different phonetic codes + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["one", "two", "three"], "spotify": ["one", "two", "three"]}, + ) + + corrections = handler.handle(gap) + assert len(corrections) == 0 # Should find no matches above threshold + + +def test_handle_preserves_exact_matches(logger): + handler = SoundAlikeHandler(logger, similarity_threshold=0.7) + gap = GapSequence( + words=("fone", "light", "night"), # middle and last words exact matches + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["phone", "light", "night"], "spotify": ["phone", "light", "night"]}, + ) + + corrections = handler.handle(gap) + + # Should only correct "fone", leaving exact matches alone + assert len(corrections) == 1 + assert corrections[0].original_word == "fone" + assert corrections[0].corrected_word == "phone" + + +def test_handle_complex_sound_alike_example(logger): + handler = SoundAlikeHandler(logger, similarity_threshold=0.65) + gap = GapSequence( + words=("relax", "your", "conscience"), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["you", "relapse", "unconscious"], "spotify": ["you", "relapse", "unconscious"]}, + ) + + corrections = handler.handle(gap) + + # We expect corrections two words + assert len(corrections) == 2 + + # Sort corrections by word_index for easier testing + corrections.sort(key=lambda x: x.word_index) + + # Check first word: "relax" -> "relapse" + assert corrections[0].original_word == "relax" + assert corrections[0].corrected_word == "relapse" + assert corrections[0].word_index == 0 + assert corrections[0].confidence >= 0.65 + + # Check third word: "conscience" -> "unconscious" + assert corrections[1].original_word == "conscience" + assert corrections[1].corrected_word == "unconscious" + assert corrections[1].word_index == 2 + assert corrections[1].confidence >= 0.65 + + +def test_handle_substring_code_match(logger): + """Test the substring code matching.""" + handler = SoundAlikeHandler(logger, similarity_threshold=0.65) + gap = GapSequence( + words=("conscience",), + transcription_position=0, + preceding_anchor=None, + following_anchor=None, + reference_words={"genius": ["unconscious"], "spotify": ["unconscious"]}, + ) + + corrections = handler.handle(gap) + + assert len(corrections) == 1 + assert corrections[0].original_word == "conscience" + assert corrections[0].corrected_word == "unconscious" + assert corrections[0].confidence >= 0.65