diff --git a/lyrics-analyzer/src/components/LyricsAnalyzer.tsx b/lyrics-analyzer/src/components/LyricsAnalyzer.tsx index 3362138..b7c805c 100644 --- a/lyrics-analyzer/src/components/LyricsAnalyzer.tsx +++ b/lyrics-analyzer/src/components/LyricsAnalyzer.tsx @@ -106,7 +106,7 @@ export default function LyricsAnalyzer({ data: initialData, onFileLoad, apiClien type: 'gap' as const, data: { ...info.gap, - position: info.gap.transcription_position, + position: info.gap.transcription_position + (info.wordIndex - info.gap.transcription_position), word: info.gap.words[info.wordIndex - info.gap.transcription_position] } } @@ -122,33 +122,31 @@ export default function LyricsAnalyzer({ data: initialData, onFileLoad, apiClien console.log('Position:', position) console.log('Updated words:', updatedWords) - // Update manual corrections - setManualCorrections(prev => { - const newCorrections = new Map(prev) - newCorrections.set(position, updatedWords) - return newCorrections - }) - // Create a deep clone of the data const newData = JSON.parse(JSON.stringify(data)) - // Find and update the gap sequence + // Find the gap that contains this position const gapIndex = newData.gap_sequences.findIndex( - (gap: GapSequence) => gap.transcription_position === position + (gap: GapSequence) => + position >= gap.transcription_position && + position < gap.transcription_position + gap.words.length ) if (gapIndex !== -1) { const originalGap = newData.gap_sequences[gapIndex] - console.log('Found gap at index:', gapIndex) - console.log('Original gap:', { - text: originalGap.text, - words: originalGap.words, - transcription_position: originalGap.transcription_position + const wordIndexInGap = position - originalGap.transcription_position + console.log('Found gap at index:', gapIndex, 'word index in gap:', wordIndexInGap) + + // Update manual corrections + setManualCorrections(prev => { + const newCorrections = new Map(prev) + newCorrections.set(position, updatedWords) + return newCorrections }) // Create a new correction const newCorrection: WordCorrection = { - original_word: originalGap.text, + original_word: originalGap.words[wordIndexInGap], corrected_word: updatedWords.join(' '), segment_index: 0, original_position: position, @@ -161,63 +159,60 @@ export default function LyricsAnalyzer({ data: initialData, onFileLoad, apiClien reference_positions: {} } - // Find the corresponding segment first to get timing information - const segmentIndex = newData.corrected_segments.findIndex((segment: LyricsSegment) => { - // Calculate total words before this segment - let totalWords = 0 - for (let i = 0; i < newData.corrected_segments.indexOf(segment); i++) { - totalWords += newData.corrected_segments[i].words.length + // Find the corresponding segment by counting words + let currentPosition = 0 + let segmentIndex = -1 + let wordIndex = -1 + + for (let i = 0; i < newData.corrected_segments.length; i++) { + const segment = newData.corrected_segments[i] + if (position >= currentPosition && position < currentPosition + segment.words.length) { + segmentIndex = i + wordIndex = position - currentPosition + break } + currentPosition += segment.words.length + } - // Check if this segment contains our target position - const segmentLength = segment.words.length - return totalWords <= originalGap.transcription_position && - (totalWords + segmentLength) > originalGap.transcription_position + console.log('Segment search:', { + position, + segmentIndex, + wordIndex, + totalSegments: newData.corrected_segments.length }) - if (segmentIndex !== -1) { + if (segmentIndex !== -1 && wordIndex !== -1) { const segment = newData.corrected_segments[segmentIndex] + const timingWord = segment.words[wordIndex] + console.log('Found matching segment:', { text: segment.text, - totalWords: newData.corrected_segments - .slice(0, segmentIndex) - .reduce((sum: number, seg: LyricsSegment) => sum + seg.words.length, 0) + wordCount: segment.words.length, + wordIndex, + word: timingWord?.text }) - // Calculate the word index within the segment - let wordsBefore = 0 - for (let i = 0; i < segmentIndex; i++) { - wordsBefore += newData.corrected_segments[i].words.length + if (!timingWord) { + console.error('Could not find timing word in segment') + console.groupEnd() + return } - const wordIndex = originalGap.transcription_position - wordsBefore - const timingWord = segment.words[wordIndex] - console.log('Found word in segment:', { - index: wordIndex, - word: timingWord.text, - timing: { - start: timingWord.start_time, - end: timingWord.end_time - } - }) - - // Update gap sequence with timing from segment + // Update gap sequence + const newWords = [...originalGap.words] + newWords[wordIndexInGap] = updatedWords[0] newData.gap_sequences[gapIndex] = { ...originalGap, - words: updatedWords.map(word => ({ - text: word, - start_time: timingWord.start_time, - end_time: timingWord.end_time - })), - text: updatedWords.join(' '), + words: newWords, + text: newWords.join(' '), corrections: originalGap.corrections .filter((c: WordCorrection) => c.source !== 'manual') .concat([newCorrection]) } - // Now update the segment - const newWords = [...segment.words] - newWords[wordIndex] = { + // Update segment + const newSegmentWords = [...segment.words] + newSegmentWords[wordIndex] = { ...timingWord, text: updatedWords[0], confidence: 1.0 @@ -225,13 +220,13 @@ export default function LyricsAnalyzer({ data: initialData, onFileLoad, apiClien newData.corrected_segments[segmentIndex] = { ...segment, - words: newWords, - text: newWords.map(word => word.text).join(' ') + words: newSegmentWords, + text: newSegmentWords.map(word => word.text).join(' ') } console.log('Updated both gap and segment') } else { - console.log('No matching segment found') + console.error('Could not find matching segment for position:', position) } } diff --git a/lyrics_transcriber/core/controller.py b/lyrics_transcriber/core/controller.py index 2f21ed1..9e5239d 100644 --- a/lyrics_transcriber/core/controller.py +++ b/lyrics_transcriber/core/controller.py @@ -291,33 +291,40 @@ def correct_lyrics(self) -> None: # Add human review step if self.output_config.enable_review: from ..review import start_review_server - import re import json + from copy import deepcopy self.logger.info("Starting human review process") - # Get auto-corrected data as JSON string - auto_corrected_json = json.dumps(self.results.transcription_corrected.to_dict(), indent=4).splitlines() + def normalize_data(data_dict): + """Normalize numeric values in the data structure before JSON conversion.""" + if isinstance(data_dict, dict): + return {k: normalize_data(v) for k, v in data_dict.items()} + elif isinstance(data_dict, list): + return [normalize_data(item) for item in data_dict] + elif isinstance(data_dict, float): + # Convert whole number floats to integers + if data_dict.is_integer(): + return int(data_dict) + return data_dict + return data_dict + + # Normalize and convert auto-corrected data + auto_data = normalize_data(deepcopy(self.results.transcription_corrected.to_dict())) + auto_corrected_json = json.dumps(auto_data, indent=4).splitlines() # Pass through review server reviewed_data = start_review_server(self.results.transcription_corrected) - # Get reviewed data as JSON string - human_corrected_json = json.dumps(reviewed_data.to_dict(), indent=4).splitlines() + # Normalize and convert reviewed data + human_data = normalize_data(deepcopy(reviewed_data.to_dict())) + human_corrected_json = json.dumps(human_data, indent=4).splitlines() self.logger.info("Human review completed") - # Compare the strings, normalizing numbers first - def normalize_numbers(line: str) -> str: - # Convert "x.0" to "x" in the line - return re.sub(r"(\d+)\.0([,\s}])", r"\1\2", line) - - # Normalize numbers in both strings before comparing - auto_corrected_lines = [normalize_numbers(line) for line in auto_corrected_json] - human_corrected_lines = [normalize_numbers(line) for line in human_corrected_json] - + # Compare the normalized JSON strings diff = list( - difflib.unified_diff(auto_corrected_lines, human_corrected_lines, fromfile="auto-corrected", tofile="human-corrected") + difflib.unified_diff(auto_corrected_json, human_corrected_json, fromfile="auto-corrected", tofile="human-corrected") ) if diff: @@ -325,6 +332,8 @@ def normalize_numbers(line: str) -> str: for line in diff: self.logger.warning(line.rstrip()) + # exit(1) + def generate_outputs(self) -> None: """Generate output files based on enabled features and available data.""" self.logger.info("Generating output files")