From 4c7d1dc52dab6ed216615170656b990a96d96671 Mon Sep 17 00:00:00 2001 From: agbeckner <62198072+agbeckner@users.noreply.github.com> Date: Thu, 27 Feb 2025 16:52:56 -0500 Subject: [PATCH 1/3] Add files via upload --- Insert/insert_whisper_transcript.rb | 304 ++++++++++++++++++++++++++++ 1 file changed, 304 insertions(+) create mode 100644 Insert/insert_whisper_transcript.rb diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb new file mode 100644 index 0000000..03be496 --- /dev/null +++ b/Insert/insert_whisper_transcript.rb @@ -0,0 +1,304 @@ +# Purpose: This script processes and imports transcript files into Datavyu. +# +# Supported Formats: +# - WebVTT (.vtt) +# - SubRip (.srt) [planned] +# - Plain Text (.txt) [planned] +# +# Features: +# - File format validation +# - Timestamp conversion +# - Quality assurance workflow +# - Support for multiple subtitle formats [planned] +# - Optimized performance for large files +# - Dynamic chunk sizing based on file size +# +# Usage: +# 1. Run script +# 2. Select subtitle/transcript file when prompted +# 3. Script will create necessary columns in Datavyu: +# - transcript_original: Contains original transcription content +# - transcript_QA: For marking various types of errors +# - transcript_clean: For adding speaker labels to transcription +# - transcript_initials: For coder identification +# - transcript_notes: For additional observations +# Authors: Aaron G. Beckner & Trinity Wang +# 02-27-2025: Added flexible timestamp detection +# 01-14-2025: Added comments and made more generic + +# Revised by Van T. Pham for SPACE 2024 play coding +# Last edited: 01-21-2025 +# Optimized version: 02-27-2025 + +require 'Datavyu_API.rb' + +# Configuration constants +SUPPORTED_FORMATS = { + 'vtt' => 'WebVTT Subtitles' + # Add additional formats here as needed: + # 'srt' => 'SubRip Subtitles', + # 'txt' => 'Plain Text Transcripts' +} + +# Column configurations +COLUMN_CONFIGS = { + transcript_original: { + name: 'transcript_original', + codes: ['content'], + required: true + }, + qa: { + name: 'transcript_QA', + codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes + required: true + }, + transcript_clean: { + name: 'transcript_clean', + codes: ['speaker', 'content'], # Codes for speaker labeling and transcription content + required: true + }, + initials: { + name: 'transcript_initials', + codes: ['coder_initials'], # Optional Coder initials column + required: false + }, + notes: { + name: 'transcript_notes', + codes: ['notes'], + required: false + } +} + +# Base ratio for determining chunk size (adjust as needed) +# For every 10 cells, use chunk size of 1 +BASE_RATIO = 10.0 +# Minimum and maximum chunk sizes to ensure reasonable processing +MIN_CHUNK_SIZE = 10 +MAX_CHUNK_SIZE = 100 + +begin + # Import Java classes for GUI file selection + java_import javax.swing.JFileChooser + java_import javax.swing.filechooser.FileNameExtensionFilter + java_import javax.swing.JFrame + java_import javax.swing.JOptionPane + + # Sets up the file chooser dialog + def setup_file_chooser + frame = JFrame.new("Import Transcript") + frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE) + frame.setSize(200, 200) + frame.setLocationRelativeTo(nil) + + jfc = JFileChooser.new + jfc.setAcceptAllFileFilterUsed(false) + jfc.setMultiSelectionEnabled(false) + jfc.setDialogTitle('Select transcript file to import') + + SUPPORTED_FORMATS.each do |format, description| + extensions = [format].to_java(:String) + filter = FileNameExtensionFilter.new(description, extensions) + jfc.addChoosableFileFilter(filter) + end + + [frame, jfc] + end + + # Validates that the selected file has a supported format + def validate_file_format(file_path) + extension = File.extname(file_path)[1..-1] + raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension) + true + end + + # Optimize timestamp parsing for better performance + def parse_timestamp(time_str) + # Handle HH:MM:SS.mmm format + if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/) + hours = $1.to_i + minutes = $2.to_i + seconds = $3.to_i + milliseconds = $4.to_i + + return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds + # Handle MM:SS.mmm format (no hours) + elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/) + minutes = $1.to_i + seconds = $2.to_i + milliseconds = $3.to_i + + return (minutes * 60000) + (seconds * 1000) + milliseconds + else + raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm" + end + end + + # Pre-process the content to extract words and timestamps more efficiently + def process_content(content) + # Remove WEBVTT header if present + content.shift if content.first && content.first.strip == 'WEBVTT' + + timestamps = [] + words = [] + current_timestamp = nil + + content.each do |line| + line = line.strip + next if line.empty? + + if line.include?("-->") + onset_str, offset_str = line.split('-->').map(&:strip) + current_timestamp = { + onset: parse_timestamp(onset_str), + offset: parse_timestamp(offset_str) + } + elsif line.match(/^\d+$/) + # Skip cue numbers (commonly found in VTT files) + next + elsif current_timestamp && line.match(/^[a-zA-Z]/) + # Only add lines that start with letters and have a valid timestamp + timestamps << current_timestamp + words << line + end + end + + [words, timestamps] + end + + # Calculate optimal chunk size based on number of cells + def calculate_chunk_size(total_cells) + # Use the ratio of 1 chunk per 10 cells as basis + chunk_size = (total_cells / BASE_RATIO).ceil + + # Enforce minimum and maximum chunk sizes + chunk_size = [chunk_size, MIN_CHUNK_SIZE].max + chunk_size = [chunk_size, MAX_CHUNK_SIZE].min + + puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells" + chunk_size + end + + # Create columns in batches for better performance + def create_columns(words, timestamps) + columns = {} + + # First create all columns + COLUMN_CONFIGS.each do |type, config| + if config[:required] + columns[type] = new_column(config[:name], *config[:codes]) + end + end + + # Calculate total number of entries and dynamic chunk size + total_entries = words.size + chunk_size = calculate_chunk_size(total_entries) + total_chunks = (total_entries.to_f / chunk_size).ceil + + puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..." + + # Process in chunks + (0...total_chunks).each do |chunk_idx| + start_idx = chunk_idx * chunk_size + end_idx = [start_idx + chunk_size, total_entries].min + + chunk_range = (start_idx...end_idx) + chunk_words = words[chunk_range] + chunk_timestamps = timestamps[chunk_range] + + puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..." + + # Process each column type for this chunk + COLUMN_CONFIGS.each do |type, config| + next unless config[:required] + column = columns[type] + + chunk_words.each_with_index do |word, i| + cell = column.make_new_cell + timestamp = chunk_timestamps[i] + + # Set onset and offset + cell.change_code('onset', timestamp[:onset]) + cell.change_code('offset', timestamp[:offset]) + + # Set content based on column type + case type + when :transcript_original + cell.change_code('content', word) + when :transcript_clean + cell.change_code('content', word) + cell.change_code('speaker', '') # Initialize speaker field empty + when :qa + # Initialize QA codes as empty + config[:codes].each do |code| + cell.change_code(code, '') + end + when :initials + cell.change_code('coder_initials', '') + when :notes + cell.change_code('notes', '') + end + end + end + end + + puts "Setting columns in Datavyu..." + columns.each do |_, column| + set_column(column) + end + end + + # Show progress dialog + def show_progress_dialog(message) + JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE) + end + + # Main execution flow + puts "Starting transcript import..." + + # Setup and show file chooser + frame, jfc = setup_file_chooser + frame.setVisible(true) + + result = jfc.showOpenDialog(frame) + frame.dispose + + if result != JFileChooser::APPROVE_OPTION + puts "No file selected. Aborting." + return + end + + file_path = jfc.getSelectedFile.getPath + validate_file_format(file_path) + + puts "Reading file: #{file_path}" + show_progress_dialog("Reading file. Please wait...") + + # Read file content + content = File.readlines(file_path) + + puts "Processing content..." + show_progress_dialog("Processing transcript. This may take a moment for large files...") + + # Process file content + words, timestamps = process_content(content) + + if words.empty? || timestamps.empty? + puts "No valid transcript entries found. Check file format." + show_progress_dialog("No valid transcript entries found. Check file format.") + return + end + + puts "Creating Datavyu columns with #{words.size} entries..." + show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...") + + # Create columns with dynamically sized batched processing + create_columns(words, timestamps) + + puts "Import completed successfully!" + show_progress_dialog("Import completed successfully!") + +rescue => e + puts "Error: #{e.message}" + puts e.backtrace if ENV['DEBUG'] + JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE) +end From 0f9a5672b9e75d19e874ee4c619e7ef2b0da57a3 Mon Sep 17 00:00:00 2001 From: agbeckner <62198072+agbeckner@users.noreply.github.com> Date: Thu, 27 Feb 2025 17:00:38 -0500 Subject: [PATCH 2/3] Delete Insert/insert_whisper_transcript.rb --- Insert/insert_whisper_transcript.rb | 304 ---------------------------- 1 file changed, 304 deletions(-) delete mode 100644 Insert/insert_whisper_transcript.rb diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb deleted file mode 100644 index 03be496..0000000 --- a/Insert/insert_whisper_transcript.rb +++ /dev/null @@ -1,304 +0,0 @@ -# Purpose: This script processes and imports transcript files into Datavyu. -# -# Supported Formats: -# - WebVTT (.vtt) -# - SubRip (.srt) [planned] -# - Plain Text (.txt) [planned] -# -# Features: -# - File format validation -# - Timestamp conversion -# - Quality assurance workflow -# - Support for multiple subtitle formats [planned] -# - Optimized performance for large files -# - Dynamic chunk sizing based on file size -# -# Usage: -# 1. Run script -# 2. Select subtitle/transcript file when prompted -# 3. Script will create necessary columns in Datavyu: -# - transcript_original: Contains original transcription content -# - transcript_QA: For marking various types of errors -# - transcript_clean: For adding speaker labels to transcription -# - transcript_initials: For coder identification -# - transcript_notes: For additional observations -# Authors: Aaron G. Beckner & Trinity Wang -# 02-27-2025: Added flexible timestamp detection -# 01-14-2025: Added comments and made more generic - -# Revised by Van T. Pham for SPACE 2024 play coding -# Last edited: 01-21-2025 -# Optimized version: 02-27-2025 - -require 'Datavyu_API.rb' - -# Configuration constants -SUPPORTED_FORMATS = { - 'vtt' => 'WebVTT Subtitles' - # Add additional formats here as needed: - # 'srt' => 'SubRip Subtitles', - # 'txt' => 'Plain Text Transcripts' -} - -# Column configurations -COLUMN_CONFIGS = { - transcript_original: { - name: 'transcript_original', - codes: ['content'], - required: true - }, - qa: { - name: 'transcript_QA', - codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes - required: true - }, - transcript_clean: { - name: 'transcript_clean', - codes: ['speaker', 'content'], # Codes for speaker labeling and transcription content - required: true - }, - initials: { - name: 'transcript_initials', - codes: ['coder_initials'], # Optional Coder initials column - required: false - }, - notes: { - name: 'transcript_notes', - codes: ['notes'], - required: false - } -} - -# Base ratio for determining chunk size (adjust as needed) -# For every 10 cells, use chunk size of 1 -BASE_RATIO = 10.0 -# Minimum and maximum chunk sizes to ensure reasonable processing -MIN_CHUNK_SIZE = 10 -MAX_CHUNK_SIZE = 100 - -begin - # Import Java classes for GUI file selection - java_import javax.swing.JFileChooser - java_import javax.swing.filechooser.FileNameExtensionFilter - java_import javax.swing.JFrame - java_import javax.swing.JOptionPane - - # Sets up the file chooser dialog - def setup_file_chooser - frame = JFrame.new("Import Transcript") - frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE) - frame.setSize(200, 200) - frame.setLocationRelativeTo(nil) - - jfc = JFileChooser.new - jfc.setAcceptAllFileFilterUsed(false) - jfc.setMultiSelectionEnabled(false) - jfc.setDialogTitle('Select transcript file to import') - - SUPPORTED_FORMATS.each do |format, description| - extensions = [format].to_java(:String) - filter = FileNameExtensionFilter.new(description, extensions) - jfc.addChoosableFileFilter(filter) - end - - [frame, jfc] - end - - # Validates that the selected file has a supported format - def validate_file_format(file_path) - extension = File.extname(file_path)[1..-1] - raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension) - true - end - - # Optimize timestamp parsing for better performance - def parse_timestamp(time_str) - # Handle HH:MM:SS.mmm format - if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/) - hours = $1.to_i - minutes = $2.to_i - seconds = $3.to_i - milliseconds = $4.to_i - - return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds - # Handle MM:SS.mmm format (no hours) - elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/) - minutes = $1.to_i - seconds = $2.to_i - milliseconds = $3.to_i - - return (minutes * 60000) + (seconds * 1000) + milliseconds - else - raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm" - end - end - - # Pre-process the content to extract words and timestamps more efficiently - def process_content(content) - # Remove WEBVTT header if present - content.shift if content.first && content.first.strip == 'WEBVTT' - - timestamps = [] - words = [] - current_timestamp = nil - - content.each do |line| - line = line.strip - next if line.empty? - - if line.include?("-->") - onset_str, offset_str = line.split('-->').map(&:strip) - current_timestamp = { - onset: parse_timestamp(onset_str), - offset: parse_timestamp(offset_str) - } - elsif line.match(/^\d+$/) - # Skip cue numbers (commonly found in VTT files) - next - elsif current_timestamp && line.match(/^[a-zA-Z]/) - # Only add lines that start with letters and have a valid timestamp - timestamps << current_timestamp - words << line - end - end - - [words, timestamps] - end - - # Calculate optimal chunk size based on number of cells - def calculate_chunk_size(total_cells) - # Use the ratio of 1 chunk per 10 cells as basis - chunk_size = (total_cells / BASE_RATIO).ceil - - # Enforce minimum and maximum chunk sizes - chunk_size = [chunk_size, MIN_CHUNK_SIZE].max - chunk_size = [chunk_size, MAX_CHUNK_SIZE].min - - puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells" - chunk_size - end - - # Create columns in batches for better performance - def create_columns(words, timestamps) - columns = {} - - # First create all columns - COLUMN_CONFIGS.each do |type, config| - if config[:required] - columns[type] = new_column(config[:name], *config[:codes]) - end - end - - # Calculate total number of entries and dynamic chunk size - total_entries = words.size - chunk_size = calculate_chunk_size(total_entries) - total_chunks = (total_entries.to_f / chunk_size).ceil - - puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..." - - # Process in chunks - (0...total_chunks).each do |chunk_idx| - start_idx = chunk_idx * chunk_size - end_idx = [start_idx + chunk_size, total_entries].min - - chunk_range = (start_idx...end_idx) - chunk_words = words[chunk_range] - chunk_timestamps = timestamps[chunk_range] - - puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..." - - # Process each column type for this chunk - COLUMN_CONFIGS.each do |type, config| - next unless config[:required] - column = columns[type] - - chunk_words.each_with_index do |word, i| - cell = column.make_new_cell - timestamp = chunk_timestamps[i] - - # Set onset and offset - cell.change_code('onset', timestamp[:onset]) - cell.change_code('offset', timestamp[:offset]) - - # Set content based on column type - case type - when :transcript_original - cell.change_code('content', word) - when :transcript_clean - cell.change_code('content', word) - cell.change_code('speaker', '') # Initialize speaker field empty - when :qa - # Initialize QA codes as empty - config[:codes].each do |code| - cell.change_code(code, '') - end - when :initials - cell.change_code('coder_initials', '') - when :notes - cell.change_code('notes', '') - end - end - end - end - - puts "Setting columns in Datavyu..." - columns.each do |_, column| - set_column(column) - end - end - - # Show progress dialog - def show_progress_dialog(message) - JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE) - end - - # Main execution flow - puts "Starting transcript import..." - - # Setup and show file chooser - frame, jfc = setup_file_chooser - frame.setVisible(true) - - result = jfc.showOpenDialog(frame) - frame.dispose - - if result != JFileChooser::APPROVE_OPTION - puts "No file selected. Aborting." - return - end - - file_path = jfc.getSelectedFile.getPath - validate_file_format(file_path) - - puts "Reading file: #{file_path}" - show_progress_dialog("Reading file. Please wait...") - - # Read file content - content = File.readlines(file_path) - - puts "Processing content..." - show_progress_dialog("Processing transcript. This may take a moment for large files...") - - # Process file content - words, timestamps = process_content(content) - - if words.empty? || timestamps.empty? - puts "No valid transcript entries found. Check file format." - show_progress_dialog("No valid transcript entries found. Check file format.") - return - end - - puts "Creating Datavyu columns with #{words.size} entries..." - show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...") - - # Create columns with dynamically sized batched processing - create_columns(words, timestamps) - - puts "Import completed successfully!" - show_progress_dialog("Import completed successfully!") - -rescue => e - puts "Error: #{e.message}" - puts e.backtrace if ENV['DEBUG'] - JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE) -end From 54ad9a65ef9034ff4052c15d9b40595a3192596f Mon Sep 17 00:00:00 2001 From: agbeckner <62198072+agbeckner@users.noreply.github.com> Date: Thu, 27 Feb 2025 17:02:01 -0500 Subject: [PATCH 3/3] Checks format and dynamically chunk speech cells --- Insert/insert_whisper_transcript.rb | 304 ++++++++++++++++++++++++++++ 1 file changed, 304 insertions(+) create mode 100644 Insert/insert_whisper_transcript.rb diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb new file mode 100644 index 0000000..03be496 --- /dev/null +++ b/Insert/insert_whisper_transcript.rb @@ -0,0 +1,304 @@ +# Purpose: This script processes and imports transcript files into Datavyu. +# +# Supported Formats: +# - WebVTT (.vtt) +# - SubRip (.srt) [planned] +# - Plain Text (.txt) [planned] +# +# Features: +# - File format validation +# - Timestamp conversion +# - Quality assurance workflow +# - Support for multiple subtitle formats [planned] +# - Optimized performance for large files +# - Dynamic chunk sizing based on file size +# +# Usage: +# 1. Run script +# 2. Select subtitle/transcript file when prompted +# 3. Script will create necessary columns in Datavyu: +# - transcript_original: Contains original transcription content +# - transcript_QA: For marking various types of errors +# - transcript_clean: For adding speaker labels to transcription +# - transcript_initials: For coder identification +# - transcript_notes: For additional observations +# Authors: Aaron G. Beckner & Trinity Wang +# 02-27-2025: Added flexible timestamp detection +# 01-14-2025: Added comments and made more generic + +# Revised by Van T. Pham for SPACE 2024 play coding +# Last edited: 01-21-2025 +# Optimized version: 02-27-2025 + +require 'Datavyu_API.rb' + +# Configuration constants +SUPPORTED_FORMATS = { + 'vtt' => 'WebVTT Subtitles' + # Add additional formats here as needed: + # 'srt' => 'SubRip Subtitles', + # 'txt' => 'Plain Text Transcripts' +} + +# Column configurations +COLUMN_CONFIGS = { + transcript_original: { + name: 'transcript_original', + codes: ['content'], + required: true + }, + qa: { + name: 'transcript_QA', + codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes + required: true + }, + transcript_clean: { + name: 'transcript_clean', + codes: ['speaker', 'content'], # Codes for speaker labeling and transcription content + required: true + }, + initials: { + name: 'transcript_initials', + codes: ['coder_initials'], # Optional Coder initials column + required: false + }, + notes: { + name: 'transcript_notes', + codes: ['notes'], + required: false + } +} + +# Base ratio for determining chunk size (adjust as needed) +# For every 10 cells, use chunk size of 1 +BASE_RATIO = 10.0 +# Minimum and maximum chunk sizes to ensure reasonable processing +MIN_CHUNK_SIZE = 10 +MAX_CHUNK_SIZE = 100 + +begin + # Import Java classes for GUI file selection + java_import javax.swing.JFileChooser + java_import javax.swing.filechooser.FileNameExtensionFilter + java_import javax.swing.JFrame + java_import javax.swing.JOptionPane + + # Sets up the file chooser dialog + def setup_file_chooser + frame = JFrame.new("Import Transcript") + frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE) + frame.setSize(200, 200) + frame.setLocationRelativeTo(nil) + + jfc = JFileChooser.new + jfc.setAcceptAllFileFilterUsed(false) + jfc.setMultiSelectionEnabled(false) + jfc.setDialogTitle('Select transcript file to import') + + SUPPORTED_FORMATS.each do |format, description| + extensions = [format].to_java(:String) + filter = FileNameExtensionFilter.new(description, extensions) + jfc.addChoosableFileFilter(filter) + end + + [frame, jfc] + end + + # Validates that the selected file has a supported format + def validate_file_format(file_path) + extension = File.extname(file_path)[1..-1] + raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension) + true + end + + # Optimize timestamp parsing for better performance + def parse_timestamp(time_str) + # Handle HH:MM:SS.mmm format + if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/) + hours = $1.to_i + minutes = $2.to_i + seconds = $3.to_i + milliseconds = $4.to_i + + return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds + # Handle MM:SS.mmm format (no hours) + elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/) + minutes = $1.to_i + seconds = $2.to_i + milliseconds = $3.to_i + + return (minutes * 60000) + (seconds * 1000) + milliseconds + else + raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm" + end + end + + # Pre-process the content to extract words and timestamps more efficiently + def process_content(content) + # Remove WEBVTT header if present + content.shift if content.first && content.first.strip == 'WEBVTT' + + timestamps = [] + words = [] + current_timestamp = nil + + content.each do |line| + line = line.strip + next if line.empty? + + if line.include?("-->") + onset_str, offset_str = line.split('-->').map(&:strip) + current_timestamp = { + onset: parse_timestamp(onset_str), + offset: parse_timestamp(offset_str) + } + elsif line.match(/^\d+$/) + # Skip cue numbers (commonly found in VTT files) + next + elsif current_timestamp && line.match(/^[a-zA-Z]/) + # Only add lines that start with letters and have a valid timestamp + timestamps << current_timestamp + words << line + end + end + + [words, timestamps] + end + + # Calculate optimal chunk size based on number of cells + def calculate_chunk_size(total_cells) + # Use the ratio of 1 chunk per 10 cells as basis + chunk_size = (total_cells / BASE_RATIO).ceil + + # Enforce minimum and maximum chunk sizes + chunk_size = [chunk_size, MIN_CHUNK_SIZE].max + chunk_size = [chunk_size, MAX_CHUNK_SIZE].min + + puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells" + chunk_size + end + + # Create columns in batches for better performance + def create_columns(words, timestamps) + columns = {} + + # First create all columns + COLUMN_CONFIGS.each do |type, config| + if config[:required] + columns[type] = new_column(config[:name], *config[:codes]) + end + end + + # Calculate total number of entries and dynamic chunk size + total_entries = words.size + chunk_size = calculate_chunk_size(total_entries) + total_chunks = (total_entries.to_f / chunk_size).ceil + + puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..." + + # Process in chunks + (0...total_chunks).each do |chunk_idx| + start_idx = chunk_idx * chunk_size + end_idx = [start_idx + chunk_size, total_entries].min + + chunk_range = (start_idx...end_idx) + chunk_words = words[chunk_range] + chunk_timestamps = timestamps[chunk_range] + + puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..." + + # Process each column type for this chunk + COLUMN_CONFIGS.each do |type, config| + next unless config[:required] + column = columns[type] + + chunk_words.each_with_index do |word, i| + cell = column.make_new_cell + timestamp = chunk_timestamps[i] + + # Set onset and offset + cell.change_code('onset', timestamp[:onset]) + cell.change_code('offset', timestamp[:offset]) + + # Set content based on column type + case type + when :transcript_original + cell.change_code('content', word) + when :transcript_clean + cell.change_code('content', word) + cell.change_code('speaker', '') # Initialize speaker field empty + when :qa + # Initialize QA codes as empty + config[:codes].each do |code| + cell.change_code(code, '') + end + when :initials + cell.change_code('coder_initials', '') + when :notes + cell.change_code('notes', '') + end + end + end + end + + puts "Setting columns in Datavyu..." + columns.each do |_, column| + set_column(column) + end + end + + # Show progress dialog + def show_progress_dialog(message) + JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE) + end + + # Main execution flow + puts "Starting transcript import..." + + # Setup and show file chooser + frame, jfc = setup_file_chooser + frame.setVisible(true) + + result = jfc.showOpenDialog(frame) + frame.dispose + + if result != JFileChooser::APPROVE_OPTION + puts "No file selected. Aborting." + return + end + + file_path = jfc.getSelectedFile.getPath + validate_file_format(file_path) + + puts "Reading file: #{file_path}" + show_progress_dialog("Reading file. Please wait...") + + # Read file content + content = File.readlines(file_path) + + puts "Processing content..." + show_progress_dialog("Processing transcript. This may take a moment for large files...") + + # Process file content + words, timestamps = process_content(content) + + if words.empty? || timestamps.empty? + puts "No valid transcript entries found. Check file format." + show_progress_dialog("No valid transcript entries found. Check file format.") + return + end + + puts "Creating Datavyu columns with #{words.size} entries..." + show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...") + + # Create columns with dynamically sized batched processing + create_columns(words, timestamps) + + puts "Import completed successfully!" + show_progress_dialog("Import completed successfully!") + +rescue => e + puts "Error: #{e.message}" + puts e.backtrace if ENV['DEBUG'] + JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE) +end