From 4c7d1dc52dab6ed216615170656b990a96d96671 Mon Sep 17 00:00:00 2001
From: agbeckner <62198072+agbeckner@users.noreply.github.com>
Date: Thu, 27 Feb 2025 16:52:56 -0500
Subject: [PATCH 1/3] Add files via upload

---
 Insert/insert_whisper_transcript.rb | 304 ++++++++++++++++++++++++++++
 1 file changed, 304 insertions(+)
 create mode 100644 Insert/insert_whisper_transcript.rb

diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb
new file mode 100644
index 0000000..03be496
--- /dev/null
+++ b/Insert/insert_whisper_transcript.rb
@@ -0,0 +1,304 @@
+# Purpose: This script processes and imports transcript files into Datavyu. 
+# 
+# Supported Formats:
+#   - WebVTT (.vtt)
+#   - SubRip (.srt) [planned]
+#   - Plain Text (.txt) [planned]
+#
+# Features:
+#   - File format validation
+#   - Timestamp conversion
+#   - Quality assurance workflow
+#   - Support for multiple subtitle formats [planned]
+#   - Optimized performance for large files
+#   - Dynamic chunk sizing based on file size
+# 
+# Usage:
+#   1. Run script
+#   2. Select subtitle/transcript file when prompted
+#   3. Script will create necessary columns in Datavyu:
+#      - transcript_original: Contains original transcription content
+#      - transcript_QA: For marking various types of errors
+#      - transcript_clean: For adding speaker labels to transcription
+#      - transcript_initials: For coder identification
+#      - transcript_notes: For additional observations
+# Authors: Aaron G. Beckner & Trinity Wang
+# 02-27-2025: Added flexible timestamp detection
+# 01-14-2025: Added comments and made more generic
+
+# Revised by Van T. Pham for SPACE 2024 play coding
+# Last edited: 01-21-2025
+# Optimized version: 02-27-2025
+
+require 'Datavyu_API.rb'
+
+# Configuration constants
+SUPPORTED_FORMATS = {
+  'vtt' => 'WebVTT Subtitles'
+  # Add additional formats here as needed:
+  # 'srt' => 'SubRip Subtitles',
+  # 'txt' => 'Plain Text Transcripts'
+}
+
+# Column configurations
+COLUMN_CONFIGS = {
+  transcript_original: {
+    name: 'transcript_original',
+    codes: ['content'],
+    required: true
+  },
+  qa: {
+    name: 'transcript_QA',
+    codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes
+    required: true
+  },
+  transcript_clean: {
+    name: 'transcript_clean',
+    codes: ['speaker', 'content'],  # Codes for speaker labeling and transcription content
+    required: true
+  },
+  initials: {
+    name: 'transcript_initials',
+    codes: ['coder_initials'], # Optional Coder initials column
+    required: false
+  },
+  notes: {
+    name: 'transcript_notes',
+    codes: ['notes'],
+    required: false
+  }
+}
+
+# Base ratio for determining chunk size (adjust as needed)
+# For every 10 cells, use chunk size of 1
+BASE_RATIO = 10.0
+# Minimum and maximum chunk sizes to ensure reasonable processing
+MIN_CHUNK_SIZE = 10
+MAX_CHUNK_SIZE = 100
+
+begin
+  # Import Java classes for GUI file selection
+  java_import javax.swing.JFileChooser
+  java_import javax.swing.filechooser.FileNameExtensionFilter
+  java_import javax.swing.JFrame
+  java_import javax.swing.JOptionPane
+
+  # Sets up the file chooser dialog
+  def setup_file_chooser
+    frame = JFrame.new("Import Transcript")
+    frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE)
+    frame.setSize(200, 200)
+    frame.setLocationRelativeTo(nil)
+    
+    jfc = JFileChooser.new
+    jfc.setAcceptAllFileFilterUsed(false)
+    jfc.setMultiSelectionEnabled(false)
+    jfc.setDialogTitle('Select transcript file to import')
+    
+    SUPPORTED_FORMATS.each do |format, description|
+      extensions = [format].to_java(:String)
+      filter = FileNameExtensionFilter.new(description, extensions)
+      jfc.addChoosableFileFilter(filter)
+    end
+    
+    [frame, jfc]
+  end
+
+  # Validates that the selected file has a supported format
+  def validate_file_format(file_path)
+    extension = File.extname(file_path)[1..-1]
+    raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension)
+    true
+  end
+
+  # Optimize timestamp parsing for better performance
+  def parse_timestamp(time_str)
+    # Handle HH:MM:SS.mmm format
+    if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/)
+      hours = $1.to_i
+      minutes = $2.to_i
+      seconds = $3.to_i
+      milliseconds = $4.to_i
+      
+      return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds
+    # Handle MM:SS.mmm format (no hours)
+    elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/)
+      minutes = $1.to_i
+      seconds = $2.to_i
+      milliseconds = $3.to_i
+      
+      return (minutes * 60000) + (seconds * 1000) + milliseconds
+    else
+      raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm"
+    end
+  end
+
+  # Pre-process the content to extract words and timestamps more efficiently
+  def process_content(content)
+    # Remove WEBVTT header if present
+    content.shift if content.first && content.first.strip == 'WEBVTT'
+    
+    timestamps = []
+    words = []
+    current_timestamp = nil
+    
+    content.each do |line|
+      line = line.strip
+      next if line.empty?
+      
+      if line.include?("-->")
+        onset_str, offset_str = line.split('-->').map(&:strip)
+        current_timestamp = {
+          onset: parse_timestamp(onset_str),
+          offset: parse_timestamp(offset_str)
+        }
+      elsif line.match(/^\d+$/)
+        # Skip cue numbers (commonly found in VTT files)
+        next
+      elsif current_timestamp && line.match(/^[a-zA-Z]/)
+        # Only add lines that start with letters and have a valid timestamp
+        timestamps << current_timestamp
+        words << line
+      end
+    end
+    
+    [words, timestamps]
+  end
+
+  # Calculate optimal chunk size based on number of cells
+  def calculate_chunk_size(total_cells)
+    # Use the ratio of 1 chunk per 10 cells as basis
+    chunk_size = (total_cells / BASE_RATIO).ceil
+    
+    # Enforce minimum and maximum chunk sizes
+    chunk_size = [chunk_size, MIN_CHUNK_SIZE].max
+    chunk_size = [chunk_size, MAX_CHUNK_SIZE].min
+    
+    puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells"
+    chunk_size
+  end
+
+  # Create columns in batches for better performance
+  def create_columns(words, timestamps)
+    columns = {}
+    
+    # First create all columns
+    COLUMN_CONFIGS.each do |type, config|
+      if config[:required]
+        columns[type] = new_column(config[:name], *config[:codes])
+      end
+    end
+    
+    # Calculate total number of entries and dynamic chunk size
+    total_entries = words.size
+    chunk_size = calculate_chunk_size(total_entries)
+    total_chunks = (total_entries.to_f / chunk_size).ceil
+    
+    puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..."
+    
+    # Process in chunks
+    (0...total_chunks).each do |chunk_idx|
+      start_idx = chunk_idx * chunk_size
+      end_idx = [start_idx + chunk_size, total_entries].min
+      
+      chunk_range = (start_idx...end_idx)
+      chunk_words = words[chunk_range]
+      chunk_timestamps = timestamps[chunk_range]
+      
+      puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..."
+      
+      # Process each column type for this chunk
+      COLUMN_CONFIGS.each do |type, config|
+        next unless config[:required]
+        column = columns[type]
+        
+        chunk_words.each_with_index do |word, i|
+          cell = column.make_new_cell
+          timestamp = chunk_timestamps[i]
+          
+          # Set onset and offset
+          cell.change_code('onset', timestamp[:onset])
+          cell.change_code('offset', timestamp[:offset])
+          
+          # Set content based on column type
+          case type
+          when :transcript_original
+            cell.change_code('content', word)
+          when :transcript_clean
+            cell.change_code('content', word)
+            cell.change_code('speaker', '')  # Initialize speaker field empty
+          when :qa
+            # Initialize QA codes as empty
+            config[:codes].each do |code|
+              cell.change_code(code, '')
+            end
+          when :initials
+            cell.change_code('coder_initials', '')
+          when :notes
+            cell.change_code('notes', '')
+          end
+        end
+      end
+    end
+    
+    puts "Setting columns in Datavyu..."
+    columns.each do |_, column|
+      set_column(column)
+    end
+  end
+
+  # Show progress dialog
+  def show_progress_dialog(message)
+    JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE)
+  end
+
+  # Main execution flow
+  puts "Starting transcript import..."
+
+  # Setup and show file chooser
+  frame, jfc = setup_file_chooser
+  frame.setVisible(true)
+  
+  result = jfc.showOpenDialog(frame)
+  frame.dispose
+
+  if result != JFileChooser::APPROVE_OPTION
+    puts "No file selected. Aborting."
+    return
+  end
+
+  file_path = jfc.getSelectedFile.getPath
+  validate_file_format(file_path)
+
+  puts "Reading file: #{file_path}"
+  show_progress_dialog("Reading file. Please wait...")
+  
+  # Read file content
+  content = File.readlines(file_path)
+  
+  puts "Processing content..."
+  show_progress_dialog("Processing transcript. This may take a moment for large files...")
+  
+  # Process file content
+  words, timestamps = process_content(content)
+  
+  if words.empty? || timestamps.empty?
+    puts "No valid transcript entries found. Check file format."
+    show_progress_dialog("No valid transcript entries found. Check file format.")
+    return
+  end
+  
+  puts "Creating Datavyu columns with #{words.size} entries..."
+  show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...")
+  
+  # Create columns with dynamically sized batched processing
+  create_columns(words, timestamps)
+
+  puts "Import completed successfully!"
+  show_progress_dialog("Import completed successfully!")
+
+rescue => e
+  puts "Error: #{e.message}"
+  puts e.backtrace if ENV['DEBUG']
+  JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE)
+end

From 0f9a5672b9e75d19e874ee4c619e7ef2b0da57a3 Mon Sep 17 00:00:00 2001
From: agbeckner <62198072+agbeckner@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:00:38 -0500
Subject: [PATCH 2/3] Delete Insert/insert_whisper_transcript.rb

---
 Insert/insert_whisper_transcript.rb | 304 ----------------------------
 1 file changed, 304 deletions(-)
 delete mode 100644 Insert/insert_whisper_transcript.rb

diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb
deleted file mode 100644
index 03be496..0000000
--- a/Insert/insert_whisper_transcript.rb
+++ /dev/null
@@ -1,304 +0,0 @@
-# Purpose: This script processes and imports transcript files into Datavyu. 
-# 
-# Supported Formats:
-#   - WebVTT (.vtt)
-#   - SubRip (.srt) [planned]
-#   - Plain Text (.txt) [planned]
-#
-# Features:
-#   - File format validation
-#   - Timestamp conversion
-#   - Quality assurance workflow
-#   - Support for multiple subtitle formats [planned]
-#   - Optimized performance for large files
-#   - Dynamic chunk sizing based on file size
-# 
-# Usage:
-#   1. Run script
-#   2. Select subtitle/transcript file when prompted
-#   3. Script will create necessary columns in Datavyu:
-#      - transcript_original: Contains original transcription content
-#      - transcript_QA: For marking various types of errors
-#      - transcript_clean: For adding speaker labels to transcription
-#      - transcript_initials: For coder identification
-#      - transcript_notes: For additional observations
-# Authors: Aaron G. Beckner & Trinity Wang
-# 02-27-2025: Added flexible timestamp detection
-# 01-14-2025: Added comments and made more generic
-
-# Revised by Van T. Pham for SPACE 2024 play coding
-# Last edited: 01-21-2025
-# Optimized version: 02-27-2025
-
-require 'Datavyu_API.rb'
-
-# Configuration constants
-SUPPORTED_FORMATS = {
-  'vtt' => 'WebVTT Subtitles'
-  # Add additional formats here as needed:
-  # 'srt' => 'SubRip Subtitles',
-  # 'txt' => 'Plain Text Transcripts'
-}
-
-# Column configurations
-COLUMN_CONFIGS = {
-  transcript_original: {
-    name: 'transcript_original',
-    codes: ['content'],
-    required: true
-  },
-  qa: {
-    name: 'transcript_QA',
-    codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes
-    required: true
-  },
-  transcript_clean: {
-    name: 'transcript_clean',
-    codes: ['speaker', 'content'],  # Codes for speaker labeling and transcription content
-    required: true
-  },
-  initials: {
-    name: 'transcript_initials',
-    codes: ['coder_initials'], # Optional Coder initials column
-    required: false
-  },
-  notes: {
-    name: 'transcript_notes',
-    codes: ['notes'],
-    required: false
-  }
-}
-
-# Base ratio for determining chunk size (adjust as needed)
-# For every 10 cells, use chunk size of 1
-BASE_RATIO = 10.0
-# Minimum and maximum chunk sizes to ensure reasonable processing
-MIN_CHUNK_SIZE = 10
-MAX_CHUNK_SIZE = 100
-
-begin
-  # Import Java classes for GUI file selection
-  java_import javax.swing.JFileChooser
-  java_import javax.swing.filechooser.FileNameExtensionFilter
-  java_import javax.swing.JFrame
-  java_import javax.swing.JOptionPane
-
-  # Sets up the file chooser dialog
-  def setup_file_chooser
-    frame = JFrame.new("Import Transcript")
-    frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE)
-    frame.setSize(200, 200)
-    frame.setLocationRelativeTo(nil)
-    
-    jfc = JFileChooser.new
-    jfc.setAcceptAllFileFilterUsed(false)
-    jfc.setMultiSelectionEnabled(false)
-    jfc.setDialogTitle('Select transcript file to import')
-    
-    SUPPORTED_FORMATS.each do |format, description|
-      extensions = [format].to_java(:String)
-      filter = FileNameExtensionFilter.new(description, extensions)
-      jfc.addChoosableFileFilter(filter)
-    end
-    
-    [frame, jfc]
-  end
-
-  # Validates that the selected file has a supported format
-  def validate_file_format(file_path)
-    extension = File.extname(file_path)[1..-1]
-    raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension)
-    true
-  end
-
-  # Optimize timestamp parsing for better performance
-  def parse_timestamp(time_str)
-    # Handle HH:MM:SS.mmm format
-    if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/)
-      hours = $1.to_i
-      minutes = $2.to_i
-      seconds = $3.to_i
-      milliseconds = $4.to_i
-      
-      return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds
-    # Handle MM:SS.mmm format (no hours)
-    elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/)
-      minutes = $1.to_i
-      seconds = $2.to_i
-      milliseconds = $3.to_i
-      
-      return (minutes * 60000) + (seconds * 1000) + milliseconds
-    else
-      raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm"
-    end
-  end
-
-  # Pre-process the content to extract words and timestamps more efficiently
-  def process_content(content)
-    # Remove WEBVTT header if present
-    content.shift if content.first && content.first.strip == 'WEBVTT'
-    
-    timestamps = []
-    words = []
-    current_timestamp = nil
-    
-    content.each do |line|
-      line = line.strip
-      next if line.empty?
-      
-      if line.include?("-->")
-        onset_str, offset_str = line.split('-->').map(&:strip)
-        current_timestamp = {
-          onset: parse_timestamp(onset_str),
-          offset: parse_timestamp(offset_str)
-        }
-      elsif line.match(/^\d+$/)
-        # Skip cue numbers (commonly found in VTT files)
-        next
-      elsif current_timestamp && line.match(/^[a-zA-Z]/)
-        # Only add lines that start with letters and have a valid timestamp
-        timestamps << current_timestamp
-        words << line
-      end
-    end
-    
-    [words, timestamps]
-  end
-
-  # Calculate optimal chunk size based on number of cells
-  def calculate_chunk_size(total_cells)
-    # Use the ratio of 1 chunk per 10 cells as basis
-    chunk_size = (total_cells / BASE_RATIO).ceil
-    
-    # Enforce minimum and maximum chunk sizes
-    chunk_size = [chunk_size, MIN_CHUNK_SIZE].max
-    chunk_size = [chunk_size, MAX_CHUNK_SIZE].min
-    
-    puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells"
-    chunk_size
-  end
-
-  # Create columns in batches for better performance
-  def create_columns(words, timestamps)
-    columns = {}
-    
-    # First create all columns
-    COLUMN_CONFIGS.each do |type, config|
-      if config[:required]
-        columns[type] = new_column(config[:name], *config[:codes])
-      end
-    end
-    
-    # Calculate total number of entries and dynamic chunk size
-    total_entries = words.size
-    chunk_size = calculate_chunk_size(total_entries)
-    total_chunks = (total_entries.to_f / chunk_size).ceil
-    
-    puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..."
-    
-    # Process in chunks
-    (0...total_chunks).each do |chunk_idx|
-      start_idx = chunk_idx * chunk_size
-      end_idx = [start_idx + chunk_size, total_entries].min
-      
-      chunk_range = (start_idx...end_idx)
-      chunk_words = words[chunk_range]
-      chunk_timestamps = timestamps[chunk_range]
-      
-      puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..."
-      
-      # Process each column type for this chunk
-      COLUMN_CONFIGS.each do |type, config|
-        next unless config[:required]
-        column = columns[type]
-        
-        chunk_words.each_with_index do |word, i|
-          cell = column.make_new_cell
-          timestamp = chunk_timestamps[i]
-          
-          # Set onset and offset
-          cell.change_code('onset', timestamp[:onset])
-          cell.change_code('offset', timestamp[:offset])
-          
-          # Set content based on column type
-          case type
-          when :transcript_original
-            cell.change_code('content', word)
-          when :transcript_clean
-            cell.change_code('content', word)
-            cell.change_code('speaker', '')  # Initialize speaker field empty
-          when :qa
-            # Initialize QA codes as empty
-            config[:codes].each do |code|
-              cell.change_code(code, '')
-            end
-          when :initials
-            cell.change_code('coder_initials', '')
-          when :notes
-            cell.change_code('notes', '')
-          end
-        end
-      end
-    end
-    
-    puts "Setting columns in Datavyu..."
-    columns.each do |_, column|
-      set_column(column)
-    end
-  end
-
-  # Show progress dialog
-  def show_progress_dialog(message)
-    JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE)
-  end
-
-  # Main execution flow
-  puts "Starting transcript import..."
-
-  # Setup and show file chooser
-  frame, jfc = setup_file_chooser
-  frame.setVisible(true)
-  
-  result = jfc.showOpenDialog(frame)
-  frame.dispose
-
-  if result != JFileChooser::APPROVE_OPTION
-    puts "No file selected. Aborting."
-    return
-  end
-
-  file_path = jfc.getSelectedFile.getPath
-  validate_file_format(file_path)
-
-  puts "Reading file: #{file_path}"
-  show_progress_dialog("Reading file. Please wait...")
-  
-  # Read file content
-  content = File.readlines(file_path)
-  
-  puts "Processing content..."
-  show_progress_dialog("Processing transcript. This may take a moment for large files...")
-  
-  # Process file content
-  words, timestamps = process_content(content)
-  
-  if words.empty? || timestamps.empty?
-    puts "No valid transcript entries found. Check file format."
-    show_progress_dialog("No valid transcript entries found. Check file format.")
-    return
-  end
-  
-  puts "Creating Datavyu columns with #{words.size} entries..."
-  show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...")
-  
-  # Create columns with dynamically sized batched processing
-  create_columns(words, timestamps)
-
-  puts "Import completed successfully!"
-  show_progress_dialog("Import completed successfully!")
-
-rescue => e
-  puts "Error: #{e.message}"
-  puts e.backtrace if ENV['DEBUG']
-  JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE)
-end

From 54ad9a65ef9034ff4052c15d9b40595a3192596f Mon Sep 17 00:00:00 2001
From: agbeckner <62198072+agbeckner@users.noreply.github.com>
Date: Thu, 27 Feb 2025 17:02:01 -0500
Subject: [PATCH 3/3] Checks format and dynamically chunk speech cells

---
 Insert/insert_whisper_transcript.rb | 304 ++++++++++++++++++++++++++++
 1 file changed, 304 insertions(+)
 create mode 100644 Insert/insert_whisper_transcript.rb

diff --git a/Insert/insert_whisper_transcript.rb b/Insert/insert_whisper_transcript.rb
new file mode 100644
index 0000000..03be496
--- /dev/null
+++ b/Insert/insert_whisper_transcript.rb
@@ -0,0 +1,304 @@
+# Purpose: This script processes and imports transcript files into Datavyu. 
+# 
+# Supported Formats:
+#   - WebVTT (.vtt)
+#   - SubRip (.srt) [planned]
+#   - Plain Text (.txt) [planned]
+#
+# Features:
+#   - File format validation
+#   - Timestamp conversion
+#   - Quality assurance workflow
+#   - Support for multiple subtitle formats [planned]
+#   - Optimized performance for large files
+#   - Dynamic chunk sizing based on file size
+# 
+# Usage:
+#   1. Run script
+#   2. Select subtitle/transcript file when prompted
+#   3. Script will create necessary columns in Datavyu:
+#      - transcript_original: Contains original transcription content
+#      - transcript_QA: For marking various types of errors
+#      - transcript_clean: For adding speaker labels to transcription
+#      - transcript_initials: For coder identification
+#      - transcript_notes: For additional observations
+# Authors: Aaron G. Beckner & Trinity Wang
+# 02-27-2025: Added flexible timestamp detection
+# 01-14-2025: Added comments and made more generic
+
+# Revised by Van T. Pham for SPACE 2024 play coding
+# Last edited: 01-21-2025
+# Optimized version: 02-27-2025
+
+require 'Datavyu_API.rb'
+
+# Configuration constants
+SUPPORTED_FORMATS = {
+  'vtt' => 'WebVTT Subtitles'
+  # Add additional formats here as needed:
+  # 'srt' => 'SubRip Subtitles',
+  # 'txt' => 'Plain Text Transcripts'
+}
+
+# Column configurations
+COLUMN_CONFIGS = {
+  transcript_original: {
+    name: 'transcript_original',
+    codes: ['content'],
+    required: true
+  },
+  qa: {
+    name: 'transcript_QA',
+    codes: ['OnsetError', 'ContentError', 'OmittedUtterance', 'HallucinatedUtterance', 'SpeakerChange'], # quality assurance error codes
+    required: true
+  },
+  transcript_clean: {
+    name: 'transcript_clean',
+    codes: ['speaker', 'content'],  # Codes for speaker labeling and transcription content
+    required: true
+  },
+  initials: {
+    name: 'transcript_initials',
+    codes: ['coder_initials'], # Optional Coder initials column
+    required: false
+  },
+  notes: {
+    name: 'transcript_notes',
+    codes: ['notes'],
+    required: false
+  }
+}
+
+# Base ratio for determining chunk size (adjust as needed)
+# For every 10 cells, use chunk size of 1
+BASE_RATIO = 10.0
+# Minimum and maximum chunk sizes to ensure reasonable processing
+MIN_CHUNK_SIZE = 10
+MAX_CHUNK_SIZE = 100
+
+begin
+  # Import Java classes for GUI file selection
+  java_import javax.swing.JFileChooser
+  java_import javax.swing.filechooser.FileNameExtensionFilter
+  java_import javax.swing.JFrame
+  java_import javax.swing.JOptionPane
+
+  # Sets up the file chooser dialog
+  def setup_file_chooser
+    frame = JFrame.new("Import Transcript")
+    frame.setDefaultCloseOperation(JFrame::DISPOSE_ON_CLOSE)
+    frame.setSize(200, 200)
+    frame.setLocationRelativeTo(nil)
+    
+    jfc = JFileChooser.new
+    jfc.setAcceptAllFileFilterUsed(false)
+    jfc.setMultiSelectionEnabled(false)
+    jfc.setDialogTitle('Select transcript file to import')
+    
+    SUPPORTED_FORMATS.each do |format, description|
+      extensions = [format].to_java(:String)
+      filter = FileNameExtensionFilter.new(description, extensions)
+      jfc.addChoosableFileFilter(filter)
+    end
+    
+    [frame, jfc]
+  end
+
+  # Validates that the selected file has a supported format
+  def validate_file_format(file_path)
+    extension = File.extname(file_path)[1..-1]
+    raise "Unsupported file format: .#{extension}" unless SUPPORTED_FORMATS.key?(extension)
+    true
+  end
+
+  # Optimize timestamp parsing for better performance
+  def parse_timestamp(time_str)
+    # Handle HH:MM:SS.mmm format
+    if time_str.match(/^(\d{2}):(\d{2}):(\d{2})\.(\d{3})$/)
+      hours = $1.to_i
+      minutes = $2.to_i
+      seconds = $3.to_i
+      milliseconds = $4.to_i
+      
+      return (hours * 3600000) + (minutes * 60000) + (seconds * 1000) + milliseconds
+    # Handle MM:SS.mmm format (no hours)
+    elsif time_str.match(/^(\d{2}):(\d{2})\.(\d{3})$/)
+      minutes = $1.to_i
+      seconds = $2.to_i
+      milliseconds = $3.to_i
+      
+      return (minutes * 60000) + (seconds * 1000) + milliseconds
+    else
+      raise "Invalid timestamp format: #{time_str}. Expected format: HH:MM:SS.mmm or MM:SS.mmm"
+    end
+  end
+
+  # Pre-process the content to extract words and timestamps more efficiently
+  def process_content(content)
+    # Remove WEBVTT header if present
+    content.shift if content.first && content.first.strip == 'WEBVTT'
+    
+    timestamps = []
+    words = []
+    current_timestamp = nil
+    
+    content.each do |line|
+      line = line.strip
+      next if line.empty?
+      
+      if line.include?("-->")
+        onset_str, offset_str = line.split('-->').map(&:strip)
+        current_timestamp = {
+          onset: parse_timestamp(onset_str),
+          offset: parse_timestamp(offset_str)
+        }
+      elsif line.match(/^\d+$/)
+        # Skip cue numbers (commonly found in VTT files)
+        next
+      elsif current_timestamp && line.match(/^[a-zA-Z]/)
+        # Only add lines that start with letters and have a valid timestamp
+        timestamps << current_timestamp
+        words << line
+      end
+    end
+    
+    [words, timestamps]
+  end
+
+  # Calculate optimal chunk size based on number of cells
+  def calculate_chunk_size(total_cells)
+    # Use the ratio of 1 chunk per 10 cells as basis
+    chunk_size = (total_cells / BASE_RATIO).ceil
+    
+    # Enforce minimum and maximum chunk sizes
+    chunk_size = [chunk_size, MIN_CHUNK_SIZE].max
+    chunk_size = [chunk_size, MAX_CHUNK_SIZE].min
+    
+    puts "Calculated optimal chunk size: #{chunk_size} for #{total_cells} cells"
+    chunk_size
+  end
+
+  # Create columns in batches for better performance
+  def create_columns(words, timestamps)
+    columns = {}
+    
+    # First create all columns
+    COLUMN_CONFIGS.each do |type, config|
+      if config[:required]
+        columns[type] = new_column(config[:name], *config[:codes])
+      end
+    end
+    
+    # Calculate total number of entries and dynamic chunk size
+    total_entries = words.size
+    chunk_size = calculate_chunk_size(total_entries)
+    total_chunks = (total_entries.to_f / chunk_size).ceil
+    
+    puts "Processing #{total_entries} entries in #{total_chunks} chunks with chunk size of #{chunk_size}..."
+    
+    # Process in chunks
+    (0...total_chunks).each do |chunk_idx|
+      start_idx = chunk_idx * chunk_size
+      end_idx = [start_idx + chunk_size, total_entries].min
+      
+      chunk_range = (start_idx...end_idx)
+      chunk_words = words[chunk_range]
+      chunk_timestamps = timestamps[chunk_range]
+      
+      puts "Processing chunk #{chunk_idx + 1}/#{total_chunks} (entries #{start_idx + 1}-#{end_idx})..."
+      
+      # Process each column type for this chunk
+      COLUMN_CONFIGS.each do |type, config|
+        next unless config[:required]
+        column = columns[type]
+        
+        chunk_words.each_with_index do |word, i|
+          cell = column.make_new_cell
+          timestamp = chunk_timestamps[i]
+          
+          # Set onset and offset
+          cell.change_code('onset', timestamp[:onset])
+          cell.change_code('offset', timestamp[:offset])
+          
+          # Set content based on column type
+          case type
+          when :transcript_original
+            cell.change_code('content', word)
+          when :transcript_clean
+            cell.change_code('content', word)
+            cell.change_code('speaker', '')  # Initialize speaker field empty
+          when :qa
+            # Initialize QA codes as empty
+            config[:codes].each do |code|
+              cell.change_code(code, '')
+            end
+          when :initials
+            cell.change_code('coder_initials', '')
+          when :notes
+            cell.change_code('notes', '')
+          end
+        end
+      end
+    end
+    
+    puts "Setting columns in Datavyu..."
+    columns.each do |_, column|
+      set_column(column)
+    end
+  end
+
+  # Show progress dialog
+  def show_progress_dialog(message)
+    JOptionPane.showMessageDialog(nil, message, "Progress", JOptionPane::INFORMATION_MESSAGE)
+  end
+
+  # Main execution flow
+  puts "Starting transcript import..."
+
+  # Setup and show file chooser
+  frame, jfc = setup_file_chooser
+  frame.setVisible(true)
+  
+  result = jfc.showOpenDialog(frame)
+  frame.dispose
+
+  if result != JFileChooser::APPROVE_OPTION
+    puts "No file selected. Aborting."
+    return
+  end
+
+  file_path = jfc.getSelectedFile.getPath
+  validate_file_format(file_path)
+
+  puts "Reading file: #{file_path}"
+  show_progress_dialog("Reading file. Please wait...")
+  
+  # Read file content
+  content = File.readlines(file_path)
+  
+  puts "Processing content..."
+  show_progress_dialog("Processing transcript. This may take a moment for large files...")
+  
+  # Process file content
+  words, timestamps = process_content(content)
+  
+  if words.empty? || timestamps.empty?
+    puts "No valid transcript entries found. Check file format."
+    show_progress_dialog("No valid transcript entries found. Check file format.")
+    return
+  end
+  
+  puts "Creating Datavyu columns with #{words.size} entries..."
+  show_progress_dialog("Creating Datavyu columns with #{words.size} entries. Please wait...")
+  
+  # Create columns with dynamically sized batched processing
+  create_columns(words, timestamps)
+
+  puts "Import completed successfully!"
+  show_progress_dialog("Import completed successfully!")
+
+rescue => e
+  puts "Error: #{e.message}"
+  puts e.backtrace if ENV['DEBUG']
+  JOptionPane.showMessageDialog(nil, "Error: #{e.message}", "Import Error", JOptionPane::ERROR_MESSAGE)
+end