#!/usr/bin/env python3
"""
End-to-end VTT to Markdown workflow.
Converts VTT transcript files to filtered, chronologically-ordered Markdown.

Usage:
    python3 vtt_to_markdown.py input.vtt output.md
    python3 vtt_to_markdown.py file1.vtt file2.vtt output.md
    python3 vtt_to_markdown.py /path/to/directory/ output.md
"""

import argparse
import json
import os
import re
import sys
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Tuple


# ============================================================================
# STEP 1: VTT PARSING
# ============================================================================

def parse_timestamp(timestamp_line: str) -> Tuple[str, str]:
    """
    Parse a VTT timestamp line.

    Args:
        timestamp_line: Line like "00:00:01.430 --> 00:00:03.659"

    Returns:
        Tuple of (start_time, end_time)
    """
    match = re.match(r'(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})', timestamp_line)
    if match:
        return match.group(1), match.group(2)
    return "", ""


def parse_speaker_content(text: str) -> Tuple[str, str]:
    """
    Parse speaker and content from text.

    Args:
        text: Line like "Seth Chandler: The content here"

    Returns:
        Tuple of (speaker, content)
    """
    match = re.match(r'^([^:]+):\s*(.*)$', text)
    if match:
        return match.group(1).strip(), match.group(2).strip()
    return "", text.strip()


def parse_vtt_file(file_path: str, source_filename: str) -> List[Dict[str, str]]:
    """
    Parse a VTT file and extract entries.

    Args:
        file_path: Path to the VTT file
        source_filename: Filename to store in entries

    Returns:
        List of dictionaries with timestamp, speaker, content, and source_file
    """
    entries = []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # Skip WEBVTT header and empty lines
        if line == 'WEBVTT' or line == '':
            i += 1
            continue

        # Check if this is an entry number
        if line.isdigit():
            entry_num = line
            i += 1

            # Next line should be timestamp
            if i < len(lines):
                timestamp_line = lines[i].strip()
                start_time, end_time = parse_timestamp(timestamp_line)
                i += 1

                # Collect content lines until empty line or end of file
                content_lines = []
                while i < len(lines) and lines[i].strip() != '':
                    content_lines.append(lines[i].strip())
                    i += 1

                # Parse speaker and content from the combined content
                full_content = ' '.join(content_lines)
                speaker, content = parse_speaker_content(full_content)

                # Create entry
                entry = {
                    "entry_number": entry_num,
                    "start_time": start_time,
                    "end_time": end_time,
                    "speaker": speaker,
                    "content": content,
                    "source_file": source_filename
                }
                entries.append(entry)
        else:
            i += 1

    return entries


# ============================================================================
# STEP 2: FILTERING
# ============================================================================

def is_extraneous(content: str, speaker: str) -> bool:
    """
    Determine if a transcript entry is extraneous (not substantive constitutional law content).

    Args:
        content: The content text
        speaker: The speaker name

    Returns:
        True if the entry should be filtered out
    """
    content_lower = content.lower().strip()

    # Very short utterances that are typically filler
    if len(content) < 3:
        return True

    # Single word responses/fillers
    single_word_fillers = [
        'yes', 'no', 'okay', 'ok', 'yeah', 'yep', 'nope', 'right', 'alright',
        'all right', 'um', 'uh', 'hmm', 'huh', 'well', 'so', 'now', 'hold on',
        'oops', 'beautiful', 'great', 'good', 'cool', 'excellent', 'perfect',
        'mmm', 'mhm', 'uh-huh', 'wait'
    ]

    if content_lower in single_word_fillers or content_lower.rstrip('.!?') in single_word_fillers:
        return True

    # Attendance codes and administrative
    if 'attendance code' in content_lower or ('farming' in content_lower and len(content) < 30):
        return True

    if re.match(r'^(farming|obamacare)[.!?]?$', content_lower):
        return True

    # Questions about attendance/location
    if re.search(r'\b(where are you|where is|are you here|absent|present)\b', content_lower) and len(content) < 50:
        return True

    # Office hours and scheduling
    if re.search(r'\b(office hours|schedule|tomorrow|yesterday|today is)\b', content_lower) and len(content) < 80:
        return True

    # Technical/zoom issues
    if re.search(r'\b(zoom|recording|audio|video|can you hear|technical)\b', content_lower):
        return True

    # Greetings without substance
    if re.match(r'^(good morning|hello|hi|hey)[.!?,]?$', content_lower):
        return True

    # Password announcements without context
    if 'password is' in content_lower and len(content) < 50:
        return True

    # Navigation/computer interaction
    if re.search(r'\b(scroll down|click|modules|canvas|let me see|where is it)\b', content_lower) and len(content) < 60:
        return True

    # Incomplete fragments
    if content.strip().endswith('…') and len(content) < 30:
        return True

    if re.match(r'^(and|but|or|so|well|like)\W*$', content_lower):
        return True

    # Questions about who to call on
    if re.search(r"(let's|let me) (take|go to|call on)", content_lower) and len(content) < 40:
        return True

    return False


def should_keep_entry(entry: Dict) -> bool:
    """
    Determine if an entry should be kept based on content.

    Args:
        entry: Current entry

    Returns:
        True if entry should be kept
    """
    content = entry.get('content', '').strip()
    speaker = entry.get('speaker', '').strip()

    # Check if extraneous
    if is_extraneous(content, speaker):
        return False

    # Keep substantive legal content indicators
    substantive_indicators = [
        'commerce clause', 'tenth amendment', 'constitution', 'supreme court',
        'congress', 'federal', 'state', 'statute', 'case', 'holding',
        'wickard', 'filburn', 'darby', 'lopez', 'morrison', 'gonzalez',
        'reich', 'perez', 'heart of atlanta', 'mcculloch', 'gibbons',
        'necessary and proper', 'enumerated powers', 'interstate commerce',
        'aggregation', 'substantial effect', 'economic activity',
        'regulatory', 'obamacare', 'affordable care act', 'medicaid',
        'individual mandate', 'taxing power', 'spending clause',
        'dormant commerce', 'anti-discrimination', 'jurisdictional',
        'constitutional', 'unconstitutional', 'justice', 'opinion',
        'dissent', 'majority', 'hobbs act', 'controlled substance'
    ]

    content_lower = content.lower()
    has_substantive = any(indicator in content_lower for indicator in substantive_indicators)

    if has_substantive:
        return True

    # Keep longer explanatory content (likely teaching)
    if len(content) > 100 and not is_extraneous(content, speaker):
        return True

    # Keep questions that seem substantive
    if content.endswith('?') and len(content) > 40:
        # But not location/administrative questions
        if not re.search(r'\b(where are you|where is|who is|anyone here)\b', content_lower):
            return True

    return False


def filter_entries(entries: List[Dict]) -> List[Dict]:
    """
    Filter entries to keep only constitutional law content.

    Args:
        entries: List of all entries

    Returns:
        Filtered list of entries
    """
    return [entry for entry in entries if should_keep_entry(entry)]


# ============================================================================
# STEP 3: MARKDOWN GENERATION
# ============================================================================

def extract_date_from_filename(filename: str) -> str:
    """
    Extract date from GMT filename.
    Example: GMT20260128-162725_Recording.transcript.vtt -> January 28, 2026
    """
    match = re.match(r'GMT(\d{4})(\d{2})(\d{2})-', filename)
    if match:
        year, month, day = match.groups()
        try:
            date_obj = datetime(int(year), int(month), int(day))
            return date_obj.strftime("%B %d, %Y")
        except:
            return filename
    return filename


def entries_to_markdown(entries: List[Dict], title: str = "Constitutional Law Transcript") -> str:
    """
    Convert entries to Markdown format with chronological ordering.

    Args:
        entries: List of filtered entries
        title: Document title

    Returns:
        Markdown formatted string
    """
    # Group entries by source file
    sessions = {}
    for entry in entries:
        source_file = entry.get('source_file', 'Unknown')
        if source_file not in sessions:
            sessions[source_file] = []
        sessions[source_file].append(entry)

    # Sort sessions by date extracted from filename
    sorted_sessions = sorted(sessions.items(), key=lambda x: x[0])

    markdown_lines = []

    # Add document header
    markdown_lines.append(f"# {title}\n")
    markdown_lines.append("*Extracted from Zoom class recordings*\n")
    markdown_lines.append("---\n")

    # Process each session in chronological order
    for session_idx, (source_file, session_entries) in enumerate(sorted_sessions):
        if session_idx > 0:
            # Add spacing before new section
            markdown_lines.append("\n---\n")

        # Add section header
        date_str = extract_date_from_filename(source_file)
        markdown_lines.append(f"\n## Session: {date_str}\n")
        markdown_lines.append(f"*Source: {source_file}*\n\n")

        # Add entries for this session
        for entry in session_entries:
            speaker = entry.get('speaker', 'Unknown')
            content = entry.get('content', '').strip()

            # Format entry - speaker and content only, no timestamp
            if content:
                markdown_lines.append(f"**{speaker}**: {content}\n\n")

    return ''.join(markdown_lines)


# ============================================================================
# MAIN WORKFLOW
# ============================================================================

def process_vtt_to_markdown(input_paths: List[str], output_file: str,
                           title: str = "Constitutional Law Transcript",
                           verbose: bool = True):
    """
    Complete workflow: VTT files -> filtered Markdown.

    Args:
        input_paths: List of VTT files or directories
        output_file: Output Markdown file path
        title: Document title
        verbose: Print progress messages
    """
    # Step 1: Collect all VTT files
    vtt_files = []
    for input_path in input_paths:
        path = Path(input_path)

        if path.is_file():
            if path.suffix.lower() == '.vtt':
                vtt_files.append(path)
            else:
                print(f"Warning: {input_path} is not a .vtt file, skipping.", file=sys.stderr)
        elif path.is_dir():
            # Find all .vtt files in directory
            dir_vtt_files = list(path.glob('*.vtt')) + list(path.glob('*.VTT'))
            if dir_vtt_files:
                vtt_files.extend(dir_vtt_files)
            else:
                print(f"Warning: No .vtt files found in {input_path}", file=sys.stderr)
        else:
            print(f"Error: {input_path} does not exist.", file=sys.stderr)
            sys.exit(1)

    if not vtt_files:
        print("Error: No VTT files to process.", file=sys.stderr)
        sys.exit(1)

    if verbose:
        print(f"Found {len(vtt_files)} VTT file(s) to process")

    # Step 2: Parse all VTT files
    all_entries = []
    for vtt_file in vtt_files:
        if verbose:
            print(f"Parsing: {vtt_file.name}")
        entries = parse_vtt_file(str(vtt_file), vtt_file.name)
        all_entries.extend(entries)
        if verbose:
            print(f"  → {len(entries)} entries")

    if verbose:
        print(f"\nTotal entries parsed: {len(all_entries)}")

    # Step 3: Filter entries
    if verbose:
        print("Filtering extraneous content...")
    filtered_entries = filter_entries(all_entries)

    if verbose:
        print(f"Kept {len(filtered_entries)} entries ({len(filtered_entries)/len(all_entries)*100:.1f}%)")
        print(f"Removed {len(all_entries) - len(filtered_entries)} entries")

    # Step 4: Generate Markdown
    if verbose:
        print("\nGenerating Markdown...")
    markdown_content = entries_to_markdown(filtered_entries, title)

    # Step 5: Write output
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(markdown_content)

    if verbose:
        print(f"\n✓ Markdown transcript written to: {output_file}")
        file_size = os.path.getsize(output_file) / 1024
        print(f"  File size: {file_size:.1f} KB")


def main():
    parser = argparse.ArgumentParser(
        description='End-to-end VTT to Markdown conversion with filtering.',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Process a single file
  %(prog)s transcript.vtt output.md

  # Process multiple files
  %(prog)s file1.vtt file2.vtt file3.vtt output.md

  # Process all VTT files in a directory
  %(prog)s /path/to/directory/ output.md

  # With custom title
  %(prog)s --title "Module 1" transcripts/ Module1.md
        """
    )

    parser.add_argument(
        'inputs',
        nargs='+',
        help='VTT file(s) or directory containing VTT files, followed by output filename'
    )

    parser.add_argument(
        '--title',
        default='Constitutional Law Transcript',
        help='Document title (default: Constitutional Law Transcript)'
    )

    parser.add_argument(
        '-q', '--quiet',
        action='store_true',
        help='Suppress progress messages'
    )

    args = parser.parse_args()

    # Last argument is the output file
    if len(args.inputs) < 2:
        print("Error: Must provide at least one input and one output file.", file=sys.stderr)
        sys.exit(1)

    output_file = args.inputs[-1]
    input_paths = args.inputs[:-1]

    # Validate output file extension
    if not output_file.endswith('.md'):
        print("Warning: Output file doesn't have .md extension", file=sys.stderr)

    # Run the workflow
    process_vtt_to_markdown(
        input_paths=input_paths,
        output_file=output_file,
        title=args.title,
        verbose=not args.quiet
    )


if __name__ == '__main__':
    main()