#!/usr/bin/env python3
"""
5QLN Training Data Converter

Converts existing decode files (1-10, 11-20, 21-30, 31-38) to JSONL format
for fine-tuning language models on 5QLN.

Usage:
    python convert_decodes_to_jsonl.py

Output:
    5QLN_ENCODE_ALL.jsonl - All 38 encode examples
"""

import json
import re
from pathlib import Path

def parse_decode_file(filepath):
    """Parse a decode file and extract INPUT/OUTPUT pairs."""
    
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Split by DECODE markers
    decode_blocks = re.split(r'## DECODE \d+', content)[1:]  # Skip header
    
    pairs = []
    for block in decode_blocks:
        # Extract INPUT
        input_match = re.search(r'\*\*INPUT\*\*:\s*"([^"]+)"', block)
        if not input_match:
            continue
        
        input_text = input_match.group(1)
        
        # Extract decode block (everything between ``` markers)
        decode_match = re.search(r'```\s*\nDECODE:(.*?)```', block, re.DOTALL)
        if not decode_match:
            # Try alternate format
            decode_match = re.search(r'```(.*?)```', block, re.DOTALL)
            if not decode_match:
                continue
        
        output_text = decode_match.group(1).strip()
        
        pairs.append({
            "instruction": "Decode this FCF talk into 5QLN structure. Extract the phases (S, G, Q, P, V), their equations, and the blooms (X, Y, Z, A, B).",
            "input": input_text,
            "output": output_text
        })
    
    return pairs

def convert_all_files():
    """Convert all decode files to single JSONL."""
    
    # File paths (adjust as needed)
    files = [
        "1-10-select_first_10_pieces-demonstrate_different_aspects.txt",
        "11-20-Processing_next_10_pieces_2.txt", 
        "21-30-Processing_next_10_pieces_3.txt",
        "31-38-Processing_final_8_pieces.txt"
    ]
    
    all_pairs = []
    
    for filename in files:
        filepath = Path(filename)
        if filepath.exists():
            pairs = parse_decode_file(filepath)
            all_pairs.extend(pairs)
            print(f"Extracted {len(pairs)} pairs from {filename}")
        else:
            print(f"File not found: {filename}")
    
    # Write JSONL
    output_file = "5QLN_ENCODE_ALL.jsonl"
    with open(output_file, 'w', encoding='utf-8') as f:
        for pair in all_pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + '\n')
    
    print(f"\nTotal: {len(all_pairs)} training pairs")
    print(f"Output: {output_file}")

def create_combined_training_file():
    """Combine all training data into single file."""
    
    datasets = [
        "5QLN_GLOSSARY.jsonl",
        "5QLN_EXPANSION.jsonl", 
        "5QLN_BEHAVIOR.jsonl",
        "5QLN_CORRUPTION.jsonl",
        "5QLN_ENCODE_ALL.jsonl"
    ]
    
    all_data = []
    
    for dataset in datasets:
        filepath = Path(dataset)
        if filepath.exists():
            with open(filepath, 'r') as f:
                count = 0
                for line in f:
                    all_data.append(json.loads(line.strip()))
                    count += 1
                print(f"Loaded {count} examples from {dataset}")
        else:
            print(f"Not found: {dataset}")
    
    # Write combined
    output_file = "5QLN_TRAINING_COMPLETE.jsonl"
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in all_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"\nTotal training examples: {len(all_data)}")
    print(f"Output: {output_file}")

if __name__ == "__main__":
    print("=== 5QLN Training Data Converter ===\n")
    
    print("Step 1: Converting decode files...")
    convert_all_files()
    
    print("\nStep 2: Combining all datasets...")
    create_combined_training_file()
    
    print("\n=== Complete ===")
    print("""
Next steps:
1. Copy all .jsonl files to training environment
2. Use Unsloth or Axolotl to fine-tune Gemma 2 4B
3. Train for 8-10 epochs
4. Evaluate on held-out examples
5. Quantize for deployment

See 5QLN_TRAINING_GUIDE.md for full instructions.
""")
