from pathlib import Path
import random, datetime, hashlib
import re

# ===== 설정 =====
OUT_DIR = Path("./custom_data/html")
TOTAL = 100
SEED = 42
SHORT_RATE = 0.08
EXACT_DUP_RATE = 0.05     
CODE_RATE = 0.15          
EN_RATE = 0.35            
URL_RATE = 0.40           

random.seed(SEED)
OUT_DIR.mkdir(parents=True, exist_ok=True)

ko_templates = [
    {
        "title": "서울 날씨 - {region}",
        "body": "오늘({date}) {region}은 {sky} {event}입니다. 기온은 {temp}도이며, {detail} 예상됩니다."
    },
    {
        "title": "팀 공지사항 - {dept}",
        "body": "안녕하세요 {dept} 팀원 여러분.\\n\\n{date}부터 {event}가 시행됩니다. 자세한 사항은 {place}에서 확인 부탁드립니다.\\n담당자: {person}\\n\\n감사합니다."
    },
    {
        "title": "오늘의 레시피 - {dish}",
        "body": "{dish} 만들기 ({difficulty} 난이도)\\n\\n재료: {ingredient}, {ingredient2}\\n조리법: {heat}에서 {mins}분간 조리하세요.\\n팁: {tip}\\n\\n맛있게 드세요!"
    },
    {
        "title": "회의록 - {dept} {agenda}",
        "body": "일시: {date} {time}\\n참석자: {person}, {person2}\\n안건: {agenda}\\n\\n결정사항:\\n- 다음 회의: {next_date}\\n- 액션 아이템: {action}\\n- 검토 필요: {review_item}"
    },
    {
        "title": "업무 메모 - {priority} 우선순위",
        "body": "{event} 관련하여 {deadline}까지 처리 필요.\\n담당자: {person}\\n우선순위: {priority}\\n예상 소요시간: {duration}\\n비고: {note}"
    },
    {
        "title": "프로젝트 현황 - {project}",
        "body": "프로젝트명: {project}\\n진행률: {progress}%\\n담당팀: {dept}\\n마일스톤: {milestone}\\n이슈사항: {issue}\\n다음 단계: {next_step}"
    },
    {
        "title": "교육 안내 - {course}",
        "body": "{course} 교육 안내\\n\\n일정: {date} {time}\\n장소: {place}\\n대상: {target}\\n신청방법: {method}\\n문의: {person}"
    }
]

en_templates = [
    {
        "title": "Weather Update - {region}",
        "body": "Today ({date}) {region} weather: {sky} with {event}. Temperature: {temp}°C. {detail} expected."
    },
    {
        "title": "Weekly Report - {dept}",
        "body": "Department: {dept}\\nWeek of {date}\\n\\n{metric} increased by {percent}%.\\nTeam: {person}\\nNext review: {next_date}\\nAction items: {action}"
    },
    {
        "title": "System Maintenance - {service}",
        "body": "Scheduled maintenance on {date} at {time}.\\nService: {service}\\nExpected downtime: {mins} minutes.\\nReason: {reason}\\nContact: {person}\\n\\nThank you for your patience."
    },
    {
        "title": "Tutorial Guide - {topic}",
        "body": "{topic} Tutorial ({difficulty} level)\\n\\nStep 1: {step1}\\nStep 2: {step2}\\nStep 3: {step3}\\n\\nTips: {tip}\\nFor more information, contact {person}."
    },
    {
        "title": "Project Update - {project}",
        "body": "Project: {project}\\nStatus: {status} ({progress}%)\\nDeadline: {deadline}\\nTeam: {person}, {person2}\\nMilestone: {milestone}\\nNext steps: {next_step}"
    }
]

code_templates = [
    {
        "title": "Python Data Processing",
        "body": """# Data processing utilities
import pandas as pd
import numpy as np

def process_data(filename):
    # Load CSV file into DataFrame
    df = pd.read_csv(filename)
    # Clean text columns
    df['text'] = df['text'].str.strip()
    # Remove empty rows
    df = df.dropna()
    return df

def clean_data(df):
    # Convert to lowercase for consistency
    df['processed'] = df['text'].str.lower()
    return df

# Usage example
data = process_data('input.csv')
cleaned = clean_data(data)
print(f"Loaded {len(data)} records")"""
    },
    {
        "title": "SQL Query Example", 
        "body": """-- User activity analysis query
-- This query finds high-value customers
SELECT 
    user_id,
    COUNT(*) as total_orders,
    SUM(amount) as total_spent,
    AVG(amount) as avg_order
FROM orders 
WHERE created_at >= '2024-01-01'
  AND status = 'completed'  -- Only completed orders
GROUP BY user_id
HAVING total_orders > 5     -- Filter active users
ORDER BY total_spent DESC
LIMIT 100;                  -- Top 100 customers"""
    },
    {
        "title": "JavaScript Data Handler",
        "body": """// Utility functions for data analysis
function calculateMetrics(data) {
    // Basic statistical calculations
    const total = data.length;
    const sum = data.reduce((a, b) => a + b, 0);
    const average = sum / total;
    
    return {
        count: total,
        sum: sum,
        average: Math.round(average * 100) / 100,  // Round to 2 decimals
        max: Math.max(...data),
        min: Math.min(...data)
    };
}

// Example usage
const results = calculateMetrics([1, 2, 3, 4, 5]);
console.log("Results:", results);"""
    },
    {
        "title": "Python ML Pipeline",
        "body": """# Machine learning pipeline example
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_model(X, y):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Initialize and train model
    model = RandomForestClassifier(n_estimators=100)
    model.fit(X_train, y_train)
    
    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return model, accuracy

# Usage example
model, acc = train_model(features, labels)
print(f"Model accuracy: {acc:.3f}")"""
    },
    {
        "title": "Python Web Scraping",
        "body": """# Web scraping utilities
import requests
from bs4 import BeautifulSoup
import time

def scrape_data(url):
    # Set headers to avoid blocking
    headers = {'User-Agent': 'Mozilla/5.0 (compatible)'}
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        # Parse HTML content
        soup = BeautifulSoup(response.content, 'html.parser')
        titles = soup.find_all('h1')
        return [title.text.strip() for title in titles]
    
    return []

def batch_scrape(urls):
    # Scrape multiple URLs with delay
    results = []
    for url in urls:
        data = scrape_data(url)
        results.extend(data)
        time.sleep(1)  # Rate limiting
    return results"""
    },
    {
        "title": "Database Connection Helper",
        "body": """# Database management utilities
import sqlite3
import pandas as pd

class DatabaseManager:
    def __init__(self, db_path):
        # Initialize database connection
        self.db_path = db_path
        self.connection = None
    
    def connect(self):
        # Establish database connection
        self.connection = sqlite3.connect(self.db_path)
        return self.connection
    
    def execute_query(self, query, params=None):
        # Execute SQL query with optional parameters
        cursor = self.connection.cursor()
        if params:
            cursor.execute(query, params)
        else:
            cursor.execute(query)
        return cursor.fetchall()
    
    def close(self):
        # Close database connection
        if self.connection:
            self.connection.close()

# Usage example
db = DatabaseManager('data.db')
db.connect()"""
    },
    {
        "title": "React Component Example",
        "body": """// React data table component
import React, { useState, useEffect } from 'react';

const DataTable = ({ apiUrl }) => {
    // State management for component
    const [data, setData] = useState([]);
    const [loading, setLoading] = useState(true);
    const [error, setError] = useState(null);

    useEffect(() => {
        // Async function to fetch data from API
        const fetchData = async () => {
            try {
                const response = await fetch(apiUrl);
                const result = await response.json();
                setData(result);
                setLoading(false);
            } catch (err) {
                // Handle fetch errors
                setError(err.message);
                setLoading(false);
            }
        };

        fetchData();
    }, [apiUrl]);

    // Render loading state
    if (loading) return <div>Loading...</div>;
    // Render error state  
    if (error) return <div>Error: {error}</div>;

    // Render data table
    return (
        <table>
            <tbody>
                {data.map((item, index) => (
                    <tr key={index}>
                        <td>{item.name}</td>
                        <td>{item.value}</td>
                    </tr>
                ))}
            </tbody>
        </table>
    );
};

export default DataTable;"""
    },
    {
        "title": "Python API Client",
        "body": """# HTTP API client implementation  
import requests
import json
from typing import Dict, List, Optional

class APIClient:
    def __init__(self, base_url: str, api_key: str):
        # Initialize API client with credentials
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        self.session = requests.Session()
        # Set default headers
        self.session.headers.update({
            'Authorization': f'Bearer {api_key}',
            'Content-Type': 'application/json'
        })
    
    def get(self, endpoint: str, params: Optional[Dict] = None) -> Dict:
        # Make GET request to API endpoint
        url = f"{self.base_url}/{endpoint}"
        response = self.session.get(url, params=params)
        response.raise_for_status()
        return response.json()
    
    def post(self, endpoint: str, data: Dict) -> Dict:
        # Make POST request with JSON data
        url = f"{self.base_url}/{endpoint}"
        response = self.session.post(url, json=data)
        response.raise_for_status()
        return response.json()

# Usage example
client = APIClient('https://api.example.com', 'your-api-key')
result = client.get('users', {'page': 1, 'limit': 10})"""
    }
]

skies = ["맑고", "흐리고", "비가 내리고", "눈이 오고"]
events = ["강한 바람", "가끔 비", "약한 비", "짙은 안개", "뇌우 가능성"]
dishes = ["김치볶음밥", "된장찌개", "불고기", "비빔냉면", "떡볶이"]
agendas = ["분기 예산 검토", "신규 프로젝트 론칭", "팀 구조 개편", "시스템 업그레이드"]
metrics = ["user engagement", "conversion rate", "page views", "revenue", "sign-ups"]
topics = ["Machine Learning", "Data Pipeline", "API Development", "Database Design"]
services = ["web server", "database", "API gateway", "file storage"]
statuses = ["In Progress", "Under Review", "Completed", "Delayed"]
people = ["김철수", "이영희", "박민수", "John Smith", "Sarah Johnson", "최지영", "정민호", "Anderson"]
priorities = ["높음", "보통", "낮음"]
regions = ["서울", "부산", "대구", "인천", "광주", "대전", "울산", "세종", "경기", "강원"]
depts = ["개발팀", "마케팅팀", "영업팀", "인사팀", "재무팀", "기획팀", "운영팀", "QA팀"]
temps = ["-5", "0", "5", "10", "15", "20", "25", "30", "35"]
difficulties = ["초급", "중급", "고급", "전문가", "beginner", "intermediate", "advanced", "expert"]
ingredients = ["김치", "두부", "소고기", "돼지고기", "닭고기", "생선", "야채", "버섯"]
ingredients2 = ["양파", "마늘", "생강", "고추", "당근", "감자", "배추", "무"]
tips = ["센 불에서 빠르게", "약한 불에서 천천히", "뚜껑을 덮고", "자주 저어주세요", "소금간을 맞춰"]
actions = ["문서 검토", "테스트 수행", "배포 준비", "고객 미팅", "코드 리뷰", "설계 검토"]
review_items = ["요구사항 문서", "기술 스펙", "테스트 계획", "배포 가이드", "사용자 매뉴얼"]
durations = ["1시간", "반나절", "1일", "2-3일", "1주일"]
notes = ["긴급", "중요", "검토 필요", "승인 대기", "진행 중"]
projects = ["웹사이트 리뉴얼", "모바일 앱 개발", "데이터 마이그레이션", "시스템 업그레이드", "AI 도입"]
progresses = ["10", "25", "50", "75", "90"]
milestones = ["기획 완료", "개발 완료", "테스트 완료", "배포 완료", "운영 안정화"]
issues = ["일정 지연", "리소스 부족", "기술적 이슈", "요구사항 변경", "없음"]
next_steps = ["상세 설계", "개발 시작", "테스트 수행", "사용자 교육", "성과 측정"]
courses = ["파이썬 기초", "데이터 분석", "머신러닝", "웹 개발", "프로젝트 관리"]
targets = ["신입사원", "경력직", "팀장급", "전 직원", "개발자"]
methods = ["온라인 신청", "이메일 접수", "전화 접수", "방문 접수"]
reasons = ["보안 패치", "성능 향상", "기능 추가", "버그 수정", "하드웨어 교체"]
step3s = ["Configure settings", "Test connection", "Deploy application", "Monitor performance", "Create backup"]

url_pool = [
    "https://docs.python.org/3/tutorial/",
    "https://pandas.pydata.org/docs/user_guide/",
    "https://github.com/nvidia/NeMo-Curator",
    "https://huggingface.co/docs",
    "https://arxiv.org/abs/2024.12345",
    "https://medium.com/@author/data-processing",
]

def rand_date():
    base = datetime.date(2025, 9, 2)
    delta = datetime.timedelta(days=random.randint(-30, 30))
    return (base + delta).isoformat()

def rand_deadline():
    base = datetime.date.today()
    delta = datetime.timedelta(days=random.randint(1, 10))
    return (base + delta).isoformat()

# ID 안정화
def make_id(text, doc_type):
    return "auto_" + hashlib.md5(text.encode("utf-8")).hexdigest()[:16] + "_" + doc_type

def make_context():
    return {
        "date": rand_date(),
        "next_date": rand_date(), 
        "deadline": rand_deadline(),
        "time": random.choice(["09:00", "10:30", "14:00", "15:30", "16:00", "19:00"]),
        "place": random.choice(["회의실 A", "회의실 B", "온라인 회의", "본관 2층", "카페테리아", "교육장"]),
        "dish": random.choice(dishes),
        "ingredient": random.choice(ingredients),
        "ingredient2": random.choice(ingredients2),
        "heat": random.choice(["약불", "중불", "강불"]),
        "mins": random.choice([3, 5, 10, 15, 20, 25, 30]),
        "sky": random.choice(skies),
        "event": random.choice(events),
        "agenda": random.choice(agendas),
        "metric": random.choice(metrics),
        "percent": random.choice([5, 8, 12, 15, 18, 22, 25, 31, 45]),
        "topic": random.choice(topics),
        "step1": "Install required packages",
        "step2": "Configure environment variables", 
        "step3": random.choice(step3s),
        "service": random.choice(services),
        "status": random.choice(statuses),
        "person": random.choice(people),
        "person2": random.choice([p for p in people if p != random.choice(people)]),
        "priority": random.choice(priorities),
        "region": random.choice(regions),
        "dept": random.choice(depts),
        "temp": random.choice(temps),
        "difficulty": random.choice(difficulties),
        "tip": random.choice(tips),
        "action": random.choice(actions),
        "review_item": random.choice(review_items),
        "duration": random.choice(durations),
        "note": random.choice(notes),
        "project": random.choice(projects),
        "progress": random.choice(progresses),
        "milestone": random.choice(milestones),
        "issue": random.choice(issues),
        "next_step": random.choice(next_steps),
        "course": random.choice(courses),
        "target": random.choice(targets),
        "method": random.choice(methods),
        "reason": random.choice(reasons),
        "detail": random.choice(["바람이 강하게", "습도가 높게", "맑은 하늘이", "구름이 많이"]),
    }

def make_document(doc_type="normal", base_content=None):
    """문서 생성 (수정판)"""
    if doc_type == "exact_dup" and base_content:
        return base_content

    ctx = make_context()

    if doc_type == "code":
        template = random.choice(code_templates)
        title = template["title"]
        # 펜스 추가 + 최소 3줄 보장
        body = f"```{title.lower().split()[0]}\n{template['body']}\n```"
        is_english = True

    elif doc_type == "short":
        title = random.choice(["알림", "공지", "메모", "Notice", "Update"])
        body = random.choice([
            "확인 완료", "처리됨", "승인", "완료",
            "Confirmed", "Done", "Approved", "Complete",
            "OK", "적용됨", "대기중", "처리중"
        ])
        body += f" #{random.randint(1, 1000)}"  # 고유 태그 추가
        is_english = not any(ord(c) > 127 for c in body)
        full_text = f"{title}\n\n{body}"
        doc = {"text": full_text, "is_english": is_english, "doc_type": "short"}
        doc["id"] = make_id(full_text, "short")
        return doc

    else:  # normal
        is_english = random.random() < EN_RATE
        templates = en_templates if is_english else ko_templates
        template = random.choice(templates)
        title = template["title"]
        body = template["body"].format(**ctx)

        # 코드처럼 보이는 기호 제거
        body = re.sub(r"\b(def|class|SELECT|function)\b", "", body)
        body = body.replace("{", "(").replace("}", ")").replace(";", ".")

        # URL 추가
        if random.random() < URL_RATE:
            url = random.choice(url_pool)
            body += f"\n\nReference: {url}" if is_english else f"\n\n참고링크: {url}"

    full_text = f"{title}\n\n{body}" if title != body else body
    doc = {"text": full_text, "is_english": is_english, "doc_type": doc_type}

    # ID 안정화
    doc["id"] = "auto_" + hashlib.md5(full_text.encode("utf-8")).hexdigest()[:16] + "_" + doc_type
    return doc

    
# 문서 생성 시작
all_documents = []
base_documents = []

print("🏗️  Creating document...")

# 1. 기본 문서 생성 (normal + code + short)
base_count = int(TOTAL * (1 - EXACT_DUP_RATE))
for i in range(base_count):
    rand = random.random()
    doc_type = "normal"
    if rand < CODE_RATE:
        doc_type = "code"
    elif rand < CODE_RATE + SHORT_RATE:
        doc_type = "short"
    
    doc = make_document(doc_type)
    all_documents.append(doc)
    if doc_type == "normal":
        base_documents.append(doc)

# 2. 완전 중복 문서들
exact_dup_count = int(TOTAL * EXACT_DUP_RATE)
for i in range(exact_dup_count):
    if base_documents:
        base_doc = random.choice(base_documents).copy()
        base_doc["doc_type"] = "exact_dup"
        all_documents.append(base_doc)

# 총량 맞추기
all_documents = all_documents[:TOTAL]
random.shuffle(all_documents)

print("💾 Saving HTML file...")

# HTML 파일 생성
for i, doc in enumerate(all_documents, 1):
    filename = f"doc{i:04d}.html"
    filepath = OUT_DIR / filename
    html_content = f"""<!DOCTYPE html>
<html><head>
<meta charset="UTF-8"><meta name="source_id" content="doc{i:04d}">
</head><body><div class="content">
{doc['text'].replace(chr(10), '<br>').replace(chr(13), '')}
</div></body></html>"""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(html_content)

print(f"✅ {TOTAL} HTML files created!")
print(f"📁 Files saved to: {OUT_DIR}")

# 통계 출력
doc_types = [doc["doc_type"] for doc in all_documents]
languages = ["KO" if not doc["is_english"] else "EN" for doc in all_documents]
has_urls = sum(1 for doc in all_documents if "http" in doc["text"])
code_docs = sum(1 for doc in all_documents if doc["doc_type"] == "code")
avg_length = sum(len(doc["text"]) for doc in all_documents) / len(all_documents)

print(f"\n📊 Dataset Statistics:")
print(f"- document type: {dict([(t, doc_types.count(t)) for t in set(doc_types)])}")
print(f"- language: {dict([(l, languages.count(l)) for l in set(languages)])}")
print(f"- Include URL: {has_urls}개 ({has_urls/TOTAL*100:.1f}%)")
print(f"- Average length: {avg_length:.0f}자")

print(f"   HTML files: doc0001.html ~ doc{TOTAL:04d}.html")