#!/usr/bin/env python3
"""
CLI tool to load CSV data into ClickHouse from dataset configurations.

Автоматически находит все config/dataset_*.yaml файлы и загружает соответствующие CSV данные.

Usage:
    python load_csv_to_clickhouse.py                    # Load all datasets
    python load_csv_to_clickhouse.py --dataset timeline # Load specific dataset
    python load_csv_to_clickhouse.py --dry-run          # Show what would be done
    python load_csv_to_clickhouse.py --drop-existing    # Drop and recreate tables
"""

import argparse
import sys
import os
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
import logging

import yaml
import pandas as pd
import clickhouse_connect
from dotenv import load_dotenv
from rich.console import Console
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn

# Setup paths
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

console = Console()


def load_env() -> Dict[str, str]:
    """Load ClickHouse settings from .env file."""
    env_path = PROJECT_ROOT / '.env'
    load_dotenv(env_path)

    # Определяем протокол: если указан CLICKHOUSE_INTERFACE, используем его
    # иначе автоматически: 8123 = http, 9000/9440 = native
    port = int(os.getenv('CLICKHOUSE_PORT', 8123))
    interface = os.getenv('CLICKHOUSE_INTERFACE')

    if not interface:
        # Автоопределение протокола по порту
        if port == 8123:
            interface = 'http'
        elif port in (9000, 9440):
            interface = 'native'
        else:
            interface = 'http'  # По умолчанию HTTP

    return {
        'host': os.getenv('CLICKHOUSE_HOST', 'localhost'),
        'port': port,
        'user': os.getenv('CLICKHOUSE_USER', 'default'),
        'password': os.getenv('CLICKHOUSE_PASSWORD', ''),
        'database': os.getenv('CLICKHOUSE_DATABASE', 'default'),
        'interface': interface
    }


def discover_datasets(config_dir: Path) -> List[Path]:
    """Find all dataset_*.yaml files in config directory."""
    pattern = 'dataset_*.yaml'
    datasets = list(config_dir.glob(pattern))
    logger.info(f"Found {len(datasets)} dataset configuration files")
    return sorted(datasets)


def load_dataset_config(yaml_path: Path) -> Dict[str, Any]:
    """Load dataset configuration from YAML file."""
    with open(yaml_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    return config.get('dataset', {})


def get_clickhouse_client(config: Dict[str, str]):
    """Create ClickHouse client connection."""
    try:
        interface = config['interface']

        client = clickhouse_connect.get_client(
            host=config['host'],
            port=config['port'],
            username=config['user'],
            password=config['password'],
            database=config['database'],
            interface=interface
        )

        logger.info(f"✅ Connected to ClickHouse ({interface}): {config['host']}:{config['port']}/{config['database']}")
        return client
    except Exception as e:
        logger.error(f"❌ Failed to connect to ClickHouse: {e}")
        raise


def clean_numeric_column(series: pd.Series) -> pd.Series:
    """
    Convert European number format (1.234,56) to standard format (1234.56).
    Handles both dot and comma as decimal separators.
    """
    def clean_value(val):
        if pd.isna(val):
            return None
        val_str = str(val).strip()

        # Check if it's already a number
        try:
            return float(val_str)
        except ValueError:
            pass

        # European format: 1.234,56 -> 1234.56
        if ',' in val_str and '.' in val_str:
            # Remove thousands separator (.), replace comma with dot
            val_str = val_str.replace('.', '').replace(',', '.')
        elif ',' in val_str:
            # Only comma, assume it's decimal separator
            val_str = val_str.replace(',', '.')

        try:
            return float(val_str)
        except ValueError:
            return None

    return series.apply(clean_value)


def infer_clickhouse_type(dtype, col_name: str, sample_values: pd.Series) -> str:
    """Map pandas dtype to ClickHouse type with better inference."""
    # Check for numeric patterns in column name
    is_numeric_name = any(keyword in col_name.lower() for keyword in [
        'sum', 'fsum', 'count', 'total', 'volume', 'amount', 'qty', 'quantity',
        'usd', 'price', 'revenue', 'cost', '9l', 'cs'
    ])

    # Integer types
    if 'int' in str(dtype):
        max_val = sample_values.max() if len(sample_values) > 0 else 0
        if max_val < 2**31:
            return 'Int32'
        elif max_val < 2**63:
            return 'Int64'
        else:
            return 'UInt64'

    # Float types or numeric column names with string data
    elif 'float' in str(dtype) or (is_numeric_name and str(dtype) == 'object'):
        # Use Decimal for financial data
        if any(keyword in col_name.lower() for keyword in ['usd', 'price', 'revenue', 'cost']):
            return 'Decimal(18, 3)'
        return 'Float64'

    # Boolean
    elif 'bool' in str(dtype):
        return 'Bool'

    # Date/DateTime
    elif 'datetime' in str(dtype):
        return 'DateTime'

    # String (default)
    else:
        # Check if it's a dimension/key (low cardinality)
        if any(keyword in col_name.lower() for keyword in ['key_', 'name', 'status', 'type', 'brand', 'category']):
            return 'LowCardinality(String)'
        return 'String'


def create_table_from_schema_file(
    client,
    table_name: str,
    schema_path: Path,
    drop_existing: bool = False
) -> Tuple[List[str], bool]:
    """
    Create table(s) from SQL schema file.

    Schema file may contain:
    - CREATE TABLE statements (for staging and/or final tables)
    - CREATE MATERIALIZED VIEW (for automatic migration via ClickHouse)

    Returns: (List of created table names, has_materialized_view)
    """
    try:
        logger.info(f"Reading SQL schema from {schema_path}")
        with open(schema_path, 'r', encoding='utf-8') as f:
            sql_content = f.read()

        # Split by semicolons to get individual statements
        statements = [s.strip() for s in sql_content.split(';') if s.strip()]

        if drop_existing:
            # Drop tables and views if requested
            drop_statements = [
                f"DROP VIEW IF EXISTS {table_name}_mv",  # Drop materialized view first
                f"DROP TABLE IF EXISTS {table_name}",
                f"DROP TABLE IF EXISTS {table_name}_raw"
            ]
            for drop_sql in drop_statements:
                try:
                    client.command(drop_sql)
                    logger.info(f"✅ Dropped: {drop_sql}")
                except Exception as e:
                    logger.debug(f"Drop error (may not exist): {e}")

        # Execute CREATE statements
        created_tables = []
        has_materialized_view = False

        for i, sql in enumerate(statements):
            # Remove leading/trailing whitespace but keep the statement intact
            sql = sql.strip()
            if not sql:
                continue

            # Find first real SQL statement (skip comments and fragments)
            sql_upper = sql.upper()

            # Check if this is a CREATE MATERIALIZED VIEW statement
            if 'CREATE MATERIALIZED VIEW' in sql_upper:
                mv_idx = sql_upper.find('CREATE MATERIALIZED VIEW')
                mv_statement = sql[mv_idx:]
                first_line = mv_statement.split('\n')[0].strip()

                try:
                    logger.info(f"🔄 Creating Materialized View for auto-migration")
                    logger.debug(f"SQL: {first_line[:150]}...")

                    client.command(mv_statement)
                    has_materialized_view = True
                    logger.info(f"✅ Created Materialized View: ClickHouse will auto-migrate data on INSERT")
                except Exception as e:
                    logger.error(f"❌ Failed to create Materialized View: {e}")
                    logger.debug(f"Failed SQL: {sql}")
                    logger.warning(f"⚠️ Migration will not work without Materialized View")

            # Check if this statement contains CREATE TABLE
            elif 'CREATE TABLE' in sql_upper:
                # Extract the part starting from CREATE TABLE
                create_idx = sql_upper.find('CREATE TABLE')
                create_statement = sql[create_idx:]

                # Get the first line of the CREATE statement for table name
                first_code_line = create_statement.split('\n')[0].strip()

                try:
                    logger.info(f"Executing CREATE TABLE statement {i+1}/{len(statements)}")
                    logger.debug(f"SQL: {first_code_line[:150]}...")

                    # Use only the part starting from CREATE TABLE (remove any garbage before it)
                    client.command(create_statement)

                    # Extract table name from CREATE TABLE statement
                    if 'IF NOT EXISTS' in first_code_line.upper():
                        created_table = first_code_line.split('IF NOT EXISTS')[1].split('(')[0].strip()
                    else:
                        created_table = first_code_line.split('CREATE TABLE', 1)[1].split('(')[0].strip()

                    created_tables.append(created_table)
                    logger.info(f"✅ Created table: {created_table}")
                except Exception as e:
                    logger.error(f"❌ Failed to execute statement: {e}")
                    logger.debug(f"Failed SQL: {sql}")
                    return [], False

        if len(created_tables) == 0:
            logger.warning(f"⚠️ No CREATE TABLE statements found in {schema_path}")
            return [], False

        logger.info(f"✅ Created {len(created_tables)} table(s) from schema file: {schema_path}")
        return created_tables, has_materialized_view

    except Exception as e:
        logger.error(f"❌ Failed to create tables from schema: {e}", exc_info=True)
        return [], False


def create_table_from_csv(
    client,
    table_name: str,
    df: pd.DataFrame,
    drop_existing: bool = False
) -> bool:
    """
    Auto-generate table schema from CSV DataFrame and create table.
    """
    try:
        if drop_existing:
            drop_sql = f"DROP TABLE IF EXISTS {table_name}"
            client.command(drop_sql)
            logger.info(f"Dropped existing table: {table_name}")

        # Build column definitions
        columns = []
        for col in df.columns:
            ch_type = infer_clickhouse_type(df[col].dtype, col, df[col])

            # Add Nullable if column has nulls
            if df[col].isna().any():
                if not ch_type.startswith('Nullable'):
                    ch_type = f'Nullable({ch_type})'

            columns.append(f"    `{col}` {ch_type}")

        columns_sql = ",\n".join(columns)

        create_sql = f"""
CREATE TABLE IF NOT EXISTS {table_name}
(
{columns_sql}
)
ENGINE = MergeTree()
ORDER BY tuple()
"""

        logger.info(f"Creating table with auto-generated schema:\n{create_sql}")
        client.command(create_sql)

        logger.info(f"✅ Table created with auto-generated schema: {table_name}")
        return True

    except Exception as e:
        logger.error(f"❌ Failed to create table: {e}")
        return False


def clean_dataframe_for_clickhouse(df: pd.DataFrame) -> pd.DataFrame:
    """
    Prepare DataFrame for ClickHouse insertion.
    Convert European number formats and handle data types.
    """
    df_clean = df.copy()

    for col in df_clean.columns:
        # Try to detect and clean numeric columns
        if df_clean[col].dtype == 'object':
            # Sample a few values to check if numeric
            sample = df_clean[col].dropna().head(10)

            if len(sample) > 0:
                # Check if looks like numbers (contains digits and ,./)
                sample_str = str(sample.iloc[0])
                if any(c.isdigit() for c in sample_str):
                    # Try cleaning as numeric
                    try:
                        cleaned = clean_numeric_column(df_clean[col])
                        # Check if conversion was successful
                        if cleaned.notna().sum() > len(df_clean) * 0.5:  # At least 50% success
                            df_clean[col] = cleaned
                            logger.debug(f"Cleaned numeric column: {col}")
                    except Exception as e:
                        logger.debug(f"Column {col} not numeric: {e}")

    return df_clean


def load_csv_to_table(
    client,
    table_name: str,
    csv_path: Path,
    chunk_size: int = 10000
) -> bool:
    """
    Load CSV data into ClickHouse table in chunks.
    """
    try:
        logger.info(f"Reading CSV file: {csv_path}")

        # Detect CSV delimiter and encoding
        with open(csv_path, 'r', encoding='utf-8') as f:
            first_line = f.readline()
            delimiter = ';' if ';' in first_line else ','

        # Read CSV with low_memory=False to avoid dtype warnings on large files
        # This reads the entire file to infer types, but since we convert everything
        # to strings for staging tables anyway, it's fine
        df = pd.read_csv(
            csv_path,
            delimiter=delimiter,
            encoding='utf-8-sig',  # Handle BOM (Byte Order Mark) in UTF-8 files
            low_memory=False,  # Prevent DtypeWarning for mixed types
            dtype=str  # Read all columns as strings (we're loading to staging table with String columns)
        )
        total_rows = len(df)
        logger.info(f"Loaded {total_rows} rows from CSV")

        # Clean data (not needed anymore since we read as str, but keep for compatibility)
        # df = clean_dataframe_for_clickhouse(df)

        # Insert in chunks
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            console=console
        ) as progress:
            task = progress.add_task(f"Inserting into {table_name}", total=total_rows)

            # Use insert_df for better compatibility
            df_clean = df.fillna('')
            client.insert_df(table_name, df_clean)
            progress.update(task, advance=total_rows)

        logger.info(f"✅ Loaded {total_rows} rows into {table_name}")
        return True

    except Exception as e:
        logger.error(f"❌ Failed to load CSV data: {e}", exc_info=True)
        return False


def process_dataset(
    client,
    dataset_config: Dict[str, Any],
    dataset_id: str,
    config_dir: Path,
    dry_run: bool = False,
    drop_existing: bool = False
) -> bool:
    """
    Process a single dataset: create table (if needed) and load CSV data.

    If SQL schema contains staging table (table_name_raw) and Materialized View,
    data is automatically migrated via ClickHouse on INSERT.
    Otherwise, CSV is loaded directly to table_name.
    """
    console.rule(f"[bold blue]Dataset: {dataset_id}")

    # Extract configuration
    table_name = dataset_config.get('clickhouse_table')
    csv_path_str = dataset_config.get('rawdata')
    schema_path_str = dataset_config.get('sql_schema')

    if not table_name:
        logger.warning(f"⚠️ No clickhouse_table specified for {dataset_id}, skipping")
        return False

    if not csv_path_str:
        logger.warning(f"⚠️ No rawdata (CSV path) specified for {dataset_id}, skipping")
        return False

    # Resolve paths
    csv_path = PROJECT_ROOT / csv_path_str
    if not csv_path.exists():
        logger.warning(f"⚠️ CSV file not found: {csv_path}")
        return False

    logger.info(f"📊 Table: {table_name}")
    logger.info(f"📄 CSV: {csv_path}")

    if dry_run:
        logger.info("🔍 DRY RUN: Would process this dataset")
        if schema_path_str:
            logger.info(f"   - Would use SQL schema: {schema_path_str}")
        else:
            logger.info(f"   - Would auto-generate schema from CSV")
        return True

    # Determine target table for CSV loading
    # If schema defines staging table (table_name_raw), use it
    staging_table = f"{table_name}_raw"
    staging_exists_query = f"EXISTS TABLE {staging_table}"
    has_materialized_view = False

    # Check if table exists
    table_exists_query = f"EXISTS TABLE {table_name}"
    table_exists = client.command(table_exists_query) == 1

    if table_exists and not drop_existing:
        logger.info(f"✅ Table {table_name} already exists")
    else:
        # Create table from schema or auto-generate
        if schema_path_str:
            schema_path = PROJECT_ROOT / schema_path_str
            if schema_path.exists():
                logger.info(f"📋 Using SQL schema: {schema_path}")
                created_tables, has_materialized_view = create_table_from_schema_file(
                    client, table_name, schema_path, drop_existing
                )
                if not created_tables:
                    return False
                logger.info(f"📋 Created tables: {', '.join(created_tables)}")
            else:
                logger.warning(f"⚠️ SQL schema file not found: {schema_path}, auto-generating...")
                df = pd.read_csv(csv_path, delimiter=';' if ';' in open(csv_path).readline() else ',', nrows=1000)
                if not create_table_from_csv(client, table_name, df, drop_existing):
                    return False
        else:
            logger.info("📋 Auto-generating schema from CSV...")
            df = pd.read_csv(csv_path, delimiter=';' if ';' in open(csv_path).readline() else ',', nrows=1000)
            if not create_table_from_csv(client, table_name, df, drop_existing):
                return False

    # Determine which table to load CSV into
    staging_exists = client.command(staging_exists_query) == 1

    if staging_exists:
        target_table = staging_table
        logger.info(f"📥 Loading CSV into staging table: {target_table}")

        if has_materialized_view:
            logger.info(f"🔄 Materialized View detected: data will auto-migrate to {table_name}")
    else:
        target_table = table_name
        logger.info(f"📥 Loading CSV directly into final table: {target_table}")

    # Load CSV data
    success = load_csv_to_table(client, target_table, csv_path)

    if success:
        # Get row count in staging table
        count_query = f"SELECT count() FROM {target_table}"
        row_count = client.command(count_query)
        logger.info(f"✅ Total rows in {target_table}: {row_count:,}")

        # If loaded into staging, check migration status
        if staging_exists and target_table == staging_table:
            if has_materialized_view:
                # Materialized View handles migration automatically via ClickHouse
                logger.info(f"⚡ Auto-migration via Materialized View (ClickHouse-native)")

                # Give ClickHouse a moment to process the materialized view
                import time
                time.sleep(1)

                # Get final row count
                final_count_query = f"SELECT count() FROM {table_name}"
                final_row_count = client.command(final_count_query)
                logger.info(f"✅ Migration complete: {final_row_count:,} rows in {table_name}")

            else:
                logger.warning(f"⚠️ No Materialized View found")
                logger.warning(f"   Data loaded into staging table {staging_table}")
                logger.warning(f"   Add CREATE MATERIALIZED VIEW to {schema_path_str} for auto-migration")
                logger.warning(f"   Or manually migrate data using SQL: INSERT INTO {table_name} SELECT ... FROM {staging_table}")

    return success


def main():
    parser = argparse.ArgumentParser(
        description='Load CSV data into ClickHouse from dataset configurations',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python load_csv_to_clickhouse.py                         # Load all datasets
  python load_csv_to_clickhouse.py --dataset timeline      # Load specific dataset
  python load_csv_to_clickhouse.py --dry-run               # Show what would be done
  python load_csv_to_clickhouse.py --drop-existing         # Drop and recreate tables
  python load_csv_to_clickhouse.py -v                      # Verbose logging

Note:
  If SQL schema contains Materialized View, data is automatically migrated from
  staging (_raw) to final table via ClickHouse on INSERT (native performance).
        """
    )

    parser.add_argument(
        '--dataset',
        type=str,
        help='Process specific dataset only (e.g., "timeline")'
    )
    parser.add_argument(
        '--dry-run',
        action='store_true',
        help='Show what would be done without executing'
    )
    parser.add_argument(
        '--drop-existing',
        action='store_true',
        help='Drop existing tables before loading'
    )
    parser.add_argument(
        '-v', '--verbose',
        action='store_true',
        help='Enable verbose logging'
    )

    args = parser.parse_args()

    # Setup logging level
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # Display header
    console.print("\n[bold cyan]CSV to ClickHouse Loader[/bold cyan]")
    console.print("[dim]Automated data loading from dataset configurations[/dim]\n")

    try:
        # Load environment
        ch_config = load_env()

        # Discover datasets
        config_dir = PROJECT_ROOT / 'config'
        dataset_files = discover_datasets(config_dir)

        if not dataset_files:
            console.print("[red]No dataset configuration files found![/red]")
            return 1

        # Filter by specific dataset if requested
        if args.dataset:
            dataset_files = [f for f in dataset_files if args.dataset in f.stem]
            if not dataset_files:
                console.print(f"[red]Dataset '{args.dataset}' not found![/red]")
                return 1

        # Show summary table
        table = Table(title="Datasets to Process")
        table.add_column("ID", style="cyan")
        table.add_column("Name", style="green")
        table.add_column("Table", style="yellow")
        table.add_column("CSV", style="blue")

        datasets_to_process = []
        for yaml_path in dataset_files:
            config = load_dataset_config(yaml_path)
            dataset_id = config.get('id', yaml_path.stem.replace('dataset_', ''))
            datasets_to_process.append((dataset_id, config))

            table.add_row(
                dataset_id,
                config.get('name', '-'),
                config.get('clickhouse_table', '-'),
                config.get('rawdata', '-')
            )

        console.print(table)
        console.print()

        if args.dry_run:
            console.print("[yellow]DRY RUN MODE - No changes will be made[/yellow]\n")

        # Connect to ClickHouse
        if not args.dry_run:
            client = get_clickhouse_client(ch_config)
        else:
            client = None

        # Process each dataset
        success_count = 0
        for dataset_id, config in datasets_to_process:
            try:
                success = process_dataset(
                    client,
                    config,
                    dataset_id,
                    config_dir,
                    dry_run=args.dry_run,
                    drop_existing=args.drop_existing
                )
                if success:
                    success_count += 1
            except Exception as e:
                logger.error(f"Failed to process dataset {dataset_id}: {e}", exc_info=True)

        # Summary
        console.print()
        console.rule("[bold green]Summary")
        console.print(f"✅ Successfully processed: {success_count}/{len(datasets_to_process)} datasets")

        return 0 if success_count == len(datasets_to_process) else 1

    except Exception as e:
        console.print(f"[red]Fatal error: {e}[/red]")
        logger.error("Fatal error", exc_info=True)
        return 1


if __name__ == '__main__':
    sys.exit(main())
