#!/usr/bin/env python3
"""
Reload timeline data from CSV with correct parsing.
"""

import sys
from pathlib import Path
import pandas as pd
from clickhouse_connect import get_client
from dotenv import load_dotenv
import os

# Load environment
PROJECT_ROOT = Path(__file__).parent.parent
load_dotenv(PROJECT_ROOT / '.env')

def main():
    # Get ClickHouse connection settings
    host = os.getenv('CLICKHOUSE_HOST')
    port = int(os.getenv('CLICKHOUSE_PORT', 8123))
    user = os.getenv('CLICKHOUSE_USER')
    password = os.getenv('CLICKHOUSE_PASSWORD')
    database = os.getenv('CLICKHOUSE_DATABASE')

    print(f'🔌 Connecting to {host}:{port}/{database}...')

    client = get_client(
        host=host,
        port=port,
        username=user,
        password=password,
        database=database,
        interface='http'
    )

    # Clear existing data
    print('🗑️  Clearing existing data...')
    client.command('TRUNCATE TABLE tl_timeline_raw')
    client.command('TRUNCATE TABLE tl_timeline')

    # Load CSV with correct delimiter and encoding
    csv_path = PROJECT_ROOT / 'data' / 'TL_data.csv'
    print(f'📂 Loading CSV: {csv_path}')

    df = pd.read_csv(
        csv_path,
        delimiter=';',
        encoding='utf-8-sig',  # Handle BOM
        na_values=['', ' ', '#'],
        keep_default_na=True
    )

    # Clean numeric columns (remove spaces)
    print('🧹 Cleaning numeric columns...')
    numeric_cols = [
        'fsum_tot_9L', 'fsum_tot_usd', 'fsum_open_9L', 'fsum_open_usd',
        'fsum_billed_9L', 'fsum_billed_usd', 'fsum_tot_cs', 'fsum_open_cs', 'fsum_billed_cs'
    ]

    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(' ', '').str.replace(',', '.')

    print(f'📊 Loaded {len(df)} rows from CSV')
    print(f'   Sample key_market: {df["key_market"].head(3).tolist()}')
    print(f'   Sample key_date: {df["key_date"].head(3).tolist()}')

    # Insert into ClickHouse
    print('⬆️  Inserting into ClickHouse...')
    client.insert_df('tl_timeline_raw', df)

    # Check results
    result = client.query('SELECT COUNT(*) FROM tl_timeline_raw WHERE key_market != ""')
    rows_raw = result.result_rows[0][0]
    print(f'✅ Inserted {rows_raw:,} non-empty rows into tl_timeline_raw')

    result = client.query('SELECT COUNT(*) FROM tl_timeline')
    rows_final = result.result_rows[0][0]
    print(f'✅ Auto-migrated {rows_final:,} rows into tl_timeline')

    # Check date range
    result = client.query("SELECT MIN(period), MAX(period), COUNT(*) FROM tl_timeline WHERE period > '2020-01-01'")
    if result.result_rows:
        min_date, max_date, valid_count = result.result_rows[0]
        print(f'📅 Date range (valid): {min_date} to {max_date} ({valid_count:,} rows)')

    # Check revenue
    result = client.query("SELECT SUM(fsum_billed_usd), COUNT(DISTINCT key_market) FROM tl_timeline WHERE period > '2020-01-01'")
    if result.result_rows:
        total_revenue, market_count = result.result_rows[0]
        print(f'💰 Total revenue: {total_revenue:,.2f} USD')
        print(f'🌍 Markets: {market_count}')

    print('\n✅ Data reload complete!')

if __name__ == '__main__':
    main()
