import numpy as np
import time
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# Try to import Bayesian optimization libraries
try:
    from skopt import BayesSearchCV
    from skopt.space import Real, Integer, Categorical
    BAYESIAN_AVAILABLE = True
except ImportError:
    BAYESIAN_AVAILABLE = False
    print("Bayesian optimization libraries not available. Will use RandomizedSearchCV instead.")


def ensure_finite(X, default_value=0.0):
    """
    Replace any NaN, inf, or extremely large values with a default value.
    
    Args:
        X: Input array or matrix
        default_value: Value to use for replacement
        
    Returns:
        X_clean: Cleaned array with finite values
    """
    # Make a copy to avoid modifying the original
    X_clean = np.array(X, copy=True)
    
    # Replace inf values
    mask_inf = np.isinf(X_clean)
    if np.any(mask_inf):
        print(f"Warning: Found {np.sum(mask_inf)} infinite values. Replacing with {default_value}.")
        X_clean[mask_inf] = default_value
    
    # Replace NaN values
    mask_nan = np.isnan(X_clean)
    if np.any(mask_nan):
        print(f"Warning: Found {np.sum(mask_nan)} NaN values. Replacing with {default_value}.")
        X_clean[mask_nan] = default_value
    
    # Check for extremely large values
    large_threshold = 1e6  # Adjust as needed
    mask_large = np.abs(X_clean) > large_threshold
    if np.any(mask_large):
        print(f"Warning: Found {np.sum(mask_large)} extremely large values. Replacing with {default_value}.")
        X_clean[mask_large] = default_value
        
    return X_clean


def tune_hyperparameters(model_class, param_grid, X_train, y_train, method='grid', cv=5, n_iter=20, model_name=None):
    """
    Tune hyperparameters for a model.

    Args:
        model_class: Scikit-learn model class
        param_grid: Dictionary of hyperparameters
        X_train: Training feature matrix
        y_train: Training target vector
        method: Tuning method ('grid', 'random', or 'bayesian')
        cv: Number of cross-validation folds
        n_iter: Number of iterations for random/bayesian search
        model_name: Name of the model for special handling

    Returns:
        best_model: Tuned model
        best_params: Best hyperparameter values
    """
    start_time = time.time()
    print(f"  Tuning hyperparameters for {model_name} using {method} search...")
    print(f"  X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")

    if method == 'grid':
        search = GridSearchCV(
            model_class(), param_grid, cv=cv, scoring='neg_mean_squared_error',
            verbose=1, n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_

    elif method == 'random':
        search = RandomizedSearchCV(
            model_class(), param_grid, n_iter=n_iter, cv=cv,
            scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
        )
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_params = search.best_params_

    elif method == 'bayesian':
        if BAYESIAN_AVAILABLE:
            # Convert param_grid to skopt space format
            search_space = {}
            for param, values in param_grid.items():
                # If parameter values are a list
                if isinstance(values, list):
                    # Check types of values to determine space type
                    if all(isinstance(v, bool) for v in values) or all(isinstance(v, str) for v in values):
                        search_space[param] = Categorical(values)
                    elif all(isinstance(v, int) for v in values):
                        search_space[param] = Integer(min(values), max(values))
                    elif all(isinstance(v, float) for v in values):
                        search_space[param] = Real(min(values), max(values), prior='log-uniform')
                    else:
                        # Mixed types or other - use categorical
                        search_space[param] = Categorical(values)
                # If parameter values are already a dictionary or distribution
                else:
                    search_space[param] = values

            print(f"  Created Bayesian search space: {search_space}")

            # Special handling for models that need parameter mapping
            model_instance = model_class()
            if model_name == 'GPR' and 'kernel' in search_space:
                # Create a modified search with a custom kernel mapping
                def map_kernel(params):
                    # Map numeric values to actual kernels
                    if 'kernel' in params and isinstance(params['kernel'], int):
                        from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
                        kernel_map = {
                            1: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)),
                            2: C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5),
                            3: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1)
                        }
                        params['kernel'] = kernel_map.get(params['kernel'], kernel_map[1])
                    return params

                # Use a subset of data for GPR to speed up training
                subset_size = min(1000, len(X_train))
                idx = np.random.choice(len(X_train), subset_size, replace=False)
                X_subset = X_train[idx]
                y_subset = y_train[idx]

                # Manual Bayesian optimization for GPR
                best_score = float('-inf')
                best_params = {}
                best_model = None  # Initialize best_model

                for _ in range(n_iter):
                    # Sample parameters randomly from the space
                    params = {}
                    for param, space in search_space.items():
                        if hasattr(space, 'rvs'):  # It's a distribution
                            params[param] = space.rvs(1)[0]
                        elif isinstance(space, list):  # It's a list of values
                            params[param] = np.random.choice(space)

                    # Map parameters for kernels
                    params = map_kernel(params)

                    # Create and fit model with these parameters
                    try:
                        model = model_class(**params)
                        model.fit(X_subset, y_subset)
                        # Score model
                        score = -mean_squared_error(y_subset, model.predict(X_subset))  # Neg MSE
                        if score > best_score:
                            best_score = score
                            best_params = params
                            best_model = model
                    except Exception as e:
                        print(f"  Skipping parameters due to error: {e}")
                        continue
                    
                print(f"  Best params: {best_params}")
                if best_model is None:
                    # Fallback if no model was successfully trained
                    print("  No successful model training, using default parameters")
                    best_model = model_class()
                    from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
                    kernel_rbf = C(1.0) * RBF(1.0)
                    best_model.set_params(kernel=kernel_rbf, alpha=1e-6)
                    best_model.fit(X_subset, y_subset)
                return best_model, best_params
            elif model_name == 'MLP' and 'hidden_layer_sizes' in search_space:
                # Create a modified search with a custom hidden_layer_sizes mapping
                def map_hidden_layers(params):
                    # Map numeric values to actual tuples for hidden_layer_sizes
                    if 'hidden_layer_sizes' in params and isinstance(params['hidden_layer_sizes'], (int, float)):
                        # Map integers to hidden layer configurations
                        layer_map = {
                            1: (50,),
                            2: (100,),
                            3: (50, 50),
                            4: (100, 50)
                        }
                        params['hidden_layer_sizes'] = layer_map.get(int(params['hidden_layer_sizes']), (50,))
                    return params

                # Manual optimization for MLP
                best_score = float('-inf')
                best_params = {}
                best_model = None  # Initialize best_model

                for _ in range(n_iter):
                    # Sample parameters randomly from the space
                    params = {}
                    for param, space in search_space.items():
                        if hasattr(space, 'rvs'):  # It's a distribution
                            params[param] = space.rvs(1)[0]
                        elif isinstance(space, list):  # It's a list of values
                            params[param] = np.random.choice(space)

                    # Map parameters for hidden layer sizes
                    params = map_hidden_layers(params)

                    # Create and fit model with these parameters
                    try:
                        model = model_class(**params)
                        model.fit(X_train, y_train)
                        # Score model
                        score = -mean_squared_error(y_train, model.predict(X_train))  # Neg MSE
                        if score > best_score:
                            best_score = score
                            best_params = params
                            best_model = model
                    except Exception as e:
                        print(f"  Skipping parameters due to error: {e}")
                        continue

                print(f"  Best params: {best_params}")
                if best_model is None:
                    # Fallback if no model was successfully trained
                    print("  No successful model training, using default parameters")
                    best_model = model_class(random_state=42, max_iter=1000) 
                    best_model.fit(X_train, y_train)
                return best_model, best_params
            else:
                # For other models, use standard BayesSearchCV
                search = BayesSearchCV(
                    model_instance, search_space, n_iter=n_iter, cv=cv,
                    scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
                )
                search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_
        else:
            print("  Bayesian optimization not available, falling back to RandomizedSearchCV")
            search = RandomizedSearchCV(
                model_class(), param_grid, n_iter=n_iter, cv=cv,
                scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
            )
            search.fit(X_train, y_train)
            best_model = search.best_estimator_
            best_params = search.best_params_

    else:
        raise ValueError(f"Unknown tuning method: {method}")

    elapsed_time = time.time() - start_time
    print(f"  Tuning completed in {elapsed_time:.2f} seconds")
    print(f"  Best params: {best_params}")

    return best_model, best_params


def get_param_grids():
    """
    Get parameter grids for different models.
    
    Returns:
        param_grids: Dictionary of parameter grids for grid/random search
        param_ranges: Dictionary of parameter ranges for Bayesian optimization
    """
    # Parameter grids for grid/random search
    param_grids = {}
    
    # Linear models
    param_grids['Linear Regression'] = {'fit_intercept': [True, False]}
    
    param_grids['Ridge'] = {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
        'fit_intercept': [True, False],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }
    
    param_grids['Lasso'] = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
        'fit_intercept': [True, False],
        'max_iter': [1000, 3000, 5000]
    }
    
    param_grids['ElasticNet'] = {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
        'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
        'fit_intercept': [True, False],
        'max_iter': [1000, 3000, 5000]
    }
    
    # Tree-based models
    param_grids['Decision Tree'] = {
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    param_grids['Random Forest'] = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    param_grids['Gradient Boosting'] = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'min_samples_split': [2, 5, 10]
    }
    
    param_grids['XGBoost'] = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0]
    }
    
    param_grids['LightGBM'] = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 9],
        'num_leaves': [31, 63, 127],
        'subsample': [0.8, 0.9, 1.0]
    }
    
    # Other models
    param_grids['SVR'] = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.1, 0.01, 0.001]
    }
    
    param_grids['KNN'] = {
        'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }
    
    # Neural Network model
    param_grids['MLP'] = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive']
    }
    
    # Gaussian Process Regression
    from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
    kernel_rbf = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
    kernel_matern = C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5)
    kernel_rbf_white = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1)
    
    param_grids['GPR'] = {
        'kernel': [kernel_rbf, kernel_matern, kernel_rbf_white],
        'alpha': [1e-10, 1e-8, 1e-6],
        'normalize_y': [True, False],
        'n_restarts_optimizer': [0, 1, 3]
    }
    
    # Parameter ranges for Bayesian optimization
    param_ranges = {}
    
    # Linear models
    param_ranges['Linear Regression'] = {'fit_intercept': [True, False]}
    
    param_ranges['Ridge'] = {
        'alpha': (0.001, 100.0, 'log-uniform'),
        'fit_intercept': [True, False],
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    }
    
    param_ranges['Lasso'] = {
        'alpha': (0.0001, 10.0, 'log-uniform'),
        'fit_intercept': [True, False],
        'max_iter': (1000, 10000)
    }
    
    param_ranges['ElasticNet'] = {
        'alpha': (0.0001, 1.0, 'log-uniform'),
        'l1_ratio': (0.1, 0.9),
        'fit_intercept': [True, False],
        'max_iter': (1000, 10000)
    }
    
    # Tree-based models
    param_ranges['Decision Tree'] = {
        'max_depth': (3, 30),  # None will be handled specially
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10)
    }
    
    param_ranges['Random Forest'] = {
        'n_estimators': (10, 300),
        'max_depth': (3, 50),  # None will be handled specially
        'min_samples_split': (2, 20),
        'min_samples_leaf': (1, 10)
    }
    
    param_ranges['Gradient Boosting'] = {
        'n_estimators': (10, 300),
        'learning_rate': (0.001, 0.3, 'log-uniform'),
        'max_depth': (2, 15),
        'min_samples_split': (2, 20)
    }
    
    param_ranges['XGBoost'] = {
        'n_estimators': (10, 300),
        'learning_rate': (0.001, 0.3, 'log-uniform'),
        'max_depth': (2, 15),
        'subsample': (0.5, 1.0),
        'colsample_bytree': (0.5, 1.0)
    }
    
    # Other models
    param_ranges['SVR'] = {
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': (0.01, 1000.0, 'log-uniform'),
        'gamma': ['scale', 'auto'] + [(0.0001, 1.0, 'log-uniform')]
    }
    
    param_ranges['KNN'] = {
        'n_neighbors': (1, 30),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }
    
    # Neural Network model
    param_ranges['MLP'] = {
        'hidden_layer_sizes': [1, 2, 3, 4],  # Will map to actual tuples later
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': (0.00001, 0.1, 'log-uniform'),
        'learning_rate': ['constant', 'adaptive']
    }
    
    # Gaussian Process Regression
    param_ranges['GPR'] = {
        'kernel': [1, 2, 3],  # Will map to actual kernels later
        'alpha': (1e-12, 1e-4, 'log-uniform'),
        'normalize_y': [True, False],
        'n_restarts_optimizer': (0, 5)
    }
    
    return param_grids, param_ranges