import numpy as np import time from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.metrics import mean_squared_error # Try to import Bayesian optimization libraries try: from skopt import BayesSearchCV from skopt.space import Real, Integer, Categorical BAYESIAN_AVAILABLE = True except ImportError: BAYESIAN_AVAILABLE = False print("Bayesian optimization libraries not available. Will use RandomizedSearchCV instead.") def ensure_finite(X, default_value=0.0): """ Replace any NaN, inf, or extremely large values with a default value. Args: X: Input array or matrix default_value: Value to use for replacement Returns: X_clean: Cleaned array with finite values """ # Make a copy to avoid modifying the original X_clean = np.array(X, copy=True) # Replace inf values mask_inf = np.isinf(X_clean) if np.any(mask_inf): print(f"Warning: Found {np.sum(mask_inf)} infinite values. Replacing with {default_value}.") X_clean[mask_inf] = default_value # Replace NaN values mask_nan = np.isnan(X_clean) if np.any(mask_nan): print(f"Warning: Found {np.sum(mask_nan)} NaN values. Replacing with {default_value}.") X_clean[mask_nan] = default_value # Check for extremely large values large_threshold = 1e6 # Adjust as needed mask_large = np.abs(X_clean) > large_threshold if np.any(mask_large): print(f"Warning: Found {np.sum(mask_large)} extremely large values. Replacing with {default_value}.") X_clean[mask_large] = default_value return X_clean def tune_hyperparameters(model_class, param_grid, X_train, y_train, method='grid', cv=5, n_iter=20, model_name=None): """ Tune hyperparameters for a model. Args: model_class: Scikit-learn model class param_grid: Dictionary of hyperparameters X_train: Training feature matrix y_train: Training target vector method: Tuning method ('grid', 'random', or 'bayesian') cv: Number of cross-validation folds n_iter: Number of iterations for random/bayesian search model_name: Name of the model for special handling Returns: best_model: Tuned model best_params: Best hyperparameter values """ start_time = time.time() print(f" Tuning hyperparameters for {model_name} using {method} search...") print(f" X_train shape: {X_train.shape}, y_train shape: {y_train.shape}") if method == 'grid': search = GridSearchCV( model_class(), param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1 ) search.fit(X_train, y_train) best_model = search.best_estimator_ best_params = search.best_params_ elif method == 'random': search = RandomizedSearchCV( model_class(), param_grid, n_iter=n_iter, cv=cv, scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1 ) search.fit(X_train, y_train) best_model = search.best_estimator_ best_params = search.best_params_ elif method == 'bayesian': if BAYESIAN_AVAILABLE: # Convert param_grid to skopt space format search_space = {} for param, values in param_grid.items(): # If parameter values are a list if isinstance(values, list): # Check types of values to determine space type if all(isinstance(v, bool) for v in values) or all(isinstance(v, str) for v in values): search_space[param] = Categorical(values) elif all(isinstance(v, int) for v in values): search_space[param] = Integer(min(values), max(values)) elif all(isinstance(v, float) for v in values): search_space[param] = Real(min(values), max(values), prior='log-uniform') else: # Mixed types or other - use categorical search_space[param] = Categorical(values) # If parameter values are already a dictionary or distribution else: search_space[param] = values print(f" Created Bayesian search space: {search_space}") # Special handling for models that need parameter mapping model_instance = model_class() if model_name == 'GPR' and 'kernel' in search_space: # Create a modified search with a custom kernel mapping def map_kernel(params): # Map numeric values to actual kernels if 'kernel' in params and isinstance(params['kernel'], int): from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C kernel_map = { 1: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)), 2: C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5), 3: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1) } params['kernel'] = kernel_map.get(params['kernel'], kernel_map[1]) return params # Use a subset of data for GPR to speed up training subset_size = min(1000, len(X_train)) idx = np.random.choice(len(X_train), subset_size, replace=False) X_subset = X_train[idx] y_subset = y_train[idx] # Manual Bayesian optimization for GPR best_score = float('-inf') best_params = {} best_model = None # Initialize best_model for _ in range(n_iter): # Sample parameters randomly from the space params = {} for param, space in search_space.items(): if hasattr(space, 'rvs'): # It's a distribution params[param] = space.rvs(1)[0] elif isinstance(space, list): # It's a list of values params[param] = np.random.choice(space) # Map parameters for kernels params = map_kernel(params) # Create and fit model with these parameters try: model = model_class(**params) model.fit(X_subset, y_subset) # Score model score = -mean_squared_error(y_subset, model.predict(X_subset)) # Neg MSE if score > best_score: best_score = score best_params = params best_model = model except Exception as e: print(f" Skipping parameters due to error: {e}") continue print(f" Best params: {best_params}") if best_model is None: # Fallback if no model was successfully trained print(" No successful model training, using default parameters") best_model = model_class() from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C kernel_rbf = C(1.0) * RBF(1.0) best_model.set_params(kernel=kernel_rbf, alpha=1e-6) best_model.fit(X_subset, y_subset) return best_model, best_params elif model_name == 'MLP' and 'hidden_layer_sizes' in search_space: # Create a modified search with a custom hidden_layer_sizes mapping def map_hidden_layers(params): # Map numeric values to actual tuples for hidden_layer_sizes if 'hidden_layer_sizes' in params and isinstance(params['hidden_layer_sizes'], (int, float)): # Map integers to hidden layer configurations layer_map = { 1: (50,), 2: (100,), 3: (50, 50), 4: (100, 50) } params['hidden_layer_sizes'] = layer_map.get(int(params['hidden_layer_sizes']), (50,)) return params # Manual optimization for MLP best_score = float('-inf') best_params = {} best_model = None # Initialize best_model for _ in range(n_iter): # Sample parameters randomly from the space params = {} for param, space in search_space.items(): if hasattr(space, 'rvs'): # It's a distribution params[param] = space.rvs(1)[0] elif isinstance(space, list): # It's a list of values params[param] = np.random.choice(space) # Map parameters for hidden layer sizes params = map_hidden_layers(params) # Create and fit model with these parameters try: model = model_class(**params) model.fit(X_train, y_train) # Score model score = -mean_squared_error(y_train, model.predict(X_train)) # Neg MSE if score > best_score: best_score = score best_params = params best_model = model except Exception as e: print(f" Skipping parameters due to error: {e}") continue print(f" Best params: {best_params}") if best_model is None: # Fallback if no model was successfully trained print(" No successful model training, using default parameters") best_model = model_class(random_state=42, max_iter=1000) best_model.fit(X_train, y_train) return best_model, best_params else: # For other models, use standard BayesSearchCV search = BayesSearchCV( model_instance, search_space, n_iter=n_iter, cv=cv, scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1 ) search.fit(X_train, y_train) best_model = search.best_estimator_ best_params = search.best_params_ else: print(" Bayesian optimization not available, falling back to RandomizedSearchCV") search = RandomizedSearchCV( model_class(), param_grid, n_iter=n_iter, cv=cv, scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1 ) search.fit(X_train, y_train) best_model = search.best_estimator_ best_params = search.best_params_ else: raise ValueError(f"Unknown tuning method: {method}") elapsed_time = time.time() - start_time print(f" Tuning completed in {elapsed_time:.2f} seconds") print(f" Best params: {best_params}") return best_model, best_params def get_param_grids(): """ Get parameter grids for different models. Returns: param_grids: Dictionary of parameter grids for grid/random search param_ranges: Dictionary of parameter ranges for Bayesian optimization """ # Parameter grids for grid/random search param_grids = {} # Linear models param_grids['Linear Regression'] = {'fit_intercept': [True, False]} param_grids['Ridge'] = { 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_intercept': [True, False], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] } param_grids['Lasso'] = { 'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], 'fit_intercept': [True, False], 'max_iter': [1000, 3000, 5000] } param_grids['ElasticNet'] = { 'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9], 'fit_intercept': [True, False], 'max_iter': [1000, 3000, 5000] } # Tree-based models param_grids['Decision Tree'] = { 'max_depth': [None, 5, 10, 15, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } param_grids['Random Forest'] = { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } param_grids['Gradient Boosting'] = { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 5, 7, 9], 'min_samples_split': [2, 5, 10] } param_grids['XGBoost'] = { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 5, 7, 9], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0] } param_grids['LightGBM'] = { 'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [3, 5, 7, 9], 'num_leaves': [31, 63, 127], 'subsample': [0.8, 0.9, 1.0] } # Other models param_grids['SVR'] = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.1, 0.01, 0.001] } param_grids['KNN'] = { 'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } # Neural Network model param_grids['MLP'] = { 'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)], 'activation': ['relu', 'tanh'], 'solver': ['adam', 'sgd'], 'alpha': [0.0001, 0.001, 0.01], 'learning_rate': ['constant', 'adaptive'] } # Gaussian Process Regression from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C kernel_rbf = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) kernel_matern = C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5) kernel_rbf_white = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1) param_grids['GPR'] = { 'kernel': [kernel_rbf, kernel_matern, kernel_rbf_white], 'alpha': [1e-10, 1e-8, 1e-6], 'normalize_y': [True, False], 'n_restarts_optimizer': [0, 1, 3] } # Parameter ranges for Bayesian optimization param_ranges = {} # Linear models param_ranges['Linear Regression'] = {'fit_intercept': [True, False]} param_ranges['Ridge'] = { 'alpha': (0.001, 100.0, 'log-uniform'), 'fit_intercept': [True, False], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] } param_ranges['Lasso'] = { 'alpha': (0.0001, 10.0, 'log-uniform'), 'fit_intercept': [True, False], 'max_iter': (1000, 10000) } param_ranges['ElasticNet'] = { 'alpha': (0.0001, 1.0, 'log-uniform'), 'l1_ratio': (0.1, 0.9), 'fit_intercept': [True, False], 'max_iter': (1000, 10000) } # Tree-based models param_ranges['Decision Tree'] = { 'max_depth': (3, 30), # None will be handled specially 'min_samples_split': (2, 20), 'min_samples_leaf': (1, 10) } param_ranges['Random Forest'] = { 'n_estimators': (10, 300), 'max_depth': (3, 50), # None will be handled specially 'min_samples_split': (2, 20), 'min_samples_leaf': (1, 10) } param_ranges['Gradient Boosting'] = { 'n_estimators': (10, 300), 'learning_rate': (0.001, 0.3, 'log-uniform'), 'max_depth': (2, 15), 'min_samples_split': (2, 20) } param_ranges['XGBoost'] = { 'n_estimators': (10, 300), 'learning_rate': (0.001, 0.3, 'log-uniform'), 'max_depth': (2, 15), 'subsample': (0.5, 1.0), 'colsample_bytree': (0.5, 1.0) } # Other models param_ranges['SVR'] = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': (0.01, 1000.0, 'log-uniform'), 'gamma': ['scale', 'auto'] + [(0.0001, 1.0, 'log-uniform')] } param_ranges['KNN'] = { 'n_neighbors': (1, 30), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } # Neural Network model param_ranges['MLP'] = { 'hidden_layer_sizes': [1, 2, 3, 4], # Will map to actual tuples later 'activation': ['relu', 'tanh'], 'solver': ['adam', 'sgd'], 'alpha': (0.00001, 0.1, 'log-uniform'), 'learning_rate': ['constant', 'adaptive'] } # Gaussian Process Regression param_ranges['GPR'] = { 'kernel': [1, 2, 3], # Will map to actual kernels later 'alpha': (1e-12, 1e-4, 'log-uniform'), 'normalize_y': [True, False], 'n_restarts_optimizer': (0, 5) } return param_grids, param_ranges