450 рядки
17 KiB
Python
450 рядки
17 KiB
Python
import numpy as np
|
|
import time
|
|
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
# Try to import Bayesian optimization libraries
|
|
try:
|
|
from skopt import BayesSearchCV
|
|
from skopt.space import Real, Integer, Categorical
|
|
BAYESIAN_AVAILABLE = True
|
|
except ImportError:
|
|
BAYESIAN_AVAILABLE = False
|
|
print("Bayesian optimization libraries not available. Will use RandomizedSearchCV instead.")
|
|
|
|
|
|
def ensure_finite(X, default_value=0.0):
|
|
"""
|
|
Replace any NaN, inf, or extremely large values with a default value.
|
|
|
|
Args:
|
|
X: Input array or matrix
|
|
default_value: Value to use for replacement
|
|
|
|
Returns:
|
|
X_clean: Cleaned array with finite values
|
|
"""
|
|
# Make a copy to avoid modifying the original
|
|
X_clean = np.array(X, copy=True)
|
|
|
|
# Replace inf values
|
|
mask_inf = np.isinf(X_clean)
|
|
if np.any(mask_inf):
|
|
print(f"Warning: Found {np.sum(mask_inf)} infinite values. Replacing with {default_value}.")
|
|
X_clean[mask_inf] = default_value
|
|
|
|
# Replace NaN values
|
|
mask_nan = np.isnan(X_clean)
|
|
if np.any(mask_nan):
|
|
print(f"Warning: Found {np.sum(mask_nan)} NaN values. Replacing with {default_value}.")
|
|
X_clean[mask_nan] = default_value
|
|
|
|
# Check for extremely large values
|
|
large_threshold = 1e6 # Adjust as needed
|
|
mask_large = np.abs(X_clean) > large_threshold
|
|
if np.any(mask_large):
|
|
print(f"Warning: Found {np.sum(mask_large)} extremely large values. Replacing with {default_value}.")
|
|
X_clean[mask_large] = default_value
|
|
|
|
return X_clean
|
|
|
|
|
|
def tune_hyperparameters(model_class, param_grid, X_train, y_train, method='grid', cv=5, n_iter=20, model_name=None):
|
|
"""
|
|
Tune hyperparameters for a model.
|
|
|
|
Args:
|
|
model_class: Scikit-learn model class
|
|
param_grid: Dictionary of hyperparameters
|
|
X_train: Training feature matrix
|
|
y_train: Training target vector
|
|
method: Tuning method ('grid', 'random', or 'bayesian')
|
|
cv: Number of cross-validation folds
|
|
n_iter: Number of iterations for random/bayesian search
|
|
model_name: Name of the model for special handling
|
|
|
|
Returns:
|
|
best_model: Tuned model
|
|
best_params: Best hyperparameter values
|
|
"""
|
|
start_time = time.time()
|
|
print(f" Tuning hyperparameters for {model_name} using {method} search...")
|
|
print(f" X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
|
|
|
|
if method == 'grid':
|
|
search = GridSearchCV(
|
|
model_class(), param_grid, cv=cv, scoring='neg_mean_squared_error',
|
|
verbose=1, n_jobs=-1
|
|
)
|
|
search.fit(X_train, y_train)
|
|
best_model = search.best_estimator_
|
|
best_params = search.best_params_
|
|
|
|
elif method == 'random':
|
|
search = RandomizedSearchCV(
|
|
model_class(), param_grid, n_iter=n_iter, cv=cv,
|
|
scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
|
|
)
|
|
search.fit(X_train, y_train)
|
|
best_model = search.best_estimator_
|
|
best_params = search.best_params_
|
|
|
|
elif method == 'bayesian':
|
|
if BAYESIAN_AVAILABLE:
|
|
# Convert param_grid to skopt space format
|
|
search_space = {}
|
|
for param, values in param_grid.items():
|
|
# If parameter values are a list
|
|
if isinstance(values, list):
|
|
# Check types of values to determine space type
|
|
if all(isinstance(v, bool) for v in values) or all(isinstance(v, str) for v in values):
|
|
search_space[param] = Categorical(values)
|
|
elif all(isinstance(v, int) for v in values):
|
|
search_space[param] = Integer(min(values), max(values))
|
|
elif all(isinstance(v, float) for v in values):
|
|
search_space[param] = Real(min(values), max(values), prior='log-uniform')
|
|
else:
|
|
# Mixed types or other - use categorical
|
|
search_space[param] = Categorical(values)
|
|
# If parameter values are already a dictionary or distribution
|
|
else:
|
|
search_space[param] = values
|
|
|
|
print(f" Created Bayesian search space: {search_space}")
|
|
|
|
# Special handling for models that need parameter mapping
|
|
model_instance = model_class()
|
|
if model_name == 'GPR' and 'kernel' in search_space:
|
|
# Create a modified search with a custom kernel mapping
|
|
def map_kernel(params):
|
|
# Map numeric values to actual kernels
|
|
if 'kernel' in params and isinstance(params['kernel'], int):
|
|
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
|
|
kernel_map = {
|
|
1: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)),
|
|
2: C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5),
|
|
3: C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1)
|
|
}
|
|
params['kernel'] = kernel_map.get(params['kernel'], kernel_map[1])
|
|
return params
|
|
|
|
# Use a subset of data for GPR to speed up training
|
|
subset_size = min(1000, len(X_train))
|
|
idx = np.random.choice(len(X_train), subset_size, replace=False)
|
|
X_subset = X_train[idx]
|
|
y_subset = y_train[idx]
|
|
|
|
# Manual Bayesian optimization for GPR
|
|
best_score = float('-inf')
|
|
best_params = {}
|
|
best_model = None # Initialize best_model
|
|
|
|
for _ in range(n_iter):
|
|
# Sample parameters randomly from the space
|
|
params = {}
|
|
for param, space in search_space.items():
|
|
if hasattr(space, 'rvs'): # It's a distribution
|
|
params[param] = space.rvs(1)[0]
|
|
elif isinstance(space, list): # It's a list of values
|
|
params[param] = np.random.choice(space)
|
|
|
|
# Map parameters for kernels
|
|
params = map_kernel(params)
|
|
|
|
# Create and fit model with these parameters
|
|
try:
|
|
model = model_class(**params)
|
|
model.fit(X_subset, y_subset)
|
|
# Score model
|
|
score = -mean_squared_error(y_subset, model.predict(X_subset)) # Neg MSE
|
|
if score > best_score:
|
|
best_score = score
|
|
best_params = params
|
|
best_model = model
|
|
except Exception as e:
|
|
print(f" Skipping parameters due to error: {e}")
|
|
continue
|
|
|
|
print(f" Best params: {best_params}")
|
|
if best_model is None:
|
|
# Fallback if no model was successfully trained
|
|
print(" No successful model training, using default parameters")
|
|
best_model = model_class()
|
|
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
|
|
kernel_rbf = C(1.0) * RBF(1.0)
|
|
best_model.set_params(kernel=kernel_rbf, alpha=1e-6)
|
|
best_model.fit(X_subset, y_subset)
|
|
return best_model, best_params
|
|
elif model_name == 'MLP' and 'hidden_layer_sizes' in search_space:
|
|
# Create a modified search with a custom hidden_layer_sizes mapping
|
|
def map_hidden_layers(params):
|
|
# Map numeric values to actual tuples for hidden_layer_sizes
|
|
if 'hidden_layer_sizes' in params and isinstance(params['hidden_layer_sizes'], (int, float)):
|
|
# Map integers to hidden layer configurations
|
|
layer_map = {
|
|
1: (50,),
|
|
2: (100,),
|
|
3: (50, 50),
|
|
4: (100, 50)
|
|
}
|
|
params['hidden_layer_sizes'] = layer_map.get(int(params['hidden_layer_sizes']), (50,))
|
|
return params
|
|
|
|
# Manual optimization for MLP
|
|
best_score = float('-inf')
|
|
best_params = {}
|
|
best_model = None # Initialize best_model
|
|
|
|
for _ in range(n_iter):
|
|
# Sample parameters randomly from the space
|
|
params = {}
|
|
for param, space in search_space.items():
|
|
if hasattr(space, 'rvs'): # It's a distribution
|
|
params[param] = space.rvs(1)[0]
|
|
elif isinstance(space, list): # It's a list of values
|
|
params[param] = np.random.choice(space)
|
|
|
|
# Map parameters for hidden layer sizes
|
|
params = map_hidden_layers(params)
|
|
|
|
# Create and fit model with these parameters
|
|
try:
|
|
model = model_class(**params)
|
|
model.fit(X_train, y_train)
|
|
# Score model
|
|
score = -mean_squared_error(y_train, model.predict(X_train)) # Neg MSE
|
|
if score > best_score:
|
|
best_score = score
|
|
best_params = params
|
|
best_model = model
|
|
except Exception as e:
|
|
print(f" Skipping parameters due to error: {e}")
|
|
continue
|
|
|
|
print(f" Best params: {best_params}")
|
|
if best_model is None:
|
|
# Fallback if no model was successfully trained
|
|
print(" No successful model training, using default parameters")
|
|
best_model = model_class(random_state=42, max_iter=1000)
|
|
best_model.fit(X_train, y_train)
|
|
return best_model, best_params
|
|
else:
|
|
# For other models, use standard BayesSearchCV
|
|
search = BayesSearchCV(
|
|
model_instance, search_space, n_iter=n_iter, cv=cv,
|
|
scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
|
|
)
|
|
search.fit(X_train, y_train)
|
|
best_model = search.best_estimator_
|
|
best_params = search.best_params_
|
|
else:
|
|
print(" Bayesian optimization not available, falling back to RandomizedSearchCV")
|
|
search = RandomizedSearchCV(
|
|
model_class(), param_grid, n_iter=n_iter, cv=cv,
|
|
scoring='neg_mean_squared_error', verbose=1, random_state=42, n_jobs=-1
|
|
)
|
|
search.fit(X_train, y_train)
|
|
best_model = search.best_estimator_
|
|
best_params = search.best_params_
|
|
|
|
else:
|
|
raise ValueError(f"Unknown tuning method: {method}")
|
|
|
|
elapsed_time = time.time() - start_time
|
|
print(f" Tuning completed in {elapsed_time:.2f} seconds")
|
|
print(f" Best params: {best_params}")
|
|
|
|
return best_model, best_params
|
|
|
|
|
|
def get_param_grids():
|
|
"""
|
|
Get parameter grids for different models.
|
|
|
|
Returns:
|
|
param_grids: Dictionary of parameter grids for grid/random search
|
|
param_ranges: Dictionary of parameter ranges for Bayesian optimization
|
|
"""
|
|
# Parameter grids for grid/random search
|
|
param_grids = {}
|
|
|
|
# Linear models
|
|
param_grids['Linear Regression'] = {'fit_intercept': [True, False]}
|
|
|
|
param_grids['Ridge'] = {
|
|
'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
|
|
'fit_intercept': [True, False],
|
|
'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
|
|
}
|
|
|
|
param_grids['Lasso'] = {
|
|
'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0],
|
|
'fit_intercept': [True, False],
|
|
'max_iter': [1000, 3000, 5000]
|
|
}
|
|
|
|
param_grids['ElasticNet'] = {
|
|
'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0],
|
|
'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
|
|
'fit_intercept': [True, False],
|
|
'max_iter': [1000, 3000, 5000]
|
|
}
|
|
|
|
# Tree-based models
|
|
param_grids['Decision Tree'] = {
|
|
'max_depth': [None, 5, 10, 15, 20],
|
|
'min_samples_split': [2, 5, 10],
|
|
'min_samples_leaf': [1, 2, 4]
|
|
}
|
|
|
|
param_grids['Random Forest'] = {
|
|
'n_estimators': [50, 100, 200],
|
|
'max_depth': [None, 10, 20, 30],
|
|
'min_samples_split': [2, 5, 10],
|
|
'min_samples_leaf': [1, 2, 4]
|
|
}
|
|
|
|
param_grids['Gradient Boosting'] = {
|
|
'n_estimators': [50, 100, 200],
|
|
'learning_rate': [0.01, 0.05, 0.1, 0.2],
|
|
'max_depth': [3, 5, 7, 9],
|
|
'min_samples_split': [2, 5, 10]
|
|
}
|
|
|
|
param_grids['XGBoost'] = {
|
|
'n_estimators': [50, 100, 200],
|
|
'learning_rate': [0.01, 0.05, 0.1, 0.2],
|
|
'max_depth': [3, 5, 7, 9],
|
|
'subsample': [0.8, 0.9, 1.0],
|
|
'colsample_bytree': [0.8, 0.9, 1.0]
|
|
}
|
|
|
|
param_grids['LightGBM'] = {
|
|
'n_estimators': [50, 100, 200],
|
|
'learning_rate': [0.01, 0.05, 0.1, 0.2],
|
|
'max_depth': [3, 5, 7, 9],
|
|
'num_leaves': [31, 63, 127],
|
|
'subsample': [0.8, 0.9, 1.0]
|
|
}
|
|
|
|
# Other models
|
|
param_grids['SVR'] = {
|
|
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
|
'C': [0.1, 1, 10, 100],
|
|
'gamma': ['scale', 'auto', 0.1, 0.01, 0.001]
|
|
}
|
|
|
|
param_grids['KNN'] = {
|
|
'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
|
|
'weights': ['uniform', 'distance'],
|
|
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
|
|
}
|
|
|
|
# Neural Network model
|
|
param_grids['MLP'] = {
|
|
'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
|
|
'activation': ['relu', 'tanh'],
|
|
'solver': ['adam', 'sgd'],
|
|
'alpha': [0.0001, 0.001, 0.01],
|
|
'learning_rate': ['constant', 'adaptive']
|
|
}
|
|
|
|
# Gaussian Process Regression
|
|
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
|
|
kernel_rbf = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2))
|
|
kernel_matern = C(1.0, (1e-3, 1e3)) * Matern(1.0, (1e-2, 1e2), nu=1.5)
|
|
kernel_rbf_white = C(1.0, (1e-3, 1e3)) * RBF(1.0, (1e-2, 1e2)) + WhiteKernel(0.1)
|
|
|
|
param_grids['GPR'] = {
|
|
'kernel': [kernel_rbf, kernel_matern, kernel_rbf_white],
|
|
'alpha': [1e-10, 1e-8, 1e-6],
|
|
'normalize_y': [True, False],
|
|
'n_restarts_optimizer': [0, 1, 3]
|
|
}
|
|
|
|
# Parameter ranges for Bayesian optimization
|
|
param_ranges = {}
|
|
|
|
# Linear models
|
|
param_ranges['Linear Regression'] = {'fit_intercept': [True, False]}
|
|
|
|
param_ranges['Ridge'] = {
|
|
'alpha': (0.001, 100.0, 'log-uniform'),
|
|
'fit_intercept': [True, False],
|
|
'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
|
|
}
|
|
|
|
param_ranges['Lasso'] = {
|
|
'alpha': (0.0001, 10.0, 'log-uniform'),
|
|
'fit_intercept': [True, False],
|
|
'max_iter': (1000, 10000)
|
|
}
|
|
|
|
param_ranges['ElasticNet'] = {
|
|
'alpha': (0.0001, 1.0, 'log-uniform'),
|
|
'l1_ratio': (0.1, 0.9),
|
|
'fit_intercept': [True, False],
|
|
'max_iter': (1000, 10000)
|
|
}
|
|
|
|
# Tree-based models
|
|
param_ranges['Decision Tree'] = {
|
|
'max_depth': (3, 30), # None will be handled specially
|
|
'min_samples_split': (2, 20),
|
|
'min_samples_leaf': (1, 10)
|
|
}
|
|
|
|
param_ranges['Random Forest'] = {
|
|
'n_estimators': (10, 300),
|
|
'max_depth': (3, 50), # None will be handled specially
|
|
'min_samples_split': (2, 20),
|
|
'min_samples_leaf': (1, 10)
|
|
}
|
|
|
|
param_ranges['Gradient Boosting'] = {
|
|
'n_estimators': (10, 300),
|
|
'learning_rate': (0.001, 0.3, 'log-uniform'),
|
|
'max_depth': (2, 15),
|
|
'min_samples_split': (2, 20)
|
|
}
|
|
|
|
param_ranges['XGBoost'] = {
|
|
'n_estimators': (10, 300),
|
|
'learning_rate': (0.001, 0.3, 'log-uniform'),
|
|
'max_depth': (2, 15),
|
|
'subsample': (0.5, 1.0),
|
|
'colsample_bytree': (0.5, 1.0)
|
|
}
|
|
|
|
# Other models
|
|
param_ranges['SVR'] = {
|
|
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
|
|
'C': (0.01, 1000.0, 'log-uniform'),
|
|
'gamma': ['scale', 'auto'] + [(0.0001, 1.0, 'log-uniform')]
|
|
}
|
|
|
|
param_ranges['KNN'] = {
|
|
'n_neighbors': (1, 30),
|
|
'weights': ['uniform', 'distance'],
|
|
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
|
|
}
|
|
|
|
# Neural Network model
|
|
param_ranges['MLP'] = {
|
|
'hidden_layer_sizes': [1, 2, 3, 4], # Will map to actual tuples later
|
|
'activation': ['relu', 'tanh'],
|
|
'solver': ['adam', 'sgd'],
|
|
'alpha': (0.00001, 0.1, 'log-uniform'),
|
|
'learning_rate': ['constant', 'adaptive']
|
|
}
|
|
|
|
# Gaussian Process Regression
|
|
param_ranges['GPR'] = {
|
|
'kernel': [1, 2, 3], # Will map to actual kernels later
|
|
'alpha': (1e-12, 1e-4, 'log-uniform'),
|
|
'normalize_y': [True, False],
|
|
'n_restarts_optimizer': (0, 5)
|
|
}
|
|
|
|
return param_grids, param_ranges
|