SPS/sintering-regression.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, WhiteKernel, ConstantKernel as C
# Import tuning functionality from shared module
from sintering_tuning import tune_hyperparameters, get_param_grids, ensure_finite
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# Define file paths
file_paths = [
    '160508-1021-1000,0min,56kN.csv',
    '160508-1022-900,0min,56kN.csv',
    '200508-1023-1350,0min,56kN.csv',
    '200508-1024-1200,0min,56kN.csv'
]

# Configuration for regression approaches
APPROACH = 2  # 1: Standard approach, 2: Window approach, 3: Virtual experiment
VALIDATION_FILE_INDEX = 3  # Use the 4th file for validation (0-indexed)
TARGET_COLUMN = 'Rel. Piston Trav'
EXCLUDED_COLUMNS = ['Abs. Piston Trav', 'Nr.', 'Datum', 'Zeit']  # Columns to exclude

# Feature selection (manual control)
# Set to None to use all available features
SELECTED_FEATURES = [
    'MTC1', 'MTC2', 'MTC3', 'Pyrometer', 'SV Temperature',
    'SV Power', 'SV Force', 'AV Force', 'AV Rel. Pressure',
    'I RMS', 'U RMS', 'Heating power'
]

# Model selection (set to True to include in the evaluation)
MODELS_TO_EVALUATE = {
    'Linear Regression': True,
    'Ridge': True,
    'Lasso': True,
    'ElasticNet': True,
    'Decision Tree': True,
    'Random Forest': True,
    'Gradient Boosting': True,
    'XGBoost': True,
    'SVR': True,
    'KNN': True,
    'MLP': True,
    'GPR': True
}

# Hyperparameter tuning settings
TUNING_METHOD = 'bayesian'  # 'grid', 'random', 'bayesian'
CV_FOLDS = 5
N_ITER = 20  # Number of iterations for random/bayesian search


def load_data(file_paths, validation_index):
    """
    Load and preprocess the CSV files.

    Args:
        file_paths: List of CSV file paths
        validation_index: Index of the file to use for validation

    Returns:
        train_data: Combined DataFrame of training data
        validation_data: DataFrame for validation
    """
    all_data = []

    for i, file_path in enumerate(file_paths):
        # Read the CSV file with proper settings for European number format
        print(f"Loading file: {file_path}")
        try:
            df = pd.read_csv(file_path, sep=';', decimal=',', header=0)
            print(f"  File shape: {df.shape}")

            # Add a file identifier column
            df['file_id'] = i

            all_data.append(df)
        except Exception as e:
            print(f"  Error loading {file_path}: {e}")

    if not all_data:
        raise ValueError("No data files could be loaded!")

    # Split into training and validation
    validation_data = all_data.pop(validation_index)
    print(f"Validation data shape: {validation_data.shape}")

    train_data = pd.concat(all_data, ignore_index=True)
    print(f"Training data shape: {train_data.shape}")

    return train_data, validation_data


def preprocess_data(df, target_col, excluded_cols, selected_features=None):
    """
    Preprocess the data for regression.

    Args:
        df: Input DataFrame
        target_col: Target column name
        excluded_cols: List of columns to exclude
        selected_features: List of features to include (None = use all)

    Returns:
        X: Feature matrix
        y: Target vector
        feature_names: List of feature names used
    """
    # Make a copy to avoid modifying the original
    data = df.copy()

    # Check if target column exists
    if target_col not in data.columns:
        raise ValueError(f"Target column '{target_col}' not found in data. Available columns: {data.columns.tolist()}")

    print(f"Preprocessing data with shape: {data.shape}")
    print(f"Target column: {target_col}")

    # Drop rows with NaN in target column
    original_count = len(data)
    data = data.dropna(subset=[target_col])
    dropped_count = original_count - len(data)
    print(f"Dropped {dropped_count} rows with missing target values")

    # Extract target
    y = data[target_col].values

    # Convert -999 values to NaN (likely error codes in the dataset)
    data = data.replace(-999, np.nan)

    # Drop specified columns and the target
    columns_to_drop = excluded_cols + [target_col, 'file_id']
    X_data = data.drop(columns=columns_to_drop, errors='ignore')

    # Select only specified features if provided
    if selected_features is not None:
        available_features = [col for col in selected_features if col in X_data.columns]
        missing_features = [col for col in selected_features if col not in X_data.columns]
        if missing_features:
            print(f"Warning: Some selected features are not in the data: {missing_features}")
        X_data = X_data[available_features]

    print(f"Selected features: {X_data.columns.tolist()}")

    # Check for non-numeric columns
    non_numeric = X_data.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        print(f"Warning: Non-numeric columns found: {non_numeric}")
        print("Converting to numeric or dropping...")

        for col in non_numeric:
            try:
                # Try to convert to numeric
                X_data[col] = pd.to_numeric(X_data[col], errors='coerce')
            except:
                # If conversion fails, drop the column
                print(f"  Dropping column: {col}")
                X_data = X_data.drop(columns=[col])

    # Check for NaN values
    nan_count = X_data.isna().sum().sum()
    if nan_count > 0:
        print(f"Found {nan_count} NaN values in features. Filling with column means...")

    # Fill remaining NaNs with column means
    X_data = X_data.fillna(X_data.mean())

    # Get feature names for later use
    feature_names = X_data.columns.tolist()

    # Convert to numpy array for modeling
    X = X_data.values

    # Improve precision of target variable (if needed)
    # This doesn't change the actual precision but makes sure we're using float64
    y = y.astype(np.float64)

    print(f"Preprocessed data: X shape: {X.shape}, y shape: {y.shape}")

    return X, y, feature_names


def prepare_window_data(X, y, window_size=1):
    """
    Prepare data for window-based approach (Approach 2 & 3).

    Args:
        X: Feature matrix
        y: Target vector
        window_size: Number of previous steps to include

    Returns:
        X_window: Feature matrix with window features
        y_window: Target vector aligned with the window features
    """
    n_samples, n_features = X.shape

    # We need at least window_size+1 samples to create a valid window
    if n_samples <= window_size:
        raise ValueError(f"Not enough samples ({n_samples}) for window size {window_size}")

    # Initialize arrays for the windowed data
    X_window = np.zeros((n_samples - window_size, n_features * (window_size + 1) + window_size))
    y_window = np.zeros(n_samples - window_size)

    # Fill in the arrays
    for i in range(window_size, n_samples):
        # Current features
        X_window[i - window_size, :n_features] = X[i]

        # Add previous features and targets
        for w in range(1, window_size + 1):
            # Previous features
            start_idx = n_features + (w - 1) * n_features
            end_idx = start_idx + n_features
            X_window[i - window_size, start_idx:end_idx] = X[i - w]

            # Previous target
            X_window[i - window_size, n_features * (window_size + 1) + (w - 1)] = y[i - w]

        # Current target
        y_window[i - window_size] = y[i]

    return X_window, y_window


def virtual_experiment(model, X_val, y_val, feature_names, window_size=1):
    """
    Run a virtual experiment with the trained model (Approach 3).

    Args:
        model: Trained regression model
        X_val: Validation feature matrix
        y_val: Validation target vector
        feature_names: List of feature names
        window_size: Window size used in training

    Returns:
        y_pred: Predicted target values
        y_true: Actual target values
    """
    n_samples, n_features = X_val.shape
    n_features_per_window = int(X_val.shape[1] / (window_size + 1) - window_size)

    # We'll store actual and predicted values
    y_true = []
    y_pred = []

    # Initialize with the first few actual values
    # (in a real scenario, we might have initial measurements)
    prev_y_values = y_val[:window_size].tolist()

    # Process each time step
    for i in range(window_size, n_samples):
        # Extract current features (the machine settings for this step)
        current_features = X_val[i, :n_features_per_window]

        # Construct input for the model using current features and previous info
        model_input = np.zeros(n_features)

        # Add current features
        model_input[:n_features_per_window] = current_features

        # Add previous features and predicted targets
        for w in range(1, window_size + 1):
            # Previous features
            prev_features_idx = i - w
            start_idx = n_features_per_window + (w - 1) * n_features_per_window
            end_idx = start_idx + n_features_per_window

            # Use actual previous features from validation set
            model_input[start_idx:end_idx] = X_val[prev_features_idx, :n_features_per_window]

            # Use predicted previous target instead of actual value
            prev_y_idx = n_features_per_window * (window_size + 1) + (w - 1)
            model_input[prev_y_idx] = prev_y_values[-w]

        # Make prediction
        prediction = model.predict([model_input])[0]

        # Store actual and predicted values
        y_true.append(y_val[i])
        y_pred.append(prediction)

        # Update previous y values for next iteration
        prev_y_values.append(prediction)

    return np.array(y_pred), np.array(y_true)


def analyze_target_precision(df, target_col, plot=True):
    """
    Analyze the precision issues in the target column.

    Args:
        df: DataFrame containing the target column
        target_col: Name of the target column
        plot: Whether to generate plots

    Returns:
        dict: Dictionary with precision analysis results
    """
    print(f"\nAnalyzing precision issues in '{target_col}'...")

    # Extract target column
    target_values = df[target_col].values

    # Calculate differences between consecutive values
    differences = np.diff(target_values)
    non_zero_diffs = differences[differences != 0]

    # Ensure we have absolute differences for calculations that need positive values
    abs_non_zero_diffs = np.abs(non_zero_diffs)

    # Count occurrences of repeated values
    consecutive_repeats = []
    current_count = 1

    for i in range(1, len(target_values)):
        if abs(target_values[i] - target_values[i-1]) < 1e-10:
            current_count += 1
        else:
            if current_count > 1:
                consecutive_repeats.append(current_count)
            current_count = 1

    # Add the last group if it's a repeat
    if current_count > 1:
        consecutive_repeats.append(current_count)

    # Calculate statistics
    results = {
        'unique_values': df[target_col].nunique(),
        'total_values': len(target_values),
        'min_nonzero_diff': np.min(non_zero_diffs) if len(non_zero_diffs) > 0 else 0,
        'min_abs_nonzero_diff': np.min(abs_non_zero_diffs) if len(abs_non_zero_diffs) > 0 else 0.0001,
        'avg_nonzero_diff': np.mean(non_zero_diffs) if len(non_zero_diffs) > 0 else 0,
        'avg_abs_nonzero_diff': np.mean(abs_non_zero_diffs) if len(abs_non_zero_diffs) > 0 else 0.0001,
        'median_nonzero_diff': np.median(non_zero_diffs) if len(non_zero_diffs) > 0 else 0,
        'zero_diff_count': len(differences) - len(non_zero_diffs),
        'zero_diff_percentage': 100 * (len(differences) - len(non_zero_diffs)) / len(differences),
        'max_consecutive_repeats': max(consecutive_repeats) if consecutive_repeats else 0,
        'avg_consecutive_repeats': np.mean(consecutive_repeats) if consecutive_repeats else 0
    }

    # Print results
    print(f"  Unique values: {results['unique_values']} out of {results['total_values']} total values")
    print(f"  Minimum non-zero difference: {results['min_nonzero_diff']:.8f}")
    print(f"  Minimum absolute non-zero difference: {results['min_abs_nonzero_diff']:.8f}")
    print(f"  Average non-zero difference: {results['avg_nonzero_diff']:.8f}")
    print(f"  Average absolute non-zero difference: {results['avg_abs_nonzero_diff']:.8f}")
    print(f"  Zero differences: {results['zero_diff_count']} ({results['zero_diff_percentage']:.2f}% of all consecutive pairs)")
    print(f"  Maximum consecutive repeated values: {results['max_consecutive_repeats']}")
    print(f"  Average run length of repeated values: {results['avg_consecutive_repeats']:.2f}")

    # Generate plots if requested
    if plot:
        # Plot 1: Histogram of non-zero differences
        plt.figure(figsize=(12, 8))
        plt.subplot(2, 2, 1)
        plt.hist(non_zero_diffs, bins=50, alpha=0.7)
        plt.xlabel('Non-zero differences between consecutive values')
        plt.ylabel('Frequency')
        plt.title('Distribution of non-zero differences')

        # Plot 2: Time series of values
        plt.subplot(2, 2, 2)
        plt.plot(target_values[:1000])  # Just plot first 1000 for clarity
        plt.xlabel('Index')
        plt.ylabel(target_col)
        plt.title(f'{target_col} values (first 1000 points)')

        # Plot 3: Histogram of consecutive repeats
        plt.subplot(2, 2, 3)
        plt.hist(consecutive_repeats, bins=30, alpha=0.7)
        plt.xlabel('Number of consecutive repeats')
        plt.ylabel('Frequency')
        plt.title('Distribution of consecutive repeated values')

        # Plot 4: Original vs. smoothed data
        plt.subplot(2, 2, 4)

        # Sample points for demonstration
        sample = target_values[:1000]

        # Create a smoothed version by adding tiny noise
        noise_scale = results['min_abs_nonzero_diff']/10
        smoothed = sample + np.random.normal(0, noise_scale, len(sample))

        plt.plot(sample, label='Original', alpha=0.7)
        plt.plot(smoothed, label='With tiny noise', alpha=0.7)
        plt.xlabel('Index')
        plt.ylabel(target_col)
        plt.title('Original vs. noise-added values')
        plt.legend()

        plt.tight_layout()
        plt.show()
        plt.close()

    return results


def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluate model performance on test data.

    Args:
        model: Trained model
        X_test: Test feature matrix
        y_test: Test target vector
        model_name: Name of the model for display

    Returns:
        metrics: Dictionary with evaluation metrics
    """
    print(f"  In evaluate_model - X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Return as dictionary
    metrics = {
        'model': model_name,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }

    return metrics, y_pred


def scale_data(X_train, X_test):
    """
    Standardize features by removing mean and scaling to unit variance.

    Args:
        X_train: Training feature matrix
        X_test: Test feature matrix

    Returns:
        X_train_scaled: Scaled training features
        X_test_scaled: Scaled test features
        scaler: The fitted scaler for future use
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler


# Tune hyperparameters function is now imported from sintering_tuning.py


def build_and_evaluate_models(X_train, y_train, X_test, y_test):
    """
    Build and evaluate multiple regression models.

    Args:
        X_train: Training feature matrix
        y_train: Training target vector
        X_test: Test feature matrix
        y_test: Test target vector

    Returns:
        results: List of evaluation metrics for each model
        best_model: The best performing model
        all_models: Dictionary of all trained models
    """
    # Print data dimensions for debugging
    print(f"Data dimensions before scaling:")
    print(f"  X_train: {X_train.shape}")
    print(f"  y_train: {y_train.shape}")
    print(f"  X_test: {X_test.shape}")
    print(f"  y_test: {y_test.shape}")

    # Scale the data
    X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test)

    # Print data dimensions after scaling
    print(f"Data dimensions after scaling:")
    print(f"  X_train_scaled: {X_train_scaled.shape}")
    print(f"  X_test_scaled: {X_test_scaled.shape}")

    # Define models to evaluate
    models = {}

    # Get parameter grids from shared tuning module
    param_grids, param_ranges = get_param_grids()

    # Linear models
    if MODELS_TO_EVALUATE.get('Linear Regression', False):
        models['Linear Regression'] = LinearRegression()

    if MODELS_TO_EVALUATE.get('Ridge', False):
        models['Ridge'] = Ridge()

    if MODELS_TO_EVALUATE.get('Lasso', False):
        models['Lasso'] = Lasso()

    if MODELS_TO_EVALUATE.get('ElasticNet', False):
        models['ElasticNet'] = ElasticNet()

    # Tree-based models
    if MODELS_TO_EVALUATE.get('Decision Tree', False):
        models['Decision Tree'] = DecisionTreeRegressor()

    if MODELS_TO_EVALUATE.get('Random Forest', False):
        models['Random Forest'] = RandomForestRegressor()

    if MODELS_TO_EVALUATE.get('Gradient Boosting', False):
        models['Gradient Boosting'] = GradientBoostingRegressor()

    if MODELS_TO_EVALUATE.get('XGBoost', False):
        models['XGBoost'] = xgb.XGBRegressor()

    if MODELS_TO_EVALUATE.get('LightGBM', False):
        models['LightGBM'] = lgb.LGBMRegressor()

    # Other models
    if MODELS_TO_EVALUATE.get('SVR', False):
        models['SVR'] = SVR()

    if MODELS_TO_EVALUATE.get('KNN', False):
        models['KNN'] = KNeighborsRegressor()

    # Neural Network model
    if MODELS_TO_EVALUATE.get('MLP', False):
        models['MLP'] = MLPRegressor(random_state=42, max_iter=1000)

    # Gaussian Process Regression
    if MODELS_TO_EVALUATE.get('GPR', False):
        models['GPR'] = GaussianProcessRegressor(random_state=42)

    # Train and evaluate models
    results = []
    all_models = {}
    best_r2 = -float('inf')
    best_model = None
    best_model_name = None

    for name, model in models.items():
        print(f"Training {name}...")

        try:
            # Tune hyperparameters if grid is provided
            if name in param_grids and len(param_grids[name]) > 0:
                model_class = model.__class__

                # Select parameter grid based on tuning method
                if TUNING_METHOD == 'bayesian' and name in param_ranges:
                    param_config = param_ranges[name]
                else:
                    param_config = param_grids[name]

                tuned_model, best_params = tune_hyperparameters(
                    model_class, param_config,
                    X_train_scaled, y_train,
                    method=TUNING_METHOD, cv=CV_FOLDS, n_iter=N_ITER,
                    model_name=name
                )
                print(f"  Best params: {best_params}")
                all_models[name] = tuned_model
            else:
                # Just fit the model with default parameters
                model.fit(X_train_scaled, y_train)
                all_models[name] = model

            # Evaluate model
            metrics, y_pred = evaluate_model(all_models[name], X_test_scaled, y_test, name)
            results.append(metrics)

            print(f"  {name} - RMSE: {metrics['rmse']:.4f}, R²: {metrics['r2']:.4f}")

            # Track best model
            if metrics['r2'] > best_r2:
                best_r2 = metrics['r2']
                best_model = all_models[name]
                best_model_name = name

        except Exception as e:
            print(f"Error training {name}: {e}")

    print(f"\nBest model: {best_model_name} (R² = {best_r2:.4f})")

    # Sort results by R2 (higher is better)
    results.sort(key=lambda x: x['r2'], reverse=True)

    return results, best_model, all_models, scaler


def plot_feature_importance(model, feature_names, model_name, top_n=15):
    """
    Plot feature importance for tree-based models.

    Args:
        model: Trained model
        feature_names: List of feature names
        model_name: Name of the model
        top_n: Number of top features to plot
    """
    plt.figure(figsize=(12, 8))

    # Get feature importance
    if hasattr(model, 'feature_importances_'):
        # For tree-based models
        importances = model.feature_importances_
    elif hasattr(model, 'coef_'):
        # For linear models
        importances = np.abs(model.coef_)
    else:
        print(f"Model {model_name} does not have built-in feature importance.")
        return

    # Sort features by importance
    indices = np.argsort(importances)[::-1]

    # Plot top N features
    top_indices = indices[:top_n]
    top_importances = importances[top_indices]
    top_features = [feature_names[i] for i in top_indices]

    plt.barh(range(len(top_indices)), top_importances, align='center')
    plt.yticks(range(len(top_indices)), top_features)
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature')
    plt.title(f'Top {top_n} Feature Importance - {model_name}')
    plt.tight_layout()
    plt.show()
    plt.close()


def plot_actual_vs_predicted(y_true, y_pred, model_name, approach):
    """
    Plot actual vs predicted values.

    Args:
        y_true: Actual target values
        y_pred: Predicted target values
        model_name: Name of the model
        approach: Regression approach used
    """
    plt.figure(figsize=(12, 8))

    # Create scatter plot
    plt.scatter(y_true, y_pred, alpha=0.5)

    # Add perfect prediction line
    max_val = max(np.max(y_true), np.max(y_pred))
    min_val = min(np.min(y_true), np.min(y_pred))
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2)

    # Add labels and title
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted - {model_name} (Approach {approach})')

    # Calculate metrics for the plot
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    # Add text with metrics
    plt.annotate(f'RMSE: {rmse:.4f}\nR²: {r2:.4f}',
                 xy=(0.05, 0.9), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.close()


def plot_time_series_prediction(y_true, y_pred, model_name, approach):
    """
    Plot time series of actual and predicted values.

    Args:
        y_true: Actual target values
        y_pred: Predicted target values
        model_name: Name of the model
        approach: Regression approach used
    """
    plt.figure(figsize=(15, 8))

    # Create time series plot
    plt.plot(range(len(y_true)), y_true, label='Actual', color='blue', alpha=0.7)
    plt.plot(range(len(y_pred)), y_pred, label='Predicted', color='red', alpha=0.7)

    # Add labels and title
    plt.xlabel('Time Step')
    plt.ylabel('Rel. Piston Trav')
    plt.title(f'Time Series Prediction - {model_name} (Approach {approach})')
    plt.legend()

    # Calculate metrics for the plot
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    # Add text with metrics
    plt.annotate(f'RMSE: {rmse:.4f}\nR²: {r2:.4f}',
                 xy=(0.05, 0.9), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    plt.close()


def plot_residuals(y_true, y_pred, model_name, approach):
    """
    Plot residuals.

    Args:
        y_true: Actual target values
        y_pred: Predicted target values
        model_name: Name of the model
        approach: Regression approach used
    """
    residuals = y_true - y_pred

    # Create figure with 2 subplots
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))

    # Plot 1: Residuals vs Predicted
    axes[0].scatter(y_pred, residuals, alpha=0.5)
    axes[0].axhline(y=0, color='r', linestyle='--')
    axes[0].set_xlabel('Predicted Values')
    axes[0].set_ylabel('Residuals')
    axes[0].set_title('Residuals vs Predicted Values')
    axes[0].grid(True, alpha=0.3)

    # Plot 2: Residual Distribution
    sns.histplot(residuals, kde=True, ax=axes[1])
    axes[1].axvline(x=0, color='r', linestyle='--')
    axes[1].set_xlabel('Residual Value')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Residual Distribution')

    # Add overall title
    plt.suptitle(f'Residual Analysis - {model_name} (Approach {approach})', y=1.05)

    plt.tight_layout()
    plt.show()
    plt.close()


def plot_model_comparison(y_true, all_predictions, approach, metric='rmse'):
    """
    Create a comparison plot for all models' predictions.

    Args:
        y_true: The actual target values
        all_predictions: Dictionary with model names as keys and predictions as values
        approach: The regression approach used
        metric: Metric to sort by ('rmse' or 'r2')
    """
    # Calculate metrics for all models
    metrics = {}
    for model_name, y_pred in all_predictions.items():
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        metrics[model_name] = {'rmse': rmse, 'r2': r2}

    # Sort models by the specified metric
    if metric.lower() == 'rmse':
        sorted_models = sorted(metrics.keys(), key=lambda m: metrics[m]['rmse'])
    else:  # r2
        sorted_models = sorted(metrics.keys(), key=lambda m: -metrics[m]['r2'])  # Higher R² is better

    # Create figure with 2 subplots: time series and metrics comparison
    fig, axes = plt.subplots(2, 1, figsize=(15, 12))

    # Plot 1: Time series of all model predictions (just a section for clarity)
    max_display_points = 500  # Avoid overcrowding the plot
    display_length = min(len(y_true), max_display_points)
    start_idx = 0
    if len(y_true) > max_display_points:
        # Choose a representative middle section
        start_idx = (len(y_true) - max_display_points) // 2
    end_idx = start_idx + display_length

    # Plot actual values
    axes[0].plot(range(display_length), y_true[start_idx:end_idx],
             label='Actual', linewidth=2, color='black')

    # Plot predictions for each model
    colors = plt.cm.tab10.colors  # Use a colormap for different models
    for i, model_name in enumerate(sorted_models):
        y_pred = all_predictions[model_name][start_idx:end_idx]
        axes[0].plot(range(display_length), y_pred,
                 label=f'{model_name}', linewidth=1.5, alpha=0.7,
                 color=colors[i % len(colors)])

    axes[0].set_xlabel('Time Step')
    axes[0].set_ylabel('Target Value')
    axes[0].set_title(f'Model Predictions Comparison (Section {start_idx}-{end_idx})')
    axes[0].legend(loc='best')
    axes[0].grid(True, alpha=0.3)

    # Plot 2: Bar chart of model metrics
    x = np.arange(len(sorted_models))
    width = 0.35

    # RMSE bars (lower is better)
    rmse_values = [metrics[model]['rmse'] for model in sorted_models]
    axes[1].bar(x - width/2, rmse_values, width, label='RMSE', color='red', alpha=0.7)

    # R² bars (higher is better)
    r2_values = [metrics[model]['r2'] for model in sorted_models]
    axes[1].bar(x + width/2, r2_values, width, label='R²', color='blue', alpha=0.7)

    # Add labels and annotations
    axes[1].set_xlabel('Model')
    axes[1].set_ylabel('Metric Value')
    axes[1].set_title('Performance Metrics Comparison')
    axes[1].set_xticks(x)
    axes[1].set_xticklabels(sorted_models, rotation=45, ha='right')
    axes[1].legend()

    # Add value labels on the bars
    for i, v in enumerate(rmse_values):
        axes[1].text(i - width/2, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontsize=8)

    for i, v in enumerate(r2_values):
        axes[1].text(i + width/2, v + 0.02, f'{v:.3f}', ha='center', va='bottom', fontsize=8)

    # Add horizontal line at 0 for R² reference
    axes[1].axhline(y=0, color='gray', linestyle='--', alpha=0.7)

    # Set different y-scale for second axis to better visualize R² values
    ax2 = axes[1].twinx()
    ax2.set_ylabel('R² Score', color='blue')
    ax2.set_ylim(min(r2_values) - 0.1, max(r2_values) + 0.1)
    ax2.tick_params(axis='y', labelcolor='blue')

    plt.suptitle(f'Model Comparison Summary - Approach {approach}', fontsize=16, y=0.98)
    plt.tight_layout()
    plt.show()
    plt.close()


def plot_smoothing_comparison(y_true, y_pred_orig, y_pred_medium, y_pred_high, model_name):
    """
    Compare different smoothing approaches for the virtual experiment.

    Args:
        y_true: Actual target values
        y_pred_orig: Predicted values without smoothing
        y_pred_medium: Predicted values with medium smoothing
        y_pred_high: Predicted values with high smoothing
        model_name: Name of the model
    """
    # Create figure with 2 subplots
    fig, axes = plt.subplots(2, 1, figsize=(15, 12))

    # Calculate error metrics
    rmse_orig = np.sqrt(mean_squared_error(y_true, y_pred_orig))
    rmse_medium = np.sqrt(mean_squared_error(y_true, y_pred_medium))
    rmse_high = np.sqrt(mean_squared_error(y_true, y_pred_high))

    r2_orig = r2_score(y_true, y_pred_orig)
    r2_medium = r2_score(y_true, y_pred_medium)
    r2_high = r2_score(y_true, y_pred_high)

    # Plot 1: Full time series
    axes[0].plot(y_true, label='Actual', color='blue', alpha=0.7)
    axes[0].plot(y_pred_orig, label=f'No Smoothing (RMSE={rmse_orig:.4f}, R²={r2_orig:.4f})',
             color='red', alpha=0.7)
    axes[0].plot(y_pred_medium, label=f'Medium Smoothing (RMSE={rmse_medium:.4f}, R²={r2_medium:.4f})',
              color='green', alpha=0.7)
    axes[0].plot(y_pred_high, label=f'High Smoothing (RMSE={rmse_high:.4f}, R²={r2_high:.4f})',
             color='purple', alpha=0.7)

    axes[0].set_xlabel('Time Step')
    axes[0].set_ylabel('Rel. Piston Trav')
    axes[0].set_title('Full Time Series Comparison')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)

    # Identify plateau regions in original prediction
    plateaus = []
    consecutive_same = 0
    last_val = None

    for i, val in enumerate(y_pred_orig):
        if last_val is not None and abs(val - last_val) < 1e-6:
            consecutive_same += 1
            if consecutive_same >= 5:  # Consider it a plateau after 5 same values
                if len(plateaus) == 0 or plateaus[-1][1] < i - consecutive_same:
                    plateaus.append((i - consecutive_same, i))  # (start, end)
                else:
                    plateaus[-1] = (plateaus[-1][0], i)  # Extend the last plateau
        else:
            consecutive_same = 0
        last_val = val

    # Plot 2: Zoom in on a problematic region with plateaus
    if plateaus:
        # Find the longest plateau
        longest_plateau = max(plateaus, key=lambda x: x[1] - x[0])
        start_idx = max(0, longest_plateau[0] - 10)
        end_idx = min(len(y_true), longest_plateau[1] + 10)

        # Plot 2: Zoomed in on plateau region
        axes[1].plot(range(start_idx, end_idx), y_true[start_idx:end_idx],
                 label='Actual', color='blue', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_orig[start_idx:end_idx],
                 label='No Smoothing', color='red', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_medium[start_idx:end_idx],
                  label='Medium Smoothing', color='green', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_high[start_idx:end_idx],
                 label='High Smoothing', color='purple', alpha=0.7)

        # Highlight the plateau region
        plt_start, plt_end = longest_plateau
        axes[1].axvspan(plt_start, plt_end, alpha=0.2, color='yellow', label='Plateau')

        axes[1].set_xlabel('Time Step')
        axes[1].set_ylabel('Rel. Piston Trav')
        axes[1].set_title(f'Zoomed View of Plateau Region (Steps {start_idx}-{end_idx})')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    else:
        # If no plateaus found, just show a zoomed portion of the middle
        mid = len(y_true) // 2
        start_idx = max(0, mid - 50)
        end_idx = min(len(y_true), mid + 50)

        axes[1].plot(range(start_idx, end_idx), y_true[start_idx:end_idx],
                 label='Actual', color='blue', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_orig[start_idx:end_idx],
                 label='No Smoothing', color='red', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_medium[start_idx:end_idx],
                  label='Medium Smoothing', color='green', alpha=0.7)
        axes[1].plot(range(start_idx, end_idx), y_pred_high[start_idx:end_idx],
                 label='High Smoothing', color='purple', alpha=0.7)

        axes[1].set_xlabel('Time Step')
        axes[1].set_ylabel('Rel. Piston Trav')
        axes[1].set_title(f'Zoomed View (Steps {start_idx}-{end_idx})')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)

    # Create a third plot showing the differences between smoothed and unsmoothed
    fig2, axes2 = plt.subplots(1, 2, figsize=(15, 6))

    # Plot differences between predictions
    diff_medium = y_pred_medium - y_pred_orig
    diff_high = y_pred_high - y_pred_orig

    axes2[0].plot(diff_medium, label='Medium Smoothing Diff', color='green', alpha=0.7)
    axes2[0].plot(diff_high, label='High Smoothing Diff', color='purple', alpha=0.7)
    axes2[0].axhline(y=0, color='r', linestyle='--')
    axes2[0].set_xlabel('Time Step')
    axes2[0].set_ylabel('Difference from Original')
    axes2[0].set_title('Difference Between Smoothed and Original Predictions')
    axes2[0].legend()
    axes2[0].grid(True, alpha=0.3)

    # Histogram of consecutive same values in the original prediction
    consecutive_counts = []
    count = 1
    for i in range(1, len(y_pred_orig)):
        if abs(y_pred_orig[i] - y_pred_orig[i-1]) < 1e-6:
            count += 1
        else:
            if count > 1:
                consecutive_counts.append(count)
            count = 1
    if count > 1:
        consecutive_counts.append(count)

    if consecutive_counts:
        sns.histplot(consecutive_counts, bins=20, kde=True, ax=axes2[1])
        axes2[1].set_xlabel('Consecutive Same Value Count')
        axes2[1].set_ylabel('Frequency')
        axes2[1].set_title('Distribution of Plateau Lengths')
    else:
        axes2[1].text(0.5, 0.5, 'No plateaus detected', ha='center', va='center', fontsize=14)
        axes2[1].set_xlabel('Consecutive Same Value Count')
        axes2[1].set_ylabel('Frequency')
        axes2[1].set_title('Distribution of Plateau Lengths')

    # Add title to the figure
    fig.suptitle(f'Smoothing Comparison - {model_name}', fontsize=16, y=0.98)
    fig2.suptitle(f'Smoothing Analysis - {model_name}', fontsize=16, y=0.98)

    plt.tight_layout()
    plt.show()
    plt.close('all')

    # Print summary comparison
    print("\nSmoothing Approaches Comparison:")
    print(f"  No Smoothing:     RMSE = {rmse_orig:.6f}, R² = {r2_orig:.6f}")
    print(f"  Medium Smoothing: RMSE = {rmse_medium:.6f}, R² = {r2_medium:.6f} (ΔRMSE = {rmse_medium-rmse_orig:.6f})")
    print(f"  High Smoothing:   RMSE = {rmse_high:.6f}, R² = {r2_high:.6f} (ΔRMSE = {rmse_high-rmse_orig:.6f})")

    # Calculate plateau statistics
    if plateaus:
        plateau_lengths = [end-start+1 for start, end in plateaus]
        print(f"\nPlateau Statistics:")
        print(f"  Number of plateaus detected: {len(plateaus)}")
        print(f"  Average plateau length: {np.mean(plateau_lengths):.2f} steps")
        print(f"  Longest plateau: {max(plateau_lengths)} steps")
        print(f"  Total steps in plateaus: {sum(plateau_lengths)} ({100*sum(plateau_lengths)/len(y_pred_orig):.2f}% of series)")
    else:
        print("\nNo significant plateaus detected in the prediction.")

    return rmse_orig, rmse_medium, rmse_high, r2_orig, r2_medium, r2_high


def main():
    """Main execution function"""
    print("SPS Sintering Regression Analysis")
    print(f"Running approach {APPROACH}")

    # Load data
    print("\nLoading data...")
    train_data, validation_data = load_data(file_paths, VALIDATION_FILE_INDEX)
    print(f"Training data shape: {train_data.shape}")
    print(f"Validation data shape: {validation_data.shape}")

    # Analyze precision issues in the target column
    all_data = pd.concat([train_data, validation_data])
    precision_results = analyze_target_precision(all_data, TARGET_COLUMN)

    # Preprocess data based on the selected approach
    print("\nPreprocessing data...")
    if APPROACH == 1:
        # Standard approach: predict target based on current features only
        X_train, y_train, feature_names = preprocess_data(
            train_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)
        X_val, y_val, _ = preprocess_data(
            validation_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)

        # Split training data into train and test sets
        X_train_split, X_test, y_train_split, y_test = train_test_split(
            X_train, y_train, test_size=0.2, random_state=42)

        # Build and evaluate models
        print("\nTraining and evaluating models...")
        results, best_model, all_models, scaler = build_and_evaluate_models(
            X_train_split, y_train_split, X_test, y_test)

        # Evaluate best model on validation data
        X_val_scaled = scaler.transform(X_val)
        val_metrics, y_val_pred = evaluate_model(best_model, X_val_scaled, y_val,
                                                 results[0]['model'] + " (Validation)")

        print("\nBest model performance on validation data:")
        print(f"  RMSE: {val_metrics['rmse']:.4f}")
        print(f"  R²: {val_metrics['r2']:.4f}")
        print(f"  MAE: {val_metrics['mae']:.4f}")

        # Plot results for all models
        # First for the best model
        best_model_name = results[0]['model']

        # Plot feature importance if available for best model
        if hasattr(best_model, 'feature_importances_') or hasattr(best_model, 'coef_'):
            plot_feature_importance(best_model, feature_names, best_model_name)

        # Evaluate and plot for all models
        print("\nEvaluating all models on validation data:")
        all_predictions = {}
        for model_name, model in all_models.items():
            # Scale the validation data
            X_val_scaled = scaler.transform(X_val)
            # Get predictions for this model
            model_metrics, model_y_val_pred = evaluate_model(model, X_val_scaled, y_val, f"{model_name} (Validation)")

            # Store predictions for comparison plot
            all_predictions[model_name] = model_y_val_pred

            print(f"  {model_name} - RMSE: {model_metrics['rmse']:.4f}, R²: {model_metrics['r2']:.4f}, MAE: {model_metrics['mae']:.4f}")

            # Plot actual vs predicted
            plot_actual_vs_predicted(y_val, model_y_val_pred, model_name, APPROACH)

            # Plot time series prediction
            plot_time_series_prediction(y_val, model_y_val_pred, model_name, APPROACH)

            # Plot residuals
            plot_residuals(y_val, model_y_val_pred, model_name, APPROACH)

        # Plot comparison of all models
        plot_model_comparison(y_val, all_predictions, APPROACH)

    elif APPROACH == 2:
        # Window approach: use previous step data and target to predict next step
        # First preprocess the data without windowing
        X_train, y_train, feature_names = preprocess_data(
            train_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)
        X_val, y_val, _ = preprocess_data(
            validation_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)

        # Create windowed data (include one previous step)
        window_size = 1
        X_train_window, y_train_window = prepare_window_data(X_train, y_train, window_size)
        X_val_window, y_val_window = prepare_window_data(X_val, y_val, window_size)

        # Create feature names for windowed data
        window_feature_names = []
        for w in range(window_size + 1):
            prefix = "" if w == 0 else f"prev{w}_"
            window_feature_names.extend([f"{prefix}{name}" for name in feature_names])
        for w in range(1, window_size + 1):
            window_feature_names.append(f"prev{w}_{TARGET_COLUMN}")

        # Split training data into train and test sets
        X_train_split, X_test, y_train_split, y_test = train_test_split(
            X_train_window, y_train_window, test_size=0.2, random_state=42)

        # Build and evaluate models
        print("\nTraining and evaluating models (window approach)...")
        results, best_model, all_models, scaler = build_and_evaluate_models(
            X_train_split, y_train_split, X_test, y_test)

        # Evaluate best model on validation data
        X_val_scaled = scaler.transform(X_val_window)
        val_metrics, y_val_pred = evaluate_model(best_model, X_val_scaled, y_val_window,
                                                 results[0]['model'] + " (Validation)")

        print("\nBest model performance on validation data (window approach):")
        print(f"  RMSE: {val_metrics['rmse']:.4f}")
        print(f"  R²: {val_metrics['r2']:.4f}")
        print(f"  MAE: {val_metrics['mae']:.4f}")

        # Plot results for all models
        # First for the best model
        best_model_name = results[0]['model']

        # Plot feature importance if available for best model
        if hasattr(best_model, 'feature_importances_') or hasattr(best_model, 'coef_'):
            plot_feature_importance(best_model, window_feature_names, best_model_name)

        # Evaluate and plot for all models
        print("\nEvaluating all models on validation data:")
        all_predictions = {}
        for model_name, model in all_models.items():
            # Scale the validation data
            X_val_window_scaled = scaler.transform(X_val_window)
            # Get predictions for this model
            model_metrics, model_y_val_pred = evaluate_model(model, X_val_window_scaled, y_val_window, f"{model_name} (Validation)")

            # Store predictions for comparison plot
            all_predictions[model_name] = model_y_val_pred

            print(f"  {model_name} - RMSE: {model_metrics['rmse']:.4f}, R²: {model_metrics['r2']:.4f}, MAE: {model_metrics['mae']:.4f}")

            # Plot actual vs predicted
            plot_actual_vs_predicted(y_val_window, model_y_val_pred, model_name, APPROACH)

            # Plot time series prediction
            plot_time_series_prediction(y_val_window, model_y_val_pred, model_name, APPROACH)

            # Plot residuals
            plot_residuals(y_val_window, model_y_val_pred, model_name, APPROACH)

        # Plot comparison of all models
        plot_model_comparison(y_val_window, all_predictions, APPROACH)

    elif APPROACH == 3:
        # Virtual experiment: similar to approach 2, but using predicted targets
        # for subsequent predictions instead of actual targets

        # First preprocess the data without windowing
        X_train, y_train, feature_names = preprocess_data(
            train_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)
        X_val, y_val, _ = preprocess_data(
            validation_data, TARGET_COLUMN, EXCLUDED_COLUMNS, SELECTED_FEATURES)

        # Create windowed data (include one previous step)
        window_size = 1
        X_train_window, y_train_window = prepare_window_data(X_train, y_train, window_size)
        X_val_window, y_val_window = prepare_window_data(X_val, y_val, window_size)

        # Create feature names for windowed data
        window_feature_names = []
        for w in range(window_size + 1):
            prefix = "" if w == 0 else f"prev{w}_"
            window_feature_names.extend([f"{prefix}{name}" for name in feature_names])
        for w in range(1, window_size + 1):
            window_feature_names.append(f"prev{w}_{TARGET_COLUMN}")

        # Split training data into train and test sets
        X_train_split, X_test, y_train_split, y_test = train_test_split(
            X_train_window, y_train_window, test_size=0.2, random_state=42)

        # Build and evaluate models
        print("\nTraining and evaluating models (window approach)...")
        results, best_model, all_models, scaler = build_and_evaluate_models(
            X_train_split, y_train_split, X_test, y_test)

        # Run virtual experiment with the best model
        print("\nRunning virtual experiment...")
        # Scale the validation data
        X_val_window_scaled = scaler.transform(X_val_window)

        # Regular window approach prediction for comparison
        val_metrics, y_val_window_pred = evaluate_model(
            best_model, X_val_window_scaled, y_val_window,
            results[0]['model'] + " (Window Approach)")

        # Virtual experiment approach
        y_virtual_pred, y_virtual_true = virtual_experiment(
            best_model, X_val_window_scaled, y_val, window_feature_names, window_size)

        # Calculate metrics for virtual experiment
        virtual_mse = mean_squared_error(y_virtual_true, y_virtual_pred)
        virtual_rmse = np.sqrt(virtual_mse)
        virtual_mae = mean_absolute_error(y_virtual_true, y_virtual_pred)
        virtual_r2 = r2_score(y_virtual_true, y_virtual_pred)

        print("\nVirtual experiment results:")
        print(f"  RMSE: {virtual_rmse:.4f}")
        print(f"  R²: {virtual_r2:.4f}")
        print(f"  MAE: {virtual_mae:.4f}")

        # Compare with window approach
        print("\nComparison with window approach:")
        print(f"  Window approach RMSE: {val_metrics['rmse']:.4f}, Virtual experiment RMSE: {virtual_rmse:.4f}")
        print(f"  Window approach R²: {val_metrics['r2']:.4f}, Virtual experiment R²: {virtual_r2:.4f}")

        # Plot results for virtual experiment with all models
        print("\nRunning virtual experiment for all models:")
        all_predictions = {}
        all_true_values = None
        for model_name, model in all_models.items():
            # Run virtual experiment for this model
            model_y_virtual_pred, model_y_virtual_true = virtual_experiment(
                model, X_val_window_scaled, y_val, window_feature_names, window_size)

            # Store for comparison plot
            all_predictions[model_name] = model_y_virtual_pred
            if all_true_values is None:
                all_true_values = model_y_virtual_true  # Should be the same for all models

            # Calculate metrics for this model's virtual experiment
            model_virtual_mse = mean_squared_error(model_y_virtual_true, model_y_virtual_pred)
            model_virtual_rmse = np.sqrt(model_virtual_mse)
            model_virtual_mae = mean_absolute_error(model_y_virtual_true, model_y_virtual_pred)
            model_virtual_r2 = r2_score(model_y_virtual_true, model_y_virtual_pred)

            print(f"  {model_name} - RMSE: {model_virtual_rmse:.4f}, R²: {model_virtual_r2:.4f}, MAE: {model_virtual_mae:.4f}")

            # Plot actual vs predicted
            plot_actual_vs_predicted(model_y_virtual_true, model_y_virtual_pred, model_name, APPROACH)

            # Plot time series prediction
            plot_time_series_prediction(model_y_virtual_true, model_y_virtual_pred, model_name, APPROACH)

            # Plot residuals
            plot_residuals(model_y_virtual_true, model_y_virtual_pred, model_name, APPROACH)

        # Plot comparison of all models
        plot_model_comparison(all_true_values, all_predictions, APPROACH)

    else:
        raise ValueError(f"Unknown approach: {APPROACH}")

    print("\nAnalysis complete!")


if __name__ == "__main__":
    main()