SPS/data-exploration.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

# Define file paths
file_paths = [
    '160508-1021-1000,0min,56kN.csv',
    '160508-1022-900,0min,56kN.csv',
    '200508-1023-1350,0min,56kN.csv',
    '200508-1024-1200,0min,56kN.csv'
]


def load_and_explore_data(file_paths):
    """
    Load all CSV files and perform exploratory data analysis.

    Args:
        file_paths: List of CSV file paths
    """
    all_data = []

    print("Loading and exploring data files...")

    for i, file_path in enumerate(file_paths):
        print(f"\nFile {i + 1}: {file_path}")

        # Read the CSV file with proper settings for European number format
        try:
            df = pd.read_csv(file_path, sep=';', decimal=',', header=0)
            # Add a file identifier column
            df['file_id'] = i
            all_data.append(df)

            # Display basic information
            print(f"  Rows: {df.shape[0]}, Columns: {df.shape[1]}")
            print("  First few rows:")
            print(df.head(3).to_string())

            # Check for missing values
            missing_values = df.isnull().sum()
            if missing_values.sum() > 0:
                print("\n  Missing values:")
                print(missing_values[missing_values > 0])

            # Analyze target variable
            target_col = 'Rel. Piston Trav'
            if target_col in df.columns:
                print(f"\n  {target_col} statistics:")
                print(f"    Min: {df[target_col].min()}")
                print(f"    Max: {df[target_col].max()}")
                print(f"    Mean: {df[target_col].mean():.4f}")
                print(f"    Std Dev: {df[target_col].std():.4f}")
                print(f"    Unique values: {df[target_col].nunique()}")

                # Check for precision issues
                decimal_places = df[target_col].astype(str).str.split('.').str[1].str.len().max()
                print(f"    Decimal places: {decimal_places}")

            # Quick correlation analysis
            if target_col in df.columns:
                # Get correlations with target
                corr = df.corr()[target_col].sort_values(ascending=False)
                print("\n  Top 5 correlations with target:")
                print(corr.head(6).to_string())  # +1 to include the target itself
                print("\n  Bottom 5 correlations with target:")
                print(corr.tail(5).to_string())

        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    # Combine all data for overall analysis
    if all_data:
        combined_df = pd.concat(all_data, ignore_index=True)
        print("\nCombined dataset:")
        print(f"  Total rows: {combined_df.shape[0]}, Columns: {combined_df.shape[1]}")

        return combined_df

    return None


def plot_target_variable(df, target_col='Rel. Piston Trav'):
    """
    Create visualizations for the target variable.

    Args:
        df: DataFrame with all data
        target_col: Name of target column
    """
    if target_col not in df.columns:
        print(f"Target column '{target_col}' not found in data")
        return

    print(f"\nGenerating plots for {target_col}...")

    # Create a copy of the dataframe with only numeric columns for correlation analysis
    numeric_df = df.select_dtypes(include=[np.number])

    # Set up figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # Plot 1: Distribution of target variable
    sns.histplot(df[target_col], kde=True, ax=axes[0, 0])
    axes[0, 0].set_title(f'Distribution of {target_col}')
    axes[0, 0].set_xlabel(target_col)
    axes[0, 0].set_ylabel('Frequency')

    # Plot 2: Target variable by file
    sns.boxplot(x='file_id', y=target_col, data=df, ax=axes[0, 1])
    axes[0, 1].set_title(f'{target_col} by File')
    axes[0, 1].set_xlabel('File ID')
    axes[0, 1].set_ylabel(target_col)

    # Plot 3: Target variable over time (for first 1000 points)
    sample_size = min(1000, df.shape[0])
    axes[1, 0].plot(df['Nr.'].head(sample_size), df[target_col].head(sample_size))
    axes[1, 0].set_title(f'{target_col} Over Time (First {sample_size} Points)')
    axes[1, 0].set_xlabel('Record Number')
    axes[1, 0].set_ylabel(target_col)

    # Plot 4: Correlation heatmap (top correlated features)
    try:
        # Get absolute correlations with target from numeric columns only
        corr = numeric_df.corr()[target_col].abs().sort_values(ascending=False)
        top_features = corr.head(10).index  # Top 10 features

        # Create correlation matrix for selected features
        corr_matrix = numeric_df[top_features].corr()

        # Plot heatmap
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1, 1])
        axes[1, 1].set_title('Correlation Heatmap (Top Features)')

        # Prepare for additional plot with top features
        top_correlated = corr.head(6).index.tolist()
        if target_col in top_correlated:
            top_correlated.remove(target_col)  # Exclude target itself
        top_correlated = top_correlated[:4]  # Get top 4

    except Exception as e:
        print(f"Error calculating correlations: {e}")
        axes[1, 1].set_title('Correlation Heatmap (Error occurred)')
        top_correlated = []

    plt.tight_layout()
    plt.show()  # Add this line to display the plot
    plt.close()

    # Additional plot: Scatter plots of top correlated features vs target
    if top_correlated:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.flatten()

        for i, feature in enumerate(top_correlated):
            if i < 4:  # Plot top 4 correlated features
                sns.scatterplot(x=feature, y=target_col, data=df.sample(min(1000, df.shape[0])),
                                alpha=0.5, ax=axes[i])
                axes[i].set_title(f'{feature} vs {target_col}')

        # Hide unused subplots
        for j in range(len(top_correlated), len(axes)):
            axes[j].set_visible(False)

        plt.tight_layout()
        plt.show()
        plt.close()


def analyze_feature_distributions(df):
    """
    Analyze the distributions of key features.

    Args:
        df: DataFrame with all data
    """
    print("\nAnalyzing feature distributions...")

    # Identify numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # Remove certain columns we don't need to visualize
    cols_to_exclude = ['Nr.', 'file_id', 'Abs. Piston Trav']
    feature_cols = [col for col in numeric_cols if col not in cols_to_exclude]

    # Select top features based on data exploration
    selected_features = [
        'MTC1', 'MTC2', 'MTC3', 'Pyrometer', 'SV Temperature',
        'SV Power', 'SV Force', 'AV Force', 'AV Speed',
        'I RMS', 'U RMS', 'Heating power'
    ]

    # Ensure all selected features exist in the dataframe
    selected_features = [f for f in selected_features if f in df.columns]

    # Create distribution plots for selected features
    n_cols = 3
    n_rows = (len(selected_features) + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
    axes = axes.flatten()

    for i, feature in enumerate(selected_features):
        if i < len(axes):
            sns.histplot(df[feature].dropna(), kde=True, ax=axes[i])
            axes[i].set_title(f'Distribution of {feature}')
            axes[i].set_xlabel(feature)

    # Hide unused subplots
    for j in range(len(selected_features), len(axes)):
        axes[j].set_visible(False)

    plt.tight_layout()
    plt.show()
    plt.close()


def plot_time_series_by_file(df, target_col='Rel. Piston Trav'):
    """
    Plot time series of target variable for each file.

    Args:
        df: DataFrame with all data
        target_col: Name of target column
    """
    print("\nPlotting time series by file...")

    # Create a figure
    plt.figure(figsize=(15, 8))

    # Plot for each file ID
    for file_id in df['file_id'].unique():
        file_data = df[df['file_id'] == file_id]
        plt.plot(range(len(file_data)), file_data[target_col],
                 label=f'File {file_id}', alpha=0.7)

    plt.title(f'{target_col} Time Series by File')
    plt.xlabel('Time Step')
    plt.ylabel(target_col)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
    plt.close()


def main():
    """Main execution function"""
    print("SPS Sintering Data Exploration")

    # Load and explore data
    combined_df = load_and_explore_data(file_paths)

    if combined_df is not None:
        # Generate plots
        plot_target_variable(combined_df)
        analyze_feature_distributions(combined_df)
        plot_time_series_by_file(combined_df)

        print("\nData exploration complete.")


if __name__ == "__main__":
    main()