SPS/data-exploration.py

265 рядки
8.7 KiB
Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Define file paths
file_paths = [
'160508-1021-1000,0min,56kN.csv',
'160508-1022-900,0min,56kN.csv',
'200508-1023-1350,0min,56kN.csv',
'200508-1024-1200,0min,56kN.csv'
]
def load_and_explore_data(file_paths):
"""
Load all CSV files and perform exploratory data analysis.
Args:
file_paths: List of CSV file paths
"""
all_data = []
print("Loading and exploring data files...")
for i, file_path in enumerate(file_paths):
print(f"\nFile {i + 1}: {file_path}")
# Read the CSV file with proper settings for European number format
try:
df = pd.read_csv(file_path, sep=';', decimal=',', header=0)
# Add a file identifier column
df['file_id'] = i
all_data.append(df)
# Display basic information
print(f" Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print(" First few rows:")
print(df.head(3).to_string())
# Check for missing values
missing_values = df.isnull().sum()
if missing_values.sum() > 0:
print("\n Missing values:")
print(missing_values[missing_values > 0])
# Analyze target variable
target_col = 'Rel. Piston Trav'
if target_col in df.columns:
print(f"\n {target_col} statistics:")
print(f" Min: {df[target_col].min()}")
print(f" Max: {df[target_col].max()}")
print(f" Mean: {df[target_col].mean():.4f}")
print(f" Std Dev: {df[target_col].std():.4f}")
print(f" Unique values: {df[target_col].nunique()}")
# Check for precision issues
decimal_places = df[target_col].astype(str).str.split('.').str[1].str.len().max()
print(f" Decimal places: {decimal_places}")
# Quick correlation analysis
if target_col in df.columns:
# Get correlations with target
corr = df.corr()[target_col].sort_values(ascending=False)
print("\n Top 5 correlations with target:")
print(corr.head(6).to_string()) # +1 to include the target itself
print("\n Bottom 5 correlations with target:")
print(corr.tail(5).to_string())
except Exception as e:
print(f"Error loading {file_path}: {e}")
# Combine all data for overall analysis
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
print("\nCombined dataset:")
print(f" Total rows: {combined_df.shape[0]}, Columns: {combined_df.shape[1]}")
return combined_df
return None
def plot_target_variable(df, target_col='Rel. Piston Trav'):
"""
Create visualizations for the target variable.
Args:
df: DataFrame with all data
target_col: Name of target column
"""
if target_col not in df.columns:
print(f"Target column '{target_col}' not found in data")
return
print(f"\nGenerating plots for {target_col}...")
# Create a copy of the dataframe with only numeric columns for correlation analysis
numeric_df = df.select_dtypes(include=[np.number])
# Set up figure with multiple subplots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
# Plot 1: Distribution of target variable
sns.histplot(df[target_col], kde=True, ax=axes[0, 0])
axes[0, 0].set_title(f'Distribution of {target_col}')
axes[0, 0].set_xlabel(target_col)
axes[0, 0].set_ylabel('Frequency')
# Plot 2: Target variable by file
sns.boxplot(x='file_id', y=target_col, data=df, ax=axes[0, 1])
axes[0, 1].set_title(f'{target_col} by File')
axes[0, 1].set_xlabel('File ID')
axes[0, 1].set_ylabel(target_col)
# Plot 3: Target variable over time (for first 1000 points)
sample_size = min(1000, df.shape[0])
axes[1, 0].plot(df['Nr.'].head(sample_size), df[target_col].head(sample_size))
axes[1, 0].set_title(f'{target_col} Over Time (First {sample_size} Points)')
axes[1, 0].set_xlabel('Record Number')
axes[1, 0].set_ylabel(target_col)
# Plot 4: Correlation heatmap (top correlated features)
try:
# Get absolute correlations with target from numeric columns only
corr = numeric_df.corr()[target_col].abs().sort_values(ascending=False)
top_features = corr.head(10).index # Top 10 features
# Create correlation matrix for selected features
corr_matrix = numeric_df[top_features].corr()
# Plot heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1, 1])
axes[1, 1].set_title('Correlation Heatmap (Top Features)')
# Prepare for additional plot with top features
top_correlated = corr.head(6).index.tolist()
if target_col in top_correlated:
top_correlated.remove(target_col) # Exclude target itself
top_correlated = top_correlated[:4] # Get top 4
except Exception as e:
print(f"Error calculating correlations: {e}")
axes[1, 1].set_title('Correlation Heatmap (Error occurred)')
top_correlated = []
plt.tight_layout()
plt.show() # Add this line to display the plot
plt.close()
# Additional plot: Scatter plots of top correlated features vs target
if top_correlated:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.flatten()
for i, feature in enumerate(top_correlated):
if i < 4: # Plot top 4 correlated features
sns.scatterplot(x=feature, y=target_col, data=df.sample(min(1000, df.shape[0])),
alpha=0.5, ax=axes[i])
axes[i].set_title(f'{feature} vs {target_col}')
# Hide unused subplots
for j in range(len(top_correlated), len(axes)):
axes[j].set_visible(False)
plt.tight_layout()
plt.show()
plt.close()
def analyze_feature_distributions(df):
"""
Analyze the distributions of key features.
Args:
df: DataFrame with all data
"""
print("\nAnalyzing feature distributions...")
# Identify numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Remove certain columns we don't need to visualize
cols_to_exclude = ['Nr.', 'file_id', 'Abs. Piston Trav']
feature_cols = [col for col in numeric_cols if col not in cols_to_exclude]
# Select top features based on data exploration
selected_features = [
'MTC1', 'MTC2', 'MTC3', 'Pyrometer', 'SV Temperature',
'SV Power', 'SV Force', 'AV Force', 'AV Speed',
'I RMS', 'U RMS', 'Heating power'
]
# Ensure all selected features exist in the dataframe
selected_features = [f for f in selected_features if f in df.columns]
# Create distribution plots for selected features
n_cols = 3
n_rows = (len(selected_features) + n_cols - 1) // n_cols
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
axes = axes.flatten()
for i, feature in enumerate(selected_features):
if i < len(axes):
sns.histplot(df[feature].dropna(), kde=True, ax=axes[i])
axes[i].set_title(f'Distribution of {feature}')
axes[i].set_xlabel(feature)
# Hide unused subplots
for j in range(len(selected_features), len(axes)):
axes[j].set_visible(False)
plt.tight_layout()
plt.show()
plt.close()
def plot_time_series_by_file(df, target_col='Rel. Piston Trav'):
"""
Plot time series of target variable for each file.
Args:
df: DataFrame with all data
target_col: Name of target column
"""
print("\nPlotting time series by file...")
# Create a figure
plt.figure(figsize=(15, 8))
# Plot for each file ID
for file_id in df['file_id'].unique():
file_data = df[df['file_id'] == file_id]
plt.plot(range(len(file_data)), file_data[target_col],
label=f'File {file_id}', alpha=0.7)
plt.title(f'{target_col} Time Series by File')
plt.xlabel('Time Step')
plt.ylabel(target_col)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
plt.close()
def main():
"""Main execution function"""
print("SPS Sintering Data Exploration")
# Load and explore data
combined_df = load_and_explore_data(file_paths)
if combined_df is not None:
# Generate plots
plot_target_variable(combined_df)
analyze_feature_distributions(combined_df)
plot_time_series_by_file(combined_df)
print("\nData exploration complete.")
if __name__ == "__main__":
main()