265 рядки
8.7 KiB
Python
265 рядки
8.7 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
import seaborn as sns
|
|
|
|
# Define file paths
|
|
file_paths = [
|
|
'160508-1021-1000,0min,56kN.csv',
|
|
'160508-1022-900,0min,56kN.csv',
|
|
'200508-1023-1350,0min,56kN.csv',
|
|
'200508-1024-1200,0min,56kN.csv'
|
|
]
|
|
|
|
|
|
def load_and_explore_data(file_paths):
|
|
"""
|
|
Load all CSV files and perform exploratory data analysis.
|
|
|
|
Args:
|
|
file_paths: List of CSV file paths
|
|
"""
|
|
all_data = []
|
|
|
|
print("Loading and exploring data files...")
|
|
|
|
for i, file_path in enumerate(file_paths):
|
|
print(f"\nFile {i + 1}: {file_path}")
|
|
|
|
# Read the CSV file with proper settings for European number format
|
|
try:
|
|
df = pd.read_csv(file_path, sep=';', decimal=',', header=0)
|
|
# Add a file identifier column
|
|
df['file_id'] = i
|
|
all_data.append(df)
|
|
|
|
# Display basic information
|
|
print(f" Rows: {df.shape[0]}, Columns: {df.shape[1]}")
|
|
print(" First few rows:")
|
|
print(df.head(3).to_string())
|
|
|
|
# Check for missing values
|
|
missing_values = df.isnull().sum()
|
|
if missing_values.sum() > 0:
|
|
print("\n Missing values:")
|
|
print(missing_values[missing_values > 0])
|
|
|
|
# Analyze target variable
|
|
target_col = 'Rel. Piston Trav'
|
|
if target_col in df.columns:
|
|
print(f"\n {target_col} statistics:")
|
|
print(f" Min: {df[target_col].min()}")
|
|
print(f" Max: {df[target_col].max()}")
|
|
print(f" Mean: {df[target_col].mean():.4f}")
|
|
print(f" Std Dev: {df[target_col].std():.4f}")
|
|
print(f" Unique values: {df[target_col].nunique()}")
|
|
|
|
# Check for precision issues
|
|
decimal_places = df[target_col].astype(str).str.split('.').str[1].str.len().max()
|
|
print(f" Decimal places: {decimal_places}")
|
|
|
|
# Quick correlation analysis
|
|
if target_col in df.columns:
|
|
# Get correlations with target
|
|
corr = df.corr()[target_col].sort_values(ascending=False)
|
|
print("\n Top 5 correlations with target:")
|
|
print(corr.head(6).to_string()) # +1 to include the target itself
|
|
print("\n Bottom 5 correlations with target:")
|
|
print(corr.tail(5).to_string())
|
|
|
|
except Exception as e:
|
|
print(f"Error loading {file_path}: {e}")
|
|
|
|
# Combine all data for overall analysis
|
|
if all_data:
|
|
combined_df = pd.concat(all_data, ignore_index=True)
|
|
print("\nCombined dataset:")
|
|
print(f" Total rows: {combined_df.shape[0]}, Columns: {combined_df.shape[1]}")
|
|
|
|
return combined_df
|
|
|
|
return None
|
|
|
|
|
|
def plot_target_variable(df, target_col='Rel. Piston Trav'):
|
|
"""
|
|
Create visualizations for the target variable.
|
|
|
|
Args:
|
|
df: DataFrame with all data
|
|
target_col: Name of target column
|
|
"""
|
|
if target_col not in df.columns:
|
|
print(f"Target column '{target_col}' not found in data")
|
|
return
|
|
|
|
print(f"\nGenerating plots for {target_col}...")
|
|
|
|
# Create a copy of the dataframe with only numeric columns for correlation analysis
|
|
numeric_df = df.select_dtypes(include=[np.number])
|
|
|
|
# Set up figure with multiple subplots
|
|
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
|
|
# Plot 1: Distribution of target variable
|
|
sns.histplot(df[target_col], kde=True, ax=axes[0, 0])
|
|
axes[0, 0].set_title(f'Distribution of {target_col}')
|
|
axes[0, 0].set_xlabel(target_col)
|
|
axes[0, 0].set_ylabel('Frequency')
|
|
|
|
# Plot 2: Target variable by file
|
|
sns.boxplot(x='file_id', y=target_col, data=df, ax=axes[0, 1])
|
|
axes[0, 1].set_title(f'{target_col} by File')
|
|
axes[0, 1].set_xlabel('File ID')
|
|
axes[0, 1].set_ylabel(target_col)
|
|
|
|
# Plot 3: Target variable over time (for first 1000 points)
|
|
sample_size = min(1000, df.shape[0])
|
|
axes[1, 0].plot(df['Nr.'].head(sample_size), df[target_col].head(sample_size))
|
|
axes[1, 0].set_title(f'{target_col} Over Time (First {sample_size} Points)')
|
|
axes[1, 0].set_xlabel('Record Number')
|
|
axes[1, 0].set_ylabel(target_col)
|
|
|
|
# Plot 4: Correlation heatmap (top correlated features)
|
|
try:
|
|
# Get absolute correlations with target from numeric columns only
|
|
corr = numeric_df.corr()[target_col].abs().sort_values(ascending=False)
|
|
top_features = corr.head(10).index # Top 10 features
|
|
|
|
# Create correlation matrix for selected features
|
|
corr_matrix = numeric_df[top_features].corr()
|
|
|
|
# Plot heatmap
|
|
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', ax=axes[1, 1])
|
|
axes[1, 1].set_title('Correlation Heatmap (Top Features)')
|
|
|
|
# Prepare for additional plot with top features
|
|
top_correlated = corr.head(6).index.tolist()
|
|
if target_col in top_correlated:
|
|
top_correlated.remove(target_col) # Exclude target itself
|
|
top_correlated = top_correlated[:4] # Get top 4
|
|
|
|
except Exception as e:
|
|
print(f"Error calculating correlations: {e}")
|
|
axes[1, 1].set_title('Correlation Heatmap (Error occurred)')
|
|
top_correlated = []
|
|
|
|
plt.tight_layout()
|
|
plt.show() # Add this line to display the plot
|
|
plt.close()
|
|
|
|
# Additional plot: Scatter plots of top correlated features vs target
|
|
if top_correlated:
|
|
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
|
|
axes = axes.flatten()
|
|
|
|
for i, feature in enumerate(top_correlated):
|
|
if i < 4: # Plot top 4 correlated features
|
|
sns.scatterplot(x=feature, y=target_col, data=df.sample(min(1000, df.shape[0])),
|
|
alpha=0.5, ax=axes[i])
|
|
axes[i].set_title(f'{feature} vs {target_col}')
|
|
|
|
# Hide unused subplots
|
|
for j in range(len(top_correlated), len(axes)):
|
|
axes[j].set_visible(False)
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|
|
plt.close()
|
|
|
|
|
|
def analyze_feature_distributions(df):
|
|
"""
|
|
Analyze the distributions of key features.
|
|
|
|
Args:
|
|
df: DataFrame with all data
|
|
"""
|
|
print("\nAnalyzing feature distributions...")
|
|
|
|
# Identify numeric columns
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
|
|
# Remove certain columns we don't need to visualize
|
|
cols_to_exclude = ['Nr.', 'file_id', 'Abs. Piston Trav']
|
|
feature_cols = [col for col in numeric_cols if col not in cols_to_exclude]
|
|
|
|
# Select top features based on data exploration
|
|
selected_features = [
|
|
'MTC1', 'MTC2', 'MTC3', 'Pyrometer', 'SV Temperature',
|
|
'SV Power', 'SV Force', 'AV Force', 'AV Speed',
|
|
'I RMS', 'U RMS', 'Heating power'
|
|
]
|
|
|
|
# Ensure all selected features exist in the dataframe
|
|
selected_features = [f for f in selected_features if f in df.columns]
|
|
|
|
# Create distribution plots for selected features
|
|
n_cols = 3
|
|
n_rows = (len(selected_features) + n_cols - 1) // n_cols
|
|
|
|
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, n_rows * 4))
|
|
axes = axes.flatten()
|
|
|
|
for i, feature in enumerate(selected_features):
|
|
if i < len(axes):
|
|
sns.histplot(df[feature].dropna(), kde=True, ax=axes[i])
|
|
axes[i].set_title(f'Distribution of {feature}')
|
|
axes[i].set_xlabel(feature)
|
|
|
|
# Hide unused subplots
|
|
for j in range(len(selected_features), len(axes)):
|
|
axes[j].set_visible(False)
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|
|
plt.close()
|
|
|
|
|
|
def plot_time_series_by_file(df, target_col='Rel. Piston Trav'):
|
|
"""
|
|
Plot time series of target variable for each file.
|
|
|
|
Args:
|
|
df: DataFrame with all data
|
|
target_col: Name of target column
|
|
"""
|
|
print("\nPlotting time series by file...")
|
|
|
|
# Create a figure
|
|
plt.figure(figsize=(15, 8))
|
|
|
|
# Plot for each file ID
|
|
for file_id in df['file_id'].unique():
|
|
file_data = df[df['file_id'] == file_id]
|
|
plt.plot(range(len(file_data)), file_data[target_col],
|
|
label=f'File {file_id}', alpha=0.7)
|
|
|
|
plt.title(f'{target_col} Time Series by File')
|
|
plt.xlabel('Time Step')
|
|
plt.ylabel(target_col)
|
|
plt.legend()
|
|
plt.grid(True, alpha=0.3)
|
|
plt.show()
|
|
plt.close()
|
|
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
print("SPS Sintering Data Exploration")
|
|
|
|
# Load and explore data
|
|
combined_df = load_and_explore_data(file_paths)
|
|
|
|
if combined_df is not None:
|
|
# Generate plots
|
|
plot_target_variable(combined_df)
|
|
analyze_feature_distributions(combined_df)
|
|
plot_time_series_by_file(combined_df)
|
|
|
|
print("\nData exploration complete.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |