#!/usr/bin/env python3
import os
import re
from math import sqrt
from pathlib import Path
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
pd.set_option('display.max_columns', None)
PROJECTS = [
'MVEC',
]
BASE_DIR = Path(os.getcwd()) / 'PeakAcc'
SKIP_ROWS = 10
SKIP_FOOTER = 2
PEAK_WINDOWS = {
'PeakHour1': (0, 12),
'PeakHour2': (12, 24),
}
KEEP_ONLY_HOURLY = True
DROP_FIRST_AND_LAST_DAY = True
def load_and_prepare_project(project: str) -> pd.DataFrame:
project_dir = BASE_DIR / project
if not project_dir.exists():
raise FileNotFoundError(f'Project folder not found: {project_dir}')
csv_files = sorted(project_dir.glob('*.csv'))
if not csv_files:
raise FileNotFoundError(f'No CSV files found in: {project_dir}')
combined = []
for file_path in csv_files:
print(f'Loading {file_path.name}')
df = pd.read_csv(
file_path,
skiprows=SKIP_ROWS,
skipfooter=SKIP_FOOTER,
engine='python',
)
rename_map = {}
cols = list(df.columns)
if len(cols) >= 2:
if str(cols[0]).startswith('Unnamed') or cols[0] != 'Time':
rename_map[cols[0]] = 'Time'
if str(cols[1]).startswith('Unnamed') or cols[1] != 'Actual':
rename_map[cols[1]] = 'Actual'
df = df.rename(columns=rename_map)
df.columns = [
str(col).strip().replace('_WA', '_WT').replace('.1', '_APE')
for col in df.columns
]
project_specific_cols = [c for c in df.columns if project in c]
if project_specific_cols and 'Actual' in df.columns:
df = df.rename(columns={'Actual': f'Actual{project}'})
df = df.dropna(subset=[f'Actual{project}'], how='all')
if 'Time' not in df.columns:
raise KeyError(f"Time column missing in {file_path.name}")
df['Time'] = pd.to_datetime(df['Time'], errors='coerce', format='mixed')
df = df.dropna(subset=['Time'])
if KEEP_ONLY_HOURLY:
hourly_mask = (df['Time'].dt.minute == 0) & (df['Time'].dt.second == 0)
df = df.loc[hourly_mask].copy()
combined.append(df)
result_df = pd.concat(combined, ignore_index=True)
result_df = result_df.set_index('Time')
result_df = result_df.apply(pd.to_numeric, errors='coerce')
result_df = result_df.groupby(result_df.index).sum(min_count=1)
result_df = result_df.sort_index()
if DROP_FIRST_AND_LAST_DAY and len(result_df) > 0:
unique_days = pd.Index(result_df.index.normalize().unique())
if len(unique_days) > 2:
result_df = result_df[
(result_df.index.normalize() != unique_days[0]) &
(result_df.index.normalize() != unique_days[-1])
]
return result_df
def get_project_models(df: pd.DataFrame, project: str) -> list:
return sorted([
col for col in df.columns
if project in col and 'Actual' not in col and not col.endswith('_APE')
])
def _find_peak_index_with_min_logic(group: pd.Series, start_hour: int, end_hour: int) -> pd.Timestamp:
# NEW LOGIC INTEGRATED: this follows the revised email requirement for weird days.
# It first finds the peak and minimum inside the 12-hour window, then if the
# minimum occurs after the first peak candidate, it reruns the peak search
# starting from that minimum so the selected peak better reflects the later rise.
window = group[(group.index.hour >= start_hour) & (group.index.hour < end_hour)]
if window.empty:
return pd.NaT
values = window.to_numpy(dtype=float)
if np.all(np.isnan(values)):
return pd.NaT
valid_mask = ~np.isnan(values)
if not valid_mask.any():
return pd.NaT
valid_positions = np.where(valid_mask)[0]
valid_values = values[valid_mask]
peak_pos_among_valid = int(np.argmax(valid_values))
peak_pos = int(valid_positions[peak_pos_among_valid])
if len(valid_values) == 1:
final_pos = peak_pos
else:
min_pos_among_valid = int(np.argmin(valid_values))
min_pos = int(valid_positions[min_pos_among_valid])
if min_pos > peak_pos:
narrowed_values = values[min_pos:]
narrowed_valid = ~np.isnan(narrowed_values)
if narrowed_valid.any():
narrowed_valid_positions = np.where(narrowed_valid)[0]
narrowed_valid_values = narrowed_values[narrowed_valid]
new_peak_among_valid = int(np.argmax(narrowed_valid_values))
final_pos = min_pos + int(narrowed_valid_positions[new_peak_among_valid])
else:
final_pos = peak_pos
else:
final_pos = peak_pos
return window.index[final_pos]
def find_daily_peaks_for_series(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
rows = []
for day, group in df[column_name].groupby(df.index.date):
peak1_ts = _find_peak_index_with_min_logic(group, *PEAK_WINDOWS['PeakHour1'])
peak2_ts = _find_peak_index_with_min_logic(group, *PEAK_WINDOWS['PeakHour2'])
rows.append({
'Date': pd.Timestamp(day),
f'PeakHour1_{column_name}': peak1_ts,
f'PeakHour2_{column_name}': peak2_ts,
})
if not rows:
return pd.DataFrame()
peak_df = pd.DataFrame(rows).set_index('Date')
return peak_df
def build_peak_report(df: pd.DataFrame, project: str, models: list) -> pd.DataFrame:
actual_col = f'Actual{project}'
if actual_col not in df.columns:
raise KeyError(f'Missing actual column: {actual_col}')
peakdf = find_daily_peaks_for_series(df, actual_col)
for mdl in models:
mdl_peakdf = find_daily_peaks_for_series(df, mdl)
peakdf = peakdf.join(mdl_peakdf, how='left')
for peak_name in PEAK_WINDOWS.keys():
actual_peak_col = f'{peak_name}_{actual_col}'
model_peak_col = f'{peak_name}_{mdl}'
diff_col = f'{peak_name}Diff_{mdl}'
# NEW OUTPUT INTEGRATED: use KW naming in the data columns because the
# source line series is kW. Only the plotted line is divided by 1000 and
# shown as MW, matching the email clarification.
ape_col = f'{peak_name}_{mdl}_APE'
model_kw_col = f'{peak_name}KW_{mdl}'
actual_kw_col = f'{peak_name}_ActualKW_{project}'
rmse_col = f'{peak_name}_RMSE_{mdl}'
peakdf[diff_col] = (
peakdf[actual_peak_col] - peakdf[model_peak_col]
).abs().dt.total_seconds() / 3600
actual_peak_times = peakdf[actual_peak_col].dropna()
if len(actual_peak_times) == 0:
continue
model_ape_source = f'{mdl}_APE'
if model_ape_source in df.columns:
peakdf.loc[actual_peak_times.index, ape_col] = df.loc[
actual_peak_times.values,
model_ape_source,
].to_numpy()
peakdf.loc[actual_peak_times.index, model_kw_col] = df.loc[
actual_peak_times.values,
mdl,
].to_numpy()
peakdf.loc[actual_peak_times.index, actual_kw_col] = df.loc[
actual_peak_times.values,
actual_col,
].to_numpy()
metric_df = peakdf[[actual_kw_col, model_kw_col]].dropna()
if not metric_df.empty:
rmse_kw = sqrt(mean_squared_error(metric_df[actual_kw_col], metric_df[model_kw_col]))
peakdf[rmse_col] = rmse_kw
return peakdf
def save_project_outputs(df: pd.DataFrame, peakdf: pd.DataFrame, project: str) -> None:
peak_file = Path(os.getcwd()) / f'PeakResults_{project}.csv'
result_file = Path(os.getcwd()) / f'Results_{project}.csv'
peakdf.to_csv(peak_file)
df.to_csv(result_file)
print(f'Saved {peak_file.name}')
print(f'Saved {result_file.name}')
def plot_project_peak_accuracy(peakdf: pd.DataFrame, project: str, models: list) -> None:
for peak_name in PEAK_WINDOWS.keys():
fig, axs = plt.subplots(
4,
2,
figsize=(16, 11),
gridspec_kw={'width_ratios': [3, 1], 'height_ratios': [1, 2, 2, 1]},
constrained_layout=True,
)
labels = sorted(models)
diff_cols = [f'{peak_name}Diff_{m}' for m in labels if f'{peak_name}Diff_{m}' in peakdf.columns]
diff_df = peakdf[diff_cols].copy() if diff_cols else pd.DataFrame(index=peakdf.index)
diff_handles = []
diff_legend_labels = []
if not diff_df.empty:
for m in labels:
col = f'{peak_name}Diff_{m}'
if col in diff_df.columns:
line, = axs[0, 0].plot(diff_df.index, diff_df[col], label=m)
diff_handles.append(line)
diff_legend_labels.append(m)
axs[0, 1].bar(labels, [round(diff_df[f'{peak_name}Diff_{m}'].mean(), 3) if f'{peak_name}Diff_{m}' in diff_df.columns else np.nan for m in labels])
axs[0, 0].set_title(f'{peak_name}_Diff')
axs[0, 0].set_ylabel('Hours')
axs[0, 0].tick_params(axis='x', rotation=20, labelsize=6)
axs[0, 1].set_title(f'{peak_name}_Diff_Mean')
axs[0, 1].tick_params(axis='x', rotation=20, labelsize=5)
ape_cols = [f'{peak_name}_{m}_APE' for m in labels if f'{peak_name}_{m}_APE' in peakdf.columns]
ape_df = peakdf[ape_cols].copy() if ape_cols else pd.DataFrame(index=peakdf.index)
ape_handles = []
ape_legend_labels = []
if not ape_df.empty:
for m in labels:
col = f'{peak_name}_{m}_APE'
if col in ape_df.columns:
line, = axs[1, 0].plot(ape_df.index, ape_df[col], label=m)
ape_handles.append(line)
ape_legend_labels.append(m)
axs[1, 1].bar(labels, [round(abs(ape_df[f'{peak_name}_{m}_APE']).mean(), 2) if f'{peak_name}_{m}_APE' in ape_df.columns else np.nan for m in labels])
axs[1, 0].set_title(f'{peak_name}_APE')
axs[1, 0].set_ylabel('APE')
axs[1, 0].set_ylim(10, -10)
axs[1, 0].axhline(y=0, color='black', linestyle='--')
axs[1, 0].tick_params(axis='x', rotation=20, labelsize=6)
axs[1, 1].set_title(f'{peak_name}_MAPE')
axs[1, 1].tick_params(axis='x', rotation=20, labelsize=5)
kw_cols = [f'{peak_name}KW_{m}' for m in labels if f'{peak_name}KW_{m}' in peakdf.columns]
actual_kw_col = f'{peak_name}_ActualKW_{project}'
kw_df = peakdf[kw_cols].copy() if kw_cols else pd.DataFrame(index=peakdf.index)
if actual_kw_col in peakdf.columns:
kw_df[actual_kw_col] = peakdf[actual_kw_col]
kw_handles = []
kw_legend_labels = []
if not kw_df.empty:
# NEW LABELING INTEGRATED: line chart values are stored in kW but plotted
# in MW by dividing by 1000, per the requested report correction.
for m in labels:
col = f'{peak_name}KW_{m}'
if col in kw_df.columns:
line, = axs[2, 0].plot(kw_df.index, kw_df[col] / 1000, label=m)
kw_handles.append(line)
kw_legend_labels.append(m)
if actual_kw_col in kw_df.columns:
actual_line, = axs[2, 0].plot(kw_df.index, kw_df[actual_kw_col] / 1000, label=f'ActualKW_{project}')
kw_handles.append(actual_line)
kw_legend_labels.append(f'ActualKW_{project}')
axs[2, 0].set_title(f'{peak_name}_KW')
axs[2, 0].set_ylabel('MW')
axs[2, 0].tick_params(axis='x', rotation=20, labelsize=6)
rmse_cols = [f'{peak_name}_RMSE_{m}' for m in labels if f'{peak_name}_RMSE_{m}' in peakdf.columns]
# NEW LABELING INTEGRATED: RMSE is computed on kW values, then divided by
# 1000 for the bar chart so the RMSE chart is labeled in MW as requested.
rmse_vals = [round(peakdf[f'{peak_name}_RMSE_{m}'].dropna().iloc[0] / 1000, 2) if f'{peak_name}_RMSE_{m}' in peakdf.columns and not peakdf[f'{peak_name}_RMSE_{m}'].dropna().empty else np.nan for m in labels]
if any(not pd.isna(v) for v in rmse_vals):
axs[2, 1].bar(labels, rmse_vals)
axs[2, 1].set_title(f'{peak_name}_RMSE')
axs[2, 1].set_ylabel('MW')
axs[2, 1].tick_params(axis='x', rotation=20, labelsize=5)
# NEW CHART FORMAT INTEGRATED: restore the dedicated bottom legend row so
# each colored model line is identified in the exported PDF.
axs[3, 0].axis('off')
if kw_handles:
# NEW LEGEND INTEGRATED: use the MW/KW chart handles for the bottom legend
# because that chart now includes all forecast models plus the Actual line.
axs[3, 0].legend(kw_handles, kw_legend_labels, fontsize=8, bbox_to_anchor=(1, 1), loc='upper right')
axs[3, 1].axis('off')
out_file = Path(os.getcwd()) / f'{project}_{peak_name}Acc.pdf'
fig.savefig(out_file)
plt.close(fig)
print(f'Saved {out_file.name}')
# NEW PEAK HOUR COMPARISON GRAPH INTEGRATED: keep the existing accuracy
# chart unchanged and add a separate chart that compares actual peak hour
# versus each forecast model peak hour by day.
fig2, ax2 = plt.subplots(figsize=(14, 5), constrained_layout=True)
comparison_handles = []
comparison_labels = []
actual_peak_col = f'{peak_name}_Actual{project}'
if actual_peak_col in peakdf.columns:
actual_peak_hour = peakdf[actual_peak_col].dt.hour + (peakdf[actual_peak_col].dt.minute / 60.0)
actual_line, = ax2.plot(
peakdf.index,
actual_peak_hour,
color='black',
linestyle='--',
linewidth=1.5,
label='Actual Peak Hour'
)
comparison_handles.append(actual_line)
comparison_labels.append('Actual Peak Hour')
for m in labels:
model_peak_col = f'{peak_name}_{m}'
if model_peak_col in peakdf.columns:
model_peak_hour = peakdf[model_peak_col].dt.hour + (peakdf[model_peak_col].dt.minute / 60.0)
line, = ax2.plot(
peakdf.index,
model_peak_hour,
linewidth=1.2,
label=m
)
comparison_handles.append(line)
comparison_labels.append(m)
ax2.set_title(f'{peak_name}_PeakHour_Comparison')
ax2.set_ylabel('Hour of Day')
ax2.tick_params(axis='x', rotation=20, labelsize=7)
if comparison_handles:
ax2.legend(comparison_handles, comparison_labels, fontsize=8, loc='best')
comparison_file = Path(os.getcwd()) / f'{project}_{peak_name}_PeakComparison.pdf'
fig2.savefig(comparison_file)
plt.close(fig2)
print(f'Saved {comparison_file.name}')
def main() -> None:
for project in PROJECTS:
print('=' * 60)
print(f'Project: {project}')
print('=' * 60)
df = load_and_prepare_project(project)
print('Columns:', list(df.columns))
models = get_project_models(df, project)
print('Models:', models)
if not models:
print(f'No model columns found for {project}')
continue
peakdf = build_peak_report(df, project, models)
save_project_outputs(df, peakdf, project)
plot_project_peak_accuracy(peakdf, project, models)
print('Done.')
if __name__ == '__main__':
main()



