Source code for thermostat.stats

import pandas as pd
import numpy as np
from scipy.stats import norm

from collections import OrderedDict
from collections import defaultdict
from itertools import chain
from warnings import warn
import json
from functools import reduce
from pkg_resources import resource_stream
import logging

from thermostat import get_version

QUANTILE = [1, 2.5, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 98, 99]
IQR_FILTER_PARAMETER = 1.5
TOP_ONLY_PERCENTILE_FILTER = .05  # Filters top 5 percent for RHU2 calculation
UNFILTERED_PERCENTILE = 1 - TOP_ONLY_PERCENTILE_FILTER

logger = logging.getLogger('epathermostat')


REAL_OR_INTEGER_VALUED_COLUMNS_HEATING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_heating_days',

    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_heating_runtime',

    'core_heating_days_mean_indoor_temperature',
    'core_heating_days_mean_outdoor_temperature',
    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',

    'rhu1_aux_duty_cycle',
    'rhu1_emg_duty_cycle',
    'rhu1_compressor_duty_cycle',

    'rhu1_00F_to_05F',
    'rhu1_05F_to_10F',
    'rhu1_10F_to_15F',
    'rhu1_15F_to_20F',
    'rhu1_20F_to_25F',
    'rhu1_25F_to_30F',
    'rhu1_30F_to_35F',
    'rhu1_35F_to_40F',
    'rhu1_40F_to_45F',
    'rhu1_45F_to_50F',
    'rhu1_50F_to_55F',
    'rhu1_55F_to_60F',

    'rhu1_05F_to_10F_aux_duty_cycle',
    'rhu1_10F_to_15F_aux_duty_cycle',
    'rhu1_15F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_25F_aux_duty_cycle',
    'rhu1_25F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_35F_aux_duty_cycle',
    'rhu1_35F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_45F_aux_duty_cycle',
    'rhu1_45F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_55F_aux_duty_cycle',
    'rhu1_55F_to_60F_aux_duty_cycle',

    'rhu1_00F_to_05F_emg_duty_cycle',
    'rhu1_05F_to_10F_emg_duty_cycle',
    'rhu1_10F_to_15F_emg_duty_cycle',
    'rhu1_15F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_25F_emg_duty_cycle',
    'rhu1_25F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_35F_emg_duty_cycle',
    'rhu1_35F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_45F_emg_duty_cycle',
    'rhu1_45F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_55F_emg_duty_cycle',
    'rhu1_55F_to_60F_emg_duty_cycle',

    'rhu1_00F_to_05F_compressor_duty_cycle',
    'rhu1_05F_to_10F_compressor_duty_cycle',
    'rhu1_10F_to_15F_compressor_duty_cycle',
    'rhu1_15F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_25F_compressor_duty_cycle',
    'rhu1_25F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_35F_compressor_duty_cycle',
    'rhu1_35F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_45F_compressor_duty_cycle',
    'rhu1_45F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_55F_compressor_duty_cycle',
    'rhu1_55F_to_60F_compressor_duty_cycle',

    'rhu1_less10F',
    'rhu1_10F_to_20F',
    'rhu1_20F_to_30F',
    'rhu1_30F_to_40F',
    'rhu1_40F_to_50F',
    'rhu1_50F_to_60F',
    'rhu1_00F_to_05F_aux_duty_cycle',

    'rhu1_less10F_aux_duty_cycle',
    'rhu1_10F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_60F_aux_duty_cycle',

    'rhu1_less10F_emg_duty_cycle',
    'rhu1_10F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_60F_emg_duty_cycle',

    'rhu1_less10F_compressor_duty_cycle',
    'rhu1_10F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_60F_compressor_duty_cycle',

    'rhu2_00F_to_05F',
    'rhu2_05F_to_10F',
    'rhu2_10F_to_15F',
    'rhu2_15F_to_20F',
    'rhu2_20F_to_25F',
    'rhu2_25F_to_30F',
    'rhu2_30F_to_35F',
    'rhu2_35F_to_40F',
    'rhu2_40F_to_45F',
    'rhu2_45F_to_50F',
    'rhu2_50F_to_55F',
    'rhu2_55F_to_60F',

    'rhu2_00F_to_05F_aux_duty_cycle',
    'rhu2_05F_to_10F_aux_duty_cycle',
    'rhu2_10F_to_15F_aux_duty_cycle',
    'rhu2_15F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_25F_aux_duty_cycle',
    'rhu2_25F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_35F_aux_duty_cycle',
    'rhu2_35F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_45F_aux_duty_cycle',
    'rhu2_45F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_55F_aux_duty_cycle',
    'rhu2_55F_to_60F_aux_duty_cycle',

    'rhu2_00F_to_05F_emg_duty_cycle',
    'rhu2_05F_to_10F_emg_duty_cycle',
    'rhu2_10F_to_15F_emg_duty_cycle',
    'rhu2_15F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_25F_emg_duty_cycle',
    'rhu2_25F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_35F_emg_duty_cycle',
    'rhu2_35F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_45F_emg_duty_cycle',
    'rhu2_45F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_55F_emg_duty_cycle',
    'rhu2_55F_to_60F_emg_duty_cycle',

    'rhu2_00F_to_05F_compressor_duty_cycle',
    'rhu2_05F_to_10F_compressor_duty_cycle',
    'rhu2_10F_to_15F_compressor_duty_cycle',
    'rhu2_15F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_25F_compressor_duty_cycle',
    'rhu2_25F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_35F_compressor_duty_cycle',
    'rhu2_35F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_45F_compressor_duty_cycle',
    'rhu2_45F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_55F_compressor_duty_cycle',
    'rhu2_55F_to_60F_compressor_duty_cycle',

    'rhu2_less10F',
    'rhu2_10F_to_20F',
    'rhu2_20F_to_30F',
    'rhu2_30F_to_40F',
    'rhu2_40F_to_50F',
    'rhu2_50F_to_60F',

    'rhu2_less10F_aux_duty_cycle',
    'rhu2_10F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_60F_aux_duty_cycle',

    'rhu2_less10F_emg_duty_cycle',
    'rhu2_10F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_60F_emg_duty_cycle',

    'rhu2_less10F_compressor_duty_cycle',
    'rhu2_10F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_60F_compressor_duty_cycle',

]

REAL_OR_INTEGER_VALUED_COLUMNS_COOLING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',

    'daily_mean_core_cooling_runtime',

    'core_cooling_days_mean_indoor_temperature',
    'core_cooling_days_mean_outdoor_temperature',
    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',
]

REAL_OR_INTEGER_VALUED_COLUMNS_ALL = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',
    'n_core_heating_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',
    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_cooling_runtime',
    'daily_mean_core_heating_runtime',

    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',

    'rhu1_aux_duty_cycle',
    'rhu1_emg_duty_cycle',
    'rhu1_compressor_duty_cycle',

    'rhu1_00F_to_05F',
    'rhu1_05F_to_10F',
    'rhu1_10F_to_15F',
    'rhu1_15F_to_20F',
    'rhu1_20F_to_25F',
    'rhu1_25F_to_30F',
    'rhu1_30F_to_35F',
    'rhu1_35F_to_40F',
    'rhu1_40F_to_45F',
    'rhu1_45F_to_50F',
    'rhu1_50F_to_55F',
    'rhu1_55F_to_60F',

    'rhu1_05F_to_10F_aux_duty_cycle',
    'rhu1_10F_to_15F_aux_duty_cycle',
    'rhu1_15F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_25F_aux_duty_cycle',
    'rhu1_25F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_35F_aux_duty_cycle',
    'rhu1_35F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_45F_aux_duty_cycle',
    'rhu1_45F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_55F_aux_duty_cycle',
    'rhu1_55F_to_60F_aux_duty_cycle',

    'rhu1_00F_to_05F_emg_duty_cycle',
    'rhu1_05F_to_10F_emg_duty_cycle',
    'rhu1_10F_to_15F_emg_duty_cycle',
    'rhu1_15F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_25F_emg_duty_cycle',
    'rhu1_25F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_35F_emg_duty_cycle',
    'rhu1_35F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_45F_emg_duty_cycle',
    'rhu1_45F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_55F_emg_duty_cycle',
    'rhu1_55F_to_60F_emg_duty_cycle',

    'rhu1_00F_to_05F_compressor_duty_cycle',
    'rhu1_05F_to_10F_compressor_duty_cycle',
    'rhu1_10F_to_15F_compressor_duty_cycle',
    'rhu1_15F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_25F_compressor_duty_cycle',
    'rhu1_25F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_35F_compressor_duty_cycle',
    'rhu1_35F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_45F_compressor_duty_cycle',
    'rhu1_45F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_55F_compressor_duty_cycle',
    'rhu1_55F_to_60F_compressor_duty_cycle',

    'rhu1_less10F',
    'rhu1_10F_to_20F',
    'rhu1_20F_to_30F',
    'rhu1_30F_to_40F',
    'rhu1_40F_to_50F',
    'rhu1_50F_to_60F',
    'rhu1_00F_to_05F_aux_duty_cycle',

    'rhu1_less10F_aux_duty_cycle',
    'rhu1_10F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_60F_aux_duty_cycle',

    'rhu1_less10F_emg_duty_cycle',
    'rhu1_10F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_60F_emg_duty_cycle',

    'rhu1_less10F_compressor_duty_cycle',
    'rhu1_10F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_60F_compressor_duty_cycle',

    'rhu2_00F_to_05F',
    'rhu2_05F_to_10F',
    'rhu2_10F_to_15F',
    'rhu2_15F_to_20F',
    'rhu2_20F_to_25F',
    'rhu2_25F_to_30F',
    'rhu2_30F_to_35F',
    'rhu2_35F_to_40F',
    'rhu2_40F_to_45F',
    'rhu2_45F_to_50F',
    'rhu2_50F_to_55F',
    'rhu2_55F_to_60F',

    'rhu2_00F_to_05F_aux_duty_cycle',
    'rhu2_05F_to_10F_aux_duty_cycle',
    'rhu2_10F_to_15F_aux_duty_cycle',
    'rhu2_15F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_25F_aux_duty_cycle',
    'rhu2_25F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_35F_aux_duty_cycle',
    'rhu2_35F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_45F_aux_duty_cycle',
    'rhu2_45F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_55F_aux_duty_cycle',
    'rhu2_55F_to_60F_aux_duty_cycle',

    'rhu2_00F_to_05F_emg_duty_cycle',
    'rhu2_05F_to_10F_emg_duty_cycle',
    'rhu2_10F_to_15F_emg_duty_cycle',
    'rhu2_15F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_25F_emg_duty_cycle',
    'rhu2_25F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_35F_emg_duty_cycle',
    'rhu2_35F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_45F_emg_duty_cycle',
    'rhu2_45F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_55F_emg_duty_cycle',
    'rhu2_55F_to_60F_emg_duty_cycle',

    'rhu2_00F_to_05F_compressor_duty_cycle',
    'rhu2_05F_to_10F_compressor_duty_cycle',
    'rhu2_10F_to_15F_compressor_duty_cycle',
    'rhu2_15F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_25F_compressor_duty_cycle',
    'rhu2_25F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_35F_compressor_duty_cycle',
    'rhu2_35F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_45F_compressor_duty_cycle',
    'rhu2_45F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_55F_compressor_duty_cycle',
    'rhu2_55F_to_60F_compressor_duty_cycle',

    'rhu2_less10F',
    'rhu2_10F_to_20F',
    'rhu2_20F_to_30F',
    'rhu2_30F_to_40F',
    'rhu2_40F_to_50F',
    'rhu2_50F_to_60F',

    'rhu2_less10F_aux_duty_cycle',
    'rhu2_10F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_60F_aux_duty_cycle',

    'rhu2_less10F_emg_duty_cycle',
    'rhu2_10F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_60F_emg_duty_cycle',

    'rhu2_less10F_compressor_duty_cycle',
    'rhu2_10F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_60F_compressor_duty_cycle',
]


[docs]def combine_output_dataframes(dfs): """ Combines output dataframes. Useful when combining output from batches. Parameters ---------- dfs : list of pd.DataFrame Output dataFrames to combine into one. Returns ------- out : pd.DataFrame Dataframe with combined output metadata. """ return pd.concat(dfs, ignore_index=True)
[docs]def get_filtered_stats( df, row_filter, label, heating_or_cooling, target_columns, target_baseline_method): n_rows_total = df.shape[0] filtered_df = df[[row_filter(row, df) for i, row in df.iterrows()]] n_rows_kept = filtered_df.shape[0] n_rows_discarded = n_rows_total - n_rows_kept stats = OrderedDict() stats["label"] = "{}_{}".format(label, heating_or_cooling) stats["sw_version"] = get_version() stats["n_thermostat_core_day_sets_total"] = n_rows_total stats["n_thermostat_core_day_sets_kept"] = n_rows_kept stats["n_thermostat_core_day_sets_discarded"] = n_rows_discarded if n_rows_total > 0: for column_name in target_columns: column = filtered_df[column_name].replace([np.inf, -np.inf], np.nan).dropna() # calculate quantiles and statistics mean = np.nanmean(pd.to_numeric(column)) if column.count() != 0: sem = np.nanstd(column) / (column.count() ** .5) else: sem = np.nan lower_bound = mean - (1.96 * sem) upper_bound = mean + (1.96 * sem) stats["{}_n".format(column_name)] = column.count() stats["{}_upper_bound_95_perc_conf".format(column_name)] = upper_bound stats["{}_mean".format(column_name)] = mean stats["{}_lower_bound_95_perc_conf".format(column_name)] = lower_bound stats["{}_sem".format(column_name)] = sem for quantile in QUANTILE: stats["{}_q{}".format(column_name, quantile)] = column.quantile(quantile / 100.) # Calculate IQR for RHU2 and filter outliers if 'rhu2' in column_name: iqr_filter = (column < column.quantile(UNFILTERED_PERCENTILE)) if bool(iqr_filter.any()) is False: iqr_filter = (column == column) warn("RHU filtering 5% and min Runtime filtering removed entire dataset from statistics summary for bin. Disabling filter.") iqr_filtered_column = column.loc[iqr_filter] # calculate quantiles and statistics for RHU2 IQR (IQFLT) and # non-IQR filtering (NOIQ) iqr_mean = np.nanmean(pd.to_numeric(iqr_filtered_column)) iqr_sem = np.nanstd(iqr_filtered_column) / (iqr_filtered_column.count() ** .5) iqr_lower_bound = iqr_mean - (1.96 * iqr_sem) iqr_upper_bound = iqr_mean + (1.96 * iqr_sem) noiq_mean = np.nanmean(pd.to_numeric(column)) noiq_sem = np.nanstd(column) / (column.count() ** .5) noiq_lower_bound = noiq_mean - (1.96 * noiq_sem) noiq_upper_bound = noiq_mean + (1.96 * noiq_sem) stats["{}_n_IQFLT".format(column_name)] = iqr_filtered_column.count() stats["{}_upper_bound_95_perc_conf_IQFLT".format(column_name)] = iqr_upper_bound stats["{}_mean_IQFLT".format(column_name)] = iqr_mean stats["{}_lower_bound_95_perc_conf_IQFLT".format(column_name)] = iqr_lower_bound stats["{}_sem_IQFLT".format(column_name)] = iqr_sem stats["{}_n_NOIQ".format(column_name)] = column.count() stats["{}_upper_bound_95_perc_conf_NOIQ".format(column_name)] = noiq_upper_bound stats["{}_mean_NOIQ".format(column_name)] = noiq_mean stats["{}_lower_bound_95_perc_conf_NOIQ".format(column_name)] = noiq_lower_bound stats["{}_sem_NOIQ".format(column_name)] = noiq_sem for quantile in QUANTILE: stats["{}_q{}_IQFLT".format(column_name, quantile)] = iqr_filtered_column.quantile(quantile / 100.) stats["{}_q{}_NOIQ".format(column_name, quantile)] = column.quantile(quantile / 100.) return [stats] else: warn( "Not enough data to compute summary_statistics ({}_{})" .format(label, heating_or_cooling) ) return []
[docs]def compute_summary_statistics( metrics_df, target_baseline_method="baseline_percentile", advanced_filtering=False): """ Computes summary statistics for the output dataframe. Computes the following statistics for each real-valued or integer valued column in the output dataframe: mean, standard error of the mean, and deciles. Parameters ---------- df : pd.DataFrame Output for which to compute summary statistics. label : str Name for this set of thermostat outputs. target_baseline_method : {"baseline_percentile", "baseline_regional"}, default "baseline_percentile" Baselining method by which samples will be filtered according to bad fits. Returns ------- stats : collections.OrderedDict An ordered dict containing the summary statistics. Column names are as follows, in which ### is a placeholder for the name of the column: - mean: ###_mean - standard error of the mean: ###_sem - 1st quantile: ###_1q - 2.5th quantile:###_2.5q - 5th quantile: ###_5q - 10th quantile: ###_10q - 15th quantile: ###_15q - 20th quantile: ###_20q - 25th quantile: ###_25q - 30th quantile: ###_30q - 35th quantile: ###_35q - 40th quantile: ###_40q - 45th quantile: ###_45q - 50th quantile: ###_50q - 55th quantile: ###_55q - 60th quantile: ###_60q - 65th quantile: ###_65q - 70th quantile: ###_70q - 75th quantile: ###_75q - 80th quantile: ###_80q - 85th quantile: ###_85q - 90th quantile: ###_90q - 95th quantile: ###_95q - 98th quantile: ###_98q - 99th quantile: ###_99q - number of non-null core day sets: ###_n The following general values are also output: - label: label - number of total core day sets: n_total_core_day_sets """ if target_baseline_method not in ["baseline_percentile", "baseline_regional"]: message = ( 'Baseline method not supported - please use one of' ' "baseline_percentile" or "baseline_regional"' ) raise ValueError(message) def _identity_filter(row, df): return True def _range_filter(row, column_name, heating_or_cooling, lower_bound=-np.inf, upper_bound=np.inf, target_baseline=False): if target_baseline: full_column_selector = "{}_{}".format(column_name, target_baseline_method) else: full_column_selector = column_name column_value = row[full_column_selector] return lower_bound < column_value < upper_bound def _percentile_range_filter(row, column_name, heating_or_cooling, df, quantile=0.0, target_baseline=False): if target_baseline: full_column_selector = "{}_{}".format(column_name, target_baseline_method) else: full_column_selector = column_name lower_bound = df[full_column_selector].dropna().quantile(0.0 + quantile) upper_bound = df[full_column_selector].dropna().quantile(1.0 - quantile) return _range_filter(row, column_name, heating_or_cooling, lower_bound, upper_bound, target_baseline) def _tau_filter_heating(row, df): return _range_filter(row, "tau", "heating", 0, 25) def _tau_filter_cooling(row, df): return _range_filter(row, "tau", "cooling", 0, 25) def _cvrmse_filter_heating(row, df): return _range_filter(row, "cv_root_mean_sq_err", "heating", upper_bound=0.6) def _cvrmse_filter_cooling(row, df): return _range_filter(row, "cv_root_mean_sq_err", "cooling", upper_bound=0.6) def _savings_filter_p01_heating(row, df): return _percentile_range_filter(row, "percent_savings", "heating", df, 0.01, True) def _savings_filter_p01_cooling(row, df): return _percentile_range_filter(row, "percent_savings", "cooling", df, 0.01, True) def _combine_filters(filters): def _new_filter(row, df): return reduce(lambda x, y: x and y(row, df), filters, True) return _new_filter def heating_stats(df, filter_, label): heating_df = df[["heating" in name for name in df["heating_or_cooling"]]] return get_filtered_stats( heating_df, filter_, label, "heating", REAL_OR_INTEGER_VALUED_COLUMNS_HEATING, target_baseline_method) def cooling_stats(df, filter_, label): cooling_df = df[["cooling" in name for name in df["heating_or_cooling"]]] return get_filtered_stats( cooling_df, filter_, label, "cooling", REAL_OR_INTEGER_VALUED_COLUMNS_COOLING, target_baseline_method) very_cold_cold_df = metrics_df[[ (cz is not None) and "Very-Cold/Cold" in cz for cz in metrics_df["climate_zone"] ]] mixed_humid_df = metrics_df[[ (cz is not None) and "Mixed-Humid" in cz for cz in metrics_df["climate_zone"] ]] mixed_dry_hot_dry_df = metrics_df[[ (cz is not None) and "Mixed-Dry/Hot-Dry" in cz for cz in metrics_df["climate_zone"] ]] hot_humid_df = metrics_df[[ (cz is not None) and "Hot-Humid" in cz for cz in metrics_df["climate_zone"] ]] marine_df = metrics_df[[ (cz is not None) and "Marine" in cz for cz in metrics_df["climate_zone"] ]] filter_0 = _identity_filter filter_1_heating = _combine_filters([_tau_filter_heating]) filter_1_cooling = _combine_filters([_tau_filter_cooling]) filter_2_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating]) filter_2_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling]) filter_3_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating, _savings_filter_p01_heating]) filter_3_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling, _savings_filter_p01_cooling]) if advanced_filtering: stats = list(chain.from_iterable([ heating_stats(metrics_df, filter_0, "all_no_filter"), cooling_stats(metrics_df, filter_0, "all_no_filter"), heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), heating_stats(marine_df, filter_0, "marine_no_filter"), cooling_stats(marine_df, filter_0, "marine_no_filter"), heating_stats(metrics_df, filter_1_heating, "all_tau_filter"), cooling_stats(metrics_df, filter_1_cooling, "all_tau_filter"), heating_stats(very_cold_cold_df, filter_1_heating, "very-cold_cold_tau_filter"), cooling_stats(very_cold_cold_df, filter_1_cooling, "very-cold_cold_tau_filter"), heating_stats(mixed_humid_df, filter_1_heating, "mixed-humid_tau_filter"), cooling_stats(mixed_humid_df, filter_1_cooling, "mixed-humid_tau_filter"), heating_stats(mixed_dry_hot_dry_df, filter_1_heating, "mixed-dry_hot-dry_tau_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_1_cooling, "mixed-dry_hot-dry_tau_filter"), heating_stats(hot_humid_df, filter_1_heating, "hot-humid_tau_filter"), cooling_stats(hot_humid_df, filter_1_cooling, "hot-humid_tau_filter"), heating_stats(marine_df, filter_1_heating, "marine_tau_filter"), cooling_stats(marine_df, filter_1_cooling, "marine_tau_filter"), heating_stats(metrics_df, filter_2_heating, "all_tau_cvrmse_filter"), cooling_stats(metrics_df, filter_2_cooling, "all_tau_cvrmse_filter"), heating_stats(very_cold_cold_df, filter_2_heating, "very-cold_cold_tau_cvrmse_filter"), cooling_stats(very_cold_cold_df, filter_2_cooling, "very-cold_cold_tau_cvrmse_filter"), heating_stats(mixed_humid_df, filter_2_heating, "mixed-humid_tau_cvrmse_filter"), cooling_stats(mixed_humid_df, filter_2_cooling, "mixed-humid_tau_cvrmse_filter"), heating_stats(mixed_dry_hot_dry_df, filter_2_heating, "mixed-dry_hot-dry_tau_cvrmse_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_2_cooling, "mixed-dry_hot-dry_tau_cvrmse_filter"), heating_stats(hot_humid_df, filter_2_heating, "hot-humid_tau_cvrmse_filter"), cooling_stats(hot_humid_df, filter_2_cooling, "hot-humid_tau_cvrmse_filter"), heating_stats(marine_df, filter_2_heating, "marine_tau_cvrmse_filter"), cooling_stats(marine_df, filter_2_cooling, "marine_tau_cvrmse_filter"), heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"), cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"), heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"), cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"), heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"), cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"), ])) else: stats = list(chain.from_iterable([ heating_stats(metrics_df, filter_0, "all_no_filter"), cooling_stats(metrics_df, filter_0, "all_no_filter"), heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"), heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"), heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"), heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"), heating_stats(marine_df, filter_0, "marine_no_filter"), cooling_stats(marine_df, filter_0, "marine_no_filter"), heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"), cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"), heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"), cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"), heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"), heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"), cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"), heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"), cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"), ])) stats_dict = {stat["label"]: stat for stat in stats} def _load_climate_zone_weights(filename_or_buffer): climate_zone_keys = { "Very-Cold/Cold": "very-cold_cold", "Mixed-Humid": "mixed-humid", "Mixed-Dry/Hot-Dry": "mixed-dry_hot-dry", "Hot-Humid": "hot-humid", "Marine": "marine", } df = pd.read_csv( filename_or_buffer, usecols=["climate_zone", "heating_weight", "cooling_weight"], ).set_index("climate_zone") heating_weights = {climate_zone_keys[cz]: weight for cz, weight in df["heating_weight"].iteritems()} cooling_weights = {climate_zone_keys[cz]: weight for cz, weight in df["cooling_weight"].iteritems()} return heating_weights, cooling_weights with resource_stream('thermostat.resources', 'NationalAverageClimateZoneWeightings.csv') as f: heating_weights, cooling_weights = _load_climate_zone_weights(f) def _compute_national_weightings(stats_by_climate_zone, keys, weights): def _national_weight(key): results = [] for cz, weight in weights.items(): stat_cz = stats_by_climate_zone.get(cz) if stat_cz is None: value = None else: value = stat_cz.get(key) if pd.notnull(weight) and pd.notnull(value): results.append((weight, value)) if len(results) == 0: return None else: weighted_sum = sum([weight * value for weight, value in results]) sum_of_weights = sum([weight for weight, _ in results]) return weighted_sum / sum_of_weights stats = [ "mean", "q1", "q2.5", "q5", "q10", "q15", "q20", "q25", "q30", "q35", "q40", "q45", "q50", "q55", "q60", "q65", "q70", "q75", "q80", "q85", "q90", "q95", "q98", "q99", ] key_stats = [ "{}_{}".format(key, stat) for key in keys for stat in stats ] return { "{}_{}".format(key_stat, "national_weighted_mean"): _national_weight(key_stat) for key_stat in key_stats } def _compute_national_weighting_lower_and_upper_bounds( stats_by_climate_zone, keys, weights): def _compute_bounds(key): # compute sem savings means, sems, weights_ = [], [], [] for cz, weight in weights.items(): stat_cz = stats_by_climate_zone.get(cz) if stat_cz is None: mean, sem = None, None else: mean = stat_cz.get("{}_mean".format(key), None) sem = stat_cz.get("{}_sem".format(key), None) if pd.notnull(weight) and pd.notnull(mean) and pd.notnull(sem): weights_.append(weight) means.append(mean) sems.append(sem) if len(weights_) == 0: return {} else: weighted_sum = sum([ weight * mean for weight, mean in zip(weights_, means) ]) weighted_mean = weighted_sum / sum(weights_) # renormalize weighted_sem = sum([ (weight*sem) ** 2 for weight, sem in zip(weights_, sems) ]) ** 0.5 lower_bound = weighted_mean - (1.96 * weighted_sem) upper_bound = weighted_mean + (1.96 * weighted_sem) return { "{}_lower_bound_95_perc_conf_national_weighted_mean".format(key): lower_bound, "{}_upper_bound_95_perc_conf_national_weighted_mean".format(key): upper_bound } items = {} for key in keys: items.update(_compute_bounds(key)) return items national_weighting_stats = [] if advanced_filtering: filters = [ "no_filter", "tau_filter", "tau_cvrmse_filter", "tau_cvrmse_savings_p01_filter", ] else: filters = [ "no_filter", "tau_cvrmse_savings_p01_filter", ] climate_zones = [ "mixed-humid", "mixed-dry_hot-dry", "marine", "hot-humid", "very-cold_cold" ] methods = [ "baseline_percentile", "baseline_regional", ] for season_type in ["heating", "cooling"]: if season_type == "heating": weights = heating_weights else: weights = cooling_weights for filter_ in filters: stats_by_climate_zone = { cz: stats_dict.get("{}_{}_{}".format(cz, filter_, season_type)) for cz in climate_zones } keys = ["percent_savings_{}".format(method) for method in methods] national_weightings = _compute_national_weightings( stats_by_climate_zone, keys, weights) bounds = _compute_national_weighting_lower_and_upper_bounds( stats_by_climate_zone, keys, weights) national_weightings.update(bounds) national_weightings.update( {"label": "national_weighted_mean_{}_{}".format(season_type, filter_)} ) national_weighting_stats.append(national_weightings) stats = national_weighting_stats + stats return stats
[docs]def summary_statistics_to_csv(stats, filepath, product_id): """ Write metric statistics to CSV file. Parameters ---------- stats : list of dict List of outputs from thermostat.stats.compute_summary_statistics() filepath : str Filepath at which to save the suppary statistics product_id : str A combination of the connected thermostat service plus one or more connected thermostat device models that comprises the data set. Returns ------- df : pandas.DataFrame A pandas dataframe containing the output data. """ columns = [ "label", "product_id", "sw_version", ] methods = [ "baseline_percentile", "baseline_regional", ] national_weighting_columns = list(chain.from_iterable([ [ "percent_savings_{}_mean_national_weighted_mean".format(method), "percent_savings_{}_q1_national_weighted_mean".format(method), "percent_savings_{}_q2.5_national_weighted_mean".format(method), "percent_savings_{}_q5_national_weighted_mean".format(method), "percent_savings_{}_q10_national_weighted_mean".format(method), "percent_savings_{}_q15_national_weighted_mean".format(method), "percent_savings_{}_q20_national_weighted_mean".format(method), "percent_savings_{}_q25_national_weighted_mean".format(method), "percent_savings_{}_q30_national_weighted_mean".format(method), "percent_savings_{}_q35_national_weighted_mean".format(method), "percent_savings_{}_q40_national_weighted_mean".format(method), "percent_savings_{}_q45_national_weighted_mean".format(method), "percent_savings_{}_q50_national_weighted_mean".format(method), "percent_savings_{}_q55_national_weighted_mean".format(method), "percent_savings_{}_q60_national_weighted_mean".format(method), "percent_savings_{}_q65_national_weighted_mean".format(method), "percent_savings_{}_q70_national_weighted_mean".format(method), "percent_savings_{}_q75_national_weighted_mean".format(method), "percent_savings_{}_q80_national_weighted_mean".format(method), "percent_savings_{}_q85_national_weighted_mean".format(method), "percent_savings_{}_q90_national_weighted_mean".format(method), "percent_savings_{}_q95_national_weighted_mean".format(method), "percent_savings_{}_q98_national_weighted_mean".format(method), "percent_savings_{}_q99_national_weighted_mean".format(method), "percent_savings_{}_lower_bound_95_perc_conf_national_weighted_mean".format(method), "percent_savings_{}_upper_bound_95_perc_conf_national_weighted_mean".format(method), ] for method in methods ])) columns.extend(national_weighting_columns) columns.extend([ "n_thermostat_core_day_sets_total", "n_thermostat_core_day_sets_kept", "n_thermostat_core_day_sets_discarded", ]) for column_name in REAL_OR_INTEGER_VALUED_COLUMNS_ALL: columns.append("{}_n".format(column_name)) columns.append("{}_upper_bound_95_perc_conf".format(column_name)) columns.append("{}_mean".format(column_name)) columns.append("{}_lower_bound_95_perc_conf".format(column_name)) columns.append("{}_sem".format(column_name)) for quantile in QUANTILE: columns.append("{}_q{}".format(column_name, quantile)) for rhu_column in [column for column in columns if 'rhu2' in column]: columns.append(rhu_column + '_IQFLT') for rhu_column in [column for column in columns if 'IQFLT' in column]: columns.append(rhu_column.replace('IQFLT', 'NOIQ')) # add product_id for row in stats: row["product_id"] = product_id # transpose for readability. stats_dataframe = pd.DataFrame(stats, columns=columns).set_index('label').transpose() stats_dataframe.to_csv(filepath) return stats_dataframe