Source code for thermostat.stats

import pandas as pd
import numpy as np
from scipy.stats import norm

from collections import OrderedDict
from collections import defaultdict
from itertools import chain
from warnings import warn
import json
from functools import reduce
from pkg_resources import resource_stream
import logging

from thermostat import get_version

QUANTILE = [1, 2.5, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 98, 99]
IQR_FILTER_PARAMETER = 1.5
TOP_ONLY_PERCENTILE_FILTER = .05  # Filters top 5 percent for RHU2 calculation
UNFILTERED_PERCENTILE = 1 - TOP_ONLY_PERCENTILE_FILTER

logger = logging.getLogger('epathermostat')


REAL_OR_INTEGER_VALUED_COLUMNS_HEATING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_heating_days',

    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_heating_runtime',

    'core_heating_days_mean_indoor_temperature',
    'core_heating_days_mean_outdoor_temperature',
    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',

    'rhu1_aux_duty_cycle',
    'rhu1_emg_duty_cycle',
    'rhu1_compressor_duty_cycle',

    'rhu1_00F_to_05F',
    'rhu1_05F_to_10F',
    'rhu1_10F_to_15F',
    'rhu1_15F_to_20F',
    'rhu1_20F_to_25F',
    'rhu1_25F_to_30F',
    'rhu1_30F_to_35F',
    'rhu1_35F_to_40F',
    'rhu1_40F_to_45F',
    'rhu1_45F_to_50F',
    'rhu1_50F_to_55F',
    'rhu1_55F_to_60F',

    'rhu1_05F_to_10F_aux_duty_cycle',
    'rhu1_10F_to_15F_aux_duty_cycle',
    'rhu1_15F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_25F_aux_duty_cycle',
    'rhu1_25F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_35F_aux_duty_cycle',
    'rhu1_35F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_45F_aux_duty_cycle',
    'rhu1_45F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_55F_aux_duty_cycle',
    'rhu1_55F_to_60F_aux_duty_cycle',

    'rhu1_00F_to_05F_emg_duty_cycle',
    'rhu1_05F_to_10F_emg_duty_cycle',
    'rhu1_10F_to_15F_emg_duty_cycle',
    'rhu1_15F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_25F_emg_duty_cycle',
    'rhu1_25F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_35F_emg_duty_cycle',
    'rhu1_35F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_45F_emg_duty_cycle',
    'rhu1_45F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_55F_emg_duty_cycle',
    'rhu1_55F_to_60F_emg_duty_cycle',

    'rhu1_00F_to_05F_compressor_duty_cycle',
    'rhu1_05F_to_10F_compressor_duty_cycle',
    'rhu1_10F_to_15F_compressor_duty_cycle',
    'rhu1_15F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_25F_compressor_duty_cycle',
    'rhu1_25F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_35F_compressor_duty_cycle',
    'rhu1_35F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_45F_compressor_duty_cycle',
    'rhu1_45F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_55F_compressor_duty_cycle',
    'rhu1_55F_to_60F_compressor_duty_cycle',

    'rhu1_less10F',
    'rhu1_10F_to_20F',
    'rhu1_20F_to_30F',
    'rhu1_30F_to_40F',
    'rhu1_40F_to_50F',
    'rhu1_50F_to_60F',
    'rhu1_00F_to_05F_aux_duty_cycle',

    'rhu1_less10F_aux_duty_cycle',
    'rhu1_10F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_60F_aux_duty_cycle',

    'rhu1_less10F_emg_duty_cycle',
    'rhu1_10F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_60F_emg_duty_cycle',

    'rhu1_less10F_compressor_duty_cycle',
    'rhu1_10F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_60F_compressor_duty_cycle',

    'rhu2_00F_to_05F',
    'rhu2_05F_to_10F',
    'rhu2_10F_to_15F',
    'rhu2_15F_to_20F',
    'rhu2_20F_to_25F',
    'rhu2_25F_to_30F',
    'rhu2_30F_to_35F',
    'rhu2_35F_to_40F',
    'rhu2_40F_to_45F',
    'rhu2_45F_to_50F',
    'rhu2_50F_to_55F',
    'rhu2_55F_to_60F',

    'rhu2_00F_to_05F_aux_duty_cycle',
    'rhu2_05F_to_10F_aux_duty_cycle',
    'rhu2_10F_to_15F_aux_duty_cycle',
    'rhu2_15F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_25F_aux_duty_cycle',
    'rhu2_25F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_35F_aux_duty_cycle',
    'rhu2_35F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_45F_aux_duty_cycle',
    'rhu2_45F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_55F_aux_duty_cycle',
    'rhu2_55F_to_60F_aux_duty_cycle',

    'rhu2_00F_to_05F_emg_duty_cycle',
    'rhu2_05F_to_10F_emg_duty_cycle',
    'rhu2_10F_to_15F_emg_duty_cycle',
    'rhu2_15F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_25F_emg_duty_cycle',
    'rhu2_25F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_35F_emg_duty_cycle',
    'rhu2_35F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_45F_emg_duty_cycle',
    'rhu2_45F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_55F_emg_duty_cycle',
    'rhu2_55F_to_60F_emg_duty_cycle',

    'rhu2_00F_to_05F_compressor_duty_cycle',
    'rhu2_05F_to_10F_compressor_duty_cycle',
    'rhu2_10F_to_15F_compressor_duty_cycle',
    'rhu2_15F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_25F_compressor_duty_cycle',
    'rhu2_25F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_35F_compressor_duty_cycle',
    'rhu2_35F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_45F_compressor_duty_cycle',
    'rhu2_45F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_55F_compressor_duty_cycle',
    'rhu2_55F_to_60F_compressor_duty_cycle',

    'rhu2_less10F',
    'rhu2_10F_to_20F',
    'rhu2_20F_to_30F',
    'rhu2_30F_to_40F',
    'rhu2_40F_to_50F',
    'rhu2_50F_to_60F',

    'rhu2_less10F_aux_duty_cycle',
    'rhu2_10F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_60F_aux_duty_cycle',

    'rhu2_less10F_emg_duty_cycle',
    'rhu2_10F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_60F_emg_duty_cycle',

    'rhu2_less10F_compressor_duty_cycle',
    'rhu2_10F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_60F_compressor_duty_cycle',

]

REAL_OR_INTEGER_VALUED_COLUMNS_COOLING = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',

    'daily_mean_core_cooling_runtime',

    'core_cooling_days_mean_indoor_temperature',
    'core_cooling_days_mean_outdoor_temperature',
    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',
]

REAL_OR_INTEGER_VALUED_COLUMNS_ALL = [
    'n_days_in_inputfile_date_range',
    'n_days_both_heating_and_cooling',
    'n_days_insufficient_data',
    'n_core_cooling_days',
    'n_core_heating_days',

    'baseline_percentile_core_cooling_comfort_temperature',
    'baseline_percentile_core_heating_comfort_temperature',
    'regional_average_baseline_cooling_comfort_temperature',
    'regional_average_baseline_heating_comfort_temperature',

    'percent_savings_baseline_percentile',
    'avoided_daily_mean_core_day_runtime_baseline_percentile',
    'avoided_total_core_day_runtime_baseline_percentile',
    'baseline_daily_mean_core_day_runtime_baseline_percentile',
    'baseline_total_core_day_runtime_baseline_percentile',
    '_daily_mean_core_day_demand_baseline_baseline_percentile',
    'percent_savings_baseline_regional',
    'avoided_daily_mean_core_day_runtime_baseline_regional',
    'avoided_total_core_day_runtime_baseline_regional',
    'baseline_daily_mean_core_day_runtime_baseline_regional',
    'baseline_total_core_day_runtime_baseline_regional',
    '_daily_mean_core_day_demand_baseline_baseline_regional',
    'mean_demand',
    'alpha',
    'tau',
    'mean_sq_err',
    'root_mean_sq_err',
    'cv_root_mean_sq_err',
    'mean_abs_err',
    'mean_abs_pct_err',

    'total_core_cooling_runtime',
    'total_core_heating_runtime',
    'total_auxiliary_heating_core_day_runtime',
    'total_emergency_heating_core_day_runtime',

    'daily_mean_core_cooling_runtime',
    'daily_mean_core_heating_runtime',

    'core_mean_indoor_temperature',
    'core_mean_outdoor_temperature',

    'rhu1_aux_duty_cycle',
    'rhu1_emg_duty_cycle',
    'rhu1_compressor_duty_cycle',

    'rhu1_00F_to_05F',
    'rhu1_05F_to_10F',
    'rhu1_10F_to_15F',
    'rhu1_15F_to_20F',
    'rhu1_20F_to_25F',
    'rhu1_25F_to_30F',
    'rhu1_30F_to_35F',
    'rhu1_35F_to_40F',
    'rhu1_40F_to_45F',
    'rhu1_45F_to_50F',
    'rhu1_50F_to_55F',
    'rhu1_55F_to_60F',

    'rhu1_05F_to_10F_aux_duty_cycle',
    'rhu1_10F_to_15F_aux_duty_cycle',
    'rhu1_15F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_25F_aux_duty_cycle',
    'rhu1_25F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_35F_aux_duty_cycle',
    'rhu1_35F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_45F_aux_duty_cycle',
    'rhu1_45F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_55F_aux_duty_cycle',
    'rhu1_55F_to_60F_aux_duty_cycle',

    'rhu1_00F_to_05F_emg_duty_cycle',
    'rhu1_05F_to_10F_emg_duty_cycle',
    'rhu1_10F_to_15F_emg_duty_cycle',
    'rhu1_15F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_25F_emg_duty_cycle',
    'rhu1_25F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_35F_emg_duty_cycle',
    'rhu1_35F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_45F_emg_duty_cycle',
    'rhu1_45F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_55F_emg_duty_cycle',
    'rhu1_55F_to_60F_emg_duty_cycle',

    'rhu1_00F_to_05F_compressor_duty_cycle',
    'rhu1_05F_to_10F_compressor_duty_cycle',
    'rhu1_10F_to_15F_compressor_duty_cycle',
    'rhu1_15F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_25F_compressor_duty_cycle',
    'rhu1_25F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_35F_compressor_duty_cycle',
    'rhu1_35F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_45F_compressor_duty_cycle',
    'rhu1_45F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_55F_compressor_duty_cycle',
    'rhu1_55F_to_60F_compressor_duty_cycle',

    'rhu1_less10F',
    'rhu1_10F_to_20F',
    'rhu1_20F_to_30F',
    'rhu1_30F_to_40F',
    'rhu1_40F_to_50F',
    'rhu1_50F_to_60F',
    'rhu1_00F_to_05F_aux_duty_cycle',

    'rhu1_less10F_aux_duty_cycle',
    'rhu1_10F_to_20F_aux_duty_cycle',
    'rhu1_20F_to_30F_aux_duty_cycle',
    'rhu1_30F_to_40F_aux_duty_cycle',
    'rhu1_40F_to_50F_aux_duty_cycle',
    'rhu1_50F_to_60F_aux_duty_cycle',

    'rhu1_less10F_emg_duty_cycle',
    'rhu1_10F_to_20F_emg_duty_cycle',
    'rhu1_20F_to_30F_emg_duty_cycle',
    'rhu1_30F_to_40F_emg_duty_cycle',
    'rhu1_40F_to_50F_emg_duty_cycle',
    'rhu1_50F_to_60F_emg_duty_cycle',

    'rhu1_less10F_compressor_duty_cycle',
    'rhu1_10F_to_20F_compressor_duty_cycle',
    'rhu1_20F_to_30F_compressor_duty_cycle',
    'rhu1_30F_to_40F_compressor_duty_cycle',
    'rhu1_40F_to_50F_compressor_duty_cycle',
    'rhu1_50F_to_60F_compressor_duty_cycle',

    'rhu2_00F_to_05F',
    'rhu2_05F_to_10F',
    'rhu2_10F_to_15F',
    'rhu2_15F_to_20F',
    'rhu2_20F_to_25F',
    'rhu2_25F_to_30F',
    'rhu2_30F_to_35F',
    'rhu2_35F_to_40F',
    'rhu2_40F_to_45F',
    'rhu2_45F_to_50F',
    'rhu2_50F_to_55F',
    'rhu2_55F_to_60F',

    'rhu2_00F_to_05F_aux_duty_cycle',
    'rhu2_05F_to_10F_aux_duty_cycle',
    'rhu2_10F_to_15F_aux_duty_cycle',
    'rhu2_15F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_25F_aux_duty_cycle',
    'rhu2_25F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_35F_aux_duty_cycle',
    'rhu2_35F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_45F_aux_duty_cycle',
    'rhu2_45F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_55F_aux_duty_cycle',
    'rhu2_55F_to_60F_aux_duty_cycle',

    'rhu2_00F_to_05F_emg_duty_cycle',
    'rhu2_05F_to_10F_emg_duty_cycle',
    'rhu2_10F_to_15F_emg_duty_cycle',
    'rhu2_15F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_25F_emg_duty_cycle',
    'rhu2_25F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_35F_emg_duty_cycle',
    'rhu2_35F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_45F_emg_duty_cycle',
    'rhu2_45F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_55F_emg_duty_cycle',
    'rhu2_55F_to_60F_emg_duty_cycle',

    'rhu2_00F_to_05F_compressor_duty_cycle',
    'rhu2_05F_to_10F_compressor_duty_cycle',
    'rhu2_10F_to_15F_compressor_duty_cycle',
    'rhu2_15F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_25F_compressor_duty_cycle',
    'rhu2_25F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_35F_compressor_duty_cycle',
    'rhu2_35F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_45F_compressor_duty_cycle',
    'rhu2_45F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_55F_compressor_duty_cycle',
    'rhu2_55F_to_60F_compressor_duty_cycle',

    'rhu2_less10F',
    'rhu2_10F_to_20F',
    'rhu2_20F_to_30F',
    'rhu2_30F_to_40F',
    'rhu2_40F_to_50F',
    'rhu2_50F_to_60F',

    'rhu2_less10F_aux_duty_cycle',
    'rhu2_10F_to_20F_aux_duty_cycle',
    'rhu2_20F_to_30F_aux_duty_cycle',
    'rhu2_30F_to_40F_aux_duty_cycle',
    'rhu2_40F_to_50F_aux_duty_cycle',
    'rhu2_50F_to_60F_aux_duty_cycle',

    'rhu2_less10F_emg_duty_cycle',
    'rhu2_10F_to_20F_emg_duty_cycle',
    'rhu2_20F_to_30F_emg_duty_cycle',
    'rhu2_30F_to_40F_emg_duty_cycle',
    'rhu2_40F_to_50F_emg_duty_cycle',
    'rhu2_50F_to_60F_emg_duty_cycle',

    'rhu2_less10F_compressor_duty_cycle',
    'rhu2_10F_to_20F_compressor_duty_cycle',
    'rhu2_20F_to_30F_compressor_duty_cycle',
    'rhu2_30F_to_40F_compressor_duty_cycle',
    'rhu2_40F_to_50F_compressor_duty_cycle',
    'rhu2_50F_to_60F_compressor_duty_cycle',
]


[docs]def combine_output_dataframes(dfs):
    """ Combines output dataframes. Useful when combining output from batches.

    Parameters
    ----------
    dfs : list of pd.DataFrame
        Output dataFrames to combine into one.

    Returns
    -------
    out : pd.DataFrame
        Dataframe with combined output metadata.

    """
    return pd.concat(dfs, ignore_index=True)


[docs]def get_filtered_stats(
        df, row_filter, label, heating_or_cooling, target_columns,
        target_baseline_method):

    n_rows_total = df.shape[0]

    filtered_df = df[[row_filter(row, df) for i, row in df.iterrows()]]

    n_rows_kept = filtered_df.shape[0]
    n_rows_discarded = n_rows_total - n_rows_kept

    stats = OrderedDict()
    stats["label"] = "{}_{}".format(label, heating_or_cooling)
    stats["sw_version"] = get_version()
    stats["n_thermostat_core_day_sets_total"] = n_rows_total
    stats["n_thermostat_core_day_sets_kept"] = n_rows_kept
    stats["n_thermostat_core_day_sets_discarded"] = n_rows_discarded

    if n_rows_total > 0:

        for column_name in target_columns:
            column = filtered_df[column_name].replace([np.inf, -np.inf], np.nan).dropna()

            # calculate quantiles and statistics
            mean = np.nanmean(pd.to_numeric(column))

            if column.count() != 0:
                sem = np.nanstd(column) / (column.count() ** .5)
            else:
                sem = np.nan
            lower_bound = mean - (1.96 * sem)
            upper_bound = mean + (1.96 * sem)
            stats["{}_n".format(column_name)] = column.count()
            stats["{}_upper_bound_95_perc_conf".format(column_name)] = upper_bound
            stats["{}_mean".format(column_name)] = mean
            stats["{}_lower_bound_95_perc_conf".format(column_name)] = lower_bound
            stats["{}_sem".format(column_name)] = sem

            for quantile in QUANTILE:
                stats["{}_q{}".format(column_name, quantile)] = column.quantile(quantile / 100.)

            # Calculate IQR for RHU2 and filter outliers
            if 'rhu2' in column_name:
                iqr_filter = (column < column.quantile(UNFILTERED_PERCENTILE))
                if bool(iqr_filter.any()) is False:
                    iqr_filter = (column == column)
                    warn("RHU filtering 5% and min Runtime filtering removed entire dataset from statistics summary for bin. Disabling filter.")
                iqr_filtered_column = column.loc[iqr_filter]

                # calculate quantiles and statistics for RHU2 IQR (IQFLT) and
                # non-IQR filtering (NOIQ)
                iqr_mean = np.nanmean(pd.to_numeric(iqr_filtered_column))
                iqr_sem = np.nanstd(iqr_filtered_column) / (iqr_filtered_column.count() ** .5)
                iqr_lower_bound = iqr_mean - (1.96 * iqr_sem)
                iqr_upper_bound = iqr_mean + (1.96 * iqr_sem)

                noiq_mean = np.nanmean(pd.to_numeric(column))
                noiq_sem = np.nanstd(column) / (column.count() ** .5)
                noiq_lower_bound = noiq_mean - (1.96 * noiq_sem)
                noiq_upper_bound = noiq_mean + (1.96 * noiq_sem)

                stats["{}_n_IQFLT".format(column_name)] = iqr_filtered_column.count()
                stats["{}_upper_bound_95_perc_conf_IQFLT".format(column_name)] = iqr_upper_bound
                stats["{}_mean_IQFLT".format(column_name)] = iqr_mean
                stats["{}_lower_bound_95_perc_conf_IQFLT".format(column_name)] = iqr_lower_bound
                stats["{}_sem_IQFLT".format(column_name)] = iqr_sem
                stats["{}_n_NOIQ".format(column_name)] = column.count()
                stats["{}_upper_bound_95_perc_conf_NOIQ".format(column_name)] = noiq_upper_bound
                stats["{}_mean_NOIQ".format(column_name)] = noiq_mean
                stats["{}_lower_bound_95_perc_conf_NOIQ".format(column_name)] = noiq_lower_bound
                stats["{}_sem_NOIQ".format(column_name)] = noiq_sem

                for quantile in QUANTILE:
                    stats["{}_q{}_IQFLT".format(column_name, quantile)] = iqr_filtered_column.quantile(quantile / 100.)
                    stats["{}_q{}_NOIQ".format(column_name, quantile)] = column.quantile(quantile / 100.)

        return [stats]
    else:
        warn(
            "Not enough data to compute summary_statistics ({}_{})"
            .format(label, heating_or_cooling)
        )
        return []


[docs]def compute_summary_statistics(
        metrics_df,
        target_baseline_method="baseline_percentile",
        advanced_filtering=False):
    """ Computes summary statistics for the output dataframe. Computes the
    following statistics for each real-valued or integer valued column in
    the output dataframe: mean, standard error of the mean, and deciles.

    Parameters
    ----------
    df : pd.DataFrame
        Output for which to compute summary statistics.
    label : str
        Name for this set of thermostat outputs.
    target_baseline_method : {"baseline_percentile", "baseline_regional"}, default "baseline_percentile"
        Baselining method by which samples will be filtered according to bad fits.

    Returns
    -------
    stats : collections.OrderedDict
        An ordered dict containing the summary statistics. Column names are as
        follows, in which ### is a placeholder for the name of the column:

          - mean: ###_mean
          - standard error of the mean: ###_sem
          - 1st quantile:  ###_1q
          - 2.5th quantile:###_2.5q
          - 5th quantile:  ###_5q
          - 10th quantile: ###_10q
          - 15th quantile: ###_15q
          - 20th quantile: ###_20q
          - 25th quantile: ###_25q
          - 30th quantile: ###_30q
          - 35th quantile: ###_35q
          - 40th quantile: ###_40q
          - 45th quantile: ###_45q
          - 50th quantile: ###_50q
          - 55th quantile: ###_55q
          - 60th quantile: ###_60q
          - 65th quantile: ###_65q
          - 70th quantile: ###_70q
          - 75th quantile: ###_75q
          - 80th quantile: ###_80q
          - 85th quantile: ###_85q
          - 90th quantile: ###_90q
          - 95th quantile: ###_95q
          - 98th quantile: ###_98q
          - 99th quantile: ###_99q
          - number of non-null core day sets: ###_n

        The following general values are also output:

          - label: label
          - number of total core day sets: n_total_core_day_sets

    """

    if target_baseline_method not in ["baseline_percentile", "baseline_regional"]:
        message = (
            'Baseline method not supported - please use one of'
            ' "baseline_percentile" or "baseline_regional"'
        )
        raise ValueError(message)

    def _identity_filter(row, df):
        return True

    def _range_filter(row, column_name, heating_or_cooling, lower_bound=-np.inf, upper_bound=np.inf, target_baseline=False):
        if target_baseline:
            full_column_selector = "{}_{}".format(column_name, target_baseline_method)
        else:
            full_column_selector = column_name
        column_value = row[full_column_selector]
        return lower_bound < column_value < upper_bound

    def _percentile_range_filter(row, column_name, heating_or_cooling, df, quantile=0.0, target_baseline=False):
        if target_baseline:
            full_column_selector = "{}_{}".format(column_name, target_baseline_method)
        else:
            full_column_selector = column_name
        lower_bound = df[full_column_selector].dropna().quantile(0.0 + quantile)
        upper_bound = df[full_column_selector].dropna().quantile(1.0 - quantile)
        return _range_filter(row, column_name, heating_or_cooling, lower_bound, upper_bound, target_baseline)

    def _tau_filter_heating(row, df):
        return _range_filter(row, "tau", "heating", 0, 25)

    def _tau_filter_cooling(row, df):
        return _range_filter(row, "tau", "cooling", 0, 25)

    def _cvrmse_filter_heating(row, df):
        return _range_filter(row, "cv_root_mean_sq_err", "heating", upper_bound=0.6)

    def _cvrmse_filter_cooling(row, df):
        return _range_filter(row, "cv_root_mean_sq_err", "cooling", upper_bound=0.6)

    def _savings_filter_p01_heating(row, df):
        return _percentile_range_filter(row, "percent_savings", "heating", df, 0.01, True)

    def _savings_filter_p01_cooling(row, df):
        return _percentile_range_filter(row, "percent_savings", "cooling", df, 0.01, True)

    def _combine_filters(filters):
        def _new_filter(row, df):
            return reduce(lambda x, y: x and y(row, df), filters, True)
        return _new_filter

    def heating_stats(df, filter_, label):
        heating_df = df[["heating" in name for name in df["heating_or_cooling"]]]
        return get_filtered_stats(
            heating_df, filter_, label,
            "heating", REAL_OR_INTEGER_VALUED_COLUMNS_HEATING,
            target_baseline_method)

    def cooling_stats(df, filter_, label):
        cooling_df = df[["cooling" in name for name in df["heating_or_cooling"]]]
        return get_filtered_stats(
            cooling_df, filter_, label,
            "cooling", REAL_OR_INTEGER_VALUED_COLUMNS_COOLING,
            target_baseline_method)

    very_cold_cold_df = metrics_df[[
        (cz is not None) and "Very-Cold/Cold" in cz
        for cz in metrics_df["climate_zone"]
    ]]
    mixed_humid_df = metrics_df[[
        (cz is not None) and "Mixed-Humid" in cz
        for cz in metrics_df["climate_zone"]
    ]]
    mixed_dry_hot_dry_df = metrics_df[[
        (cz is not None) and "Mixed-Dry/Hot-Dry" in cz
        for cz in metrics_df["climate_zone"]
    ]]
    hot_humid_df = metrics_df[[
        (cz is not None) and "Hot-Humid" in cz
        for cz in metrics_df["climate_zone"]
    ]]
    marine_df = metrics_df[[
        (cz is not None) and "Marine" in cz
        for cz in metrics_df["climate_zone"]
    ]]

    filter_0 = _identity_filter
    filter_1_heating = _combine_filters([_tau_filter_heating])
    filter_1_cooling = _combine_filters([_tau_filter_cooling])
    filter_2_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating])
    filter_2_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling])
    filter_3_heating = _combine_filters([_tau_filter_heating, _cvrmse_filter_heating, _savings_filter_p01_heating])
    filter_3_cooling = _combine_filters([_tau_filter_cooling, _cvrmse_filter_cooling, _savings_filter_p01_cooling])

    if advanced_filtering:
        stats = list(chain.from_iterable([
            heating_stats(metrics_df, filter_0, "all_no_filter"),
            cooling_stats(metrics_df, filter_0, "all_no_filter"),
            heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"),
            cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"),
            heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"),
            cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"),
            heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"),
            cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"),
            heating_stats(marine_df, filter_0, "marine_no_filter"),
            cooling_stats(marine_df, filter_0, "marine_no_filter"),

            heating_stats(metrics_df, filter_1_heating, "all_tau_filter"),
            cooling_stats(metrics_df, filter_1_cooling, "all_tau_filter"),
            heating_stats(very_cold_cold_df, filter_1_heating, "very-cold_cold_tau_filter"),
            cooling_stats(very_cold_cold_df, filter_1_cooling, "very-cold_cold_tau_filter"),
            heating_stats(mixed_humid_df, filter_1_heating, "mixed-humid_tau_filter"),
            cooling_stats(mixed_humid_df, filter_1_cooling, "mixed-humid_tau_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_1_heating, "mixed-dry_hot-dry_tau_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_1_cooling, "mixed-dry_hot-dry_tau_filter"),
            heating_stats(hot_humid_df, filter_1_heating, "hot-humid_tau_filter"),
            cooling_stats(hot_humid_df, filter_1_cooling, "hot-humid_tau_filter"),
            heating_stats(marine_df, filter_1_heating, "marine_tau_filter"),
            cooling_stats(marine_df, filter_1_cooling, "marine_tau_filter"),

            heating_stats(metrics_df, filter_2_heating, "all_tau_cvrmse_filter"),
            cooling_stats(metrics_df, filter_2_cooling, "all_tau_cvrmse_filter"),
            heating_stats(very_cold_cold_df, filter_2_heating, "very-cold_cold_tau_cvrmse_filter"),
            cooling_stats(very_cold_cold_df, filter_2_cooling, "very-cold_cold_tau_cvrmse_filter"),
            heating_stats(mixed_humid_df, filter_2_heating, "mixed-humid_tau_cvrmse_filter"),
            cooling_stats(mixed_humid_df, filter_2_cooling, "mixed-humid_tau_cvrmse_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_2_heating, "mixed-dry_hot-dry_tau_cvrmse_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_2_cooling, "mixed-dry_hot-dry_tau_cvrmse_filter"),
            heating_stats(hot_humid_df, filter_2_heating, "hot-humid_tau_cvrmse_filter"),
            cooling_stats(hot_humid_df, filter_2_cooling, "hot-humid_tau_cvrmse_filter"),
            heating_stats(marine_df, filter_2_heating, "marine_tau_cvrmse_filter"),
            cooling_stats(marine_df, filter_2_cooling, "marine_tau_cvrmse_filter"),

            heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"),
            cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"),
            heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"),
            cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"),
            heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"),
            cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"),
            heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"),
            cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"),
            heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"),
            cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"),
        ]))
    else:
        stats = list(chain.from_iterable([
            heating_stats(metrics_df, filter_0, "all_no_filter"),
            cooling_stats(metrics_df, filter_0, "all_no_filter"),
            heating_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"),
            cooling_stats(very_cold_cold_df, filter_0, "very-cold_cold_no_filter"),
            heating_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"),
            cooling_stats(mixed_humid_df, filter_0, "mixed-humid_no_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_0, "mixed-dry_hot-dry_no_filter"),
            heating_stats(hot_humid_df, filter_0, "hot-humid_no_filter"),
            cooling_stats(hot_humid_df, filter_0, "hot-humid_no_filter"),
            heating_stats(marine_df, filter_0, "marine_no_filter"),
            cooling_stats(marine_df, filter_0, "marine_no_filter"),

            heating_stats(metrics_df, filter_3_heating, "all_tau_cvrmse_savings_p01_filter"),
            cooling_stats(metrics_df, filter_3_cooling, "all_tau_cvrmse_savings_p01_filter"),
            heating_stats(very_cold_cold_df, filter_3_heating, "very-cold_cold_tau_cvrmse_savings_p01_filter"),
            cooling_stats(very_cold_cold_df, filter_3_cooling, "very-cold_cold_tau_cvrmse_savings_p01_filter"),
            heating_stats(mixed_humid_df, filter_3_heating, "mixed-humid_tau_cvrmse_savings_p01_filter"),
            cooling_stats(mixed_humid_df, filter_3_cooling, "mixed-humid_tau_cvrmse_savings_p01_filter"),
            heating_stats(mixed_dry_hot_dry_df, filter_3_heating, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"),
            cooling_stats(mixed_dry_hot_dry_df, filter_3_cooling, "mixed-dry_hot-dry_tau_cvrmse_savings_p01_filter"),
            heating_stats(hot_humid_df, filter_3_heating, "hot-humid_tau_cvrmse_savings_p01_filter"),
            cooling_stats(hot_humid_df, filter_3_cooling, "hot-humid_tau_cvrmse_savings_p01_filter"),
            heating_stats(marine_df, filter_3_heating, "marine_tau_cvrmse_savings_p01_filter"),
            cooling_stats(marine_df, filter_3_cooling, "marine_tau_cvrmse_savings_p01_filter"),
        ]))

    stats_dict = {stat["label"]: stat for stat in stats}

    def _load_climate_zone_weights(filename_or_buffer):
        climate_zone_keys = {
            "Very-Cold/Cold": "very-cold_cold",
            "Mixed-Humid": "mixed-humid",
            "Mixed-Dry/Hot-Dry": "mixed-dry_hot-dry",
            "Hot-Humid": "hot-humid",
            "Marine": "marine",
        }
        df = pd.read_csv(
            filename_or_buffer,
            usecols=["climate_zone", "heating_weight", "cooling_weight"],
        ).set_index("climate_zone")

        heating_weights = {climate_zone_keys[cz]: weight for cz, weight in df["heating_weight"].iteritems()}
        cooling_weights = {climate_zone_keys[cz]: weight for cz, weight in df["cooling_weight"].iteritems()}

        return heating_weights, cooling_weights

    with resource_stream('thermostat.resources', 'NationalAverageClimateZoneWeightings.csv') as f:
        heating_weights, cooling_weights = _load_climate_zone_weights(f)

    def _compute_national_weightings(stats_by_climate_zone, keys, weights):
        def _national_weight(key):
            results = []
            for cz, weight in weights.items():
                stat_cz = stats_by_climate_zone.get(cz)
                if stat_cz is None:
                    value = None
                else:
                    value = stat_cz.get(key)
                if pd.notnull(weight) and pd.notnull(value):
                    results.append((weight, value))
            if len(results) == 0:
                return None
            else:
                weighted_sum = sum([weight * value for weight, value in results])
                sum_of_weights = sum([weight for weight, _ in results])
                return weighted_sum / sum_of_weights

        stats = [
            "mean",
            "q1",
            "q2.5",
            "q5",
            "q10",
            "q15",
            "q20",
            "q25",
            "q30",
            "q35",
            "q40",
            "q45",
            "q50",
            "q55",
            "q60",
            "q65",
            "q70",
            "q75",
            "q80",
            "q85",
            "q90",
            "q95",
            "q98",
            "q99",
        ]

        key_stats = [
            "{}_{}".format(key, stat)
            for key in keys for stat in stats
        ]

        return {
            "{}_{}".format(key_stat, "national_weighted_mean"): _national_weight(key_stat)
            for key_stat in key_stats
        }

    def _compute_national_weighting_lower_and_upper_bounds(
            stats_by_climate_zone, keys, weights):

        def _compute_bounds(key):

            # compute sem savings
            means, sems, weights_ = [], [], []
            for cz, weight in weights.items():
                stat_cz = stats_by_climate_zone.get(cz)
                if stat_cz is None:
                    mean, sem = None, None
                else:
                    mean = stat_cz.get("{}_mean".format(key), None)
                    sem = stat_cz.get("{}_sem".format(key), None)

                if pd.notnull(weight) and pd.notnull(mean) and pd.notnull(sem):
                    weights_.append(weight)
                    means.append(mean)
                    sems.append(sem)

            if len(weights_) == 0:
                return {}
            else:
                weighted_sum = sum([
                    weight * mean for weight, mean in zip(weights_, means)
                ])
                weighted_mean = weighted_sum / sum(weights_)  # renormalize

                weighted_sem = sum([
                    (weight*sem) ** 2 for weight, sem in zip(weights_, sems)
                ]) ** 0.5

                lower_bound = weighted_mean - (1.96 * weighted_sem)
                upper_bound = weighted_mean + (1.96 * weighted_sem)

                return {
                    "{}_lower_bound_95_perc_conf_national_weighted_mean".format(key): lower_bound,
                    "{}_upper_bound_95_perc_conf_national_weighted_mean".format(key): upper_bound
                }

        items = {}
        for key in keys:
            items.update(_compute_bounds(key))
        return items

    national_weighting_stats = []

    if advanced_filtering:
        filters = [
            "no_filter",
            "tau_filter",
            "tau_cvrmse_filter",
            "tau_cvrmse_savings_p01_filter",
        ]
    else:
        filters = [
            "no_filter",
            "tau_cvrmse_savings_p01_filter",
        ]

    climate_zones = [
        "mixed-humid",
        "mixed-dry_hot-dry",
        "marine",
        "hot-humid",
        "very-cold_cold"
    ]
    methods = [
        "baseline_percentile",
        "baseline_regional",
    ]
    for season_type in ["heating", "cooling"]:
        if season_type == "heating":
            weights = heating_weights
        else:
            weights = cooling_weights

        for filter_ in filters:
            stats_by_climate_zone = {
                cz: stats_dict.get("{}_{}_{}".format(cz, filter_, season_type))
                for cz in climate_zones
            }

            keys = ["percent_savings_{}".format(method) for method in methods]

            national_weightings = _compute_national_weightings(
                stats_by_climate_zone, keys, weights)

            bounds = _compute_national_weighting_lower_and_upper_bounds(
                stats_by_climate_zone, keys, weights)
            national_weightings.update(bounds)

            national_weightings.update(
                {"label": "national_weighted_mean_{}_{}".format(season_type, filter_)}
            )

            national_weighting_stats.append(national_weightings)

    stats = national_weighting_stats + stats
    return stats


[docs]def summary_statistics_to_csv(stats, filepath, product_id):
    """ Write metric statistics to CSV file.

    Parameters
    ----------
    stats : list of dict
        List of outputs from thermostat.stats.compute_summary_statistics()
    filepath : str
        Filepath at which to save the suppary statistics
    product_id : str
        A combination of the connected thermostat service plus one or more
        connected thermostat device models that comprises the data set.

    Returns
    -------
    df : pandas.DataFrame
        A pandas dataframe containing the output data.

    """

    columns = [
        "label",
        "product_id",
        "sw_version",
    ]

    methods = [
        "baseline_percentile",
        "baseline_regional",
    ]

    national_weighting_columns = list(chain.from_iterable([
        [
            "percent_savings_{}_mean_national_weighted_mean".format(method),
            "percent_savings_{}_q1_national_weighted_mean".format(method),
            "percent_savings_{}_q2.5_national_weighted_mean".format(method),
            "percent_savings_{}_q5_national_weighted_mean".format(method),
            "percent_savings_{}_q10_national_weighted_mean".format(method),
            "percent_savings_{}_q15_national_weighted_mean".format(method),
            "percent_savings_{}_q20_national_weighted_mean".format(method),
            "percent_savings_{}_q25_national_weighted_mean".format(method),
            "percent_savings_{}_q30_national_weighted_mean".format(method),
            "percent_savings_{}_q35_national_weighted_mean".format(method),
            "percent_savings_{}_q40_national_weighted_mean".format(method),
            "percent_savings_{}_q45_national_weighted_mean".format(method),
            "percent_savings_{}_q50_national_weighted_mean".format(method),
            "percent_savings_{}_q55_national_weighted_mean".format(method),
            "percent_savings_{}_q60_national_weighted_mean".format(method),
            "percent_savings_{}_q65_national_weighted_mean".format(method),
            "percent_savings_{}_q70_national_weighted_mean".format(method),
            "percent_savings_{}_q75_national_weighted_mean".format(method),
            "percent_savings_{}_q80_national_weighted_mean".format(method),
            "percent_savings_{}_q85_national_weighted_mean".format(method),
            "percent_savings_{}_q90_national_weighted_mean".format(method),
            "percent_savings_{}_q95_national_weighted_mean".format(method),
            "percent_savings_{}_q98_national_weighted_mean".format(method),
            "percent_savings_{}_q99_national_weighted_mean".format(method),
            "percent_savings_{}_lower_bound_95_perc_conf_national_weighted_mean".format(method),
            "percent_savings_{}_upper_bound_95_perc_conf_national_weighted_mean".format(method),
        ] for method in methods
    ]))

    columns.extend(national_weighting_columns)

    columns.extend([
        "n_thermostat_core_day_sets_total",
        "n_thermostat_core_day_sets_kept",
        "n_thermostat_core_day_sets_discarded",
    ])
    for column_name in REAL_OR_INTEGER_VALUED_COLUMNS_ALL:
        columns.append("{}_n".format(column_name))
        columns.append("{}_upper_bound_95_perc_conf".format(column_name))
        columns.append("{}_mean".format(column_name))
        columns.append("{}_lower_bound_95_perc_conf".format(column_name))
        columns.append("{}_sem".format(column_name))
        for quantile in QUANTILE:
            columns.append("{}_q{}".format(column_name, quantile))

    for rhu_column in [column for column in columns if 'rhu2' in column]:
        columns.append(rhu_column + '_IQFLT')

    for rhu_column in [column for column in columns if 'IQFLT' in column]:
        columns.append(rhu_column.replace('IQFLT', 'NOIQ'))

    # add product_id
    for row in stats:
        row["product_id"] = product_id

    # transpose for readability.
    stats_dataframe = pd.DataFrame(stats, columns=columns).set_index('label').transpose()
    stats_dataframe.to_csv(filepath)
    return stats_dataframe
Source code for thermostat.stats

thermostat

Navigation

Related Topics