Source code for src.aequitas.fairness

import logging

import pandas as pd

logging.getLogger(__name__)

__author__ = "Rayid Ghani, Pedro Saleiro <saleiro@uchicago.edu>, Loren Hinkson"
__copyright__ = "Copyright \xa9 2018. The University of Chicago. All Rights Reserved."


[docs]class Fairness(object):
    """
    """
    def __init__(self, fair_eval=None, tau=None, fair_measures_depend=None,
                 type_parity_depend=None, high_level_fairness_depend=None):
        """

        :param fair_eval: a lambda function that is used to assess fairness
            (e.g. 80% rule)
        :param tau: the threshold for fair/unfair evaluation
        :param fair_measures_depend: a dictionary containing fairness measures
            as keys and the corresponding input bias disparity as values
        :param type_parity_depend: a dictionary with Type I, Type II, and
            Equalized Odds fairness measures as keys and lists of their
            underlying bias metric parities as values
        :param high_level_fairness_depend: a dictionary with supervised and
            unsupervised fairness as keys and lists of their underlying metric
            parities as values.
        """

        if not fair_eval:
            self.fair_eval = lambda tau: lambda x: pd.np.nan if pd.np.isnan(x) else \
                (True if tau <= x <= 1 / tau else False)
        else:
            self.fair_eval = fair_eval
        # tau is the fairness_threshold and should be a real ]0.0 and 1.0]
        if not tau:
            self.tau = 0.8
        else:
            self.tau = tau

        # Set high-level fairness evaluation to NA (undefined) if both
        # underlying parity determinations are NA. If only one parity is NA,
        # evaluation is determined by the defined parity.
        self.high_level_pair_eval = lambda col1, col2: lambda x: pd.np.nan if (pd.np.isnan(x[col1]) and pd.np.isnan(x[col2])) \
            else \
            (True if (x[col1] is True and x[col2] is True) else False)

        self.high_level_single_eval = lambda col: lambda x: pd.np.nan if pd.np.isnan(x[col]) else (True if x[col] is True else
        False)

        # the fair_measures_depend define the bias metrics that serve as input to the fairness evaluation and respective
        # fairness measure. basically these are the fairness measures supported by the current version of aequitas.

        if not fair_measures_depend:
            self.fair_measures_depend = {'Statistical Parity': 'ppr_disparity',
                                         'Impact Parity': 'pprev_disparity',
                                         'FDR Parity': 'fdr_disparity',
                                         'FPR Parity': 'fpr_disparity',
                                         'FOR Parity': 'for_disparity',
                                         'FNR Parity': 'fnr_disparity',
                                         'TPR Parity': 'tpr_disparity',
                                         'TNR Parity': 'tnr_disparity',
                                         'NPV Parity': 'npv_disparity',
                                         'Precision Parity': 'precision_disparity'
                                         }
        else:
            self.fair_measures_depend = fair_measures_depend
        # the self.fair_measures represents the list of fairness_measures to be calculated by default
        self.fair_measures_supported = self.fair_measures_depend.keys()

        if not type_parity_depend:
            self.type_parity_depend = {'TypeI Parity': ['FDR Parity', 'FPR Parity'],
                                       'TypeII Parity': ['FOR Parity', 'FNR Parity'],
                                       'Equalized Odds': ['FPR Parity', 'TPR Parity']}
        else:
            self.type_parity_depend = type_parity_depend

        # high level fairness_depend define which input fairness measures are used to calculate the high level ones
        if not high_level_fairness_depend:
            self.high_level_fairness_depend = {
                'Unsupervised Fairness': ['Statistical Parity', 'Impact Parity'],
                'Supervised Fairness': ['TypeI Parity', 'TypeII Parity']
            }
        else:
            self.high_level_fairness_depend = high_level_fairness_depend

[docs]    def get_fairness_measures_supported(self, input_df):
        """
        Determine fairness measures supported based on columns in data frame.
        """
        if 'label_value' not in input_df.columns:
            self.fair_measures_supported = ['Statistical Parity', 'Impact Parity']
        return self.fair_measures_supported

[docs]    def get_group_value_fairness(self, bias_df, tau=None, fair_measures_requested=None):
        """
        Calculates the fairness measures defined in fair_measures_requested
        dictionary and adds them as columns to the input bias_df.

        :param bias_df: the output dataframe from bias/ disparity calculation methods.
        :param tau: optional, the threshold for fair/ unfair evaluation.
        :param fair_measures_requested: optional, a dictionary containing fairness
            measures as keys and the corresponding input bias disparity as values.

        :return: Bias_df dataframe with additional columns for each
            of the fairness measures defined in the fair_measures dictionary
        """
        logging.info('get_group_value_fairness...')
        if not tau:
            tau = self.tau
        if not fair_measures_requested:
            fair_measures_requested = self.fair_measures_supported
        for fair, input in self.fair_measures_depend.items():
            if fair in fair_measures_requested:
                bias_df[fair] = bias_df[input].apply(self.fair_eval(tau))
        for fair, input in self.type_parity_depend.items():

            if input[0] in bias_df.columns:
                if input[1] in bias_df.columns:
                    bias_df[fair] = bias_df.apply(self.high_level_pair_eval(input[0], input[1]), axis=1)
                else:
                    bias_df[fair] = bias_df.apply(self.high_level_single_eval(input[0]), axis=1)
            elif input[1] in bias_df.columns:
                bias_df[fair] = bias_df.apply(self.high_level_single_eval(input[1]), axis=1)
            else:
                print('get_group_value_fairness: No Parity measure input found on bias_df')
        for fair, input in self.high_level_fairness_depend.items():
            if input[0] in bias_df.columns:
                if input[1] in bias_df.columns:
                    bias_df[fair] = bias_df.apply(self.high_level_pair_eval(input[0], input[1]), axis=1)
                else:
                    bias_df[fair] = bias_df.apply(self.high_level_single_eval(input[0]), axis=1)
            elif input[1] in bias_df.columns:
                bias_df[fair] = bias_df.apply(self.high_level_single_eval(input[1]), axis=1)
        if 'Unsupervised Fairness' not in bias_df.columns and 'Supervised Fairness' not in bias_df.columns:
            logging.info('get_group_value_fairness: No high level measure input found on bias_df' + input[1])
        return bias_df

    def _fill_groupby_attribute_fairness(self, groupby_obj, key_columns,
                                        group_attribute_df, measures):
        """
        Returns dataframe with values grouped by attribute_value
        """
        logging.info('fill_groupby_attribute_fairness')
        for key in measures:
            rows = []
            for group, values in groupby_obj:
                group_df = groupby_obj.get_group(group)
                if group_df[key].isnull().all():
                    row = group_df.iloc[0][key_columns + [key]]
                else:
                    group_df = group_df[group_df[key].notnull()][key_columns + [key]]
                    row = group_df.loc[group_df[key].astype(bool).idxmin()]
                rows.append(row)
            key_df = pd.DataFrame(rows)
            if group_attribute_df.empty:
                group_attribute_df = key_df
            else:
                group_attribute_df = group_attribute_df.merge(key_df, on=key_columns)
        return group_attribute_df


[docs]    def get_group_attribute_fairness(self, group_value_df, fair_measures_requested=None):
        """
        Determines whether the minimum value for each fairness measure in
        fair_measures_requested is 'False' across all attribute_values defined
        by a group attribute_name. If 'False' is present, determination for the
        attribute is False for given fairness measure.

        :param group_value_df: output dataframe of get_group_value_fairness() method
        :return: A dataframe of fairness measures at the attribute level (no attribute_values)
        """
        logging.info('get_group_attribute_fairness')
        if not fair_measures_requested:
            fair_measures_requested = self.fair_measures_supported
        group_attribute_df = pd.DataFrame()
        key_columns = ['model_id', 'score_threshold', 'attribute_name']
        groupby_variable = group_value_df.groupby(key_columns)
        # We need to do this because of NaNs. idxmin() on pandas raises keyerror if there is a NaN...
        group_attribute_df = self._fill_groupby_attribute_fairness(groupby_variable, key_columns, group_attribute_df,
                                                                  fair_measures_requested)
        if group_attribute_df.empty:
            raise Exception('get_group_attribute_fairness: no fairness measures requested found on input group_value_df columns')
        parity_cols = [col for col in self.type_parity_depend if col in group_value_df.columns]
        group_attribute_df = self._fill_groupby_attribute_fairness(groupby_variable, key_columns, group_attribute_df, parity_cols)
        highlevel_cols = [col for col in self.high_level_fairness_depend if col in group_value_df.columns]
        group_attribute_df = self._fill_groupby_attribute_fairness(groupby_variable, key_columns, group_attribute_df,
                                                                  highlevel_cols)
        return group_attribute_df

[docs]    def get_overall_fairness(self, group_attribute_df):
        """
        Calculates overall fairness regardless of the group_attributes.
        Searches for 'False' parity determinations across group_attributes and
        outputs 'True' determination if all group_attributes are fair.

        :param group_attribute_df: the output df of the get_group_attributes_fairness
        :return: A dictionary of overall, unsupervised, and supervised fairness determinations
        """
        overall_fairness = {}
        if 'Unsupervised Fairness' in group_attribute_df.columns:
            overall_fairness['Unsupervised Fairness'] = False if \
                group_attribute_df['Unsupervised Fairness'].min() == False else True

        if 'Supervised Fairness' in group_attribute_df.columns:
            overall_fairness['Supervised Fairness'] = False if group_attribute_df['Supervised Fairness'].min() == False else True

        fair_vals = [val for key, val in overall_fairness.items()]
        if False in fair_vals:
            overall_fairness['Overall Fairness'] = False
        elif True in fair_vals:
            overall_fairness['Overall Fairness'] = True
        else:
            overall_fairness['Overall Fairness'] = 'Undefined'
        return overall_fairness

[docs]    def list_parities(self, df):
        """
        View list of all parity determinations in df
        """
        all_fairness = self.type_parity_depend.keys() | \
                       self.high_level_fairness_depend.keys() | \
                       self.fair_measures_depend.keys()
        return list(all_fairness & set(df.columns))