Source code for corradjust.metrics

import sys
import os
import io
import parallel_sort

import numpy as np

from datetime import datetime
from numba import njit, prange
from scipy.stats import false_discovery_control

from corradjust.utils import *



[docs]
class CorrScorer:
    """
    Class that contains data structures
    for features and reference feature sets, and computes
    different scores given feature-feature correlations.
    See documentation of CorrAdjust class constructor
    for detailed description of constructor parameters (they are the same).

    Parameters
    ----------
    df_feature_ann : pandas.DataFrame with index and 2 columns
        A data frame providing annotation of features.
    ref_feature_colls : dict
        A dictionary with information about reference feature set collections.
    shuffle_feature_names : bool or list, optional, default=False
        Whether (and how) to permute feature names.
    metric : {"enrichment-based", "BP@K"}, optional, default="enrichment-based"
        Metric for evaluating feature correlations.
    min_pairs_to_score : int, optional, default=1000
        Minimum number of total pairs containing a feature
        to include the feature in the enrichment score computations.
    random_seed : int, optional, default=17
        Random seed used by shuffling procedure.
    verbose : bool, optional, default=True
        Whether to print progress messages to stderr.
    
    Attributes
    ----------
    data : dict
        The main data structure holding information about features
        and feature sets. Populated by calling `compute_index` method.
        Keys of `data` are names of reference
        feature set collections, and values are dicts with the following
        structure:

        | {
        |     ``"sign"`` : ``{"positive", "negative", "absolute"}``
        |         Taken directly from the input data.
        |     ``"mask"`` : `numpy.array`
        |         Array that says whether each feature pair
                  belongs to at least one shared reference set.
                  Feature pairs are packed into a flat array
                  with ``n_features * (n_features - 1) // 2`` elements (see `numpy.triu_indices`).
                  There are 3 possible values:
        |             ``mask == 1``: features belong to the same reference set.
        |             ``mask == 0``: features do not belong to the same reference set,
                      though they form allowed pair type and both are present in some reference sets.
        |             ``mask == -1``: otherwise.
        |     ``"train_val_mask"`` : `numpy.array`
        |         Array that says whether each feature pair
                  belongs to training or validation set.
                  Same order of pairs as in ``mask``.
                  There are 3 possible values:
        |             ``train_val_mask == 0``: pair belongs to training set.
        |             ``train_val_mask == 1``: pair belongs to validation set.
        |             ``train_val_mask == -1``: all other pairs
        |     ``"features_total_pos"`` : `numpy.ndarray`
        |         2D array of shape ``(3, n_features)``. The first index
                  refers to feature pair type: ``0`` is ``training``,
                  ``1`` is ``validation``, and ``2`` is ``all``.
                  The second index stands for feature index,
                  and the array value says how many reference pairs
                  the feature has. These are ``n_j`` in the paper's terminology.
        |     ``"features_total_neg"`` : `numpy.ndarray`
        |         Same as ``"features_total_pos"`` but for
                  non-reference pairs. These are ``N_j - n_j`` in the paper's terminology.
        |     ``"scoring_features_mask"`` : `numpy.array`
        |         Binary array with ``n_features`` elements that says
                  whether each feature should participate in the computation of
                  enrichment scores. The value is determined based on
                  `min_pairs_to_score` parameter.
        |     ``"high_corr_frac"`` : `float`
        |         Taken directly from the input data.
        |     ``"high_corr_pairs_all"`` : `int`
        |         Absolute number of highly ranked feature pairs (all pairs).
        |     ``"high_corr_pairs_training"`` : `int`
        |         Absolute number of highly ranked feature pairs (training pairs).
        |     ``"high_corr_pairs_validation"`` : `int`
        |         Absolute number of highly ranked feature pairs (validation pairs).
        | }
        
        feature_ids, feature_names, feature_types : np.array
            Arrays of feature IDs, names, and types.
            Populated by `compute_index` method.
        
        metric : {"enrichment-based", "BP@K"}
            Metric for evaluating feature correlations.
            Populated by constructor.
        
        verbose : bool
            Whether to print progress messages.
            Populated by constructor.
    """

    def __init__(
        self,
        df_feature_ann,
        ref_feature_colls,
        shuffle_feature_names=False,
        metric="enrichment-based",
        min_pairs_to_score=1000,
        random_seed=17,
        verbose=True
    ):
        # Do some input checks
        assert (
            len(df_feature_ann) == len(set(df_feature_ann.index))
        ), "Feature annotation data shouldn't contain duplicate feature IDs."

        assert (
            set(df_feature_ann.columns) == {"feature_name", "feature_type"}
        ), "Columns of feature annotation data should be 'feature_name' and 'feature_type'."

        for feature_type in df_feature_ann["feature_type"].unique():
            assert (
                "-" not in feature_type
            ), "Feature types should not contain '-' character."
        
        assert (
            metric in {"enrichment-based", "BP@K"}
        ), f"Metric should be either 'enrichment-based' or 'BP@K', but '{metric}' was found"
        
        assert isinstance(min_pairs_to_score, int), "min_pairs_to_score must be int."
        assert isinstance(random_seed, int), "random_seed must be int."
        assert verbose in {True, False}, "verbose must be True or False."

        self.metric = metric
        self.verbose = verbose

        if shuffle_feature_names is not False:
            # We shuffle feature names within each feature type
            # This way we fully preserve structure of reference sets,
            # but destroy association between feature name and data
            np.random.seed(random_seed)

            if shuffle_feature_names is True:
                feature_types_to_shuffle = df_feature_ann["feature_type"].unique()
            else:
                feature_types_to_shuffle = shuffle_feature_names
            
            for feature_type in feature_types_to_shuffle:
                assert (
                    feature_type in set(df_feature_ann["feature_type"])
                ), f"{feature_type} is present in shuffle_feature_names but not in the feature annotation data."

                type_mask = df_feature_ann["feature_type"] == feature_type
                shuffled_feature_names = df_feature_ann.loc[type_mask, "feature_name"].sample(n=type_mask.sum()).to_list()
                df_feature_ann.loc[type_mask, "feature_name"] = shuffled_feature_names

        self.feature_ids = np.array(df_feature_ann.index.to_list())
        self.feature_names = np.array(df_feature_ann["feature_name"].to_list())
        self.feature_types = np.array(df_feature_ann["feature_type"].to_list())

        # Make dicts to quickly convert between feature *names* and feature indices
        # One name can have many indices, but one index have only one name
        self.feature_name_to_idxs = {}
        self.feature_idx_to_name, self.feature_idx_to_type = [], []
        for feature_idx, feature_id in enumerate(self.feature_ids):
            feature_name = df_feature_ann.at[feature_id, "feature_name"]
            feature_type = df_feature_ann.at[feature_id, "feature_type"]
            self.feature_name_to_idxs[feature_name] = self.feature_name_to_idxs.get(feature_name, []) + [feature_idx]
            self.feature_idx_to_name.append(feature_name)
            self.feature_idx_to_type.append(feature_type)
        self.feature_idx_to_name = np.array(self.feature_idx_to_name)
        self.feature_idx_to_type = np.array(self.feature_idx_to_type)

        # Load reference sets and store them in a nice data structure 
        self.compute_index(ref_feature_colls, min_pairs_to_score)


[docs]
    def compute_index(self, ref_feature_colls, min_pairs_to_score):
        """
        Populate `data` attribute.
        Input arguments are the same as in the constructor.

        Parameters
        ----------
        ref_feature_colls : dict
            A dictionary with information about reference feature set collections.
        min_pairs_to_score : int
            Minimum number of total pairs containing a feature
            to include the feature in the enrichment score computations.
        """

        n_features = len(self.feature_ids)
        # These arrays contain feature indices for (feature1, feature2) pairs in mask
        self.feature_idxs1, self.feature_idxs2 = np.triu_indices(n_features, k=1)

        self.data = {}

        for ref_feature_coll in ref_feature_colls:
            if self.verbose:
                print(
                    f"{datetime.now()} | Loading {ref_feature_coll} reference collection...",
                    file=sys.stderr
                )

            assert (
                os.path.isfile(ref_feature_colls[ref_feature_coll]["path"])
            ), f"File {ref_feature_colls[ref_feature_coll]['path']} not found."

            db_file = io.BufferedReader(open(ref_feature_colls[ref_feature_coll]["path"], "rb"))
            # Each line has reference set name (field 0) and list of features (fields 2+)
            # n features listed on a line will result in n*(n-1)/2 feature pairs
            
            # Features from all lines are stored in a flat 1D arrays
            # ref_feature_sets_boundaries arrays stores the indices separating
            # features from different lines
            feature_idxs_in_ref_sets = []
            ref_feature_sets_boundaries = [0]

            # These features are present in at least one reference set
            unique_feature_idxs_in_db = set()
            
            for line in db_file:
                splitted = line.decode().strip("\n").split("\t")
                assert (
                    len(splitted) >= 4
                ), f"This line of GMT file contains < 4 fields: '{line.decode()}'."
                feature_names = splitted[2:]
                
                feature_idxs = [
                    feature_idx
                    for feature_name in feature_names if feature_name in self.feature_name_to_idxs
                    for feature_idx in self.feature_name_to_idxs[feature_name]
                ]
                unique_feature_idxs_in_db |= set(feature_idxs)
                if len(feature_idxs) <= 1:
                    continue

                feature_idxs_in_ref_sets += feature_idxs
                ref_feature_sets_boundaries.append(ref_feature_sets_boundaries[-1] + len(feature_idxs))

            db_file.close()

            feature_idxs_in_ref_sets = np.array(feature_idxs_in_ref_sets, dtype="int64")
            ref_feature_sets_boundaries = np.array(ref_feature_sets_boundaries, dtype="int64")
            unique_feature_idxs_in_db = np.array(sorted(list(unique_feature_idxs_in_db)), dtype="int64")
            
            # Numba cannot work with strings and tuples normally
            # So, we encode feature types by numbers
            unique_types, inverse_idx = np.unique(self.feature_idx_to_type, return_inverse=True)
            type_ids = np.arange(len(unique_types))
            feature_idx_to_type_id = type_ids[inverse_idx]

            # We hash type_id tuples using Cantor pairing function
            type_to_id = dict(zip(unique_types, type_ids))
            allowed_feature_pair_types = []
            for pair_type in ref_feature_colls[ref_feature_coll]["feature_pair_types"]:
                assert (
                    "-" in pair_type
                ), f"Allowed pair type must contain '-' character but none is found in '{pair_type}'."
                assert (
                    pair_type.split("-")[0] in type_to_id and
                    pair_type.split("-")[1] in type_to_id
                ), f"One of the allowed feature pair types is not present in feature annotation data: '{pair_type}'."

                k1 = type_to_id[pair_type.split("-")[0]]
                k2 = type_to_id[pair_type.split("-")[1]]
                cantor = (k1 + k2) * (k1 + k2 + 1) // 2 + k2
                allowed_feature_pair_types.append(cantor)
            
            allowed_feature_pair_types = np.array(allowed_feature_pair_types, dtype="int64")
            
            self.data[ref_feature_coll] = {}

            assert (
                ref_feature_colls[ref_feature_coll]["sign"] in {"positive", "negative", "absolute"}
            ), f"Sign can be 'positive', 'negative', or 'absolute', but '{ref_feature_colls[ref_feature_coll]['sign']}' was found."
            self.data[ref_feature_coll]["sign"] = ref_feature_colls[ref_feature_coll]["sign"]

            # Effective data structure for ground truth flag 
            mask = self._compute_ref_feature_coll_mask(
                feature_idxs_in_ref_sets, ref_feature_sets_boundaries, unique_feature_idxs_in_db,
                allowed_feature_pair_types, feature_idx_to_type_id,
                n_features
            )
            assert (
                np.any(mask == 0) and np.any(mask == 1)
            ), (
                f"There are no reference or non-reference feature pairs found in {ref_feature_coll}. "
                "Check that 1) feature names match between feature annotation data "
                "and the GMT file, 2) you correctly specified allowed feature pair types."
            )

            # Get labels of training and validation sets
            train_val_mask = self._compute_train_val_mask(mask)

            # Count total number of reference and non-reference pairs for each feature
            # Counts are training/validation/all-specific
            features_total_pos, features_total_neg, scoring_features_mask = self._compute_feature_totals(
                mask, train_val_mask, self.feature_idxs1, self.feature_idxs2, n_features, min_pairs_to_score
            )
            assert (
                np.any(scoring_features_mask)
            ), (
                f"None of the features can be used for scoring in {ref_feature_coll}. "
                "The value of min_pairs_to_score might be too high for your data."
            )

            self.data[ref_feature_coll]["mask"] = mask
            self.data[ref_feature_coll]["train_val_mask"] = train_val_mask

            self.data[ref_feature_coll]["features_total_pos"] = features_total_pos
            self.data[ref_feature_coll]["features_total_neg"] = features_total_neg
            self.data[ref_feature_coll]["scoring_features_mask"] = scoring_features_mask

            # A fraction of all participating pairs are "highly correlated"
            high_corr_frac = ref_feature_colls[ref_feature_coll]["high_corr_frac"]
            assert (
                isinstance(high_corr_frac, float) and 0 <= high_corr_frac <= 1
            ), "high_corr_frac must be float between 0 and 1."
            total_num_pairs = np.sum(mask != -1)
            high_corr_pairs_all = int(total_num_pairs * high_corr_frac)
            
            n_pairs_all = np.sum(train_val_mask != -1)
            n_pairs_train = np.sum(train_val_mask == 0)
            n_pairs_val = np.sum(train_val_mask == 1)

            self.data[ref_feature_coll]["high_corr_frac"] = high_corr_frac
            self.data[ref_feature_coll]["high_corr_pairs_all"] = high_corr_pairs_all
            self.data[ref_feature_coll]["high_corr_pairs_training"] = int(high_corr_pairs_all * n_pairs_train / n_pairs_all)
            self.data[ref_feature_coll]["high_corr_pairs_validation"] = int(high_corr_pairs_all * n_pairs_val / n_pairs_all)

    
    

[docs]
    def compute_corr_scores(self, corrs, full_output=False):
        """
        Evaluate feature-feature correlations with respect to the
        reference feature sets.
        
        Parameters
        ----------
        corrs : numpy.array
            Array of feature-feature correlations (flat format).
        full_output : bool, optional, default=False
            Whether to return large arrays with detailed scores
            (not only final scores).

        Returns
        -------
        res: dict
            If `full_output` is ``False``, dict has the following structure:

            | {
            |     ``ref_feature_coll`` : {
            |         ``"score {subset}"`` : `float`
            |     }
            | }

            Here, subset is ``"training"``, ``"validation"``, and ``"all"`` (feature pair subsets).
            Score is defined according to the `metric` attribute.
            If `full_output` is ``True``, the following keys are added
            to the dict corresponding to each ``ref_feature_coll``:

            | {
            |     ``"BPs_at_K {subset}"`` : `numpy.array`
            |         Balanced precisions for each feature.
            |     ``"enrichments {subset}"`` : `numpy.array`
            |         Enrichments for each feature.
            |     ``"pvalues {subset}"`` : `numpy.array`
            |         p-values for each feature.
            |     ``"pvalues_adj {subset}"`` : `numpy.array`
            |         Adjusted p-values for each feature.
            |     ``"TPs_at_K {subset}"`` : `numpy.array`
            |         True positives for each feature (``k_j`` in the notation of paper).
            |     ``"num_pairs {subset}"`` : `numpy.array`
            |         True positives + false positives for each feature (``K_j`` in the notation of paper).
            |     ``"scoring_features_mask"`` : `numpy.array`
            |         Binary array of whether each feature participates in score computation.
            |     ``"corrs"`` : `numpy.array`
            |         Feature-feature correlations.
            |     ``"mask"`` : `numpy.array`
            |         Whether each feature pair belong to at least one shared reference set.
            |     ``"train_val_mask"`` : `numpy.array`
            |         Whether each feature pair belong to training or validation set.
            | }

            The last four arrays are sorted by correlation value
            in a direction specified by the ``"sign"`` value of the `data` attribute.
        """

        res = {}
        # Different reference collections may use the same sorting sign
        # So the sorting will be done only once for each sign and cached
        sort_cache = {}

        for ref_feature_coll in self.data:
            sign = self.data[ref_feature_coll]["sign"]

            if sign in sort_cache:
                corrs_sorted_args = sort_cache[sign]
            else:
                if sign == "negative":
                    corrs_sorted_args = parallel_sort.argsort(corrs)
                elif sign == "positive":
                    corrs_sorted_args = parallel_sort.argsort(-corrs)
                elif sign == "absolute":
                    corrs_sorted_args = parallel_sort.argsort(-np.abs(corrs))
                
                sort_cache[sign] = corrs_sorted_args
            
            (
                BPs_at_K, enrichments, pvalues, TPs_at_K, FPs_at_K,
                corrs_sorted, mask_sorted, train_val_mask_sorted
            ) = self._compute_scores_for_ref_feature_coll(
                corrs, corrs_sorted_args,
                self.data[ref_feature_coll]["mask"],
                self.data[ref_feature_coll]["train_val_mask"],
                self.data[ref_feature_coll]["high_corr_pairs_all"],
                self.data[ref_feature_coll]["high_corr_pairs_training"],
                self.data[ref_feature_coll]["high_corr_pairs_validation"],
                self.data[ref_feature_coll]["features_total_pos"],
                self.data[ref_feature_coll]["features_total_neg"],
                self.data[ref_feature_coll]["scoring_features_mask"],
                self.feature_idxs1,
                self.feature_idxs2,
            )

            # Need to adjust p-values outside of the previous method
            # because numba doesn't support this function
            scoring_mask = self.data[ref_feature_coll]["scoring_features_mask"]
            pvalues_adj = np.zeros(pvalues.shape, dtype="float64")
            for mode in range(3):
                pvalues_adj[mode, scoring_mask] = false_discovery_control(pvalues[mode, scoring_mask])
                pvalues_adj[mode, ~scoring_mask] = np.nan

            neg_log_pvalues_adj = -np.log10(pvalues_adj)
            if not np.all(np.isnan(neg_log_pvalues_adj)):
                avg_log_padj = np.nanmean(neg_log_pvalues_adj, axis=1)
            else:
                avg_log_padj = np.array([0.0, 0.0, 0.0])

            # This is aggregated BP@K score (a single number)
            scoring_features_mask = self.data[ref_feature_coll]["scoring_features_mask"]
            agg_total_pos = np.nansum(
                self.data[ref_feature_coll]["features_total_pos"][:, scoring_features_mask],
                axis=1
            )
            agg_total_neg = np.nansum(
                self.data[ref_feature_coll]["features_total_neg"][:, scoring_features_mask],
                axis=1
            )
            agg_TP_at_K = np.nansum(TPs_at_K[:, scoring_features_mask], axis=1)
            agg_FP_at_K = np.nansum(FPs_at_K[:, scoring_features_mask], axis=1)
            agg_BP_at_K_TP = agg_TP_at_K / (agg_TP_at_K + agg_FP_at_K) * agg_total_neg
            agg_BP_at_K_FP = agg_FP_at_K / (agg_TP_at_K + agg_FP_at_K) * agg_total_pos
            agg_BP_at_K = agg_BP_at_K_TP / (agg_BP_at_K_TP + agg_BP_at_K_FP)

            final_scores = avg_log_padj if self.metric == "enrichment-based" else agg_BP_at_K

            res[ref_feature_coll] = dict(zip(
                ["score training", "score validation", "score all"],
                final_scores
            ))
            
            if full_output:
                for mode, subset in enumerate(["training", "validation", "all"]):
                    res[ref_feature_coll][f"BPs_at_K {subset}"] = BPs_at_K[mode]
                    res[ref_feature_coll][f"enrichments {subset}"] = enrichments[mode]
                    res[ref_feature_coll][f"pvalues {subset}"] = pvalues[mode]
                    res[ref_feature_coll][f"pvalues_adj {subset}"] = pvalues_adj[mode]
                    res[ref_feature_coll][f"TPs_at_K {subset}"] = TPs_at_K[mode]
                    res[ref_feature_coll][f"num_pairs {subset}"] = (TPs_at_K + FPs_at_K)[mode]
                
                res[ref_feature_coll]["scoring_features_mask"] = scoring_features_mask
                res[ref_feature_coll]["corrs"] = corrs_sorted
                res[ref_feature_coll]["mask"] = mask_sorted
                res[ref_feature_coll]["train_val_mask"] = train_val_mask_sorted

        return res

    
    

[docs]
    @staticmethod
    @njit(parallel=True)
    def _compute_ref_feature_coll_mask(
        feature_idxs_in_ref_sets, ref_feature_sets_boundaries, unique_feature_idxs_in_db,
        allowed_feature_pair_types, feature_idx_to_type_id,
        n_features
    ):
        """
        Efficiently create a flat array that answers
        whether each feature pair belong at least one shared
        reference feature set.

        Parameters
        ----------
        feature_idxs_in_ref_sets : numpy.array
            Array with indices of features that belong to
            each reference sets. Each reference set is
            represented by a consecutive block of feature
            indices.
        ref_feature_sets_boundaries : numpy.array
            Array with indices of `feature_idxs_in_ref_sets`, which
            tell where each referense set starts and ends.
        unique_feature_idxs_in_db : numpy.array
            Array of feature indices that belong to
            at least one reference set.
        allowed_feature_pair_types : numpy.array
            Array of feature pair types that are participating
            in the score computations. Since Numba cannot work
            with string and tuples normally, these should be
            integers computes using Cantor's pairing function,
            see `compute_index` code.
        feature_idx_to_type_id : numpy.array
            Array mapping feature indices to feature types.
            Indices of the array stand for feature indices,
            and values stand for feature type indices.
        n_features : int
            Total number of features.

        Returns
        -------
        mask : numpy.array
            See `data` attribute description.
        """

        # Allocate memory
        n_pairs = n_features * (n_features - 1) // 2
        mask = np.empty((n_pairs,), dtype="int8")
        # Initialize everything with -1
        mask[:] = -1

        # Set 0s for all pairs composed of features from reference sets file
        # the only constraint: feature types
        for p1 in prange(unique_feature_idxs_in_db.shape[0]):
            for p2 in range(p1 + 1, unique_feature_idxs_in_db.shape[0]):
                i = unique_feature_idxs_in_db[p1]
                j = unique_feature_idxs_in_db[p2]

                k1, k2 = feature_idx_to_type_id[i], feature_idx_to_type_id[j]
                cantor1 = (k1 + k2) * (k1 + k2 + 1) // 2 + k1
                cantor2 = (k1 + k2) * (k1 + k2 + 1) // 2 + k2
                if (
                    cantor1 in allowed_feature_pair_types or
                    cantor2 in allowed_feature_pair_types
                ):
                    mask_idx = j - 1 + i * (n_features - 1) - i * (i + 1) // 2
                    mask[mask_idx] = 0
        
        # Then, set 1s
        # We cannot use parallel processing here because of possible race conditions
        for p in range(1, ref_feature_sets_boundaries.shape[0]):
            for idx_pos1 in range(ref_feature_sets_boundaries[p - 1], ref_feature_sets_boundaries[p]):
                for idx_pos2 in range(idx_pos1 + 1, ref_feature_sets_boundaries[p]):
                    i = min(feature_idxs_in_ref_sets[idx_pos1], feature_idxs_in_ref_sets[idx_pos2])
                    j = max(feature_idxs_in_ref_sets[idx_pos1], feature_idxs_in_ref_sets[idx_pos2])

                    k1, k2 = feature_idx_to_type_id[i], feature_idx_to_type_id[j]
                    cantor1 = (k1 + k2) * (k1 + k2 + 1) // 2 + k1
                    cantor2 = (k1 + k2) * (k1 + k2 + 1) // 2 + k2
                    if (
                        cantor1 in allowed_feature_pair_types or
                        cantor2 in allowed_feature_pair_types
                    ):
                        mask_idx = j - 1 + i * (n_features - 1) - i * (i + 1) // 2
                        mask[mask_idx] = 1

        return mask

    

[docs]
    @staticmethod
    @njit(parallel=True)
    def _compute_train_val_mask(mask):
        """
        Efficiently create a flat array that answers
        whether each feature pair belong to training or
        validation set. The function is deterministic,
        as training and validation set are derived by
        taking each 2nd element.

        Parameters
        ----------
        mask : numpy.array
            Array returned by the `_compute_ref_feature_coll_mask` method.

        Returns
        -------
        train_val_mask : numpy.array
            See `data` attribute description.
        """

        train_val_mask = np.empty(mask.shape, dtype="int8")

        # Assign training and validation sets
        # = 0 stands for train, = 1 stands for validation, = -1 stands for others
        # This code will make sure that
        # a) train/validation separation is deterministic
        # b) it is the same for different reference collections
        train_val_mask[::2] = 1
        train_val_mask[1::2] = 0
        train_val_mask[np.where(mask == -1)[0]] = -1

        return train_val_mask

    

[docs]
    @staticmethod
    @njit(parallel=True)
    def _compute_feature_totals(
        mask, train_val_mask,
        feature_idxs1, feature_idxs2, n_features,
        min_pairs_to_score
    ):
        """
        Efficiently compute statistics on number
        of feature pairs that involve each individual feature.

        Parameters
        ----------
        mask : numpy.array
            Array returned by the `_compute_ref_feature_coll_mask` method.
        train_val_mask : numpy.array
            Array returned by the `_compute_train_val_mask` method.
        feature_idxs1 : numpy.array
            Indices of the first features in the flat array of pairs.
        feature_idxs2 : numpy.array
            Indices of the second features in the flat array of pairs.
        n_features : int
            Number of features.
        min_pairs_to_score : int
            Same as in the constructor.

        Returns
        -------
        features_total_pos : numpy.ndarray
            See `data` attribute description.
        features_total_neg : numpy.ndarray
            See `data` attribute description.
        scoring_features_mask : numpy.array
            See `data` attribute description.
        """

        # Feature counts are computed separately for
        # trainining (0), validation (1), and all pairs (2)
        features_total_pos = np.zeros((3, n_features), dtype="int64")
        features_total_neg = np.zeros((3, n_features), dtype="int64")
        # These are the features that are involved in at least min_pairs_to_score pairs,
        # including at least one reference and one non-reference pair
        scoring_features_mask = np.ones((n_features,), dtype="bool")

        # train, validaion, all 
        # No prange because of the race condition on scoring_features_mask
        for mode in range(3):
            if mode != 2:
                idxs = np.where(train_val_mask == mode)[0]
            else:
                idxs = np.where(mask != -1)[0]

            # Non-parallel because of race conditions
            # But it is very fast anyway
            for mask_idx in idxs:
                feature1_idx = feature_idxs1[mask_idx]
                feature2_idx = feature_idxs2[mask_idx]
                TP_flag = mask[mask_idx]

                features_total_pos[mode, feature1_idx] += TP_flag
                features_total_pos[mode, feature2_idx] += TP_flag
                features_total_neg[mode, feature1_idx] += 1 - TP_flag
                features_total_neg[mode, feature2_idx] += 1 - TP_flag

            scoring_features_mask = (
                scoring_features_mask & 
                ((features_total_pos[mode] + features_total_neg[mode]) >= min_pairs_to_score)
            )
        
        return features_total_pos, features_total_neg, scoring_features_mask

    
    

[docs]
    @staticmethod
    @njit(parallel=True)
    def _compute_scores_for_ref_feature_coll(
        corrs, corrs_sorted_args,
        mask, train_val_mask,
        high_corr_pairs_all, high_corr_pairs_train, high_corr_pairs_val,
        features_total_pos, features_total_neg, scoring_features_mask,
        feature_idxs1, feature_idxs2,
        num_regularization_pairs=5,
    ):
        """
        Evaluate feature-feature correlations with respect to
        one reference features set collection.
        
        Parameters
        ----------
        corrs : numpy.array
            Array of feature-feature correlations (flat format).
        corrs_sorted_args : numpy.array
            Array of indices that sort corrs array in needed way.
        mask : numpy.array
            Array returned by the `_compute_ref_feature_coll_mask` method.
        train_val_mask : numpy.array
            Array returned by the `_compute_train_val_mask` method.
        high_corr_pairs_all : int
            See `data` attribute description.
        high_corr_pairs_train : int
            See `data` attribute description.
        high_corr_pairs_val : int
            See `data` attribute description.
        features_total_pos : numpy.ndarray
            Array returned by the `_compute_feature_totals` method.
        features_total_neg : numpy.ndarray
            Array returned by the `_compute_feature_totals` method.
        scoring_features_mask : numpy.array
            Array returned by the `_compute_feature_totals` method.
        feature_idxs1 : numpy.array
            Indices of the first features in the flat array of pairs.
        feature_idxs2 : numpy.array
            Indices of the second features in the flat array of pairs.
        num_regularization_pairs : int, optional, default=5
            Number of feature pairs to add to each ``K_j`` for
            Bayesian regularization purposes.

        Returns
        -------
        BPs_at_K : numpy.array
            Balanced precisions for each feature.
        enrichments : numpy.array
            Enrichments for each feature.
        pvalues : numpy.array
            p-values for each feature.
        TPs_at_K : numpy.array
            True positives for each feature.
        FPs_at_K : numpy.array
            False positives for each feature.
        corrs_filtered : numpy.array
            Sorted feature-feature correlations with ``mask != -1``.
        mask_filtered : numpy.array
            Mask sorted in the same way as `corrs_filtered`.
        train_val_mask_filtered : numpy.array
            Training/validation mask sorted in the same way as `corrs_filtered`.
        """
    
        # Parallel re-ordering of arrays according to order in corrs_sorted_args
        # Numba can't auto parallelize this, so we need to write for loop with prange
        
        # Allocate memory
        corrs_sorted = np.zeros(corrs_sorted_args.shape, dtype="float32")
        mask_sorted = np.zeros(corrs_sorted_args.shape, dtype="int8")
        train_val_mask_sorted = np.zeros(corrs_sorted_args.shape, dtype="int8")
        feature_idxs1_sorted = np.zeros(corrs_sorted_args.shape, dtype="int64")
        feature_idxs2_sorted = np.zeros(corrs_sorted_args.shape, dtype="int64")

        # Re-order
        for i in prange(corrs_sorted_args.shape[0]):
            corrs_sorted[i] = corrs[corrs_sorted_args[i]]
            mask_sorted[i] = mask[corrs_sorted_args[i]]
            train_val_mask_sorted[i] = train_val_mask[corrs_sorted_args[i]]
            feature_idxs1_sorted[i] = feature_idxs1[corrs_sorted_args[i]]
            feature_idxs2_sorted[i] = feature_idxs2[corrs_sorted_args[i]]
        
        # Next, we need to keep only those elements that corresponds to mask != -1
        # Again, we need explicit prange loop to make it parallel

        # Allocate memory
        mask_non_empty_idxs = np.where(mask_sorted != -1)[0]
        corrs_filtered = np.zeros(mask_non_empty_idxs.shape, dtype="float32")
        mask_filtered = np.zeros(mask_non_empty_idxs.shape, dtype="int8")
        train_val_mask_filtered = np.zeros(mask_non_empty_idxs.shape, dtype="int8")
        feature_idxs1_filtered = np.zeros(mask_non_empty_idxs.shape, dtype="int64")
        feature_idxs2_filtered = np.zeros(mask_non_empty_idxs.shape, dtype="int64")
        
        # Select subset of elements
        for i in prange(mask_non_empty_idxs.shape[0]):
            corrs_filtered[i] = corrs_sorted[mask_non_empty_idxs[i]]
            mask_filtered[i] = mask_sorted[mask_non_empty_idxs[i]]
            train_val_mask_filtered[i] = train_val_mask_sorted[mask_non_empty_idxs[i]]
            feature_idxs1_filtered[i] = feature_idxs1_sorted[mask_non_empty_idxs[i]]
            feature_idxs2_filtered[i] = feature_idxs2_sorted[mask_non_empty_idxs[i]]
        
        # Next, we calculate precision for neighborhood of each feature
        n_features1 = np.max(feature_idxs1) + 1
        n_features2 = np.max(feature_idxs2) + 1
        n_features = max(n_features1, n_features2)
        scoring_features_idxs = np.where(scoring_features_mask)[0]
        
        # We calculate precision separately for
        # train (0), validation (1), and all pairs (2)
        TPs_at_K = np.zeros((3, n_features), dtype="int64")
        FPs_at_K = np.zeros((3, n_features), dtype="int64")

        BPs_at_K = np.zeros((3, n_features), dtype="float32")
        enrichments = np.zeros((3, n_features), dtype="float32")
        pvalues = np.zeros((3, n_features), dtype="float64")
        BPs_at_K[:, :] = np.nan
        enrichments[:, :] = np.nan
        pvalues[:, :] = np.nan

        # This is an array to store mean values across features
        #MBPs_at_K = np.zeros((3,), dtype="float32")

        # train, validaion, all 
        for mode in range(3):
            if mode != 2:
                idxs = np.where(train_val_mask_filtered == mode)[0]
                if mode == 0:
                    top_K = high_corr_pairs_train
                else:
                    top_K = high_corr_pairs_val
            else:
                idxs = np.arange(mask_filtered.shape[0])
                top_K = high_corr_pairs_all

            # Non-parallel because of race conditions
            # But it is very fast anyway
            for mask_idx in idxs[:top_K]:
                feature1_idx = feature_idxs1_filtered[mask_idx]
                feature2_idx = feature_idxs2_filtered[mask_idx]
                TP_flag = mask_filtered[mask_idx]

                TPs_at_K[mode, feature1_idx] += TP_flag
                TPs_at_K[mode, feature2_idx] += TP_flag
                FPs_at_K[mode, feature1_idx] += 1 - TP_flag
                FPs_at_K[mode, feature2_idx] += 1 - TP_flag

            ##########################################
            # Balanced precision at K and enrichment #
            ##########################################
        
            # Weights are inverse-proportional to the fraction of positives/negatives
            # Factor of 0.5 is needed to bring TP * TP_weight and FP * FP_weight
            # to the scale needed for regularization
            features_total = features_total_pos[mode, scoring_features_mask] + features_total_neg[mode, scoring_features_mask]
            frac_pos = features_total_pos[mode, scoring_features_mask] / features_total
            frac_neg = features_total_neg[mode, scoring_features_mask] / features_total
            # Bayesian regularization (pseudo-counts)
            regularization_TPs = num_regularization_pairs * frac_pos
            regularization_FPs = num_regularization_pairs * frac_neg

            # Compute BPs_at_K and enrichments
            # j goes over indices of scoring features only
            # feature_idx points to the global index of feature
            for j in prange(scoring_features_idxs.shape[0]):
                feature_idx = scoring_features_idxs[j]
                # For features without any pairs within top-K, this will set
                # BP to 0.5 because of regularization
                # For features with frac_pos or frac_neg == 0, we just set value of 0.5
                if frac_pos[j] != 0 and frac_neg[j] != 0:
                    BPs_at_K[mode, feature_idx] = (
                        ((TPs_at_K[mode, feature_idx] + regularization_TPs[j]) / frac_pos[j]) /
                        (
                            (TPs_at_K[mode, feature_idx] + regularization_TPs[j]) / frac_pos[j] +
                            (FPs_at_K[mode, feature_idx] + regularization_FPs[j]) / frac_neg[j]
                        )
                    )
                else:
                    BPs_at_K[mode, feature_idx] = 0.5

                # Set enrichment to 1 if expected = 0
                if frac_pos[j] != 0:
                    expected = frac_pos[j] * (
                        TPs_at_K[mode, feature_idx] + FPs_at_K[mode, feature_idx] +
                        num_regularization_pairs
                    )
                    enrichments[mode, feature_idx] = (
                        (TPs_at_K[mode, feature_idx] + regularization_TPs[j]) /
                        expected
                    )
                else:
                    enrichments[mode, feature_idx] = 1.0

            ##################################
            # p-values (hypergeometric test) #
            ##################################

            pvalues[mode, scoring_features_mask] = hypergeom_pvalues(
                features_total + num_regularization_pairs,
                features_total_pos[mode, scoring_features_mask] + regularization_TPs.astype("int64"),
                TPs_at_K[mode, scoring_features_mask] + FPs_at_K[mode, scoring_features_mask] + num_regularization_pairs,
                TPs_at_K[mode, scoring_features_mask] + regularization_TPs.astype("int64")
            )

        return (
            BPs_at_K, enrichments, pvalues,
            TPs_at_K, FPs_at_K,
            corrs_filtered, mask_filtered, train_val_mask_filtered
        )