LoadControls

LoadControls ¶

Bases: Task

Task for loading single, multi and no-protein controls from a list of files.

REQUIRED PIPELINE DATA¶

None

PRODUCED PIPELINE DATA¶

controls_values: an array of control values (total events x channels)
controls_masks: an array of masks telling which proteins were present, for each event (total events x proteins)
channel_names: the ordered list of channel names
protein_names: the ordered list of protein names
saturation_thresholds: {'upper': (channels), 'lower': (channels)} the upper and lower saturation thresholds for each channel
reference_channels: an array of integer indices of the reference channels for each protein (proteins)

Functions¶

pick_reference_channels ¶

pick_reference_channels(max_sat_proportion=0.001, min_dyn_range=3, **_)

Pick reference channels for each protein using multiple quality metrics.

The channel selection uses a weighted scoring system that considers: - Dynamic range (log10) - Signal-to-noise ratio - Specificity (signal vs crosstalk) - Saturation proportion (penalizes channels with too many saturated events)

Source code in calibrie/loadcontrols.py

def pick_reference_channels(self, max_sat_proportion=0.001, min_dyn_range=3, **_):
    """
    Pick reference channels for each protein using multiple quality metrics.

    The channel selection uses a weighted scoring system that considers:
    - Dynamic range (log10)
    - Signal-to-noise ratio
    - Specificity (signal vs crosstalk)
    - Saturation proportion (penalizes channels with too many saturated events)
    """
    if self.use_reference_channels is not None:
        user_provided = []
        for p in self._protein_names:
            if p in self.use_reference_channels.keys():
                ch = self.use_reference_channels[p]
                assert ch in self._channel_names, f'Unknown channel {ch} in reference_channels'
                user_provided.append(self._channel_names.index(ch))
            else:
                user_provided.append(None)
        unknown_provided = [
            p for p in self.use_reference_channels if p not in self._protein_names
        ]
        if len(unknown_provided) > 0:
            self._log.warning(
                f'Ignoring proteins {unknown_provided}: declared a reference channel but are absent from controls.'
            )
    else:
        user_provided = [None] * len(self._protein_names)

    # compute quality metrics for selection
    metrics = ChannelMetrics(
        self._controls_values, self._controls_masks, self._channel_names, self._protein_names
    )

    single_masks = self._controls_masks.sum(axis=1) == 1
    ref_channels = []

    for pid, prot in enumerate(self._protein_names):
        if user_provided[pid] is not None:
            ref_channels.append(user_provided[pid])
            continue

        # get protein mask and values
        prot_mask = np.logical_and(single_masks, self._controls_masks[:, pid])
        if not np.any(prot_mask):
            ref_channels.append(None)
            continue

        prot_values = self._controls_values[prot_mask]

        # compute saturation proportions
        prot_sat = np.logical_or(
            prot_values <= self._saturarion_thresholds[:, 0],
            prot_values >= self._saturarion_thresholds[:, 1],
        )
        sat_proportions = prot_sat.sum(axis=0) / prot_sat.shape[0]

        # get other metrics for this protein
        channel_scores = []
        for cid, chan in enumerate(self._channel_names):
            # skip channels with too much saturation
            if sat_proportions[cid] > max_sat_proportion:
                channel_scores.append(-np.inf)
                continue

            # compile scores from different metrics
            scores = {
                'dynamic_range': metrics.dynamic_ranges[prot][chan],
                'snr': metrics.signal_to_noise[prot][chan],
                'specificity': metrics.specificities[prot][chan],
                'signal': metrics.signal_strengths[prot][chan],
            }

            # skip channels with insufficient dynamic range
            if scores['dynamic_range'] < min_dyn_range:
                channel_scores.append(-np.inf)
                continue

            # normalize each metric to 0-1 range for this protein
            metric_weights = {
                'dynamic_range': 1.0,
                'specificity': 0.8,
                'snr': 0.5,
                'signal': 0.4,
            }

            scores = normalized_metrics(scores)

            total_score = 0
            for metric, weight in metric_weights.items():
                score = scores[metric]
                total_score += weight * score

            channel_scores.append(total_score)

        channel_scores = np.array(channel_scores)

        if np.all(channel_scores == -np.inf):
            ref_channels.append(None)
            raise ValueError(
                f"""
                No reference channel found for {prot}!
                Try increasing max_sat_proportion and/or decreasing min_dyn_range
                Saturation proportions per channel: {sat_proportions}
                """
            )
        else:
            best_channel = int(np.argmax(channel_scores))
            ref_channels.append(best_channel)
            self._log.debug(
                f"Reference channel for {prot} is {self._channel_names[best_channel]} "
                f"with score {channel_scores[best_channel]:.2f}"
            )

    return ref_channels