Module mimir.attacks.min_k_plus_plus

Min-K%++ Attack: https://github.com/zjysteven/mink-plus-plus

Classes

class MinKPlusPlusAttack (config: ExperimentConfig, model: Model)
Expand source code
class MinKPlusPlusAttack(Attack):

    def __init__(self, config: ExperimentConfig, model: Model):
        super().__init__(config, model, ref_model=None)

    @ch.no_grad()
    def _attack(self, document, probs, tokens=None, **kwargs):
        """
        Min-K%++ Attack. 
        Gets token probabilties, normalize with the mean and std over the whole categorical distribution,
        and returns normalized likelihood when computed over top k% of ngrams.
        """
        # Hyper-params specific to min-k attack
        k: float = kwargs.get("k", 0.2)
        all_probs = kwargs.get("all_probs", None)

        # these are all log probabilites
        target_prob, all_probs = (
            (probs, all_probs)
            if (probs is not None and all_probs is not None)
            else self.model.get_probabilities(document, tokens=tokens, return_all_probs=True)
        )
        
        mu = (ch.exp(all_probs) * all_probs).sum(-1)
        sigma = (ch.exp(all_probs) * ch.square(all_probs)).sum(-1) - ch.square(mu)
        scores = (np.array(target_prob) - mu.numpy()) / sigma.sqrt().numpy()
        
        return -np.mean(sorted(scores)[:int(len(scores) * k)])

Ancestors

Inherited members