Module mimir.attacks.gradnorm
Gradient-norm attack. Proposed for MIA in multiple settings, and particularly experimented for pre-training data and LLMs in https://arxiv.org/abs/2402.17012
Classes
class GradNormAttack (config: ExperimentConfig, model: Model)
-
Expand source code
class GradNormAttack(Attack): def __init__(self, config: ExperimentConfig, model: Model): super().__init__(config, model, ref_model=None, is_blackbox=False) def _attack(self, document, probs, tokens=None, **kwargs): """ Gradient Norm Attack. Computes p-norm of gradients w.r.t. model parameters. """ # We ignore probs here since they are computed in the general case without gradient-tracking (to save memory) # Hyper-params specific to min-k attack p: float = kwargs.get("p", np.inf) if p not in [1, 2, np.inf]: raise ValueError(f"Invalid p-norm value: {p}.") # Make sure model params require gradients # for name, param in self.target_model.model.named_parameters(): # param.requires_grad = True # Get gradients for model parameters self.target_model.model.zero_grad() all_prob = self.target_model.get_probabilities(document, tokens=tokens, no_grads=False) loss = - ch.mean(all_prob) loss.backward() # Compute p-norm of gradients (for all model params where grad exists) grad_norms = [] for param in self.target_model.model.parameters(): if param.grad is not None: grad_norms.append(param.grad.detach().norm(p)) grad_norm = ch.stack(grad_norms).mean() # Zero out gradients again self.target_model.model.zero_grad() return -grad_norm.cpu().numpy()
Ancestors
Inherited members