Module `mimir.attacks.dc_pdd`

DC-PDD Attack: https://aclanthology.org/2024.emnlp-main.300/ Based on the official implementation: https://github.com/zhang-wei-chao/DC-PDD

Functions

def ensure_parent_directory_exists(filename)

Expand source code

def ensure_parent_directory_exists(filename):
    # Get the parent directory from the given filename
    parent_dir = os.path.dirname(filename)
    
    # Create the parent directory if it does not exist
    if parent_dir and not os.path.exists(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)

Classes

class DC_PDDAttack (config: ExperimentConfig, model: Model)

Expand source code

class DC_PDDAttack(Attack):

    def __init__(self, config: ExperimentConfig, model: Model):
        super().__init__(config, model, ref_model=None)
        # Use subset of C-4
        self.fre_dis = ch.zeros(len(model.tokenizer))
        # Account for model name
        model_name = model.name

        # Load from cache if available, save otherwise
        cached_file_path = os.path.join(get_cache_path(), "DC_PDD_freq_dis", "C4", f"{model_name}.pt")

        if os.path.exists(cached_file_path):
            self.fre_dis = ch.load(cached_file_path)
            print(f"Loaded frequency distribution from cache for {model_name}")
        else:
            # Make sure the directory exists
            ensure_parent_directory_exists(cached_file_path)
            # Collect frequency data
            self._collect_frequency_data()
            ch.save(self.fre_dis, cached_file_path)
            print(f"Saved frequency distribution to cache for {model_name}")

        # Laplace smoothing
        self.fre_dis = (1 + self.fre_dis) / (ch.sum(self.fre_dis) + len(self.fre_dis))

    def _fre_dis(self, ref_data, max_tok: int = 1024):
        """
        token frequency distribution
        ref_data: reference dataset
        tok: tokenizer
        """
        # Tokenize all the text in the reference dataset
        # input_ids = self.target_model.tokenizer(ref_data, truncation=True, max_length=max_tok).input_ids
        for text in tqdm(ref_data):
            input_ids = self.target_model.tokenizer(text, truncation=True, max_length=max_tok).input_ids
            self.fre_dis[input_ids] += 1

    def _collect_frequency_data(self, fil_num: int = 15):
        for i in tqdm(range(fil_num), desc="Downloading and processing dataset"):
            # Download the dataset split
            url = f"https://huggingface.co/datasets/allenai/c4/resolve/main/en/c4-train.{i:05}-of-01024.json.gz"
            # Download the file
            response = requests.get(url)
            response.raise_for_status()  # Check for download errors

            # Open and parse the .json.gz file - the file is a .json file with one json object per line
            with gzip.GzipFile(fileobj=io.BytesIO(response.content)) as gz_file:
                sub_dataset = gz_file.readlines()
                examples = []
                # for example in tqdm(sub_dataset):
                for example in sub_dataset:
                    example = json.loads(example)
                    examples.append(example['text'])

                # Compute the frequency distribution
                self._fre_dis(examples)

    @ch.no_grad()
    def _attack(self, document, probs, tokens=None, **kwargs):
        """
        DC-PDD Attack: Use frequency distribution of some large corpus to "calibrate" token probabilities
        and compute a membership score.
        """
        # Hyper-params specific to DC-PDD
        a: float = kwargs.get("a", 0.01)

        # Tokenize text (we process things slightly differently)
        tokens_og = self.target_model.tokenizer(document, return_tensors="pt").input_ids
        # Inject EOS token at beginning
        tokens = ch.cat([ch.tensor([[self.target_model.tokenizer.eos_token_id]]), tokens_og], dim=1).numpy()

        # these are all log probabilites
        probs_with_start_token = self.target_model.get_probabilities(document, tokens=tokens)
        x_pro = np.exp(probs_with_start_token)

        indexes = []
        current_ids = []
        input_ids = tokens_og[0]
        for i, input_id in enumerate(input_ids):
            if input_id not in current_ids:
                indexes.append(i)
                current_ids.append(input_id)

        x_pro = x_pro[indexes]
        x_fre = self.fre_dis[input_ids[indexes]].numpy()

        # Compute alpha values
        alpha = x_pro * np.log(1 / x_fre)

        # Compute membership score
        alpha[alpha > a] = a

        beta = - np.mean(alpha)

        return beta

Ancestors

Attack

Inherited members

Attack:
- attack
- load