Module mimir.config

Definitions for configurations.

Classes

class EnvironmentConfig (cache_dir: Optional[str] = None, data_source: Optional[str] = None, device: Optional[str] = 'cuda:0', device_map: Optional[str] = None, device_aux: Optional[str] = 'cuda:1', compile: Optional[bool] = True, int8: Optional[bool] = False, half: Optional[bool] = False, results: Optional[str] = 'results', tmp_results: Optional[str] = 'tmp_results')

Config for environment-specific parameters

Expand source code
@dataclass
class EnvironmentConfig(Serializable):
    """
    Config for environment-specific parameters
    """
    cache_dir: Optional[str] = None
    """Path to cache directory"""
    data_source: Optional[str] = None
    """Path where data is stored"""
    device: Optional[str] = 'cuda:0'
    """Device (GPU) to load main model on"""
    device_map: Optional[str] = None
    """Configuration for device map if needing to split model across gpus"""
    device_aux: Optional[str] = "cuda:1"
    """Device (GPU) to load any auxiliary model(s) on"""
    compile: Optional[bool] = True
    """Compile models?"""
    int8: Optional[bool] = False
    """Use int8 quantization?"""
    half: Optional[bool] = False
    """Use half precision?"""
    results: Optional[str] = "results"
    """Path for saving final results"""
    tmp_results: Optional[str] = "tmp_results"

    def __post_init__(self):
        if self.cache_dir is None:
            self.cache_dir = get_cache_path()
        if self.data_source is None:
            self.data_source = get_data_source()

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var cache_dir : Optional[str]

Path to cache directory

var compile : Optional[bool]

Compile models?

var data_source : Optional[str]

Path where data is stored

var decode_into_subclasses : ClassVar[bool]
var device : Optional[str]

Device (GPU) to load main model on

var device_aux : Optional[str]

Device (GPU) to load any auxiliary model(s) on

var device_map : Optional[str]

Configuration for device map if needing to split model across gpus

var half : Optional[bool]

Use half precision?

var int8 : Optional[bool]

Use int8 quantization?

var results : Optional[str]

Path for saving final results

var tmp_results : Optional[str]
class ExperimentConfig (experiment_name: str, base_model: str, dataset_member: str, dataset_nonmember: str, output_name: str = None, dataset_nonmember_other_sources: Optional[List[str]] = <factory>, pretokenized: Optional[bool] = False, revision: Optional[str] = None, presampled_dataset_member: Optional[str] = None, presampled_dataset_nonmember: Optional[str] = None, token_frequency_map: Optional[str] = None, dataset_key: Optional[str] = None, specific_source: Optional[str] = None, full_doc: Optional[bool] = False, max_substrs: Optional[int] = 20, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, load_from_hf: Optional[bool] = True, blackbox_attacks: Optional[List[str]] = <factory>, tokenization_attack: Optional[bool] = False, quantile_attack: Optional[bool] = False, n_samples: Optional[int] = 200, max_tokens: Optional[int] = 512, max_data: Optional[int] = 5000, min_words: Optional[int] = 100, max_words: Optional[int] = 200, max_words_cutoff: Optional[bool] = True, batch_size: Optional[int] = 50, chunk_size: Optional[int] = 20, scoring_model_name: Optional[str] = None, top_k: Optional[int] = 40, do_top_k: Optional[bool] = False, top_p: Optional[float] = 0.96, do_top_p: Optional[bool] = False, pre_perturb_pct: Optional[float] = 0.0, pre_perturb_span_length: Optional[int] = 5, tok_by_tok: Optional[bool] = False, fpr_list: Optional[List[float]] = <factory>, random_seed: Optional[int] = 0, ref_config: Optional[ReferenceConfig] = None, recall_config: Optional[ReCaLLConfig] = None, neighborhood_config: Optional[NeighborhoodConfig] = None, env_config: Optional[EnvironmentConfig] = None, openai_config: Optional[OpenAIConfig] = None)

Config for attacks

Expand source code
@dataclass
class ExperimentConfig(Serializable):
    """
    Config for attacks
    """
    experiment_name: str
    """Name for the experiment"""
    base_model: str
    """Base model name"""
    dataset_member: str
    """Dataset source for members"""
    dataset_nonmember: str
    """Dataset source for nonmembers"""
    output_name: str = None
    """Output name for sub-directory."""
    dataset_nonmember_other_sources: Optional[List[str]] = field(
        default_factory=lambda: None
    )
    """Dataset sources for nonmembers for which metrics will be computed, using the thresholds derived from the main member/nonmember datasets"""
    pretokenized: Optional[bool] = False
    """Is the data already pretokenized"""
    revision: Optional[str] = None
    """Model revision to use"""
    presampled_dataset_member: Optional[str] = None
    """Path to presampled dataset source for members"""
    presampled_dataset_nonmember: Optional[str] = None
    """Path to presampled dataset source for non-members"""
    token_frequency_map: Optional[
        str
    ] = None  # TODO: Handling auxiliary data structures
    """Path to a pre-computed token frequency map"""
    dataset_key: Optional[str] = None
    """Dataset key"""
    specific_source: Optional[str] = None
    """Specific sub-source to focus on. Only valid for the_pile"""
    full_doc: Optional[bool] = False  # TODO: refactor full_doc design?
    """Determines whether MIA will be performed over entire doc or not"""
    max_substrs: Optional[int] = 20
    """If full_doc, determines the maximum number of sample substrs to evaluate on"""
    dump_cache: Optional[bool] = False
    """Dump data to cache? Exits program after dumping"""
    load_from_cache: Optional[bool] = False
    """Load data from cache?"""
    load_from_hf: Optional[bool] = True
    """Load data from HuggingFace?"""
    blackbox_attacks: Optional[List[str]] = field(
        default_factory=lambda: None
    )  # Can replace with "default" attacks if we want
    """List of attacks to evaluate"""
    tokenization_attack: Optional[bool] = False
    """Run tokenization attack?"""
    quantile_attack: Optional[bool] = False
    """Run quantile attack?"""
    n_samples: Optional[int] = 200
    """Number of records (member and non-member each) to run the attack(s) for"""
    max_tokens: Optional[int] = 512
    """Consider samples with at most these many tokens"""
    max_data: Optional[int] = 5_000
    """Maximum samples to load from data before processing. Helps with efficiency"""
    min_words: Optional[int] = 100
    """Consider documents with at least these many words"""
    max_words: Optional[int] = 200
    """Consider documents with at most these many words"""
    max_words_cutoff: Optional[bool] = True
    """Is max_words a selection criteria (False), or a cutoff added on text (True)?"""
    batch_size: Optional[int] = 50
    """Batch size"""
    chunk_size: Optional[int] = 20
    """Chunk size"""
    scoring_model_name: Optional[str] = None
    """Scoring model (if different from base model)"""
    top_k: Optional[int] = 40
    """Consider only top-k tokens"""
    do_top_k: Optional[bool] = False
    """Use top-k sampling?"""
    top_p: Optional[float] = 0.96
    """Use tokens (minimal set) with cumulative probability of <=top_p"""
    do_top_p: Optional[bool] = False
    """Use top-p sampling?"""
    pre_perturb_pct: Optional[float] = 0.0
    """Percentage of tokens to perturb before attack"""
    pre_perturb_span_length: Optional[int] = 5
    """Span length for pre-perturbation"""
    tok_by_tok: Optional[bool] = False
    """Process data token-wise?"""
    fpr_list: Optional[List[float]] = field(default_factory=lambda: [0.001, 0.01])
    """FPRs at which to compute TPR"""
    random_seed: Optional[int] = 0
    """Random seed"""
    ref_config: Optional[ReferenceConfig] = None
    """Reference model config"""
    recall_config: Optional[ReCaLLConfig] = None
    """ReCaLL attack config"""
    neighborhood_config: Optional[NeighborhoodConfig] = None
    """Neighborhood attack config"""
    env_config: Optional[EnvironmentConfig] = None
    """Environment config"""
    openai_config: Optional[OpenAIConfig] = None
    """OpenAI config"""

    def __post_init__(self):
        if self.dump_cache and (self.load_from_cache or self.load_from_hf):
            raise ValueError("Cannot dump and load cache at the same time")

        if self.neighborhood_config:
            if (
                self.neighborhood_config.dump_cache
                or self.neighborhood_config.load_from_cache
            ) and not (self.load_from_cache or self.dump_cache or self.load_from_hf):
                raise ValueError(
                    "Using dump/load for neighborhood cache without dumping/loading main cache does not make sense"
                )

            if self.neighborhood_config.dump_cache and (self.neighborhood_config.load_from_cache or self.load_from_hf):
                raise ValueError("Cannot dump and load neighborhood cache at the same time")    

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var base_model : str

Base model name

var batch_size : Optional[int]

Batch size

var blackbox_attacks : Optional[List[str]]

List of attacks to evaluate

var chunk_size : Optional[int]

Chunk size

var dataset_key : Optional[str]

Dataset key

var dataset_member : str

Dataset source for members

var dataset_nonmember : str

Dataset source for nonmembers

var dataset_nonmember_other_sources : Optional[List[str]]

Dataset sources for nonmembers for which metrics will be computed, using the thresholds derived from the main member/nonmember datasets

var decode_into_subclasses : ClassVar[bool]
var do_top_k : Optional[bool]

Use top-k sampling?

var do_top_p : Optional[bool]

Use top-p sampling?

var dump_cache : Optional[bool]

Dump data to cache? Exits program after dumping

var env_config : Optional[EnvironmentConfig]

Environment config

var experiment_name : str

Name for the experiment

var fpr_list : Optional[List[float]]

FPRs at which to compute TPR

var full_doc : Optional[bool]

Determines whether MIA will be performed over entire doc or not

var load_from_cache : Optional[bool]

Load data from cache?

var load_from_hf : Optional[bool]

Load data from HuggingFace?

var max_data : Optional[int]

Maximum samples to load from data before processing. Helps with efficiency

var max_substrs : Optional[int]

If full_doc, determines the maximum number of sample substrs to evaluate on

var max_tokens : Optional[int]

Consider samples with at most these many tokens

var max_words : Optional[int]

Consider documents with at most these many words

var max_words_cutoff : Optional[bool]

Is max_words a selection criteria (False), or a cutoff added on text (True)?

var min_words : Optional[int]

Consider documents with at least these many words

var n_samples : Optional[int]

Number of records (member and non-member each) to run the attack(s) for

var neighborhood_config : Optional[NeighborhoodConfig]

Neighborhood attack config

var openai_config : Optional[OpenAIConfig]

OpenAI config

var output_name : str

Output name for sub-directory.

var pre_perturb_pct : Optional[float]

Percentage of tokens to perturb before attack

var pre_perturb_span_length : Optional[int]

Span length for pre-perturbation

var presampled_dataset_member : Optional[str]

Path to presampled dataset source for members

var presampled_dataset_nonmember : Optional[str]

Path to presampled dataset source for non-members

var pretokenized : Optional[bool]

Is the data already pretokenized

var quantile_attack : Optional[bool]

Run quantile attack?

var random_seed : Optional[int]

Random seed

var recall_config : Optional[ReCaLLConfig]

ReCaLL attack config

var ref_config : Optional[ReferenceConfig]

Reference model config

var revision : Optional[str]

Model revision to use

var scoring_model_name : Optional[str]

Scoring model (if different from base model)

var specific_source : Optional[str]

Specific sub-source to focus on. Only valid for the_pile

var tok_by_tok : Optional[bool]

Process data token-wise?

var token_frequency_map : Optional[str]

Path to a pre-computed token frequency map

var tokenization_attack : Optional[bool]

Run tokenization attack?

var top_k : Optional[int]

Consider only top-k tokens

var top_p : Optional[float]

Use tokens (minimal set) with cumulative probability of <=top_p

class NeighborhoodConfig (model: str, n_perturbation_list: List[int] = <factory>, dump_cache: Optional[bool] = False, load_from_cache: Optional[bool] = False, original_tokenization_swap: Optional[bool] = True, pct_swap_bert: Optional[float] = 0.05, neighbor_strategy: Optional[str] = 'deterministic', span_length: Optional[int] = 2, random_fills_tokens: Optional[bool] = False, random_fills: Optional[bool] = False, pct_words_masked: Optional[float] = 0.3, buffer_size: Optional[int] = 1, top_p: Optional[float] = 1.0, max_tries: Optional[int] = 100, ceil_pct: Optional[bool] = False)

Config for neighborhood attack

Expand source code
@dataclass
class NeighborhoodConfig(Serializable):
    """
    Config for neighborhood attack
    """
    model: str
    """Mask-filling model"""
    n_perturbation_list: List[int] = field(default_factory=lambda: [1, 10])
    """List of n_neighbors to try."""
    dump_cache: Optional[bool] = False
    "Dump neighbors data to cache? Exits program after dumping"
    load_from_cache: Optional[bool] = False
    """Load neighbors data from cache?"""
    # BERT-specific param
    original_tokenization_swap: Optional[bool] = True
    """Swap out token in original text with neighbor token, instead of re-generating text"""
    pct_swap_bert: Optional[float] = 0.05
    """Percentage of tokens per neighbor that are different from the original text"""
    neighbor_strategy: Optional[str] = "deterministic"
    """Strategy for generating neighbors. One of ['deterministic', 'random']. Deterministic uses only one-word neighbors"""
    # T-5 specific hyper-parameters
    span_length: Optional[int] = 2
    """Span length for neighborhood attack"""
    random_fills_tokens: Optional[bool] = False
    """Randomly fill tokens?"""
    random_fills: Optional[bool] = False
    """Randomly fill?"""
    pct_words_masked: Optional[float] = 0.3
    """Percentage masked is actually pct_words_masked * (span_length / (span_length + 2 * buffer_size))"""
    buffer_size: Optional[int] = 1
    """Buffer size"""
    top_p: Optional[float] = 1.0
    """Use tokens (minimal set) with cumulative probability of <=top_p"""
    max_tries: Optional[int] = 100
    """Maximum number of trials in finding replacements for masked tokens"""
    ceil_pct: Optional[bool] = False
    """Apply ceil operation on span length calculation?"""

    def __post_init__(self):
        if self.dump_cache and self.load_from_cache:
            raise ValueError("Cannot dump and load cache at the same time")

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var buffer_size : Optional[int]

Buffer size

var ceil_pct : Optional[bool]

Apply ceil operation on span length calculation?

var decode_into_subclasses : ClassVar[bool]
var dump_cache : Optional[bool]

Dump neighbors data to cache? Exits program after dumping

var load_from_cache : Optional[bool]

Load neighbors data from cache?

var max_tries : Optional[int]

Maximum number of trials in finding replacements for masked tokens

var model : str

Mask-filling model

var n_perturbation_list : List[int]

List of n_neighbors to try.

var neighbor_strategy : Optional[str]

Strategy for generating neighbors. One of ['deterministic', 'random']. Deterministic uses only one-word neighbors

var original_tokenization_swap : Optional[bool]

Swap out token in original text with neighbor token, instead of re-generating text

var pct_swap_bert : Optional[float]

Percentage of tokens per neighbor that are different from the original text

var pct_words_masked : Optional[float]

Percentage masked is actually pct_words_masked * (span_length / (span_length + 2 * buffer_size))

var random_fills : Optional[bool]

Randomly fill?

var random_fills_tokens : Optional[bool]

Randomly fill tokens?

var span_length : Optional[int]

Span length for neighborhood attack

var top_p : Optional[float]

Use tokens (minimal set) with cumulative probability of <=top_p

class OpenAIConfig (key: str, model: str)

Config for OpenAI calls

Expand source code
@dataclass
class OpenAIConfig(Serializable):
    """
    Config for OpenAI calls
    """
    key: str
    """OpenAI API key"""
    model: str
    """Model name"""

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var decode_into_subclasses : ClassVar[bool]
var key : str

OpenAI API key

var model : str

Model name

class ReCaLLConfig (num_shots: Optional[int] = 1)

Config for ReCaLL attack

Expand source code
@dataclass
class ReCaLLConfig(Serializable):
    """
    Config for ReCaLL attack
    """
    num_shots: Optional[int] = 1
    """Number of shots for ReCaLL Attacks"""

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var decode_into_subclasses : ClassVar[bool]
var num_shots : Optional[int]

Number of shots for ReCaLL Attacks

class ReferenceConfig (models: List[str])

Config for attacks that use reference models.

Expand source code
@dataclass
class ReferenceConfig(Serializable):
    """
    Config for attacks that use reference models.
    """
    models: List[str]
    """Reference model names"""

Ancestors

  • simple_parsing.helpers.serialization.serializable.Serializable
  • simple_parsing.helpers.serialization.serializable.SerializableMixin

Class variables

var decode_into_subclasses : ClassVar[bool]
var models : List[str]

Reference model names