Module mimir.custom_datasets
Helper functions for processing of data (ultimately used for membership inference evaluation)
Functions
def dump_to_cache(data: List, cache_dir, path, filename: str, min_length: int, max_length: int, n_samples: int, max_tokens: int)
-
Cache a file (one sample per line)
def load(name, cache_dir, **kwargs)
def load_cached(cache_dir, data_split: str, filename: str, min_length: int, max_length: int, n_samples: int, max_tokens: int, load_from_hf: bool = False)
-
" Read from cache if available. Used for certain pile sources and xsum to ensure fairness in comparison across attacks.runs.
def load_data(file_path)
-
Load data from a given filepath (.jsonl)
def load_english(cache_dir)
def load_german(cache_dir)
def load_language(language, cache_dir)
def load_pubmed(cache_dir)
def load_writing(cache_dir=None)
def process_prompt(prompt)
def process_spaces(story)
def save_data(file_path, data)