import logging from datasets import load_dataset logger = logging.getLogger(__name__) class DataLoader: def __init__(self, cache_dir: str = "./cache"): self.cache_dir = cache_dir def load_msmarco_passage(self, split: str = "train"): """Load MS MARCO Passage Ranking dataset from Hugging Face (v2.1)""" try: logger.info(f"Downloading MS MARCO Passage Ranking {split} (v2.1) from Hugging Face") ds = load_dataset("ms_marco", "v2.1", split=split) return ds except Exception as e: logger.error(f"Failed to load MS MARCO Passage Ranking: {e}") raise def get_passage_dataset(self, split: str = "train"): """Load MS MARCO Passage Ranking dataset""" try: ds = self.load_msmarco_passage(split) logger.info("MS MARCO Passage Ranking loaded successfully") return ds except Exception as e: logger.error(f"Failed to load MS MARCO Passage Ranking: {e}") raise