arxiv-rag-mvp / huggingface_dataset_manager.py
donb-hf's picture
update services
84deff7
raw
history blame contribute delete
No virus
1.48 kB
# huggingface_dataset_manager.py
from datasets import load_dataset, Dataset
from typing import List, Dict, Any
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class HuggingFaceDatasetManager:
def __init__(self, dataset_name: str):
self.dataset_name = dataset_name
def persist_to_dataset(self, metadata_list: List[Dict[str, Any]]):
if not metadata_list:
logging.warning("No metadata to persist.")
return
try:
dataset = load_dataset(self.dataset_name)
new_dataset = Dataset.from_dict({k: [d[k] for d in metadata_list] for k in metadata_list[0]})
dataset = dataset.add_item(new_dataset)
dataset.push_to_hub(self.dataset_name)
logging.info(f"Updated and pushed dataset: {self.dataset_name}")
except Exception as e:
logging.error(f"Error persisting to dataset: {str(e)}")
def update_dataset(self, new_data: List[Dict[str, Any]]):
try:
dataset = load_dataset(self.dataset_name)
new_dataset = Dataset.from_dict({k: [d[k] for d in new_data] for k in new_data[0]})
dataset = dataset.add_item(new_dataset)
dataset.push_to_hub(self.dataset_name)
logging.info(f"Updated Hugging Face dataset: {self.dataset_name}")
except Exception as e:
logging.error(f"Error updating Hugging Face dataset: {str(e)}")