Spaces:
Sleeping
Sleeping
initial commit
Browse files- .gitignore +3 -0
- .python-version +1 -0
- app.py +26 -0
- arxiv_fetcher.py +38 -0
- arxiv_metadata_service.py +36 -0
- config.py +15 -0
- requirements.txt +3 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.venv/
|
2 |
+
.env
|
3 |
+
__pycache__/
|
.python-version
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
3.10
|
app.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from arxiv_metadata_service import ArxivMetadataService
|
3 |
+
import traceback
|
4 |
+
|
5 |
+
arxiv_service = ArxivMetadataService()
|
6 |
+
|
7 |
+
def extract_metadata(query: str, max_results: int):
|
8 |
+
try:
|
9 |
+
return arxiv_service.extract_and_update(query, max_results)
|
10 |
+
except Exception as e:
|
11 |
+
error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
12 |
+
return error_msg
|
13 |
+
|
14 |
+
demo = gr.Interface(
|
15 |
+
fn=extract_metadata,
|
16 |
+
inputs=[
|
17 |
+
gr.Textbox(label="ArXiv Query"),
|
18 |
+
gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
|
19 |
+
],
|
20 |
+
outputs="text",
|
21 |
+
title="ArXiv Metadata Extractor",
|
22 |
+
description="Extract metadata from ArXiv papers and update the dataset."
|
23 |
+
)
|
24 |
+
|
25 |
+
if __name__ == "__main__":
|
26 |
+
demo.launch()
|
arxiv_fetcher.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# arxiv_fetcher.py
|
2 |
+
|
3 |
+
import arxiv
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
import logging
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
8 |
+
|
9 |
+
def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
|
10 |
+
logging.info(f"Fetching arXiv metadata for query: {query}")
|
11 |
+
if not query.strip():
|
12 |
+
logging.warning("Empty or whitespace-only query provided")
|
13 |
+
return []
|
14 |
+
|
15 |
+
client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
|
16 |
+
search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
|
17 |
+
|
18 |
+
results = []
|
19 |
+
try:
|
20 |
+
for result in client.results(search):
|
21 |
+
metadata = {
|
22 |
+
"title": result.title,
|
23 |
+
"authors": [author.name for author in result.authors],
|
24 |
+
"published": result.published.isoformat(),
|
25 |
+
"updated": result.updated.isoformat(),
|
26 |
+
"pdf_url": result.pdf_url,
|
27 |
+
"entry_id": result.entry_id,
|
28 |
+
"summary": result.summary,
|
29 |
+
"categories": result.categories,
|
30 |
+
"primary_category": result.primary_category,
|
31 |
+
"html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
|
32 |
+
}
|
33 |
+
results.append(metadata)
|
34 |
+
logging.info(f"Fetched metadata for {len(results)} papers")
|
35 |
+
except Exception as e:
|
36 |
+
logging.error(f"Error fetching metadata: {str(e)}")
|
37 |
+
|
38 |
+
return results
|
arxiv_metadata_service.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from arxiv_fetcher import fetch_arxiv_metadata
|
2 |
+
from datasets import load_dataset, Dataset
|
3 |
+
from config import DATASET_NAME
|
4 |
+
import logging
|
5 |
+
from typing import List, Dict, Any
|
6 |
+
|
7 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
8 |
+
|
9 |
+
class ArxivMetadataService:
|
10 |
+
def extract_and_update(self, query: str, max_results: int = 10) -> str:
|
11 |
+
metadata_list = fetch_arxiv_metadata(query, max_results)
|
12 |
+
return self.update_dataset(metadata_list)
|
13 |
+
|
14 |
+
def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
|
15 |
+
try:
|
16 |
+
dataset = load_dataset(DATASET_NAME, split="train")
|
17 |
+
current_data = dataset.to_dict()
|
18 |
+
|
19 |
+
for paper in metadata_list:
|
20 |
+
if paper['id'] not in current_data.get('id', []):
|
21 |
+
for key, value in paper.items():
|
22 |
+
if key not in current_data:
|
23 |
+
current_data[key] = []
|
24 |
+
current_data[key].append(value)
|
25 |
+
else:
|
26 |
+
index = current_data['id'].index(paper['id'])
|
27 |
+
for key, value in paper.items():
|
28 |
+
current_data[key][index] = value
|
29 |
+
|
30 |
+
updated_dataset = Dataset.from_dict(current_data)
|
31 |
+
updated_dataset.push_to_hub(DATASET_NAME, split="train")
|
32 |
+
|
33 |
+
return f"Successfully updated dataset with {len(metadata_list)} papers"
|
34 |
+
except Exception as e:
|
35 |
+
logging.error(f"Failed to update dataset: {str(e)}")
|
36 |
+
return f"Failed to update dataset: {str(e)}"
|
config.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# File: config.py
|
2 |
+
import os
|
3 |
+
|
4 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
5 |
+
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
6 |
+
QDRANT_API_URL = os.getenv("QDRANT_API_URL")
|
7 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
8 |
+
COLLECTION_NAME = "arxiv_papers"
|
9 |
+
DATASET_NAME = "dwb2023/arxiv-papers-dataset"
|
10 |
+
|
11 |
+
LANGCHAIN_PROJECT="arxiv_papers"
|
12 |
+
LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
|
13 |
+
LANGCHAIN_TRACING_V2="true"
|
14 |
+
LANGCHAIN_HUB_PROMPT="rlm/rag-prompt-llama3"
|
15 |
+
LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
arxiv
|
2 |
+
datasets
|
3 |
+
gradio
|