donb-hf commited on
Commit
edd8809
1 Parent(s): 74a845d

initial commit

Browse files
Files changed (7) hide show
  1. .gitignore +3 -0
  2. .python-version +1 -0
  3. app.py +26 -0
  4. arxiv_fetcher.py +38 -0
  5. arxiv_metadata_service.py +36 -0
  6. config.py +15 -0
  7. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv/
2
+ .env
3
+ __pycache__/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
app.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from arxiv_metadata_service import ArxivMetadataService
3
+ import traceback
4
+
5
+ arxiv_service = ArxivMetadataService()
6
+
7
+ def extract_metadata(query: str, max_results: int):
8
+ try:
9
+ return arxiv_service.extract_and_update(query, max_results)
10
+ except Exception as e:
11
+ error_msg = f"An error occurred: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
12
+ return error_msg
13
+
14
+ demo = gr.Interface(
15
+ fn=extract_metadata,
16
+ inputs=[
17
+ gr.Textbox(label="ArXiv Query"),
18
+ gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Max Results")
19
+ ],
20
+ outputs="text",
21
+ title="ArXiv Metadata Extractor",
22
+ description="Extract metadata from ArXiv papers and update the dataset."
23
+ )
24
+
25
+ if __name__ == "__main__":
26
+ demo.launch()
arxiv_fetcher.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # arxiv_fetcher.py
2
+
3
+ import arxiv
4
+ from typing import List, Dict, Any
5
+ import logging
6
+
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ def fetch_arxiv_metadata(query: str, max_results: int = 10) -> List[Dict[str, Any]]:
10
+ logging.info(f"Fetching arXiv metadata for query: {query}")
11
+ if not query.strip():
12
+ logging.warning("Empty or whitespace-only query provided")
13
+ return []
14
+
15
+ client = arxiv.Client(page_size=max_results, delay_seconds=3, num_retries=3)
16
+ search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
17
+
18
+ results = []
19
+ try:
20
+ for result in client.results(search):
21
+ metadata = {
22
+ "title": result.title,
23
+ "authors": [author.name for author in result.authors],
24
+ "published": result.published.isoformat(),
25
+ "updated": result.updated.isoformat(),
26
+ "pdf_url": result.pdf_url,
27
+ "entry_id": result.entry_id,
28
+ "summary": result.summary,
29
+ "categories": result.categories,
30
+ "primary_category": result.primary_category,
31
+ "html_url": f"http://arxiv.org/abs/{result.entry_id.split('/')[-1]}"
32
+ }
33
+ results.append(metadata)
34
+ logging.info(f"Fetched metadata for {len(results)} papers")
35
+ except Exception as e:
36
+ logging.error(f"Error fetching metadata: {str(e)}")
37
+
38
+ return results
arxiv_metadata_service.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from arxiv_fetcher import fetch_arxiv_metadata
2
+ from datasets import load_dataset, Dataset
3
+ from config import DATASET_NAME
4
+ import logging
5
+ from typing import List, Dict, Any
6
+
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
8
+
9
+ class ArxivMetadataService:
10
+ def extract_and_update(self, query: str, max_results: int = 10) -> str:
11
+ metadata_list = fetch_arxiv_metadata(query, max_results)
12
+ return self.update_dataset(metadata_list)
13
+
14
+ def update_dataset(self, metadata_list: List[Dict[str, Any]]) -> str:
15
+ try:
16
+ dataset = load_dataset(DATASET_NAME, split="train")
17
+ current_data = dataset.to_dict()
18
+
19
+ for paper in metadata_list:
20
+ if paper['id'] not in current_data.get('id', []):
21
+ for key, value in paper.items():
22
+ if key not in current_data:
23
+ current_data[key] = []
24
+ current_data[key].append(value)
25
+ else:
26
+ index = current_data['id'].index(paper['id'])
27
+ for key, value in paper.items():
28
+ current_data[key][index] = value
29
+
30
+ updated_dataset = Dataset.from_dict(current_data)
31
+ updated_dataset.push_to_hub(DATASET_NAME, split="train")
32
+
33
+ return f"Successfully updated dataset with {len(metadata_list)} papers"
34
+ except Exception as e:
35
+ logging.error(f"Failed to update dataset: {str(e)}")
36
+ return f"Failed to update dataset: {str(e)}"
config.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: config.py
2
+ import os
3
+
4
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
5
+ QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
6
+ QDRANT_API_URL = os.getenv("QDRANT_API_URL")
7
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
8
+ COLLECTION_NAME = "arxiv_papers"
9
+ DATASET_NAME = "dwb2023/arxiv-papers-dataset"
10
+
11
+ LANGCHAIN_PROJECT="arxiv_papers"
12
+ LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
13
+ LANGCHAIN_TRACING_V2="true"
14
+ LANGCHAIN_HUB_PROMPT="rlm/rag-prompt-llama3"
15
+ LANGCHAIN_API_KEY=os.getenv("LANGCHAIN_API_KEY")
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ arxiv
2
+ datasets
3
+ gradio