oroszgy's picture
refactor(keyphrases): moving examples around
9193ed5 unverified
raw
history blame contribute delete
No virus
815 Bytes
import math
from pathlib import Path
from typing import Dict, List
import spacy
from spacy import Language
NLP: Language = spacy.load("hu_core_news_trf")
def _compute_idf(freq_file: Path) -> Dict[str, float]:
freqs: Dict[str, int] = {}
with freq_file.open() as f:
for line in f:
line = line.strip()
data: List[str] = line.split()
if len(data) == 0:
continue
word: str = data[0]
freq: int = int(data[-1])
if not line.isalpha():
freqs[word] = freq
max_freq: int = freqs["a"]
idfs: Dict[str, float] = {w: math.log2(max_freq / (float(f) + 1)) + 1 for w, f in freqs.items()}
return idfs
IDF: Dict[str, float] = _compute_idf(Path(__file__).parent.parent / "resources" / "freq.list")