grobid-dev-2 / grobid.yaml
lfoppiano's picture
Update grobid.yaml
c59b7a1 verified
raw
history blame contribute delete
No virus
11.3 kB
# this is the configuration file for the GROBID instance
grobid:
# where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
grobidHome: "grobid-home"
# path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
temp: "tmp"
# normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
nativelibrary: "lib"
pdf:
pdfalto:
# path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
path: "pdfalto"
# security for PDF parsing
memoryLimitMb: 6096
timeoutSec: 120
# security relative to the PDF parsing result
blocksMax: 200000
tokensMax: 1000000
consolidation:
# define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
# "glutton" for https://github.com/kermitt2/biblio-glutton
#service: "crossref"
service: "glutton"
glutton:
url: "http://sciencialab.ddns.net/glutton"
#url: "http://localhost:8080"
crossref:
mailto: luca@sciencialab.com
# to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
#mailto: "toto@titi.tutu"
token:
# to use Crossref metadata plus service (available by subscription)
#token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"
proxy:
# proxy to be used when doing external call to the consolidation service
host:
port:
# CORS configuration for the GROBID web API service
corsAllowedOrigins: "*"
corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"
# the actual implementation for language recognition to be used
languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"
# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
# sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
# maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
# for a production server running only GROBID, set the value slightly above the available number of threads of the server
# to get best performance and security
concurrency: 10
# when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
# to get an engine (in seconds) - normally never change it
poolMaxWait: 1
delft:
# DeLFT global parameters
# delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
# embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
install: "../delft"
pythonVirtualEnv:
wapiti:
# Wapiti global parameters
# number of threads for training the wapiti models (0 to use all available processors)
nbThreads: 0
models:
# we configure here how each sequence labeling model should be implemented
# for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
# for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
# parameters then depends on this selected DL architecture
- name: "segmentation"
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0000001
window: 50
nbMaxIterations: 2000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 3000
batch_size: 1
training:
# parameters used for training
max_sequence_length: 3000
batch_size: 10
- name: "fulltext"
# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
engine: "wapiti"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0001
window: 20
nbMaxIterations: 1500
- name: "header"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.000001
window: 30
nbMaxIterations: 1500
delft:
# deep learning parameters
architecture: "BidLSTM_ChainCRF_FEATURES"
#transformer: "allenai/scibert_scivocab_cased"
useELMo: false
runtime:
# parameters used at runtime/prediction
#max_sequence_length: 510
max_sequence_length: 3000
batch_size: 1
training:
# parameters used for training
#max_sequence_length: 510
#batch_size: 6
max_sequence_length: 3000
batch_size: 9
- name: "reference-segmenter"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_ChainCRF_FEATURES"
useELMo: false
runtime:
# parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
max_sequence_length: 3000
batch_size: 2
training:
# parameters used for training
max_sequence_length: 3000
batch_size: 10
- name: "name-header"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
- name: "name-citation"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
- name: "date"
engine: "wapiti"
#engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
- name: "figure"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF"
- name: "table"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF"
- name: "affiliation-address"
#engine: "wapiti"
engine: "delft"
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
- name: "citation"
#engine: "wapiti"
engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 50
nbMaxIterations: 3000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
#transformer: "michiyasunaga/LinkBERT-base"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 500
batch_size: 30
training:
# parameters used for training
max_sequence_length: 500
batch_size: 50
- name: "patent-citation"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.0001
window: 20
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
runtime:
# parameters used at runtime/prediction
max_sequence_length: 800
batch_size: 20
training:
# parameters used for training
max_sequence_length: 1000
batch_size: 40
- name: "funding-acknowledgement"
engine: "wapiti"
#engine: "delft"
wapiti:
# wapiti training parameters, they will be used at training time only
epsilon: 0.00001
window: 50
nbMaxIterations: 2000
delft:
# deep learning parameters
architecture: "BidLSTM_CRF_FEATURES"
#architecture: "BERT_CRF"
#transformer: "michiyasunaga/LinkBERT-base"
useELMo: false
runtime:
# parameters used at runtime/prediction
max_sequence_length: 800
batch_size: 20
training:
# parameters used for training
max_sequence_length: 500
batch_size: 40
- name: "copyright"
# at this time, we only have a DeLFT implementation,
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
#engine: "delft"
engine: "wapiti"
delft:
# deep learning parameters
architecture: "gru"
#architecture: "bert"
#transformer: "allenai/scibert_scivocab_cased"
- name: "license"
# at this time, for being active, it must be DeLFT, no other implementation is available
# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
#engine: "delft"
engine: "wapiti"
delft:
# deep learning parameters
architecture: "gru"
#architecture: "bert"
#transformer: "allenai/scibert_scivocab_cased"
# for **service only**: how to load the models,
# false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
# significantly the service at first call
# true -> all the models are loaded into memory at the server startup (default), slow the start of the services
# and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
modelPreload: true
server:
type: custom
applicationConnectors:
- type: http
port: 8070
adminConnectors:
- type: http
port: 8071
registerDefaultExceptionMappers: false
# change the following for having all http requests logged
requestLog:
appenders: []
# these logging settings apply to the Grobid service usage mode
logging:
level: INFO
loggers:
org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
org.glassfish.jersey.internal: "OFF"
com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
appenders:
- type: console
threshold: INFO
timeZone: UTC
# uncomment to have the logs in json format
# layout:
# type: json