grobid-dev-2

Paused

App Files Files Community

grobid-dev-2 / grobid.yaml

lfoppiano

Update grobid.yaml

c59b7a1 verified 3 months ago

raw

history blame contribute delete

No virus

11.3 kB

	# this is the configuration file for the GROBID instance

	grobid:
	# where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
	grobidHome: "grobid-home"

	# path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
	temp: "tmp"

	# normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
	nativelibrary: "lib"

	pdf:
	pdfalto:
	# path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
	path: "pdfalto"
	# security for PDF parsing
	memoryLimitMb: 6096
	timeoutSec: 120

	# security relative to the PDF parsing result
	blocksMax: 200000
	tokensMax: 1000000

	consolidation:
	# define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or
	# "glutton" for https://github.com/kermitt2/biblio-glutton
	#service: "crossref"
	service: "glutton"
	glutton:
	url: "http://sciencialab.ddns.net/glutton"
	#url: "http://localhost:8080"
	crossref:
	mailto: luca@sciencialab.com
	# to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g.
	#mailto: "toto@titi.tutu"
	token:
	# to use Crossref metadata plus service (available by subscription)
	#token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"

	proxy:
	# proxy to be used when doing external call to the consolidation service
	host:
	port:

	# CORS configuration for the GROBID web API service
	corsAllowedOrigins: "*"
	corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
	corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

	# the actual implementation for language recognition to be used
	languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"

	# the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
	sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
	# sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"

	# maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
	# for a production server running only GROBID, set the value slightly above the available number of threads of the server
	# to get best performance and security
	concurrency: 10
	# when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try
	# to get an engine (in seconds) - normally never change it
	poolMaxWait: 1

	delft:
	# DeLFT global parameters
	# delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model,
	# embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
	install: "../delft"
	pythonVirtualEnv:

	wapiti:
	# Wapiti global parameters
	# number of threads for training the wapiti models (0 to use all available processors)
	nbThreads: 0

	models:
	# we configure here how each sequence labeling model should be implemented
	# for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
	# for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training
	# parameters then depends on this selected DL architecture

	- name: "segmentation"
	# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
	engine: "wapiti"
	#engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.0000001
	window: 50
	nbMaxIterations: 2000
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"
	useELMo: false
	runtime:
	# parameters used at runtime/prediction
	max_sequence_length: 3000
	batch_size: 1
	training:
	# parameters used for training
	max_sequence_length: 3000
	batch_size: 10

	- name: "fulltext"
	# at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
	engine: "wapiti"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.0001
	window: 20
	nbMaxIterations: 1500

	- name: "header"
	#engine: "wapiti"
	engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.000001
	window: 30
	nbMaxIterations: 1500
	delft:
	# deep learning parameters
	architecture: "BidLSTM_ChainCRF_FEATURES"
	#transformer: "allenai/scibert_scivocab_cased"
	useELMo: false
	runtime:
	# parameters used at runtime/prediction
	#max_sequence_length: 510
	max_sequence_length: 3000
	batch_size: 1
	training:
	# parameters used for training
	#max_sequence_length: 510
	#batch_size: 6
	max_sequence_length: 3000
	batch_size: 9

	- name: "reference-segmenter"
	#engine: "wapiti"
	engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.00001
	window: 20
	delft:
	# deep learning parameters
	architecture: "BidLSTM_ChainCRF_FEATURES"
	useELMo: false
	runtime:
	# parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
	max_sequence_length: 3000
	batch_size: 2
	training:
	# parameters used for training
	max_sequence_length: 3000
	batch_size: 10

	- name: "name-header"
	engine: "wapiti"
	#engine: "delft"
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"

	- name: "name-citation"
	engine: "wapiti"
	#engine: "delft"
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"

	- name: "date"
	engine: "wapiti"
	#engine: "delft"
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"

	- name: "figure"
	engine: "wapiti"
	#engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.00001
	window: 20
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF"

	- name: "table"
	engine: "wapiti"
	#engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.00001
	window: 20
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF"

	- name: "affiliation-address"
	#engine: "wapiti"
	engine: "delft"
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"

	- name: "citation"
	#engine: "wapiti"
	engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.00001
	window: 50
	nbMaxIterations: 3000
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"
	#architecture: "BERT_CRF"
	#transformer: "michiyasunaga/LinkBERT-base"
	useELMo: false
	runtime:
	# parameters used at runtime/prediction
	max_sequence_length: 500
	batch_size: 30
	training:
	# parameters used for training
	max_sequence_length: 500
	batch_size: 50

	- name: "patent-citation"
	engine: "wapiti"
	#engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.0001
	window: 20
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"
	#architecture: "BERT_CRF"
	runtime:
	# parameters used at runtime/prediction
	max_sequence_length: 800
	batch_size: 20
	training:
	# parameters used for training
	max_sequence_length: 1000
	batch_size: 40

	- name: "funding-acknowledgement"
	engine: "wapiti"
	#engine: "delft"
	wapiti:
	# wapiti training parameters, they will be used at training time only
	epsilon: 0.00001
	window: 50
	nbMaxIterations: 2000
	delft:
	# deep learning parameters
	architecture: "BidLSTM_CRF_FEATURES"
	#architecture: "BERT_CRF"
	#transformer: "michiyasunaga/LinkBERT-base"
	useELMo: false
	runtime:
	# parameters used at runtime/prediction
	max_sequence_length: 800
	batch_size: 20
	training:
	# parameters used for training
	max_sequence_length: 500
	batch_size: 40

	- name: "copyright"
	# at this time, we only have a DeLFT implementation,
	# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
	#engine: "delft"
	engine: "wapiti"
	delft:
	# deep learning parameters
	architecture: "gru"
	#architecture: "bert"
	#transformer: "allenai/scibert_scivocab_cased"

	- name: "license"
	# at this time, for being active, it must be DeLFT, no other implementation is available
	# use "wapiti" if the deep learning library JNI is not available and model will then be ignored
	#engine: "delft"
	engine: "wapiti"
	delft:
	# deep learning parameters
	architecture: "gru"
	#architecture: "bert"
	#transformer: "allenai/scibert_scivocab_cased"

	# for service only: how to load the models,
	# false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down
	# significantly the service at first call
	# true -> all the models are loaded into memory at the server startup (default), slow the start of the services
	# and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
	modelPreload: true

	server:
	type: custom
	applicationConnectors:
	- type: http
	port: 8070
	adminConnectors:
	- type: http
	port: 8071
	registerDefaultExceptionMappers: false
	# change the following for having all http requests logged
	requestLog:
	appenders: []

	# these logging settings apply to the Grobid service usage mode
	logging:
	level: INFO
	loggers:
	org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
	org.glassfish.jersey.internal: "OFF"
	com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
	appenders:
	- type: console
	threshold: INFO
	timeZone: UTC
	# uncomment to have the logs in json format
	# layout:
	# type: json