grobid-dev-2

Paused

File size: 11,316 Bytes

# this is the configuration file for the GROBID instance

grobid:
  # where all the Grobid resources are stored (models, lexicon, native libraries, etc.), normally no need to change
  grobidHome: "grobid-home"

  # path relative to the grobid-home path (e.g. tmp for grobid-home/tmp) or absolute path (/tmp)
  temp: "tmp"
  
  # normally nothing to change here, path relative to the grobid-home path (e.g. grobid-home/lib)
  nativelibrary: "lib"

  pdf:
    pdfalto:
      # path relative to the grobid-home path (e.g. grobid-home/pdfalto), you don't want to change this normally
      path: "pdfalto"
      # security for PDF parsing
      memoryLimitMb: 6096
      timeoutSec: 120

    # security relative to the PDF parsing result
    blocksMax: 200000
    tokensMax: 1000000

  consolidation:
    # define the bibliographical data consolidation service to be used, either "crossref" for CrossRef REST API or 
    # "glutton" for https://github.com/kermitt2/biblio-glutton
    #service: "crossref"
    service: "glutton"
    glutton:
      url: "http://sciencialab.ddns.net/glutton"
      #url: "http://localhost:8080" 
    crossref:
      mailto: luca@sciencialab.com
      # to use crossref web API, you need normally to use it politely and to indicate an email address here, e.g. 
      #mailto: "toto@titi.tutu"
      token:
      # to use Crossref metadata plus service (available by subscription)
      #token: "yourmysteriouscrossrefmetadataplusauthorizationtokentobeputhere"

  proxy:
    # proxy to be used when doing external call to the consolidation service
    host: 
    port: 

  # CORS configuration for the GROBID web API service
  corsAllowedOrigins: "*"
  corsAllowedMethods: "OPTIONS,GET,PUT,POST,DELETE,HEAD"
  corsAllowedHeaders: "X-Requested-With,Content-Type,Accept,Origin"

  # the actual implementation for language recognition to be used
  languageDetectorFactory: "org.grobid.core.lang.impl.CybozuLanguageDetectorFactory"

  # the actual implementation for optional sentence segmentation to be used (PragmaticSegmenter or OpenNLP)
  sentenceDetectorFactory: "org.grobid.core.lang.impl.PragmaticSentenceDetectorFactory"
  # sentenceDetectorFactory: "org.grobid.core.lang.impl.OpenNLPSentenceDetectorFactory"
  
  # maximum concurrency allowed to GROBID server for processing parallel requests - change it according to your CPU/GPU capacities
  # for a production server running only GROBID, set the value slightly above the available number of threads of the server
  # to get best performance and security
  concurrency: 10
  # when the pool is full, for queries waiting for the availability of a Grobid engine, this is the maximum time wait to try 
  # to get an engine (in seconds) - normally never change it
  poolMaxWait: 1

  delft:
    # DeLFT global parameters
    # delft installation path if Deep Learning architectures are used to implement one of the sequence labeling model, 
    # embeddings are usually compiled as lmdb under delft/data (this parameter is ignored if only featured-engineered CRF are used)
    install: "../delft"
    pythonVirtualEnv:

  wapiti:
    # Wapiti global parameters
    # number of threads for training the wapiti models (0 to use all available processors)
    nbThreads: 0

  models:
    # we configure here how each sequence labeling model should be implemented
    # for feature-engineered CRF, use "wapiti" and possible training parameters are window, epsilon and nbMaxIterations
    # for Deep Learning, use "delft" and select the target DL architecture (see DeLFT library), the training 
    # parameters then depends on this selected DL architecture 
    
    - name: "segmentation"
      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0000001
        window: 50
        nbMaxIterations: 2000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 3000
          batch_size: 1
        training:
          # parameters used for training
          max_sequence_length: 3000
          batch_size: 10

    - name: "fulltext"
      # at this time, must always be CRF wapiti, the input sequence size is too large for a Deep Learning implementation
      engine: "wapiti"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0001
        window: 20
        nbMaxIterations: 1500

    - name: "header"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only  
        epsilon: 0.000001
        window: 30
        nbMaxIterations: 1500
      delft:
        # deep learning parameters
        architecture: "BidLSTM_ChainCRF_FEATURES"
        #transformer: "allenai/scibert_scivocab_cased"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          #max_sequence_length: 510
          max_sequence_length: 3000
          batch_size: 1
        training:
          # parameters used for training
          #max_sequence_length: 510
          #batch_size: 6
          max_sequence_length: 3000
          batch_size: 9

    - name: "reference-segmenter"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_ChainCRF_FEATURES"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction (for this model, use same max_sequence_length as training)
          max_sequence_length: 3000
          batch_size: 2
        training:
          # parameters used for training
          max_sequence_length: 3000
          batch_size: 10

    - name: "name-header"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "name-citation"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "date"
      engine: "wapiti"
      #engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "figure"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF"

    - name: "table"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 20
      delft:  
        # deep learning parameters
        architecture: "BidLSTM_CRF"

    - name: "affiliation-address"
      #engine: "wapiti"
      engine: "delft"
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"

    - name: "citation"
      #engine: "wapiti"
      engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 50
        nbMaxIterations: 3000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        #transformer: "michiyasunaga/LinkBERT-base"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 500
          batch_size: 30
        training:
          # parameters used for training
          max_sequence_length: 500  
          batch_size: 50

    - name: "patent-citation"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.0001
        window: 20
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 800
          batch_size: 20
        training:
          # parameters used for training
          max_sequence_length: 1000
          batch_size: 40

    - name: "funding-acknowledgement"
      engine: "wapiti"
      #engine: "delft"
      wapiti:
        # wapiti training parameters, they will be used at training time only
        epsilon: 0.00001
        window: 50
        nbMaxIterations: 2000
      delft:
        # deep learning parameters
        architecture: "BidLSTM_CRF_FEATURES"
        #architecture: "BERT_CRF"
        #transformer: "michiyasunaga/LinkBERT-base"
        useELMo: false
        runtime:
          # parameters used at runtime/prediction
          max_sequence_length: 800
          batch_size: 20
        training:
          # parameters used for training
          max_sequence_length: 500  
          batch_size: 40

    - name: "copyright"
      # at this time, we only have a DeLFT implementation, 
      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
      #engine: "delft"
      engine: "wapiti"
      delft:
        # deep learning parameters
        architecture: "gru"
        #architecture: "bert"
        #transformer: "allenai/scibert_scivocab_cased"

    - name: "license"
      # at this time, for being active, it must be DeLFT, no other implementation is available
      # use "wapiti" if the deep learning library JNI is not available and model will then be ignored
      #engine: "delft"
      engine: "wapiti"
      delft:
        # deep learning parameters
        architecture: "gru"
        #architecture: "bert"
        #transformer: "allenai/scibert_scivocab_cased"

  # for **service only**: how to load the models, 
  # false -> models are loaded when needed, avoiding putting in memory useless models (only in case of CRF) but slow down 
  #          significantly the service at first call
  # true -> all the models are loaded into memory at the server startup (default), slow the start of the services 
  #         and models not used will take some more memory (only in case of CRF), but server is immediatly warm and ready
  modelPreload: true

server:
    type: custom
    applicationConnectors:
    - type: http
      port: 8070
    adminConnectors:
    - type: http
      port: 8071
    registerDefaultExceptionMappers: false
    # change the following for having all http requests logged
    requestLog:
      appenders: []

# these logging settings apply to the Grobid service usage mode
logging:
  level: INFO
  loggers:
    org.apache.pdfbox.pdmodel.font.PDSimpleFont: "OFF"
    org.glassfish.jersey.internal: "OFF"
    com.squarespace.jersey2.guice.JerseyGuiceUtils: "OFF"
  appenders:
    - type: console
      threshold: INFO
      timeZone: UTC
      # uncomment to have the logs in json format
      # layout:
       # type: json