This repository has been archived by the owner on Mar 30, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
grobid_pipeline.py
136 lines (112 loc) · 4.35 KB
/
grobid_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import argparse
import logging
from configparser import ConfigParser
from typing import Dict, List, Optional
from sciencebeam_pipelines.transformers.grobid_service import (
grobid_service,
GrobidApiPaths
)
from sciencebeam_pipelines.transformers.xslt import xslt_transformer_from_file
from sciencebeam_pipelines.utils.mime_type_constants import MimeTypes
from . import Pipeline, FunctionPipelineStep, FieldNames, StepDataProps
LOGGER = logging.getLogger(__name__)
DEFAULT_GROBID_ACTION = GrobidApiPaths.PROCESS_HEADER_DOCUMENT
DEFAULT_GROBID_XSLT_PATH = 'xslt/grobid-jats.xsl'
LOCAL_GROBID_API_URL = 'http://localhost:8080/api'
HEADER_FIELDS = {
FieldNames.TITLE,
FieldNames.ABSTRACT,
FieldNames.AUTHORS,
FieldNames.AFFILIATIONS
}
def has_only_header_fields(fields):
return fields and not set(fields) - HEADER_FIELDS
def get_default_grobid_action_for_fields(fields):
return (
GrobidApiPaths.PROCESS_HEADER_DOCUMENT
if has_only_header_fields(fields)
else GrobidApiPaths.PROCESS_FULL_TEXT_DOCUMENT
)
def get_xslt_template_parameters(config: ConfigParser) -> Dict[str, str]:
return {
key: value
for key, value in config.items('xslt_template_parameters')
if value
}
class GrobidPipeline(Pipeline):
def add_arguments(
self,
parser: argparse.ArgumentParser,
config: ConfigParser,
argv: Optional[List[str]] = None
):
grobid_group = parser.add_argument_group('Grobid')
grobid_group.add_argument(
'--grobid-url', required=False, default=None,
help='Base URL to the Grobid service'
)
grobid_group.add_argument(
'--grobid-action', required=False,
default=None,
help='Name of the Grobid action'
' (by default determined depending on the requested fields)'
)
grobid_group.add_argument(
'--no-grobid-xslt', action='store_true',
help='Disable translation using XSLT'
)
grobid_group.add_argument(
'--grobid-xslt-path', default=DEFAULT_GROBID_XSLT_PATH,
help='Path to XSLT file translating results to JATS'
)
grobid_group.add_argument(
'--no-grobid-pretty-print', action='store_true',
help='Disable pretty print of XSLT output'
)
def get_steps(self, config, args):
# type: (dict, object) -> list
grobid_url = args.grobid_url
if not grobid_url:
grobid_url = LOCAL_GROBID_API_URL
start_grobid_service = True
else:
start_grobid_service = False
call_grobid = grobid_service(
grobid_url, args.grobid_action, start_service=start_grobid_service
)
def convert_to_tei(pdf_filename, pdf_content, includes):
return call_grobid( # pylint: disable=redundant-keyword-arg
(pdf_filename, pdf_content),
path=args.grobid_action or get_default_grobid_action_for_fields(
includes
)
)[1]
steps = [
FunctionPipelineStep(lambda data, **_: {
StepDataProps.CONTENT: convert_to_tei(
pdf_filename=data[StepDataProps.FILENAME],
pdf_content=data[StepDataProps.CONTENT],
includes=data.get(StepDataProps.INCLUDES)
),
StepDataProps.TYPE: MimeTypes.TEI_XML
}, {MimeTypes.PDF}, 'Convert to TEI')
]
if not args.no_grobid_xslt:
xslt_transformer = xslt_transformer_from_file(
args.grobid_xslt_path,
pretty_print=not args.no_grobid_pretty_print
)
xslt_template_parameters = get_xslt_template_parameters(config)
LOGGER.info(
'grobid_xslt_path=%r (xslt_template_parameters=%r)',
args.grobid_xslt_path, xslt_template_parameters
)
steps.append(FunctionPipelineStep(lambda d, **_: {
StepDataProps.CONTENT: xslt_transformer(
d[StepDataProps.CONTENT],
xslt_template_parameters=xslt_template_parameters
),
StepDataProps.TYPE: MimeTypes.JATS_XML
}, {MimeTypes.TEI_XML}, 'TEI to JATS'))
return steps
PIPELINE = GrobidPipeline()