|
1 | | -import json |
2 | 1 | import glob |
3 | 2 | import re |
4 | 3 | import os |
|
14 | 13 | from ocrd_models.ocrd_page import parse |
15 | 14 | from ocrd_utils import bbox_from_points |
16 | 15 |
|
17 | | -from .ned import ned |
18 | | -from .ner import ner |
19 | | -from .tsv import read_tsv, write_tsv, extract_doc_links |
| 16 | +from qurator.utils.tsv import read_tsv, write_tsv, extract_doc_links |
20 | 17 | from .ocr import get_conf_color |
21 | 18 |
|
22 | | - |
23 | 19 | @click.command() |
24 | 20 | @click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) |
25 | 21 | @click.argument('url-file', type=click.Path(exists=False), required=True, nargs=1) |
@@ -218,59 +214,6 @@ def tsv2page(output_filename, keep_words, page_file, tsv_file): |
218 | 214 | f.write(ET.tostring(tree, pretty_print=True).decode('utf-8')) |
219 | 215 |
|
220 | 216 |
|
221 | | -@click.command() |
222 | | -@click.argument('tsv-file', type=click.Path(exists=True), required=True, nargs=1) |
223 | | -@click.argument('tsv-out-file', type=click.Path(), required=True, nargs=1) |
224 | | -@click.option('--ner-rest-endpoint', type=str, default=None, |
225 | | - help="REST endpoint of sbb_ner service. See https://github.com/qurator-spk/sbb_ner for details.") |
226 | | -@click.option('--ned-rest-endpoint', type=str, default=None, |
227 | | - help="REST endpoint of sbb_ned service. See https://github.com/qurator-spk/sbb_ned for details.") |
228 | | -@click.option('--ned-json-file', type=str, default=None) |
229 | | -@click.option('--noproxy', type=bool, is_flag=True, help='disable proxy. default: proxy is enabled.') |
230 | | -@click.option('--ned-threshold', type=float, default=None) |
231 | | -@click.option('--ned-priority', type=int, default=1) |
232 | | -def find_entities(tsv_file, tsv_out_file, ner_rest_endpoint, ned_rest_endpoint, ned_json_file, noproxy, ned_threshold, |
233 | | - ned_priority): |
234 | | - |
235 | | - if noproxy: |
236 | | - os.environ['no_proxy'] = '*' |
237 | | - |
238 | | - tsv, urls = read_tsv(tsv_file) |
239 | | - |
240 | | - try: |
241 | | - if ner_rest_endpoint is not None: |
242 | | - |
243 | | - tsv, ner_result = ner(tsv, ner_rest_endpoint) |
244 | | - |
245 | | - elif os.path.exists(tsv_file): |
246 | | - |
247 | | - print('Using NER information that is already contained in file: {}'.format(tsv_file)) |
248 | | - |
249 | | - tmp = tsv.copy() |
250 | | - tmp['sen'] = (tmp['No.'] == 0).cumsum() |
251 | | - tmp.loc[~tmp['NE-TAG'].isin(['O', 'B-PER', 'B-LOC', 'B-ORG', 'I-PER', 'I-LOC', 'I-ORG']), 'NE-TAG'] = 'O' |
252 | | - |
253 | | - ner_result = [[{'word': str(row.TOKEN), 'prediction': row['NE-TAG']} for _, row in sen.iterrows()] |
254 | | - for _, sen in tmp.groupby('sen')] |
255 | | - else: |
256 | | - raise RuntimeError("Either NER rest endpoint or NER-TAG information within tsv_file required.") |
257 | | - |
258 | | - if ned_rest_endpoint is not None: |
259 | | - |
260 | | - tsv, ned_result = ned(tsv, ner_result, ned_rest_endpoint, json_file=ned_json_file, threshold=ned_threshold, |
261 | | - priority=ned_priority) |
262 | | - |
263 | | - if ned_json_file is not None and not os.path.exists(ned_json_file): |
264 | | - |
265 | | - with open(ned_json_file, "w") as fp_json: |
266 | | - json.dump(ned_result, fp_json, indent=2, separators=(',', ': ')) |
267 | | - |
268 | | - write_tsv(tsv, urls, tsv_out_file) |
269 | | - |
270 | | - except requests.HTTPError as e: |
271 | | - print(e) |
272 | | - |
273 | | - |
274 | 217 | @click.command() |
275 | 218 | @click.option('--xls-file', type=click.Path(exists=True), default=None, |
276 | 219 | help="Read parameters from xls-file. Expected columns: Filename, iiif_url, scale_factor.") |
|
0 commit comments