Skip to content

Commit 60a07c6

Browse files
committed
drop support for scaling, not necessary for SBB use case anymore
1 parent fe4a1ea commit 60a07c6

File tree

5 files changed

+31
-19
lines changed

5 files changed

+31
-19
lines changed

.gitmodules

Whitespace-only changes.

Makefile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
deps:
2+
pip install -r requirements.txt
3+
4+
deps-test:
5+
pip install -r requirements-test.txt
6+
7+
test:
8+
pytest tests
9+
10+
install:
11+
pip install .
12+
13+
install-dev:
14+
pip install -e .
15+
16+
.PHONY: test
17+

requirements-test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pytest

tsvtools/ocrd-tool.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
"parameters": {
1313
"iiif_url_template": {
1414
"type": "string",
15-
"description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ image_width }}, or {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
16-
"default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/{{ image_width }}/0/default.jpg"
15+
"description": "URL template for lookup of images via IIIF based on {{ unique_identifier }}, {{ page_id }}, {{ page_no }} and {{ PPN }}. 'left', 'top', 'right', 'bottom', 'width', and 'height' are replaced by the neat JS.",
16+
"default": "https://content.staatsbibliothek-berlin.de/dc/{{ PPN }}-{{ page_no }}/left,top,width,height/full/0/default.jpg"
1717
},
1818
"scale_filegrp": {
1919
"type": "string",

tsvtools/ocrd_processors.py

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
from PIL import Image
88

99
from ocrd import Processor
10-
from ocrd_models import OcrdExif
1110
from ocrd_utils import getLogger, make_file_id, assert_file_grp_cardinality, MIMETYPE_PAGE
11+
from ocrd_models import OcrdExif
1212
from ocrd_models.constants import NAMESPACES as NS
1313
from ocrd_models.ocrd_page import TextEquivType, to_xml
1414
from ocrd_modelfactory import page_from_file
@@ -32,35 +32,29 @@ def process(self):
3232
assert_file_grp_cardinality(self.input_file_grp, 1)
3333
assert_file_grp_cardinality(self.output_file_grp, 1)
3434
iiif_url_template = self.parameter['iiif_url_template']
35-
scale_filegrp = self.parameter['scale_filegrp']
3635
noproxy = self.parameter['noproxy']
36+
37+
ppn_found = self.workspace.mets._tree.find('//mods:recordIdentifier[@source="gbv-ppn"]', NS)
38+
print(ppn_found)
39+
if ppn_found is not None:
40+
ppn = ppn_found.text
41+
else:
42+
ppn = ''
3743
for n, input_file in enumerate(self.input_files):
3844
page_id = input_file.pageId or input_file.ID
3945
log.info('Processing: %d / %s of %d', n, page_id, len(list(self.input_files)))
4046
file_id = make_file_id(input_file, self.output_file_grp)
4147
pcgts = page_from_file(self.workspace.download_file(input_file))
4248
page = pcgts.get_Page()
43-
scale_factor = 1.0
44-
iiif_width = f',{page.imageHeight}'
45-
ppn = self.workspace.mets.unique_identifier
46-
el_recordIdentifier = self.workspace.mets._tree.getroot().find(".//mods:recordIdentifier[@source='gbv-ppn']", NS)
47-
if el_recordIdentifier is not None:
48-
ppn = el_recordIdentifier.text
49-
if scale_filegrp:
50-
scaled_img_ocrd_file = self.workspace.download_file(next(
51-
self.workspace.mets.find_files(fileGrp=scale_filegrp, pageId=page_id)))
52-
scaled_img_pil = Image.open(scaled_img_ocrd_file.local_filename)
53-
scale_factor = scaled_img_pil.width / page.imageWidth
54-
iiif_width = 'full'
49+
5550
iiif_url = iiif_url_template\
5651
.replace('{{ unique_identifier }}', self.workspace.mets.unique_identifier)\
5752
.replace('{{ PPN }}', ppn)\
5853
.replace('{{ page_id }}', page_id)\
59-
.replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))\
60-
.replace('{{ image_width }}', str(iiif_width))
54+
.replace('{{ page_no }}', re_sub('[^0-9]', '', page_id))
6155
Path(self.output_file_grp).mkdir(exist_ok=True)
6256
tsv_filepath = Path(self.output_file_grp, file_id + '.tsv')
63-
page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, scale_factor, None, None, None, 1)
57+
page2tsv(input_file.local_filename, tsv_filepath, 'OCR', iiif_url, None, None, noproxy, 1.0, None, None, None, 1)
6458

6559
self.workspace.add_file(
6660
ID=file_id,

0 commit comments

Comments
 (0)