Skip to content

Commit 81ba7cf

Browse files
committed
tests
1 parent 60a07c6 commit 81ba7cf

24 files changed

+3684
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
*.egg-info
22
__pycache__
3+
tests/**/*.jpg
4+
tests/**/*.png

tests/test_imexport.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from pathlib import Path
2+
from shutil import copytree
3+
from pytest import fixture
4+
5+
from ocrd_utils import pushd_popd
6+
from ocrd_models.ocrd_page import parse
7+
from ocrd import Resolver
8+
9+
from tsvtools.ocrd_processors import OcrdNeatExportProcessor, OcrdNeatImportProcessor
10+
11+
@fixture
12+
def testws(tmpdir):
13+
copytree('tests/testws', f'{tmpdir}/ws')
14+
return Resolver().workspace_from_url(f'{tmpdir}/ws/mets.xml')
15+
16+
def test_imexport(testws):
17+
wsdir = testws.directory
18+
exporter = OcrdNeatExportProcessor(workspace=testws, input_file_grp='TESS', output_file_grp='OUT')
19+
exporter.process()
20+
outfile = Path(wsdir, 'OUT/FILE_0005_OUT.tsv')
21+
assert outfile.exists()
22+
assert 'Ein Welt-Stantenbund 0 174 1116 169 280 region0000_line0001' in outfile.read_text()
23+
assert outfile.read_text().splitlines()[1] == '# https://content.staatsbibliothek-berlin.de/dc/PPN680203753-0005/left,top,width,height/full/0/default.jpg'
24+
25+
outfile.write_text(outfile.read_text().replace('Stantenbund', 'Staatenbund'))
26+
27+
importer = OcrdNeatImportProcessor(workspace=testws, input_file_grp='TESS,OUT', output_file_grp='TESS-CORRECTED')
28+
importer.process()
29+
30+
origfile = Path(wsdir, 'TESS/FILE_0005_TESS.xml')
31+
corrfile = Path(wsdir, 'TESS-CORRECTED/FILE_0005_TESS-CORRECTED.xml')
32+
33+
assert origfile.exists()
34+
assert corrfile.exists()
35+
36+
origpage = parse(origfile)
37+
corrpage = parse(corrfile)
38+
39+
origline = origpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode
40+
corrline = corrpage.get_Page().get_TextRegion()[0].get_TextLine()[1].get_TextEquiv()[0].Unicode
41+
42+
assert 'Stantenbund' in origline
43+
assert 'Stantenbund' not in corrline
44+
45+
assert 'Staatenbund' not in origline
46+
assert 'Staatenbund' in corrline
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="FILE_0001_TESS">
3+
<pc:Metadata>
4+
<pc:Creator>OCR-D/core 2.34.0</pc:Creator>
5+
<pc:Created>2022-05-30T16:41:01.833765</pc:Created>
6+
<pc:LastChange>2022-05-30T16:41:01.833765</pc:LastChange>
7+
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-recognize">
8+
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
9+
<pc:Label value="False" type="find_tables"/>
10+
<pc:Label value="line" type="textequiv_level"/>
11+
<pc:Label value="region" type="segmentation_level"/>
12+
<pc:Label value="deu" type="model"/>
13+
<pc:Label value="0" type="dpi"/>
14+
<pc:Label value="0" type="padding"/>
15+
<pc:Label value="False" type="overwrite_segments"/>
16+
<pc:Label value="True" type="overwrite_text"/>
17+
<pc:Label value="False" type="shrink_polygons"/>
18+
<pc:Label value="False" type="block_polygons"/>
19+
<pc:Label value="False" type="find_staves"/>
20+
<pc:Label value="False" type="sparse_text"/>
21+
<pc:Label value="False" type="raw_lines"/>
22+
<pc:Label value="" type="char_whitelist"/>
23+
<pc:Label value="" type="char_blacklist"/>
24+
<pc:Label value="" type="char_unblacklist"/>
25+
<pc:Label value="{}" type="tesseract_parameters"/>
26+
<pc:Label value="{}" type="xpath_parameters"/>
27+
<pc:Label value="{}" type="xpath_model"/>
28+
<pc:Label value="False" type="auto_model"/>
29+
<pc:Label value="DEFAULT" type="oem"/>
30+
</pc:Labels>
31+
<pc:Labels externalModel="ocrd-tool" externalId="version">
32+
<pc:Label value="0.13.6 (tesseract 5.1.0)" type="ocrd-tesserocr-recognize"/>
33+
<pc:Label value="2.34.0" type="ocrd/core"/>
34+
</pc:Labels>
35+
</pc:MetadataItem>
36+
</pc:Metadata>
37+
<pc:Page imageFilename="DEFAULT/FILE_0001_DEFAULT.jpg" imageWidth="1485" imageHeight="2124">
38+
<pc:AlternativeImage filename="TESS/FILE_0001_TESS.IMG-BIN.png" comments=",binarized,clipped"/>
39+
<pc:ReadingOrder>
40+
<pc:OrderedGroup id="reading-order">
41+
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
42+
</pc:OrderedGroup>
43+
</pc:ReadingOrder>
44+
<pc:ImageRegion id="region0000" orientation="0.">
45+
<pc:Coords points="0,0 1485,0 1485,2124 0,2124"/>
46+
</pc:ImageRegion>
47+
</pc:Page>
48+
</pc:PcGts>
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="FILE_0002_TESS">
3+
<pc:Metadata>
4+
<pc:Creator>OCR-D/core 2.34.0</pc:Creator>
5+
<pc:Created>2022-05-30T16:41:03.031763</pc:Created>
6+
<pc:LastChange>2022-05-30T16:41:03.031763</pc:LastChange>
7+
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-recognize">
8+
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
9+
<pc:Label value="False" type="find_tables"/>
10+
<pc:Label value="line" type="textequiv_level"/>
11+
<pc:Label value="region" type="segmentation_level"/>
12+
<pc:Label value="deu" type="model"/>
13+
<pc:Label value="0" type="dpi"/>
14+
<pc:Label value="0" type="padding"/>
15+
<pc:Label value="False" type="overwrite_segments"/>
16+
<pc:Label value="True" type="overwrite_text"/>
17+
<pc:Label value="False" type="shrink_polygons"/>
18+
<pc:Label value="False" type="block_polygons"/>
19+
<pc:Label value="False" type="find_staves"/>
20+
<pc:Label value="False" type="sparse_text"/>
21+
<pc:Label value="False" type="raw_lines"/>
22+
<pc:Label value="" type="char_whitelist"/>
23+
<pc:Label value="" type="char_blacklist"/>
24+
<pc:Label value="" type="char_unblacklist"/>
25+
<pc:Label value="{}" type="tesseract_parameters"/>
26+
<pc:Label value="{}" type="xpath_parameters"/>
27+
<pc:Label value="{}" type="xpath_model"/>
28+
<pc:Label value="False" type="auto_model"/>
29+
<pc:Label value="DEFAULT" type="oem"/>
30+
</pc:Labels>
31+
<pc:Labels externalModel="ocrd-tool" externalId="version">
32+
<pc:Label value="0.13.6 (tesseract 5.1.0)" type="ocrd-tesserocr-recognize"/>
33+
<pc:Label value="2.34.0" type="ocrd/core"/>
34+
</pc:Labels>
35+
</pc:MetadataItem>
36+
</pc:Metadata>
37+
<pc:Page imageFilename="DEFAULT/FILE_0002_DEFAULT.jpg" imageWidth="1462" imageHeight="2080">
38+
<pc:AlternativeImage filename="TESS/FILE_0002_TESS.IMG-BIN.png" comments=",binarized,clipped"/>
39+
<pc:ReadingOrder>
40+
<pc:OrderedGroup id="reading-order">
41+
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
42+
</pc:OrderedGroup>
43+
</pc:ReadingOrder>
44+
<pc:ImageRegion id="region0000" orientation="0.">
45+
<pc:Coords points="0,0 1462,0 1462,2080 0,2080"/>
46+
</pc:ImageRegion>
47+
</pc:Page>
48+
</pc:PcGts>
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="FILE_0003_TESS">
3+
<pc:Metadata>
4+
<pc:Creator>OCR-D/core 2.34.0</pc:Creator>
5+
<pc:Created>2022-05-30T16:41:04.040750</pc:Created>
6+
<pc:LastChange>2022-05-30T16:41:04.040750</pc:LastChange>
7+
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-recognize">
8+
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
9+
<pc:Label value="False" type="find_tables"/>
10+
<pc:Label value="line" type="textequiv_level"/>
11+
<pc:Label value="region" type="segmentation_level"/>
12+
<pc:Label value="deu" type="model"/>
13+
<pc:Label value="0" type="dpi"/>
14+
<pc:Label value="0" type="padding"/>
15+
<pc:Label value="False" type="overwrite_segments"/>
16+
<pc:Label value="True" type="overwrite_text"/>
17+
<pc:Label value="False" type="shrink_polygons"/>
18+
<pc:Label value="False" type="block_polygons"/>
19+
<pc:Label value="False" type="find_staves"/>
20+
<pc:Label value="False" type="sparse_text"/>
21+
<pc:Label value="False" type="raw_lines"/>
22+
<pc:Label value="" type="char_whitelist"/>
23+
<pc:Label value="" type="char_blacklist"/>
24+
<pc:Label value="" type="char_unblacklist"/>
25+
<pc:Label value="{}" type="tesseract_parameters"/>
26+
<pc:Label value="{}" type="xpath_parameters"/>
27+
<pc:Label value="{}" type="xpath_model"/>
28+
<pc:Label value="False" type="auto_model"/>
29+
<pc:Label value="DEFAULT" type="oem"/>
30+
</pc:Labels>
31+
<pc:Labels externalModel="ocrd-tool" externalId="version">
32+
<pc:Label value="0.13.6 (tesseract 5.1.0)" type="ocrd-tesserocr-recognize"/>
33+
<pc:Label value="2.34.0" type="ocrd/core"/>
34+
</pc:Labels>
35+
</pc:MetadataItem>
36+
</pc:Metadata>
37+
<pc:Page imageFilename="DEFAULT/FILE_0003_DEFAULT.jpg" imageWidth="1461" imageHeight="2124">
38+
<pc:AlternativeImage filename="TESS/FILE_0003_TESS.IMG-BIN.png" comments=",binarized,clipped"/>
39+
<pc:ReadingOrder>
40+
<pc:OrderedGroup id="reading-order">
41+
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
42+
</pc:OrderedGroup>
43+
</pc:ReadingOrder>
44+
<pc:ImageRegion id="region0000" orientation="0.">
45+
<pc:Coords points="0,0 1461,0 1461,2124 0,2124"/>
46+
</pc:ImageRegion>
47+
</pc:Page>
48+
</pc:PcGts>
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="FILE_0004_TESS">
3+
<pc:Metadata>
4+
<pc:Creator>OCR-D/core 2.34.0</pc:Creator>
5+
<pc:Created>2022-05-30T16:41:05.028576</pc:Created>
6+
<pc:LastChange>2022-05-30T16:41:05.028576</pc:LastChange>
7+
<pc:MetadataItem type="processingStep" name="layout/segmentation/region" value="ocrd-tesserocr-recognize">
8+
<pc:Labels externalModel="ocrd-tool" externalId="parameters">
9+
<pc:Label value="False" type="find_tables"/>
10+
<pc:Label value="line" type="textequiv_level"/>
11+
<pc:Label value="region" type="segmentation_level"/>
12+
<pc:Label value="deu" type="model"/>
13+
<pc:Label value="0" type="dpi"/>
14+
<pc:Label value="0" type="padding"/>
15+
<pc:Label value="False" type="overwrite_segments"/>
16+
<pc:Label value="True" type="overwrite_text"/>
17+
<pc:Label value="False" type="shrink_polygons"/>
18+
<pc:Label value="False" type="block_polygons"/>
19+
<pc:Label value="False" type="find_staves"/>
20+
<pc:Label value="False" type="sparse_text"/>
21+
<pc:Label value="False" type="raw_lines"/>
22+
<pc:Label value="" type="char_whitelist"/>
23+
<pc:Label value="" type="char_blacklist"/>
24+
<pc:Label value="" type="char_unblacklist"/>
25+
<pc:Label value="{}" type="tesseract_parameters"/>
26+
<pc:Label value="{}" type="xpath_parameters"/>
27+
<pc:Label value="{}" type="xpath_model"/>
28+
<pc:Label value="False" type="auto_model"/>
29+
<pc:Label value="DEFAULT" type="oem"/>
30+
</pc:Labels>
31+
<pc:Labels externalModel="ocrd-tool" externalId="version">
32+
<pc:Label value="0.13.6 (tesseract 5.1.0)" type="ocrd-tesserocr-recognize"/>
33+
<pc:Label value="2.34.0" type="ocrd/core"/>
34+
</pc:Labels>
35+
</pc:MetadataItem>
36+
</pc:Metadata>
37+
<pc:Page imageFilename="DEFAULT/FILE_0004_DEFAULT.jpg" imageWidth="1323" imageHeight="1959">
38+
<pc:AlternativeImage filename="TESS/FILE_0004_TESS.IMG-BIN.png" comments=",binarized,clipped"/>
39+
<pc:ReadingOrder>
40+
<pc:OrderedGroup id="reading-order">
41+
<pc:RegionRefIndexed index="0" regionRef="region0000"/>
42+
</pc:OrderedGroup>
43+
</pc:ReadingOrder>
44+
<pc:ImageRegion id="region0000" orientation="0.">
45+
<pc:Coords points="0,0 1320,0 1320,1959 0,1959"/>
46+
</pc:ImageRegion>
47+
</pc:Page>
48+
</pc:PcGts>

0 commit comments

Comments
 (0)