Skip to content
Snippets Groups Projects
Commit dcaddfe3 authored by Erwan Rouchet's avatar Erwan Rouchet Committed by Bastien Abadie
Browse files

Transkribus PAGE imports

parent 4f7d30f1
No related branches found
No related tags found
No related merge requests found
Showing
with 384 additions and 279 deletions
from django.conf import settings
from django.shortcuts import get_object_or_404
from rest_framework import status
from rest_framework.mixins import CreateModelMixin
from rest_framework.generics import CreateAPIView, UpdateAPIView
from rest_framework.exceptions import ValidationError
from rest_framework.response import Response
from arkindex.documents.models import Classification, DataSource, Transcription, TranscriptionType
from rest_framework.views import APIView
from arkindex.documents.models import \
Classification, DataSource, Transcription, TranscriptionType, Page, Corpus
from arkindex.documents.serializers.ml import \
ClassificationsSerializer, TranscriptionsSerializer, TranscriptionCreateSerializer
from arkindex.documents.indexer import Indexer
from arkindex.documents.pagexml import PageXmlParser
from arkindex.images.models import Zone
from arkindex.images.importer import build_transcriptions, save_transcriptions, index_transcriptions
from arkindex.project.parsers import XMLParser
from arkindex.project.permissions import IsVerified
from arkindex.project.polygon import Polygon
......@@ -160,3 +166,33 @@ class ClassificationBulk(CreateAPIView):
)
for cl in serializer.validated_data['classifications']
])
class PageXmlTranscriptionsImport(CreateModelMixin, APIView):
parser_classes = (XMLParser, )
permission_classes = (IsVerified, )
def get_queryset(self):
return Page.objects.filter(corpus__in=Corpus.objects.writable(self.request.user))
def get_object(self):
"""
Since we are inheriting from APIView, because GenericAPIView would break OpenAPI,
we have to rewrite get_object ourselves.
"""
obj = get_object_or_404(self.get_queryset(), pk=self.kwargs['pk'])
self.check_object_permissions(self.request, obj)
return obj
def post(self, request, *args, **kwargs):
page = self.get_object()
try:
parser = PageXmlParser(request.data)
except AssertionError as e:
raise ValidationError('Could not parse PAGE XML document: {!s}'.format(e))
parser.save_transcriptions(page)
return Response(
status=status.HTTP_201_CREATED,
headers=self.get_success_headers(None),
)
from django.utils.functional import cached_property
from arkindex_common.ml_tool import MLToolType
from arkindex_common.pagexml import PageXmlPage
from arkindex.project.polygon import Polygon
from arkindex.documents.models import DataSource, TranscriptionType, ElementType, Page
import logging
logger = logging.getLogger(__name__)
class PageXmlParser(object):
def __init__(self, path_or_xml):
self.pagexml_page = PageXmlPage(path_or_xml)
@cached_property
def source(self):
ds, _ = DataSource.objects.get_or_create(
type=MLToolType.Recognizer,
slug='transkribus',
revision='2013-07-15',
internal=False,
)
return ds
def get_zone(self, region, page):
poly = Polygon(region.points)
z = page.zone.image.zones.filter(polygon=poly).first()
if not z:
z = page.zone.image.zones.create(polygon=poly)
return z
def create_surface(self, region, page):
if region.points is None:
logger.warning('No points in region {}'.format(region.id))
return None, False
surface, created = self.get_zone(region, page).elements.get_or_create(
type=ElementType.Surface,
corpus=page.corpus,
defaults={
'name': 'Surface {}'.format(region.id),
},
)
surface.add_parent(page)
return surface, created
def save_surfaces(self, page):
if not len(self.pagexml_page.page.text_regions):
logger.warning('No surfaces to save')
return 0, 0
region_count, created_count = 0, 0
for region in self.pagexml_page.page.sort_regions(self.pagexml_page.page.text_regions):
region_count += 1
_, created = self.create_surface(region, page)
created_count += created
return region_count, created_count
def create_transcription(self, region, page, type=TranscriptionType.Line):
if region.text is None:
logger.warning('No text in region {}'.format(region.id))
return None, False
if region.points is None:
logger.warning('No points in region {}'.format(region.id))
return None, False
return self.get_zone(region, page).transcriptions.get_or_create(
type=type,
element=page,
text=region.text,
source=self.source,
)
def save_transcriptions(self, page):
assert isinstance(page, Page), 'Page should be an Arkindex page'
if self.pagexml_page.page.text_regions is None or not len(self.pagexml_page.page.text_regions):
logger.warning('No transcriptions to save')
return
region_count, line_count, region_ts_count, line_ts_count = 0, 0, 0, 0
for region in self.pagexml_page.page.text_regions:
region_count += 1
_, created = self.create_transcription(region, page, type=TranscriptionType.Paragraph)
region_ts_count += created
for line in region.lines:
line_count += 1
_, created = self.create_transcription(line, page)
line_ts_count += created
logger.info('Parsed {} regions and {} lines and created {} paragraph and {} line transcriptions'.format(
region_count,
line_count,
region_ts_count,
line_ts_count,
))
from arkindex.project.polygon import Polygon
from arkindex.documents.models import Corpus, ElementType, Element, Page
from arkindex.documents.pagexml import PageXmlParser
from arkindex.documents.importer import parse_folio
from arkindex.images.models import Zone
from pathlib import Path
import xml.etree.ElementTree as ElementTree
import re
import logging
logger = logging.getLogger(__name__)
NS_PAGE = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
NS = {'page': NS_PAGE}
NS_DELETE = "{%s}" % NS_PAGE
REGEX_VOLUME_NAME = re.compile(r'(?:JJ)?([0-9]+)([A-Z]?)')
REGEX_ACT_NUMBER = re.compile(r'([0-9]+)\s*([a-z]*)')
REGEX_FOLIO = re.compile(r'(?:fol\.?\s*)?([0-9]+)\s*(bis)?\s*(r|v)?', re.IGNORECASE)
REGEX_FOLIO_GROUP = re.compile(r'([^-_à]+)', re.IGNORECASE)
class SurfaceParser(object):
def __init__(self, path):
self.path = str(path)
self.tree = ElementTree.parse(self.path)
# Check we have a page
self.page = self.tree.find('page:Page', NS)
assert self.page is not None, 'Missing page'
def get_metadata(self):
# From metadata
metadata = self.tree.find('page:Metadata', NS)
assert metadata is not None, 'Missing metadata'
dictMetadata = {node.tag.replace(NS_DELETE, "").lower(): node.text for node in metadata}
# From page
dictMetadata.update(self.page.attrib)
return dictMetadata
def list_surfaces(self):
region_indexes = None
# Load base elements from page
read = self.page.find('page:ReadingOrder', NS)
if read is not None:
order = read.find('page:OrderedGroup', NS)
assert order is not None, 'Missing ordered group'
# Find indexes for each region
region_indexes = {
refindex.get('regionRef'): int(refindex.get('index'))
for refindex in order.findall('page:RegionRefIndexed', NS)
}
out = []
for i, textRegion in enumerate(self.page.findall('page:TextRegion', NS)):
# Build coordinates polygon
coords = textRegion.find('page:Coords', NS)
assert coords is not None, 'Missing coords'
polygon = Polygon([
(int(x), int(y))
for x, y in re.findall(r'(\d+),(\d+)', coords.get('points'))
])
text_id = textRegion.get('id')
if region_indexes is not None: # If ReadingOrder exists
# Find index for this text region
index = region_indexes.get(text_id)
assert index is not None, 'Missing index in region_indexes'
else:
index = i
# Build output structure
out.append({
'index': index,
'id': text_id,
'polygon': polygon,
})
# Sort the surfaces by index
return sorted(out, key=lambda x: x['index'])
class SurfaceImporter(object):
def __init__(self, xmlpath=None, basepath=None, corpus=None, **kwargs):
......@@ -167,36 +94,15 @@ class SurfaceImporter(object):
for page, path in self.get_xml_pages():
xml_count += 1
s = SurfaceParser(self.basepath / path)
image = page.zone.image
parser = PageXmlParser(self.basepath / path)
if dry: # Handle dry run
logger.info("{}\t{}".format(path, image.path))
logger.info("{}\t{}".format(path, page.zone.image.path))
continue
for surfacedata in s.list_surfaces():
poly = surfacedata['polygon']
try:
z, _ = image.zones.get_or_create(polygon=poly)
except ValueError as e:
logger.warning(
"Could not import zone with polygon '{}' on image '{}' from file '{}': {}".format(
poly, image.id, path, e,
)
)
except Zone.MultipleObjectsReturned:
logger.warning('Multiple zones found, picking the first one')
z = image.zones.filter(polygon=poly).first()
surface, created = Element.objects.get_or_create(
type=ElementType.Surface,
zone=z,
corpus=page.corpus,
defaults={'name': "Surface {}".format(surfacedata['id'])},
)
surface.add_parent(page)
surfaces_count += 1
created_surfaces_count += created
region_count, created_count = parser.save_surfaces(page)
surfaces_count += region_count
created_surfaces_count += created_count
logger.info("Parsed {} and created {} surfaces from {} XML files".format(
surfaces_count, created_surfaces_count, xml_count,
......
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2018-10-01T09:25:16.139-04:00</Created>
<LastChange>2019-02-19T19:45:19.222+01:00</LastChange>
</Metadata>
<Page imageFilename="01R_CE101S01_1907_005.tif" imageWidth="2415" imageHeight="3936">
<ReadingOrder>
<OrderedGroup id="ro_1550601919253" caption="Regions reading order">
<RegionRefIndexed index="0" regionRef="TextRegion_1540299380975_9"/>
<RegionRefIndexed index="1" regionRef="TextRegion_1540299473514_23"/>
</OrderedGroup>
</ReadingOrder>
<Relations>
<Relation type="link">
<RegionRef regionRef="TextRegion_1540299380975_9"/>
<RegionRef regionRef="TextRegion_1540299473514_23"/>
</Relation>
</Relations>
<TextRegion orientation="0.0" type="marginalia" id="TextRegion_1540299380975_9" custom="readingOrder {index:0;} structure {type:marginalia;}">
<Coords points="12,34 56,78 910,1112"/>
<TextLine id="r1l6" custom="readingOrder {index:0;} structure {type:ref;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>B .1</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="r1l7" custom="readingOrder {index:1;} _prenom {offset:0; length:12; continued:true;_role:sujet;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>Louis Joseph</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="r1l8" custom="readingOrder {index:2;} _prenom {offset:0; length:13; continued:true;_role:sujet;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>Pierre Siméon</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="r1l9" custom="readingOrder {index:3;} _nom {offset:0; length:7;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>Lemieux</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>B .1
Louis Joseph
Pierre Siméon
Lemieux</Unicode>
</TextEquiv>
</TextRegion>
<TextRegion orientation="0.0" id="TextRegion_1540299473514_23" custom="readingOrder {index:1;}">
<Coords points="12,34 56,78 910,1112"/>
<TextLine id="r2l12" custom="readingOrder {index:0;} _date {offset:3; length:30;_enregistrement:1;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>Le onze janvier mil neuf centsept</Unicode>
</TextEquiv>
</TextLine>
<TextLine id="r2l13" custom="readingOrder {index:1;} _prenom {offset:36; length:5; continued:true;_role:sujet;}">
<Coords points="12,34 56,78 910,1112"/>
<Baseline points="13,37 42,42 37,13"/>
<TextEquiv>
<Unicode>nous prêtre soussigné avons baptisé Louis</Unicode>
</TextEquiv>
</TextLine>
<TextEquiv>
<Unicode>Le onze janvier mil neuf centsept
nous prêtre soussigné avons baptisé Louis</Unicode>
</TextEquiv>
</TextRegion>
</Page>
</PcGts>
../../surface_samples/simple.xml
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2017-07-19T13:58:10.738+02:00</Created>
<LastChange>2017-07-19T14:04:22.502+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731">
<ReadingOrder>
<OrderedGroup id="ro_1500465862580" caption="Regions reading order">
<RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}">
<Coords points="2974,1270 16,1270 18,105 2976,105"/>
</TextRegion>
</Page>
</PcGts>
../../surface_samples/2_regions.xml
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2018-03-27T13:03:11.218+02:00</Created>
<LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
<ReadingOrder>
<OrderedGroup id="ro_1522155770972" caption="Regions reading order">
<RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
<RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
<Coords points="3509,1675 0,1675 0,0 3509,0"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
</Page>
</PcGts>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2018-03-27T13:03:11.218+02:00</Created>
<LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
<ReadingOrder>
<OrderedGroup id="ro_1522155770972" caption="Regions reading order">
<RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
<RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
<Coords points="3509,1675 0,1675 0,0 3509,0"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
</Page>
</PcGts>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2018-03-27T13:03:11.218+02:00</Created>
<LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
<TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
<Coords points="3509,1675 0,1675 0,0 3509,0"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
</Page>
</PcGts>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2017-07-19T13:58:10.738+02:00</Created>
<LastChange>2017-07-19T14:04:22.502+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731">
<ReadingOrder>
<OrderedGroup id="ro_1500465862580" caption="Regions reading order">
<RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}">
<Coords points="2974,1270 16,1270 18,105 2976,105"/>
</TextRegion>
</Page>
</PcGts>
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
<Metadata>
<Creator>TRP</Creator>
<Created>2018-03-27T13:03:11.218+02:00</Created>
<LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
</Metadata>
<Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
<ReadingOrder>
<OrderedGroup id="ro_1522155770972" caption="Regions reading order">
<RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
<RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
<RegionRefIndexed index="2" regionRef="TextRegion_1522155769847_484"/>
</OrderedGroup>
</ReadingOrder>
<TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
<Coords points="3509,1675 0,1675 0,0 3509,0"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
<TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}">
<Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
</TextRegion>
</Page>
</PcGts>
from pathlib import Path
from django.urls import reverse
from rest_framework import status
from arkindex.project.tests import FixtureAPITestCase
from arkindex.documents.models import Page, TranscriptionType
FIXTURES = Path(__file__).absolute().parent / 'pagexml_samples'
class TestPageXml(FixtureAPITestCase):
@classmethod
def setUpTestData(cls):
super().setUpTestData()
cls.page = Page.objects.get(corpus=cls.corpus, name='Volume 2, page 1r')
def test_pagexml_import_requires_login(self):
with (FIXTURES / 'transcript.xml').open() as f:
resp = self.client.post(
reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}),
data=f.read(),
content_type='application/xml',
)
self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
def test_pagexml_import(self):
self.assertFalse(self.page.transcriptions.exists())
self.client.force_login(self.user)
with (FIXTURES / 'transcript.xml').open() as f:
resp = self.client.post(
reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}),
data=f.read(),
content_type='application/xml',
)
self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
self.maxDiff = None
self.assertCountEqual(self.page.transcriptions.values_list('type', 'text'), [
(TranscriptionType.Paragraph, 'B .1\nLouis Joseph\nPierre Siméon\nLemieux'),
(TranscriptionType.Paragraph, 'Le onze janvier mil neuf centsept\n'
'nous prêtre soussigné avons baptisé Louis'),
(TranscriptionType.Line, 'B .1'),
(TranscriptionType.Line, 'Louis Joseph'),
(TranscriptionType.Line, 'Pierre Siméon'),
(TranscriptionType.Line, 'Lemieux'),
(TranscriptionType.Line, 'Le onze janvier mil neuf centsept'),
(TranscriptionType.Line, 'nous prêtre soussigné avons baptisé Louis'),
])
from unittest import TestCase
from arkindex.documents.surface import SurfaceParser
from arkindex.project.polygon import Polygon
import os.path
FIXTURES = os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'surface_samples',
)
class TestSurfaceParser(TestCase):
# TODO: Make these tests readable!
def test_surface_simple(self):
s = SurfaceParser(os.path.join(FIXTURES, 'simple.xml'))
# Check metadata
meta = s.get_metadata()
self.assertEqual(meta['imageFilename'], 'FRAN_0021_0023_L.jpg')
self.assertEqual(meta['imageWidth'], '3195')
self.assertEqual(meta['imageHeight'], '3731')
self.assertEqual(meta['creator'], 'TRP')
self.assertEqual(meta['created'], '2017-07-19T13:58:10.738+02:00')
self.assertEqual(meta['lastchange'], '2017-07-19T14:04:22.502+02:00')
# Check surfaces
surfaces = s.list_surfaces()
self.assertEqual(surfaces[0]['id'], 'TextRegion_1500465748446_12')
self.assertEqual(surfaces[0]['index'], 0)
self.assertEqual(surfaces[0]['polygon'], Polygon([(2974, 1270), (16, 1270), (18, 105), (2976, 105)]))
def test_surface_2_regions(self):
s = SurfaceParser(os.path.join(FIXTURES, '2_regions.xml'))
# Check metadata
meta = s.get_metadata()
self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg')
self.assertEqual(meta['imageWidth'], '3509')
self.assertEqual(meta['imageHeight'], '4569')
self.assertEqual(meta['creator'], 'TRP')
self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00')
self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00')
# Check surfaces
surfaces = s.list_surfaces()
self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
self.assertEqual(surfaces[0]['index'], 0)
self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
self.assertEqual(surfaces[1]['index'], 1)
self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
def test_surface_unordered(self):
s = SurfaceParser(os.path.join(FIXTURES, 'unordered.xml'))
# Check metadata
meta = s.get_metadata()
self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg')
self.assertEqual(meta['imageWidth'], '3509')
self.assertEqual(meta['imageHeight'], '4569')
self.assertEqual(meta['creator'], 'TRP')
self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00')
self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00')
# Check surfaces
surfaces = s.list_surfaces()
self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
self.assertEqual(surfaces[0]['index'], 0)
self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
self.assertEqual(surfaces[1]['index'], 1)
self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484")
self.assertEqual(surfaces[2]['index'], 2)
self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
def test_surface_no_reading_order(self):
"Test SurfaceParser with no ReadingOrder tag in XML file"
surfaces = SurfaceParser(os.path.join(FIXTURES, 'no_reading_order.xml')).list_surfaces()
self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
self.assertEqual(surfaces[0]['index'], 0)
self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
self.assertEqual(surfaces[1]['index'], 1)
self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484")
self.assertEqual(surfaces[2]['index'], 2)
self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
......@@ -7,7 +7,8 @@ from arkindex.documents.api.elements import (
ElementTranscriptions, ElementsCreate,
)
from arkindex.documents.api.search import PageSearch, ActSearch
from arkindex.documents.api.ml import ClassificationBulk, TranscriptionCreate, TranscriptionBulk
from arkindex.documents.api.ml import \
ClassificationBulk, TranscriptionCreate, TranscriptionBulk, PageXmlTranscriptionsImport
from arkindex.documents.api.iiif import (
VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList,
TranscriptionSearchAnnotationList,
......@@ -36,6 +37,11 @@ api = [
path('element/<uuid:pk>/history/', ElementHistory.as_view(), name='element-history'),
path('element/<uuid:pk>/transcriptions/', ElementTranscriptions.as_view(), name='element-transcriptions'),
path('page/<uuid:pk>/', PageDetails.as_view(), name='page-details'),
path(
'page/<uuid:pk>/transcriptions/xml/',
PageXmlTranscriptionsImport.as_view(),
name='pagexml-transcriptions',
),
path('surface/<uuid:pk>/', SurfaceDetails.as_view(), name='surface-details'),
path('corpus/', CorpusList.as_view(), name='corpus'),
path('corpus/<uuid:pk>/', CorpusRetrieve.as_view(), name='corpus-retrieve'),
......
from rest_framework.parsers import BaseParser
from lxml import etree
class XMLParser(BaseParser):
"""
A basic XML parser without serializer support
"""
media_type = 'application/xml'
def parse(self, stream, media_type=None, parser_context=None):
"""
Parse the request body into a lxml Element
"""
return etree.parse(stream).getroot()
......@@ -112,7 +112,7 @@ def update_schema(schema, patches):
for method, operation in methods.items():
if 'requestBody' not in operation:
continue
if not operation['requestBody']['content']['application/json']['schema']:
if not operation['requestBody']['content'].get('application/json', {}).get('schema'):
# Ignore empty schemas
continue
......
......@@ -437,6 +437,23 @@ paths:
security: []
tags:
- elements
/api/v1/page/{id}/transcriptions/xml/:
post:
operationId: ImportPageXmlTranscriptions
description: Import transcriptions into Arkindex from region data in the PAGE XML format.
requestBody:
required: true
description: >-
A PAGE XML document.
TextRegion tags will be imported as Paragraph transcriptions
and TextLine tags will become Line transcriptions.
See https://github.com/PRImA-Research-Lab/PAGE-XML for more info
about the PAGE XML format.
content:
application/xml:
schema: {}
tags:
- ml
/api/v1/pages/:
get:
operationId: SearchPages
......
git+https://github.com/encode/django-rest-framework.git@ac64c0a536b0ae21b81d86c3c2a37bc0c70f932e#egg=djangorestframework
git+https://github.com/encode/django-rest-framework.git@bb0db35680dd85cd33dade83b6cbd1039995b9db#egg=djangorestframework
coreapi==2.3.3
apistar>=0.7.2
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment