diff --git a/arkindex/documents/api/ml.py b/arkindex/documents/api/ml.py index 489b5326a74d88c1aca91116df03670959d5f4f9..9881ca98bf76f699eb3b3403ed594c536d259bc9 100644 --- a/arkindex/documents/api/ml.py +++ b/arkindex/documents/api/ml.py @@ -1,14 +1,20 @@ from django.conf import settings +from django.shortcuts import get_object_or_404 from rest_framework import status +from rest_framework.mixins import CreateModelMixin from rest_framework.generics import CreateAPIView, UpdateAPIView from rest_framework.exceptions import ValidationError from rest_framework.response import Response -from arkindex.documents.models import Classification, DataSource, Transcription, TranscriptionType +from rest_framework.views import APIView +from arkindex.documents.models import \ + Classification, DataSource, Transcription, TranscriptionType, Page, Corpus from arkindex.documents.serializers.ml import \ ClassificationsSerializer, TranscriptionsSerializer, TranscriptionCreateSerializer from arkindex.documents.indexer import Indexer +from arkindex.documents.pagexml import PageXmlParser from arkindex.images.models import Zone from arkindex.images.importer import build_transcriptions, save_transcriptions, index_transcriptions +from arkindex.project.parsers import XMLParser from arkindex.project.permissions import IsVerified from arkindex.project.polygon import Polygon @@ -160,3 +166,33 @@ class ClassificationBulk(CreateAPIView): ) for cl in serializer.validated_data['classifications'] ]) + + +class PageXmlTranscriptionsImport(CreateModelMixin, APIView): + parser_classes = (XMLParser, ) + permission_classes = (IsVerified, ) + + def get_queryset(self): + return Page.objects.filter(corpus__in=Corpus.objects.writable(self.request.user)) + + def get_object(self): + """ + Since we are inheriting from APIView, because GenericAPIView would break OpenAPI, + we have to rewrite get_object ourselves. + """ + obj = get_object_or_404(self.get_queryset(), pk=self.kwargs['pk']) + self.check_object_permissions(self.request, obj) + return obj + + def post(self, request, *args, **kwargs): + page = self.get_object() + try: + parser = PageXmlParser(request.data) + except AssertionError as e: + raise ValidationError('Could not parse PAGE XML document: {!s}'.format(e)) + + parser.save_transcriptions(page) + return Response( + status=status.HTTP_201_CREATED, + headers=self.get_success_headers(None), + ) diff --git a/arkindex/documents/pagexml.py b/arkindex/documents/pagexml.py new file mode 100644 index 0000000000000000000000000000000000000000..624841894eea34fc291661de88c75cfb196ef62f --- /dev/null +++ b/arkindex/documents/pagexml.py @@ -0,0 +1,93 @@ +from django.utils.functional import cached_property +from arkindex_common.ml_tool import MLToolType +from arkindex_common.pagexml import PageXmlPage +from arkindex.project.polygon import Polygon +from arkindex.documents.models import DataSource, TranscriptionType, ElementType, Page +import logging + +logger = logging.getLogger(__name__) + + +class PageXmlParser(object): + + def __init__(self, path_or_xml): + self.pagexml_page = PageXmlPage(path_or_xml) + + @cached_property + def source(self): + ds, _ = DataSource.objects.get_or_create( + type=MLToolType.Recognizer, + slug='transkribus', + revision='2013-07-15', + internal=False, + ) + return ds + + def get_zone(self, region, page): + poly = Polygon(region.points) + z = page.zone.image.zones.filter(polygon=poly).first() + if not z: + z = page.zone.image.zones.create(polygon=poly) + return z + + def create_surface(self, region, page): + if region.points is None: + logger.warning('No points in region {}'.format(region.id)) + return None, False + surface, created = self.get_zone(region, page).elements.get_or_create( + type=ElementType.Surface, + corpus=page.corpus, + defaults={ + 'name': 'Surface {}'.format(region.id), + }, + ) + surface.add_parent(page) + return surface, created + + def save_surfaces(self, page): + if not len(self.pagexml_page.page.text_regions): + logger.warning('No surfaces to save') + return 0, 0 + region_count, created_count = 0, 0 + for region in self.pagexml_page.page.sort_regions(self.pagexml_page.page.text_regions): + region_count += 1 + _, created = self.create_surface(region, page) + created_count += created + return region_count, created_count + + def create_transcription(self, region, page, type=TranscriptionType.Line): + if region.text is None: + logger.warning('No text in region {}'.format(region.id)) + return None, False + + if region.points is None: + logger.warning('No points in region {}'.format(region.id)) + return None, False + + return self.get_zone(region, page).transcriptions.get_or_create( + type=type, + element=page, + text=region.text, + source=self.source, + ) + + def save_transcriptions(self, page): + assert isinstance(page, Page), 'Page should be an Arkindex page' + if self.pagexml_page.page.text_regions is None or not len(self.pagexml_page.page.text_regions): + logger.warning('No transcriptions to save') + return + region_count, line_count, region_ts_count, line_ts_count = 0, 0, 0, 0 + for region in self.pagexml_page.page.text_regions: + region_count += 1 + _, created = self.create_transcription(region, page, type=TranscriptionType.Paragraph) + region_ts_count += created + for line in region.lines: + line_count += 1 + _, created = self.create_transcription(line, page) + line_ts_count += created + logger.info('Parsed {} regions and {} lines and created {} paragraph and {} line transcriptions'.format( + region_count, + line_count, + region_ts_count, + line_ts_count, + )) diff --git a/arkindex/documents/surface.py b/arkindex/documents/surface.py index fb1e66c00f967369ac5f7b1955a7a758b888b570..459558d79ca68f3c88a4b09741adf9c1b97cd6e3 100644 --- a/arkindex/documents/surface.py +++ b/arkindex/documents/surface.py @@ -1,91 +1,18 @@ -from arkindex.project.polygon import Polygon from arkindex.documents.models import Corpus, ElementType, Element, Page +from arkindex.documents.pagexml import PageXmlParser from arkindex.documents.importer import parse_folio -from arkindex.images.models import Zone from pathlib import Path -import xml.etree.ElementTree as ElementTree import re import logging logger = logging.getLogger(__name__) -NS_PAGE = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15' -NS = {'page': NS_PAGE} -NS_DELETE = "{%s}" % NS_PAGE - REGEX_VOLUME_NAME = re.compile(r'(?:JJ)?([0-9]+)([A-Z]?)') REGEX_ACT_NUMBER = re.compile(r'([0-9]+)\s*([a-z]*)') REGEX_FOLIO = re.compile(r'(?:fol\.?\s*)?([0-9]+)\s*(bis)?\s*(r|v)?', re.IGNORECASE) REGEX_FOLIO_GROUP = re.compile(r'([^-_à ]+)', re.IGNORECASE) -class SurfaceParser(object): - def __init__(self, path): - - self.path = str(path) - self.tree = ElementTree.parse(self.path) - - # Check we have a page - self.page = self.tree.find('page:Page', NS) - assert self.page is not None, 'Missing page' - - def get_metadata(self): - - # From metadata - metadata = self.tree.find('page:Metadata', NS) - assert metadata is not None, 'Missing metadata' - dictMetadata = {node.tag.replace(NS_DELETE, "").lower(): node.text for node in metadata} - - # From page - dictMetadata.update(self.page.attrib) - - return dictMetadata - - def list_surfaces(self): - region_indexes = None - - # Load base elements from page - read = self.page.find('page:ReadingOrder', NS) - if read is not None: - order = read.find('page:OrderedGroup', NS) - assert order is not None, 'Missing ordered group' - - # Find indexes for each region - region_indexes = { - refindex.get('regionRef'): int(refindex.get('index')) - for refindex in order.findall('page:RegionRefIndexed', NS) - } - - out = [] - for i, textRegion in enumerate(self.page.findall('page:TextRegion', NS)): - - # Build coordinates polygon - coords = textRegion.find('page:Coords', NS) - assert coords is not None, 'Missing coords' - polygon = Polygon([ - (int(x), int(y)) - for x, y in re.findall(r'(\d+),(\d+)', coords.get('points')) - ]) - - text_id = textRegion.get('id') - if region_indexes is not None: # If ReadingOrder exists - # Find index for this text region - index = region_indexes.get(text_id) - assert index is not None, 'Missing index in region_indexes' - else: - index = i - - # Build output structure - out.append({ - 'index': index, - 'id': text_id, - 'polygon': polygon, - }) - - # Sort the surfaces by index - return sorted(out, key=lambda x: x['index']) - - class SurfaceImporter(object): def __init__(self, xmlpath=None, basepath=None, corpus=None, **kwargs): @@ -167,36 +94,15 @@ class SurfaceImporter(object): for page, path in self.get_xml_pages(): xml_count += 1 - s = SurfaceParser(self.basepath / path) - image = page.zone.image + parser = PageXmlParser(self.basepath / path) if dry: # Handle dry run - logger.info("{}\t{}".format(path, image.path)) + logger.info("{}\t{}".format(path, page.zone.image.path)) continue - for surfacedata in s.list_surfaces(): - poly = surfacedata['polygon'] - try: - z, _ = image.zones.get_or_create(polygon=poly) - except ValueError as e: - logger.warning( - "Could not import zone with polygon '{}' on image '{}' from file '{}': {}".format( - poly, image.id, path, e, - ) - ) - except Zone.MultipleObjectsReturned: - logger.warning('Multiple zones found, picking the first one') - z = image.zones.filter(polygon=poly).first() - - surface, created = Element.objects.get_or_create( - type=ElementType.Surface, - zone=z, - corpus=page.corpus, - defaults={'name': "Surface {}".format(surfacedata['id'])}, - ) - surface.add_parent(page) - surfaces_count += 1 - created_surfaces_count += created + region_count, created_count = parser.save_surfaces(page) + surfaces_count += region_count + created_surfaces_count += created_count logger.info("Parsed {} and created {} surfaces from {} XML files".format( surfaces_count, created_surfaces_count, xml_count, diff --git a/arkindex/documents/tests/pagexml_samples/transcript.xml b/arkindex/documents/tests/pagexml_samples/transcript.xml new file mode 100644 index 0000000000000000000000000000000000000000..d1ba6bb54d20de65b04666bdde86fc5147413ec3 --- /dev/null +++ b/arkindex/documents/tests/pagexml_samples/transcript.xml @@ -0,0 +1,80 @@ +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> + <Metadata> + <Creator>TRP</Creator> + <Created>2018-10-01T09:25:16.139-04:00</Created> + <LastChange>2019-02-19T19:45:19.222+01:00</LastChange> + </Metadata> + <Page imageFilename="01R_CE101S01_1907_005.tif" imageWidth="2415" imageHeight="3936"> + <ReadingOrder> + <OrderedGroup id="ro_1550601919253" caption="Regions reading order"> + <RegionRefIndexed index="0" regionRef="TextRegion_1540299380975_9"/> + <RegionRefIndexed index="1" regionRef="TextRegion_1540299473514_23"/> + </OrderedGroup> + </ReadingOrder> + <Relations> + <Relation type="link"> + <RegionRef regionRef="TextRegion_1540299380975_9"/> + <RegionRef regionRef="TextRegion_1540299473514_23"/> + </Relation> + </Relations> + <TextRegion orientation="0.0" type="marginalia" id="TextRegion_1540299380975_9" custom="readingOrder {index:0;} structure {type:marginalia;}"> + <Coords points="12,34 56,78 910,1112"/> + <TextLine id="r1l6" custom="readingOrder {index:0;} structure {type:ref;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>B .1</Unicode> + </TextEquiv> + </TextLine> + <TextLine id="r1l7" custom="readingOrder {index:1;} _prenom {offset:0; length:12; continued:true;_role:sujet;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>Louis Joseph</Unicode> + </TextEquiv> + </TextLine> + <TextLine id="r1l8" custom="readingOrder {index:2;} _prenom {offset:0; length:13; continued:true;_role:sujet;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>Pierre Siméon</Unicode> + </TextEquiv> + </TextLine> + <TextLine id="r1l9" custom="readingOrder {index:3;} _nom {offset:0; length:7;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>Lemieux</Unicode> + </TextEquiv> + </TextLine> + <TextEquiv> + <Unicode>B .1 +Louis Joseph +Pierre Siméon +Lemieux</Unicode> + </TextEquiv> + </TextRegion> + <TextRegion orientation="0.0" id="TextRegion_1540299473514_23" custom="readingOrder {index:1;}"> + <Coords points="12,34 56,78 910,1112"/> + <TextLine id="r2l12" custom="readingOrder {index:0;} _date {offset:3; length:30;_enregistrement:1;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>Le onze janvier mil neuf centsept</Unicode> + </TextEquiv> + </TextLine> + <TextLine id="r2l13" custom="readingOrder {index:1;} _prenom {offset:36; length:5; continued:true;_role:sujet;}"> + <Coords points="12,34 56,78 910,1112"/> + <Baseline points="13,37 42,42 37,13"/> + <TextEquiv> + <Unicode>nous prêtre soussigné avons baptisé Louis</Unicode> + </TextEquiv> + </TextLine> + <TextEquiv> + <Unicode>Le onze janvier mil neuf centsept +nous prêtre soussigné avons baptisé Louis</Unicode> + </TextEquiv> + </TextRegion> + </Page> +</PcGts> diff --git a/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml b/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml deleted file mode 120000 index bedd858aea72d8de44d772f1c99cdb55ac2eb67a..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml +++ /dev/null @@ -1 +0,0 @@ -../../surface_samples/simple.xml \ No newline at end of file diff --git a/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml b/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml new file mode 100644 index 0000000000000000000000000000000000000000..2c7658cbe8eac1740aa54cd0739c6a512e638cd7 --- /dev/null +++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml @@ -0,0 +1,18 @@ +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> + <Metadata> + <Creator>TRP</Creator> + <Created>2017-07-19T13:58:10.738+02:00</Created> + <LastChange>2017-07-19T14:04:22.502+02:00</LastChange> + </Metadata> + <Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731"> + <ReadingOrder> + <OrderedGroup id="ro_1500465862580" caption="Regions reading order"> + <RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/> + </OrderedGroup> + </ReadingOrder> + <TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}"> + <Coords points="2974,1270 16,1270 18,105 2976,105"/> + </TextRegion> + </Page> +</PcGts> diff --git a/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml b/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml deleted file mode 120000 index 61bca024c852b9de31f489b88b3ea2ad26cb2f20..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml +++ /dev/null @@ -1 +0,0 @@ -../../surface_samples/2_regions.xml \ No newline at end of file diff --git a/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml b/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml new file mode 100644 index 0000000000000000000000000000000000000000..411b329aaae03102e9fadec7d5c55dcf2b9b5a22 --- /dev/null +++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8" standalone="yes"?> +<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> + <Metadata> + <Creator>TRP</Creator> + <Created>2018-03-27T13:03:11.218+02:00</Created> + <LastChange>2018-03-27T15:02:50.831+02:00</LastChange> + </Metadata> + <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569"> + <ReadingOrder> + <OrderedGroup id="ro_1522155770972" caption="Regions reading order"> + <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/> + <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/> + </OrderedGroup> + </ReadingOrder> + <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}"> + <Coords points="3509,1675 0,1675 0,0 3509,0"/> + </TextRegion> + <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}"> + <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> + </TextRegion> + </Page> +</PcGts> diff --git a/arkindex/documents/tests/surface_samples/2_regions.xml b/arkindex/documents/tests/surface_samples/2_regions.xml deleted file mode 100644 index 411b329aaae03102e9fadec7d5c55dcf2b9b5a22..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_samples/2_regions.xml +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="yes"?> -<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> - <Metadata> - <Creator>TRP</Creator> - <Created>2018-03-27T13:03:11.218+02:00</Created> - <LastChange>2018-03-27T15:02:50.831+02:00</LastChange> - </Metadata> - <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569"> - <ReadingOrder> - <OrderedGroup id="ro_1522155770972" caption="Regions reading order"> - <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/> - <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/> - </OrderedGroup> - </ReadingOrder> - <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}"> - <Coords points="3509,1675 0,1675 0,0 3509,0"/> - </TextRegion> - <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}"> - <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> - </TextRegion> - </Page> -</PcGts> diff --git a/arkindex/documents/tests/surface_samples/no_reading_order.xml b/arkindex/documents/tests/surface_samples/no_reading_order.xml deleted file mode 100644 index c13da57b9ad6b73ed319baebdcf1ecffc66a73aa..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_samples/no_reading_order.xml +++ /dev/null @@ -1,19 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="yes"?> -<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> - <Metadata> - <Creator>TRP</Creator> - <Created>2018-03-27T13:03:11.218+02:00</Created> - <LastChange>2018-03-27T15:02:50.831+02:00</LastChange> - </Metadata> - <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569"> - <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}"> - <Coords points="3509,1675 0,1675 0,0 3509,0"/> - </TextRegion> - <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}"> - <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> - </TextRegion> - <TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}"> - <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> - </TextRegion> - </Page> -</PcGts> diff --git a/arkindex/documents/tests/surface_samples/simple.xml b/arkindex/documents/tests/surface_samples/simple.xml deleted file mode 100644 index 2c7658cbe8eac1740aa54cd0739c6a512e638cd7..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_samples/simple.xml +++ /dev/null @@ -1,18 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="yes"?> -<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> - <Metadata> - <Creator>TRP</Creator> - <Created>2017-07-19T13:58:10.738+02:00</Created> - <LastChange>2017-07-19T14:04:22.502+02:00</LastChange> - </Metadata> - <Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731"> - <ReadingOrder> - <OrderedGroup id="ro_1500465862580" caption="Regions reading order"> - <RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/> - </OrderedGroup> - </ReadingOrder> - <TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}"> - <Coords points="2974,1270 16,1270 18,105 2976,105"/> - </TextRegion> - </Page> -</PcGts> diff --git a/arkindex/documents/tests/surface_samples/unordered.xml b/arkindex/documents/tests/surface_samples/unordered.xml deleted file mode 100644 index 235f166ddf7fe4edcd2fc1aa41941e107428f5e9..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/surface_samples/unordered.xml +++ /dev/null @@ -1,26 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" standalone="yes"?> -<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd"> - <Metadata> - <Creator>TRP</Creator> - <Created>2018-03-27T13:03:11.218+02:00</Created> - <LastChange>2018-03-27T15:02:50.831+02:00</LastChange> - </Metadata> - <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569"> - <ReadingOrder> - <OrderedGroup id="ro_1522155770972" caption="Regions reading order"> - <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/> - <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/> - <RegionRefIndexed index="2" regionRef="TextRegion_1522155769847_484"/> - </OrderedGroup> - </ReadingOrder> - <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}"> - <Coords points="3509,1675 0,1675 0,0 3509,0"/> - </TextRegion> - <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}"> - <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> - </TextRegion> - <TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}"> - <Coords points="3509,4569 0,4569 0,1675 3509,1675"/> - </TextRegion> - </Page> -</PcGts> diff --git a/arkindex/documents/tests/test_pagexml.py b/arkindex/documents/tests/test_pagexml.py new file mode 100644 index 0000000000000000000000000000000000000000..95ea930233d8ea8c6a8a46e0728838f03338c098 --- /dev/null +++ b/arkindex/documents/tests/test_pagexml.py @@ -0,0 +1,47 @@ +from pathlib import Path +from django.urls import reverse +from rest_framework import status +from arkindex.project.tests import FixtureAPITestCase +from arkindex.documents.models import Page, TranscriptionType + +FIXTURES = Path(__file__).absolute().parent / 'pagexml_samples' + + +class TestPageXml(FixtureAPITestCase): + + @classmethod + def setUpTestData(cls): + super().setUpTestData() + cls.page = Page.objects.get(corpus=cls.corpus, name='Volume 2, page 1r') + + def test_pagexml_import_requires_login(self): + with (FIXTURES / 'transcript.xml').open() as f: + resp = self.client.post( + reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}), + data=f.read(), + content_type='application/xml', + ) + self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN) + + def test_pagexml_import(self): + self.assertFalse(self.page.transcriptions.exists()) + self.client.force_login(self.user) + with (FIXTURES / 'transcript.xml').open() as f: + resp = self.client.post( + reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}), + data=f.read(), + content_type='application/xml', + ) + self.assertEqual(resp.status_code, status.HTTP_201_CREATED) + self.maxDiff = None + self.assertCountEqual(self.page.transcriptions.values_list('type', 'text'), [ + (TranscriptionType.Paragraph, 'B .1\nLouis Joseph\nPierre Siméon\nLemieux'), + (TranscriptionType.Paragraph, 'Le onze janvier mil neuf centsept\n' + 'nous prêtre soussigné avons baptisé Louis'), + (TranscriptionType.Line, 'B .1'), + (TranscriptionType.Line, 'Louis Joseph'), + (TranscriptionType.Line, 'Pierre Siméon'), + (TranscriptionType.Line, 'Lemieux'), + (TranscriptionType.Line, 'Le onze janvier mil neuf centsept'), + (TranscriptionType.Line, 'nous prêtre soussigné avons baptisé Louis'), + ]) diff --git a/arkindex/documents/tests/test_surface_parser.py b/arkindex/documents/tests/test_surface_parser.py deleted file mode 100644 index dfd0a5c53831d0010b35feb2ffdbbdfbc67b4cf9..0000000000000000000000000000000000000000 --- a/arkindex/documents/tests/test_surface_parser.py +++ /dev/null @@ -1,86 +0,0 @@ -from unittest import TestCase -from arkindex.documents.surface import SurfaceParser -from arkindex.project.polygon import Polygon -import os.path - -FIXTURES = os.path.join( - os.path.dirname(os.path.realpath(__file__)), - 'surface_samples', -) - - -class TestSurfaceParser(TestCase): - # TODO: Make these tests readable! - - def test_surface_simple(self): - s = SurfaceParser(os.path.join(FIXTURES, 'simple.xml')) - - # Check metadata - meta = s.get_metadata() - self.assertEqual(meta['imageFilename'], 'FRAN_0021_0023_L.jpg') - self.assertEqual(meta['imageWidth'], '3195') - self.assertEqual(meta['imageHeight'], '3731') - self.assertEqual(meta['creator'], 'TRP') - self.assertEqual(meta['created'], '2017-07-19T13:58:10.738+02:00') - self.assertEqual(meta['lastchange'], '2017-07-19T14:04:22.502+02:00') - # Check surfaces - surfaces = s.list_surfaces() - self.assertEqual(surfaces[0]['id'], 'TextRegion_1500465748446_12') - self.assertEqual(surfaces[0]['index'], 0) - self.assertEqual(surfaces[0]['polygon'], Polygon([(2974, 1270), (16, 1270), (18, 105), (2976, 105)])) - - def test_surface_2_regions(self): - s = SurfaceParser(os.path.join(FIXTURES, '2_regions.xml')) - - # Check metadata - meta = s.get_metadata() - self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg') - self.assertEqual(meta['imageWidth'], '3509') - self.assertEqual(meta['imageHeight'], '4569') - self.assertEqual(meta['creator'], 'TRP') - self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00') - self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00') - # Check surfaces - surfaces = s.list_surfaces() - self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482') - self.assertEqual(surfaces[0]['index'], 0) - self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)])) - self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481') - self.assertEqual(surfaces[1]['index'], 1) - self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)])) - - def test_surface_unordered(self): - s = SurfaceParser(os.path.join(FIXTURES, 'unordered.xml')) - - # Check metadata - meta = s.get_metadata() - self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg') - self.assertEqual(meta['imageWidth'], '3509') - self.assertEqual(meta['imageHeight'], '4569') - self.assertEqual(meta['creator'], 'TRP') - self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00') - self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00') - # Check surfaces - surfaces = s.list_surfaces() - self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482') - self.assertEqual(surfaces[0]['index'], 0) - self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)])) - self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481') - self.assertEqual(surfaces[1]['index'], 1) - self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)])) - self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484") - self.assertEqual(surfaces[2]['index'], 2) - self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)])) - - def test_surface_no_reading_order(self): - "Test SurfaceParser with no ReadingOrder tag in XML file" - surfaces = SurfaceParser(os.path.join(FIXTURES, 'no_reading_order.xml')).list_surfaces() - self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482') - self.assertEqual(surfaces[0]['index'], 0) - self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)])) - self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481') - self.assertEqual(surfaces[1]['index'], 1) - self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)])) - self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484") - self.assertEqual(surfaces[2]['index'], 2) - self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)])) diff --git a/arkindex/project/api_v1.py b/arkindex/project/api_v1.py index 1dae673dcc46d058272d92cc93d892924d47b2d3..6f34541cfa7c61baa5a3a81d3b20d513c2447a9f 100644 --- a/arkindex/project/api_v1.py +++ b/arkindex/project/api_v1.py @@ -7,7 +7,8 @@ from arkindex.documents.api.elements import ( ElementTranscriptions, ElementsCreate, ) from arkindex.documents.api.search import PageSearch, ActSearch -from arkindex.documents.api.ml import ClassificationBulk, TranscriptionCreate, TranscriptionBulk +from arkindex.documents.api.ml import \ + ClassificationBulk, TranscriptionCreate, TranscriptionBulk, PageXmlTranscriptionsImport from arkindex.documents.api.iiif import ( VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList, TranscriptionSearchAnnotationList, @@ -36,6 +37,11 @@ api = [ path('element/<uuid:pk>/history/', ElementHistory.as_view(), name='element-history'), path('element/<uuid:pk>/transcriptions/', ElementTranscriptions.as_view(), name='element-transcriptions'), path('page/<uuid:pk>/', PageDetails.as_view(), name='page-details'), + path( + 'page/<uuid:pk>/transcriptions/xml/', + PageXmlTranscriptionsImport.as_view(), + name='pagexml-transcriptions', + ), path('surface/<uuid:pk>/', SurfaceDetails.as_view(), name='surface-details'), path('corpus/', CorpusList.as_view(), name='corpus'), path('corpus/<uuid:pk>/', CorpusRetrieve.as_view(), name='corpus-retrieve'), diff --git a/arkindex/project/parsers.py b/arkindex/project/parsers.py new file mode 100644 index 0000000000000000000000000000000000000000..d55f8f757977476959dfffbc6a46a5e925ae0292 --- /dev/null +++ b/arkindex/project/parsers.py @@ -0,0 +1,15 @@ +from rest_framework.parsers import BaseParser +from lxml import etree + + +class XMLParser(BaseParser): + """ + A basic XML parser without serializer support + """ + media_type = 'application/xml' + + def parse(self, stream, media_type=None, parser_context=None): + """ + Parse the request body into a lxml Element + """ + return etree.parse(stream).getroot() diff --git a/openapi/patch.py b/openapi/patch.py index 4e711e616eff2b0543106637f6f886436055d103..09593b106951381026a9c39fb925d00a79ef5faa 100755 --- a/openapi/patch.py +++ b/openapi/patch.py @@ -112,7 +112,7 @@ def update_schema(schema, patches): for method, operation in methods.items(): if 'requestBody' not in operation: continue - if not operation['requestBody']['content']['application/json']['schema']: + if not operation['requestBody']['content'].get('application/json', {}).get('schema'): # Ignore empty schemas continue diff --git a/openapi/patch.yml b/openapi/patch.yml index 62a3aee36d959d4b30a96abfda5c825219082fbe..8538e9c5da8313c5d0af9f0427c1e50aca7a2135 100644 --- a/openapi/patch.yml +++ b/openapi/patch.yml @@ -437,6 +437,23 @@ paths: security: [] tags: - elements + /api/v1/page/{id}/transcriptions/xml/: + post: + operationId: ImportPageXmlTranscriptions + description: Import transcriptions into Arkindex from region data in the PAGE XML format. + requestBody: + required: true + description: >- + A PAGE XML document. + TextRegion tags will be imported as Paragraph transcriptions + and TextLine tags will become Line transcriptions. + See https://github.com/PRImA-Research-Lab/PAGE-XML for more info + about the PAGE XML format. + content: + application/xml: + schema: {} + tags: + - ml /api/v1/pages/: get: operationId: SearchPages diff --git a/openapi/requirements.txt b/openapi/requirements.txt index e9cc0800b3edbd01b8e8825c5b9b9c13046741c2..1149dbc6dbb0e6a06c485a49a36c80f4d44381a2 100644 --- a/openapi/requirements.txt +++ b/openapi/requirements.txt @@ -1,3 +1,3 @@ -git+https://github.com/encode/django-rest-framework.git@ac64c0a536b0ae21b81d86c3c2a37bc0c70f932e#egg=djangorestframework +git+https://github.com/encode/django-rest-framework.git@bb0db35680dd85cd33dade83b6cbd1039995b9db#egg=djangorestframework coreapi==2.3.3 apistar>=0.7.2