Transkribus PAGE imports

dcaddfe3 · Erwan Rouchet · Bastien Abadie · 4f7d30f1 · dcaddfe3 · dcaddfe3
Commit dcaddfe3 authored 5 years ago by Erwan Rouchet Committed by Bastien Abadie 5 years ago
--- a/arkindex/documents/api/ml.py
+++ b/arkindex/documents/api/ml.py
 from django.conf import settings
+from django.shortcuts import get_object_or_404
 from rest_framework import status
+from rest_framework.mixins import CreateModelMixin
 from rest_framework.generics import CreateAPIView, UpdateAPIView
 from rest_framework.exceptions import ValidationError
 from rest_framework.response import Response
-from arkindex.documents.models import Classification, DataSource, Transcription, TranscriptionType
+from rest_framework.views import APIView
+from arkindex.documents.models import \
+    Classification, DataSource, Transcription, TranscriptionType, Page, Corpus
 from arkindex.documents.serializers.ml import \
    ClassificationsSerializer, TranscriptionsSerializer, TranscriptionCreateSerializer
 from arkindex.documents.indexer import Indexer
+from arkindex.documents.pagexml import PageXmlParser
 from arkindex.images.models import Zone
 from arkindex.images.importer import build_transcriptions, save_transcriptions, index_transcriptions
+from arkindex.project.parsers import XMLParser
 from arkindex.project.permissions import IsVerified
 from arkindex.project.polygon import Polygon

@@ -160,3 +166,33 @@ class ClassificationBulk(CreateAPIView):
            )
            for cl in serializer.validated_data['classifications']
        ])
+
+
+class PageXmlTranscriptionsImport(CreateModelMixin, APIView):
+    parser_classes = (XMLParser, )
+    permission_classes = (IsVerified, )
+
+    def get_queryset(self):
+        return Page.objects.filter(corpus__in=Corpus.objects.writable(self.request.user))
+
+    def get_object(self):
+        """
+        Since we are inheriting from APIView, because GenericAPIView would break OpenAPI,
+        we have to rewrite get_object ourselves.
+        """
+        obj = get_object_or_404(self.get_queryset(), pk=self.kwargs['pk'])
+        self.check_object_permissions(self.request, obj)
+        return obj
+
+    def post(self, request, *args, **kwargs):
+        page = self.get_object()
+        try:
+            parser = PageXmlParser(request.data)
+        except AssertionError as e:
+            raise ValidationError('Could not parse PAGE XML document: {!s}'.format(e))
+
+        parser.save_transcriptions(page)
+        return Response(
+            status=status.HTTP_201_CREATED,
+            headers=self.get_success_headers(None),
+        )
--- a/arkindex/documents/pagexml.py
+++ b/arkindex/documents/pagexml.py
+from django.utils.functional import cached_property
+from arkindex_common.ml_tool import MLToolType
+from arkindex_common.pagexml import PageXmlPage
+from arkindex.project.polygon import Polygon
+from arkindex.documents.models import DataSource, TranscriptionType, ElementType, Page
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class PageXmlParser(object):
+
+    def __init__(self, path_or_xml):
+        self.pagexml_page = PageXmlPage(path_or_xml)
+
+    @cached_property
+    def source(self):
+        ds, _ = DataSource.objects.get_or_create(
+            type=MLToolType.Recognizer,
+            slug='transkribus',
+            revision='2013-07-15',
+            internal=False,
+        )
+        return ds
+
+    def get_zone(self, region, page):
+        poly = Polygon(region.points)
+        z = page.zone.image.zones.filter(polygon=poly).first()
+        if not z:
+            z = page.zone.image.zones.create(polygon=poly)
+        return z
+
+    def create_surface(self, region, page):
+        if region.points is None:
+            logger.warning('No points in region {}'.format(region.id))
+            return None, False
+        surface, created = self.get_zone(region, page).elements.get_or_create(
+            type=ElementType.Surface,
+            corpus=page.corpus,
+            defaults={
+                'name': 'Surface {}'.format(region.id),
+            },
+        )
+        surface.add_parent(page)
+        return surface, created
+
+    def save_surfaces(self, page):
+        if not len(self.pagexml_page.page.text_regions):
+            logger.warning('No surfaces to save')
+            return 0, 0
+        region_count, created_count = 0, 0
+        for region in self.pagexml_page.page.sort_regions(self.pagexml_page.page.text_regions):
+            region_count += 1
+            _, created = self.create_surface(region, page)
+            created_count += created
+        return region_count, created_count
+
+    def create_transcription(self, region, page, type=TranscriptionType.Line):
+        if region.text is None:
+            logger.warning('No text in region {}'.format(region.id))
+            return None, False
+
+        if region.points is None:
+            logger.warning('No points in region {}'.format(region.id))
+            return None, False
+
+        return self.get_zone(region, page).transcriptions.get_or_create(
+            type=type,
+            element=page,
+            text=region.text,
+            source=self.source,
+        )
+
+    def save_transcriptions(self, page):
+        assert isinstance(page, Page), 'Page should be an Arkindex page'
+        if self.pagexml_page.page.text_regions is None or not len(self.pagexml_page.page.text_regions):
+            logger.warning('No transcriptions to save')
+            return
+        region_count, line_count, region_ts_count, line_ts_count = 0, 0, 0, 0
+        for region in self.pagexml_page.page.text_regions:
+            region_count += 1
+            _, created = self.create_transcription(region, page, type=TranscriptionType.Paragraph)
+            region_ts_count += created
+            for line in region.lines:
+                line_count += 1
+                _, created = self.create_transcription(line, page)
+                line_ts_count += created
+        logger.info('Parsed {} regions and {} lines and created {} paragraph and {} line transcriptions'.format(
+            region_count,
+            line_count,
+            region_ts_count,
+            line_ts_count,
+        ))
--- a/arkindex/documents/surface.py
+++ b/arkindex/documents/surface.py
-from arkindex.project.polygon import Polygon
 from arkindex.documents.models import Corpus, ElementType, Element, Page
+from arkindex.documents.pagexml import PageXmlParser
 from arkindex.documents.importer import parse_folio
-from arkindex.images.models import Zone
 from pathlib import Path
-import xml.etree.ElementTree as ElementTree
 import re
 import logging

 logger = logging.getLogger(__name__)

-NS_PAGE = 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'
-NS = {'page': NS_PAGE}
-NS_DELETE = "{%s}" % NS_PAGE
-
 REGEX_VOLUME_NAME = re.compile(r'(?:JJ)?([0-9]+)([A-Z]?)')
 REGEX_ACT_NUMBER = re.compile(r'([0-9]+)\s*([a-z]*)')
 REGEX_FOLIO = re.compile(r'(?:fol\.?\s*)?([0-9]+)\s*(bis)?\s*(r|v)?', re.IGNORECASE)
 REGEX_FOLIO_GROUP = re.compile(r'([^-_à]+)', re.IGNORECASE)


-class SurfaceParser(object):
-    def __init__(self, path):
-
-        self.path = str(path)
-        self.tree = ElementTree.parse(self.path)
-
-        # Check we have a page
-        self.page = self.tree.find('page:Page', NS)
-        assert self.page is not None, 'Missing page'
-
-    def get_metadata(self):
-
-        # From metadata
-        metadata = self.tree.find('page:Metadata', NS)
-        assert metadata is not None, 'Missing metadata'
-        dictMetadata = {node.tag.replace(NS_DELETE, "").lower(): node.text for node in metadata}
-
-        # From page
-        dictMetadata.update(self.page.attrib)
-
-        return dictMetadata
-
-    def list_surfaces(self):
-        region_indexes = None
-
-        # Load base elements from page
-        read = self.page.find('page:ReadingOrder', NS)
-        if read is not None:
-            order = read.find('page:OrderedGroup', NS)
-            assert order is not None, 'Missing ordered group'
-
-            # Find indexes for each region
-            region_indexes = {
-                refindex.get('regionRef'): int(refindex.get('index'))
-                for refindex in order.findall('page:RegionRefIndexed', NS)
-            }
-
-        out = []
-        for i, textRegion in enumerate(self.page.findall('page:TextRegion', NS)):
-
-            # Build coordinates polygon
-            coords = textRegion.find('page:Coords', NS)
-            assert coords is not None, 'Missing coords'
-            polygon = Polygon([
-                (int(x), int(y))
-                for x, y in re.findall(r'(\d+),(\d+)', coords.get('points'))
-            ])
-
-            text_id = textRegion.get('id')
-            if region_indexes is not None:  # If ReadingOrder exists
-                # Find index for this text region
-                index = region_indexes.get(text_id)
-                assert index is not None, 'Missing index in region_indexes'
-            else:
-                index = i
-
-            # Build output structure
-            out.append({
-                'index': index,
-                'id': text_id,
-                'polygon': polygon,
-            })
-
-        # Sort the surfaces by index
-        return sorted(out, key=lambda x: x['index'])
-
-
 class SurfaceImporter(object):

    def __init__(self, xmlpath=None, basepath=None, corpus=None, **kwargs):
@@ -167,36 +94,15 @@ class SurfaceImporter(object):

        for page, path in self.get_xml_pages():
            xml_count += 1
-            s = SurfaceParser(self.basepath / path)
-            image = page.zone.image
+            parser = PageXmlParser(self.basepath / path)

            if dry:  # Handle dry run
-                logger.info("{}\t{}".format(path, image.path))
+                logger.info("{}\t{}".format(path, page.zone.image.path))
                continue

-            for surfacedata in s.list_surfaces():
-                poly = surfacedata['polygon']
-                try:
-                    z, _ = image.zones.get_or_create(polygon=poly)
-                except ValueError as e:
-                    logger.warning(
-                        "Could not import zone with polygon '{}' on image '{}' from file '{}': {}".format(
-                            poly, image.id, path, e,
-                        )
-                    )
-                except Zone.MultipleObjectsReturned:
-                    logger.warning('Multiple zones found, picking the first one')
-                    z = image.zones.filter(polygon=poly).first()
-
-                surface, created = Element.objects.get_or_create(
-                    type=ElementType.Surface,
-                    zone=z,
-                    corpus=page.corpus,
-                    defaults={'name': "Surface {}".format(surfacedata['id'])},
-                )
-                surface.add_parent(page)
-                surfaces_count += 1
-                created_surfaces_count += created
+            region_count, created_count = parser.save_surfaces(page)
+            surfaces_count += region_count
+            created_surfaces_count += created_count

        logger.info("Parsed {} and created {} surfaces from {} XML files".format(
            surfaces_count, created_surfaces_count, xml_count,

--- a/arkindex/documents/tests/pagexml_samples/transcript.xml
+++ b/arkindex/documents/tests/pagexml_samples/transcript.xml
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
+    <Metadata>
+        <Creator>TRP</Creator>
+        <Created>2018-10-01T09:25:16.139-04:00</Created>
+        <LastChange>2019-02-19T19:45:19.222+01:00</LastChange>
+    </Metadata>
+    <Page imageFilename="01R_CE101S01_1907_005.tif" imageWidth="2415" imageHeight="3936">
+        <ReadingOrder>
+            <OrderedGroup id="ro_1550601919253" caption="Regions reading order">
+                <RegionRefIndexed index="0" regionRef="TextRegion_1540299380975_9"/>
+                <RegionRefIndexed index="1" regionRef="TextRegion_1540299473514_23"/>
+            </OrderedGroup>
+        </ReadingOrder>
+        <Relations>
+            <Relation type="link">
+                <RegionRef regionRef="TextRegion_1540299380975_9"/>
+                <RegionRef regionRef="TextRegion_1540299473514_23"/>
+            </Relation>
+        </Relations>
+        <TextRegion orientation="0.0" type="marginalia" id="TextRegion_1540299380975_9" custom="readingOrder {index:0;} structure {type:marginalia;}">
+            <Coords points="12,34 56,78 910,1112"/>
+            <TextLine id="r1l6" custom="readingOrder {index:0;} structure {type:ref;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>B .1</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextLine id="r1l7" custom="readingOrder {index:1;} _prenom {offset:0; length:12; continued:true;_role:sujet;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>Louis Joseph</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextLine id="r1l8" custom="readingOrder {index:2;} _prenom {offset:0; length:13; continued:true;_role:sujet;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>Pierre Siméon</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextLine id="r1l9" custom="readingOrder {index:3;} _nom {offset:0; length:7;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>Lemieux</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextEquiv>
+                <Unicode>B .1
+Louis Joseph
+Pierre Siméon
+Lemieux</Unicode>
+            </TextEquiv>
+        </TextRegion>
+        <TextRegion orientation="0.0" id="TextRegion_1540299473514_23" custom="readingOrder {index:1;}">
+            <Coords points="12,34 56,78 910,1112"/>
+            <TextLine id="r2l12" custom="readingOrder {index:0;} _date {offset:3; length:30;_enregistrement:1;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>Le onze janvier mil neuf centsept</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextLine id="r2l13" custom="readingOrder {index:1;} _prenom {offset:36; length:5; continued:true;_role:sujet;}">
+                <Coords points="12,34 56,78 910,1112"/>
+                <Baseline points="13,37 42,42 37,13"/>
+                <TextEquiv>
+                    <Unicode>nous prêtre soussigné avons baptisé Louis</Unicode>
+                </TextEquiv>
+            </TextLine>
+            <TextEquiv>
+                <Unicode>Le onze janvier mil neuf centsept
+nous prêtre soussigné avons baptisé Louis</Unicode>
+            </TextEquiv>
+        </TextRegion>
+    </Page>
+</PcGts>
--- a/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml
+++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml
-../../surface_samples/simple.xml
\ No newline at end of file
--- a/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml
+++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1r.xml
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
+    <Metadata>
+        <Creator>TRP</Creator>
+        <Created>2017-07-19T13:58:10.738+02:00</Created>
+        <LastChange>2017-07-19T14:04:22.502+02:00</LastChange>
+    </Metadata>
+    <Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731">
+        <ReadingOrder>
+            <OrderedGroup id="ro_1500465862580" caption="Regions reading order">
+                <RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/>
+            </OrderedGroup>
+        </ReadingOrder>
+        <TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}">
+            <Coords points="2974,1270 16,1270 18,105 2976,105"/>
+        </TextRegion>
+    </Page>
+</PcGts>
--- a/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml
+++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml
-../../surface_samples/2_regions.xml
\ No newline at end of file
--- a/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml
+++ b/arkindex/documents/tests/surface_importer_samples/volume 1/1v.xml
+<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
+<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
+    <Metadata>
+        <Creator>TRP</Creator>
+        <Created>2018-03-27T13:03:11.218+02:00</Created>
+        <LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
+    </Metadata>
+    <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
+        <ReadingOrder>
+            <OrderedGroup id="ro_1522155770972" caption="Regions reading order">
+                <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
+                <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
+            </OrderedGroup>
+        </ReadingOrder>
+        <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
+            <Coords points="3509,1675 0,1675 0,0 3509,0"/>
+        </TextRegion>
+        <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
+            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
+        </TextRegion>
+    </Page>
+</PcGts>
--- a/arkindex/documents/tests/surface_samples/2_regions.xml
+++ b/arkindex/documents/tests/surface_samples/2_regions.xml
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
-    <Metadata>
-        <Creator>TRP</Creator>
-        <Created>2018-03-27T13:03:11.218+02:00</Created>
-        <LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
-    </Metadata>
-    <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
-        <ReadingOrder>
-            <OrderedGroup id="ro_1522155770972" caption="Regions reading order">
-                <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
-                <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
-            </OrderedGroup>
-        </ReadingOrder>
-        <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
-            <Coords points="3509,1675 0,1675 0,0 3509,0"/>
-        </TextRegion>
-        <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
-            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
-        </TextRegion>
-    </Page>
-</PcGts>
--- a/arkindex/documents/tests/surface_samples/no_reading_order.xml
+++ b/arkindex/documents/tests/surface_samples/no_reading_order.xml
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
-    <Metadata>
-        <Creator>TRP</Creator>
-        <Created>2018-03-27T13:03:11.218+02:00</Created>
-        <LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
-    </Metadata>
-    <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
-        <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
-            <Coords points="3509,1675 0,1675 0,0 3509,0"/>
-        </TextRegion>
-        <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
-            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
-        </TextRegion>
-        <TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}">
-            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
-        </TextRegion>
-    </Page>
-</PcGts>
--- a/arkindex/documents/tests/surface_samples/simple.xml
+++ b/arkindex/documents/tests/surface_samples/simple.xml
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
-    <Metadata>
-        <Creator>TRP</Creator>
-        <Created>2017-07-19T13:58:10.738+02:00</Created>
-        <LastChange>2017-07-19T14:04:22.502+02:00</LastChange>
-    </Metadata>
-    <Page imageFilename="FRAN_0021_0023_L.jpg" imageWidth="3195" imageHeight="3731">
-        <ReadingOrder>
-            <OrderedGroup id="ro_1500465862580" caption="Regions reading order">
-                <RegionRefIndexed index="0" regionRef="TextRegion_1500465748446_12"/>
-            </OrderedGroup>
-        </ReadingOrder>
-        <TextRegion id="TextRegion_1500465748446_12" custom="readingOrder {index:0;}">
-            <Coords points="2974,1270 16,1270 18,105 2976,105"/>
-        </TextRegion>
-    </Page>
-</PcGts>
--- a/arkindex/documents/tests/surface_samples/unordered.xml
+++ b/arkindex/documents/tests/surface_samples/unordered.xml
-<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<PcGts xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd">
-    <Metadata>
-        <Creator>TRP</Creator>
-        <Created>2018-03-27T13:03:11.218+02:00</Created>
-        <LastChange>2018-03-27T15:02:50.831+02:00</LastChange>
-    </Metadata>
-    <Page imageFilename="FRAN_0021_0316_L.jpg" imageWidth="3509" imageHeight="4569">
-        <ReadingOrder>
-            <OrderedGroup id="ro_1522155770972" caption="Regions reading order">
-                <RegionRefIndexed index="1" regionRef="TextRegion_1522155769847_481"/>
-                <RegionRefIndexed index="0" regionRef="TextRegion_1522155769847_482"/>
-                <RegionRefIndexed index="2" regionRef="TextRegion_1522155769847_484"/>
-            </OrderedGroup>
-        </ReadingOrder>
-        <TextRegion id="TextRegion_1522155769847_482" custom="readingOrder {index:0;}">
-            <Coords points="3509,1675 0,1675 0,0 3509,0"/>
-        </TextRegion>
-        <TextRegion id="TextRegion_1522155769847_481" custom="readingOrder {index:1;}">
-            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
-        </TextRegion>
-        <TextRegion id="TextRegion_1522155769847_484" custom="readingOrder {index:1;}">
-            <Coords points="3509,4569 0,4569 0,1675 3509,1675"/>
-        </TextRegion>
-    </Page>
-</PcGts>
--- a/arkindex/documents/tests/test_pagexml.py
+++ b/arkindex/documents/tests/test_pagexml.py
+from pathlib import Path
+from django.urls import reverse
+from rest_framework import status
+from arkindex.project.tests import FixtureAPITestCase
+from arkindex.documents.models import Page, TranscriptionType
+
+FIXTURES = Path(__file__).absolute().parent / 'pagexml_samples'
+
+
+class TestPageXml(FixtureAPITestCase):
+
+    @classmethod
+    def setUpTestData(cls):
+        super().setUpTestData()
+        cls.page = Page.objects.get(corpus=cls.corpus, name='Volume 2, page 1r')
+
+    def test_pagexml_import_requires_login(self):
+        with (FIXTURES / 'transcript.xml').open() as f:
+            resp = self.client.post(
+                reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}),
+                data=f.read(),
+                content_type='application/xml',
+            )
+        self.assertEqual(resp.status_code, status.HTTP_403_FORBIDDEN)
+
+    def test_pagexml_import(self):
+        self.assertFalse(self.page.transcriptions.exists())
+        self.client.force_login(self.user)
+        with (FIXTURES / 'transcript.xml').open() as f:
+            resp = self.client.post(
+                reverse('api:pagexml-transcriptions', kwargs={'pk': str(self.page.id)}),
+                data=f.read(),
+                content_type='application/xml',
+            )
+        self.assertEqual(resp.status_code, status.HTTP_201_CREATED)
+        self.maxDiff = None
+        self.assertCountEqual(self.page.transcriptions.values_list('type', 'text'), [
+            (TranscriptionType.Paragraph, 'B .1\nLouis Joseph\nPierre Siméon\nLemieux'),
+            (TranscriptionType.Paragraph, 'Le onze janvier mil neuf centsept\n'
+                                          'nous prêtre soussigné avons baptisé Louis'),
+            (TranscriptionType.Line, 'B .1'),
+            (TranscriptionType.Line, 'Louis Joseph'),
+            (TranscriptionType.Line, 'Pierre Siméon'),
+            (TranscriptionType.Line, 'Lemieux'),
+            (TranscriptionType.Line, 'Le onze janvier mil neuf centsept'),
+            (TranscriptionType.Line, 'nous prêtre soussigné avons baptisé Louis'),
+        ])
--- a/arkindex/documents/tests/test_surface_parser.py
+++ b/arkindex/documents/tests/test_surface_parser.py
-from unittest import TestCase
-from arkindex.documents.surface import SurfaceParser
-from arkindex.project.polygon import Polygon
-import os.path
-
-FIXTURES = os.path.join(
-    os.path.dirname(os.path.realpath(__file__)),
-    'surface_samples',
-)
-
-
-class TestSurfaceParser(TestCase):
-    # TODO: Make these tests readable!
-
-    def test_surface_simple(self):
-        s = SurfaceParser(os.path.join(FIXTURES, 'simple.xml'))
-
-        # Check metadata
-        meta = s.get_metadata()
-        self.assertEqual(meta['imageFilename'], 'FRAN_0021_0023_L.jpg')
-        self.assertEqual(meta['imageWidth'], '3195')
-        self.assertEqual(meta['imageHeight'], '3731')
-        self.assertEqual(meta['creator'], 'TRP')
-        self.assertEqual(meta['created'], '2017-07-19T13:58:10.738+02:00')
-        self.assertEqual(meta['lastchange'], '2017-07-19T14:04:22.502+02:00')
-        # Check surfaces
-        surfaces = s.list_surfaces()
-        self.assertEqual(surfaces[0]['id'], 'TextRegion_1500465748446_12')
-        self.assertEqual(surfaces[0]['index'], 0)
-        self.assertEqual(surfaces[0]['polygon'], Polygon([(2974, 1270), (16, 1270), (18, 105), (2976, 105)]))
-
-    def test_surface_2_regions(self):
-        s = SurfaceParser(os.path.join(FIXTURES, '2_regions.xml'))
-
-        # Check metadata
-        meta = s.get_metadata()
-        self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg')
-        self.assertEqual(meta['imageWidth'], '3509')
-        self.assertEqual(meta['imageHeight'], '4569')
-        self.assertEqual(meta['creator'], 'TRP')
-        self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00')
-        self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00')
-        # Check surfaces
-        surfaces = s.list_surfaces()
-        self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
-        self.assertEqual(surfaces[0]['index'], 0)
-        self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
-        self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
-        self.assertEqual(surfaces[1]['index'], 1)
-        self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
-
-    def test_surface_unordered(self):
-        s = SurfaceParser(os.path.join(FIXTURES, 'unordered.xml'))
-
-        # Check metadata
-        meta = s.get_metadata()
-        self.assertEqual(meta['imageFilename'], 'FRAN_0021_0316_L.jpg')
-        self.assertEqual(meta['imageWidth'], '3509')
-        self.assertEqual(meta['imageHeight'], '4569')
-        self.assertEqual(meta['creator'], 'TRP')
-        self.assertEqual(meta['created'], '2018-03-27T13:03:11.218+02:00')
-        self.assertEqual(meta['lastchange'], '2018-03-27T15:02:50.831+02:00')
-        # Check surfaces
-        surfaces = s.list_surfaces()
-        self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
-        self.assertEqual(surfaces[0]['index'], 0)
-        self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
-        self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
-        self.assertEqual(surfaces[1]['index'], 1)
-        self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
-        self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484")
-        self.assertEqual(surfaces[2]['index'], 2)
-        self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
-
-    def test_surface_no_reading_order(self):
-        "Test SurfaceParser with no ReadingOrder tag in XML file"
-        surfaces = SurfaceParser(os.path.join(FIXTURES, 'no_reading_order.xml')).list_surfaces()
-        self.assertEqual(surfaces[0]['id'], 'TextRegion_1522155769847_482')
-        self.assertEqual(surfaces[0]['index'], 0)
-        self.assertEqual(surfaces[0]['polygon'], Polygon([(3509, 1675), (0, 1675), (0, 0), (3509, 0)]))
-        self.assertEqual(surfaces[1]['id'], 'TextRegion_1522155769847_481')
-        self.assertEqual(surfaces[1]['index'], 1)
-        self.assertEqual(surfaces[1]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
-        self.assertEqual(surfaces[2]['id'], "TextRegion_1522155769847_484")
-        self.assertEqual(surfaces[2]['index'], 2)
-        self.assertEqual(surfaces[2]['polygon'], Polygon([(3509, 4569), (0, 4569), (0, 1675), (3509, 1675)]))
--- a/arkindex/project/api_v1.py
+++ b/arkindex/project/api_v1.py
@@ -7,7 +7,8 @@ from arkindex.documents.api.elements import (
    ElementTranscriptions, ElementsCreate,
 )
 from arkindex.documents.api.search import PageSearch, ActSearch
-from arkindex.documents.api.ml import ClassificationBulk, TranscriptionCreate, TranscriptionBulk
+from arkindex.documents.api.ml import \
+    ClassificationBulk, TranscriptionCreate, TranscriptionBulk, PageXmlTranscriptionsImport
 from arkindex.documents.api.iiif import (
    VolumeManifest, ActManifest, PageAnnotationList, PageActAnnotationList, SurfaceAnnotationList,
    TranscriptionSearchAnnotationList,
@@ -36,6 +37,11 @@ api = [
    path('element/<uuid:pk>/history/', ElementHistory.as_view(), name='element-history'),
    path('element/<uuid:pk>/transcriptions/', ElementTranscriptions.as_view(), name='element-transcriptions'),
    path('page/<uuid:pk>/', PageDetails.as_view(), name='page-details'),
+    path(
+        'page/<uuid:pk>/transcriptions/xml/',
+        PageXmlTranscriptionsImport.as_view(),
+        name='pagexml-transcriptions',
+    ),
    path('surface/<uuid:pk>/', SurfaceDetails.as_view(), name='surface-details'),
    path('corpus/', CorpusList.as_view(), name='corpus'),
    path('corpus/<uuid:pk>/', CorpusRetrieve.as_view(), name='corpus-retrieve'),

--- a/arkindex/project/parsers.py
+++ b/arkindex/project/parsers.py
+from rest_framework.parsers import BaseParser
+from lxml import etree
+
+
+class XMLParser(BaseParser):
+    """
+    A basic XML parser without serializer support
+    """
+    media_type = 'application/xml'
+
+    def parse(self, stream, media_type=None, parser_context=None):
+        """
+        Parse the request body into a lxml Element
+        """
+        return etree.parse(stream).getroot()
--- a/openapi/patch.py
+++ b/openapi/patch.py
@@ -112,7 +112,7 @@ def update_schema(schema, patches):
        for method, operation in methods.items():
            if 'requestBody' not in operation:
                continue
-            if not operation['requestBody']['content']['application/json']['schema']:
+            if not operation['requestBody']['content'].get('application/json', {}).get('schema'):
                # Ignore empty schemas
                continue


--- a/openapi/patch.yml
+++ b/openapi/patch.yml
@@ -437,6 +437,23 @@ paths:
      security: []
      tags:
        - elements
+  /api/v1/page/{id}/transcriptions/xml/:
+    post:
+      operationId: ImportPageXmlTranscriptions
+      description: Import transcriptions into Arkindex from region data in the PAGE XML format.
+      requestBody:
+        required: true
+        description: >-
+          A PAGE XML document.
+          TextRegion tags will be imported as Paragraph transcriptions
+          and TextLine tags will become Line transcriptions.
+          See https://github.com/PRImA-Research-Lab/PAGE-XML for more info
+          about the PAGE XML format.
+        content:
+          application/xml:
+            schema: {}
+      tags:
+        - ml
  /api/v1/pages/:
    get:
      operationId: SearchPages

--- a/openapi/requirements.txt
+++ b/openapi/requirements.txt
-git+https://github.com/encode/django-rest-framework.git@ac64c0a536b0ae21b81d86c3c2a37bc0c70f932e#egg=djangorestframework
+git+https://github.com/encode/django-rest-framework.git@bb0db35680dd85cd33dade83b6cbd1039995b9db#egg=djangorestframework
 coreapi==2.3.3
 apistar>=0.7.2