From 447a22bbaf4b39fe15391f484be37c11db6409b1 Mon Sep 17 00:00:00 2001
From: manon blanco <blanco@teklia.com>
Date: Fri, 19 Mar 2021 10:35:41 +0000
Subject: [PATCH] Convert HTML metadatas as markdown

---
 .isort.cfg                                    |  2 +-
 .../0030_convert_html_metadata_as_markdown.py | 33 +++++++
 arkindex/documents/models.py                  |  2 +-
 .../documents/serializers/iiif/manifests.py   |  2 +-
 arkindex/documents/serializers/light.py       | 15 +++
 arkindex/documents/tests/test_manifest.py     |  2 +-
 arkindex/documents/tests/test_metadata.py     | 97 ++++++++++++++++++-
 requirements.txt                              |  1 +
 8 files changed, 149 insertions(+), 5 deletions(-)
 create mode 100644 arkindex/documents/migrations/0030_convert_html_metadata_as_markdown.py

diff --git a/.isort.cfg b/.isort.cfg
index 6bae546cc0..6ed26f7127 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -8,4 +8,4 @@ line_length = 120
 
 default_section=FIRSTPARTY
 known_first_party = ponos,transkribus
-known_third_party = boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,elasticsearch,elasticsearch_dsl,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml
+known_third_party = bleach,boto3,botocore,corsheaders,django,django_admin_hstore_widget,django_rq,drf_spectacular,elasticsearch,elasticsearch_dsl,enumfields,gitlab,psycopg2,requests,responses,rest_framework,rq,setuptools,sqlparse,teklia_toolbox,tenacity,tripoli,yaml
diff --git a/arkindex/documents/migrations/0030_convert_html_metadata_as_markdown.py b/arkindex/documents/migrations/0030_convert_html_metadata_as_markdown.py
new file mode 100644
index 0000000000..87d49a4fe7
--- /dev/null
+++ b/arkindex/documents/migrations/0030_convert_html_metadata_as_markdown.py
@@ -0,0 +1,33 @@
+# Generated by Django 3.1.7 on 2021-03-17 08:26
+from django.db import migrations
+
+from arkindex.documents.models import MetaType
+
+
+def convert_html_metadata_to_markdown(apps, schema_editor):
+    MetaData = apps.get_model('documents', 'MetaData')
+    AllowedMetaData = apps.get_model('documents', 'AllowedMetaData')
+    MetaData.objects.exclude(type__in=[mt.value for mt in MetaType]).update(type=MetaType.Markdown)
+    AllowedMetaData.objects.exclude(type__in=[mt.value for mt in MetaType]).update(type=MetaType.Markdown)
+
+
+def convert_markdown_metadata_to_html(apps, schema_editor):
+    MetaData = apps.get_model('documents', 'MetaData')
+    AllowedMetaData = apps.get_model('documents', 'AllowedMetaData')
+    MetaData.objects.exclude(type__in=[mt.value for mt in MetaType]).update(type=MetaType.HTML)
+    AllowedMetaData.objects.exclude(type__in=[mt.value for mt in MetaType]).update(type=MetaType.HTML)
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('documents', '0029_corpus_top_level_type'),
+    ]
+
+    operations = [
+        migrations.RunPython(
+            convert_html_metadata_to_markdown,
+            reverse_code=convert_markdown_metadata_to_html,
+            elidable=True,
+        ),
+    ]
diff --git a/arkindex/documents/models.py b/arkindex/documents/models.py
index 4cd6bed613..3fa88944d7 100644
--- a/arkindex/documents/models.py
+++ b/arkindex/documents/models.py
@@ -515,7 +515,7 @@ class Classification(models.Model):
 
 class MetaType(Enum):
     Text = 'text'
-    HTML = 'html'
+    Markdown = 'markdown'
     Date = 'date'
     Location = 'location'
     # Element's original structure reference (intended to be indexed)
diff --git a/arkindex/documents/serializers/iiif/manifests.py b/arkindex/documents/serializers/iiif/manifests.py
index 7ab2a1aa48..05a2d33a4f 100644
--- a/arkindex/documents/serializers/iiif/manifests.py
+++ b/arkindex/documents/serializers/iiif/manifests.py
@@ -164,7 +164,7 @@ class FolderManifestSerializer(serializers.Serializer):
             "viewingDirection": "left-to-right",
             "service": services,
             "metadata": ManifestMetadataSerializer(
-                element.metadatas.exclude(type=MetaType.HTML),
+                element.metadatas.exclude(type=MetaType.Markdown),
                 context=self.context,
                 many=True,
             ).data,
diff --git a/arkindex/documents/serializers/light.py b/arkindex/documents/serializers/light.py
index cc1062f9f8..49ec882319 100644
--- a/arkindex/documents/serializers/light.py
+++ b/arkindex/documents/serializers/light.py
@@ -1,3 +1,4 @@
+import bleach
 from django.db.models import Max
 from rest_framework import serializers
 from rest_framework.exceptions import APIException, ValidationError
@@ -101,6 +102,20 @@ class MetaDataLightSerializer(serializers.ModelSerializer):
     revision = RevisionSerializer(read_only=True)
     dates = InterpretedDateSerializer(many=True, source='get_dates', read_only=True)
 
+    def to_representation(self, instance):
+        # Clean metadata value
+        tags = []
+        if instance.type == MetaType.Markdown:
+            tags = [
+                'a', 'b', 'blockquote', 'body', 'br', 'div',
+                'em', 'h1', 'h2', 'h3', 'html', 'i', 'iframe',
+                'img', 'li', 'marquee', 'ol', 'p', 'sup',
+                'table', 'tbody', 'td', 'th', 'thead', 'tr',
+                'ul'
+            ]
+        instance.value = bleach.clean(instance.value, tags=tags)
+        return super().to_representation(instance)
+
     def reindex_element(self, elt):
         reindex_start(element=elt, elements=True)
 
diff --git a/arkindex/documents/tests/test_manifest.py b/arkindex/documents/tests/test_manifest.py
index 701559d37b..44fab622b3 100644
--- a/arkindex/documents/tests/test_manifest.py
+++ b/arkindex/documents/tests/test_manifest.py
@@ -19,7 +19,7 @@ class TestFolderManifestSerializer(FixtureAPITestCase):
         cls.vol.metadatas.create(name='test 1', type=MetaType.Text, value='Blah')
         cls.vol.metadatas.create(name='test 2', type=MetaType.Date, value='1337-01-01')
         cls.vol.metadatas.create(name='test 3', type=MetaType.Location, value='Somewhere')
-        cls.vol.metadatas.create(name='test 4', type=MetaType.HTML, value='<p>oh no</p>')
+        cls.vol.metadatas.create(name='test 4', type=MetaType.Markdown, value='<p>oh no</p>')
         cls.page = Element.objects.get(name='Volume 1, page 1r')
 
     def setUp(self):
diff --git a/arkindex/documents/tests/test_metadata.py b/arkindex/documents/tests/test_metadata.py
index 77c7efc9da..a626322516 100644
--- a/arkindex/documents/tests/test_metadata.py
+++ b/arkindex/documents/tests/test_metadata.py
@@ -35,7 +35,7 @@ class TestMetaData(FixtureAPITestCase):
     def setUp(self):
         super().setUp()
         self.metadata = self.vol.metadatas.create(type=MetaType.Text, name='folio', value='123')
-        self.private_metadata = self.private_vol.metadatas.create(type=MetaType.HTML, name='leet', value='1337')
+        self.private_metadata = self.private_vol.metadatas.create(type=MetaType.Markdown, name='leet', value='1337')
 
     def test_metadata_forbidden_methods(self):
         """
@@ -563,3 +563,98 @@ class TestMetaData(FixtureAPITestCase):
         self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST)
         metadata.refresh_from_db()
         self.assertIsNone(metadata.entity)
+
+    def test_render_markdown_metadata(self):
+        metadata = self.vol.metadatas.create(type=MetaType.Markdown, name='Some text', value='# Title\n## Subtitle\nbla')
+        self.client.force_login(self.user)
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(metadata.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertDictEqual(
+            response.json(),
+            {
+                'id': str(metadata.id),
+                'type': 'markdown',
+                'name': 'Some text',
+                'value': '# Title\n## Subtitle\nbla',
+                'entity': None,
+                'dates': [],
+                'worker_version': None
+            }
+        )
+
+    def test_render_html_metadata(self):
+        metadata = self.vol.metadatas.create(type=MetaType.Markdown, name='Some text', value='<h1>Title</h1>')
+        self.client.force_login(self.user)
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(metadata.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertDictEqual(
+            response.json(),
+            {
+                'id': str(metadata.id),
+                'type': 'markdown',
+                'name': 'Some text',
+                'value': '<h1>Title</h1>',
+                'entity': None,
+                'dates': [],
+                'worker_version': None
+            }
+        )
+
+    def test_unrender_html_metadata(self):
+        metadata = self.vol.metadatas.create(type=MetaType.Markdown, name='Some text', value='<style type="text/css">* { display: none !important; }</style>')
+        self.client.force_login(self.user)
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(metadata.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertDictEqual(
+            response.json(),
+            {
+                'id': str(metadata.id),
+                'type': 'markdown',
+                'name': 'Some text',
+                'value': '&lt;style type="text/css"&gt;* { display: none !important; }&lt;/style&gt;',
+                'entity': None,
+                'dates': [],
+                'worker_version': None
+            }
+        )
+
+    def test_create_markdown_metadata(self):
+        AllowedMetaData.objects.create(corpus=self.corpus, type=MetaType.Markdown, name='text')
+        self.client.force_login(self.user)
+        response = self.client.post(
+            reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}),
+            data={'type': 'markdown', 'name': 'text', 'value': '# Title\n## Subtitle\nbla'}
+        )
+        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+        md = self.vol.metadatas.get(type=MetaType.Markdown, name='text')
+        self.assertEqual(md.value, '# Title\n## Subtitle\nbla')
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(md.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.json()['value'], '# Title\n## Subtitle\nbla')
+
+    def test_create_html_metadata(self):
+        AllowedMetaData.objects.create(corpus=self.corpus, type=MetaType.Markdown, name='text')
+        self.client.force_login(self.user)
+        response = self.client.post(
+            reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}),
+            data={'type': 'markdown', 'name': 'text', 'value': '<style type="text/css">* { display: none !important; }</style>'}
+        )
+        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+        md = self.vol.metadatas.get(type=MetaType.Markdown, name='text')
+        self.assertEqual(md.value, '<style type="text/css">* { display: none !important; }</style>')
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(md.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.json()['value'], '&lt;style type="text/css"&gt;* { display: none !important; }&lt;/style&gt;')
+
+    def test_create_html_metadata_with_other_type(self):
+        self.client.force_login(self.user)
+        response = self.client.post(
+            reverse('api:element-metadata', kwargs={'pk': str(self.vol.id)}),
+            data={'type': 'date', 'name': 'edition', 'value': '<a href="lalaland">oops</a>'}
+        )
+        self.assertEqual(response.status_code, status.HTTP_201_CREATED)
+        md = self.vol.metadatas.get(type=MetaType.Date, name='edition')
+        self.assertEqual(md.value, '<a href="lalaland">oops</a>')
+        response = self.client.get(reverse('api:metadata-edit', kwargs={'pk': str(md.id)}))
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+        self.assertEqual(response.json()['value'], '&lt;a href="lalaland"&gt;oops&lt;/a&gt;')
diff --git a/requirements.txt b/requirements.txt
index 2f15387e52..40ec288a87 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,6 +2,7 @@
 
 apistar==0.7.2
 git+https://gitlab.com/teklia/apistar.git#egg=apistar
+bleach==3.3.0
 django-admin-hstore-widget==1.1.0
 django-cors-headers==3.7.0
 django-enumfields==2.1.1
-- 
GitLab