Compare revisions

Manon Blanco · Manon Blanco · Yoann Schneider · Yoann Schneider · Yoann Schneider · Manon Blanco
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,10 +3,6 @@ stages:
  - build
  - deploy
-variables:
-  # Submodule clone
-  GIT_SUBMODULE_STRATEGY: recursive
 lint:
  image: python:3.10
  stage: test
@@ -46,6 +42,14 @@ test:
    - apt-get update -q
    - apt-get install -q -y libgl1
+    # Add netrc file
+    - |
+      echo "
+      machine gitlab.teklia.com
+      login gitlab-ci-token
+      password $CI_JOB_TOKEN
+      " > ~/.netrc
  except:
    - schedules
@@ -64,6 +68,15 @@ docker-build:
  except:
    - schedules
+  before_script:
+    # Add netrc file
+    - |
+      echo "
+      machine gitlab.teklia.com
+      login gitlab-ci-token
+      password $CI_JOB_TOKEN
+      " > ~/.netrc
  script:
    - ci/build.sh
@@ -75,7 +88,15 @@ docker-build:
      - public
  before_script:
-    - pip install -e .[docs]
+    # Add netrc file
+    - |
+      echo "
+      machine gitlab.teklia.com
+      login gitlab-ci-token
+      password $CI_JOB_TOKEN
+      " > ~/.netrc
+    - pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .[docs]
  script:
    - mkdocs build --strict --verbose
@@ -149,6 +170,25 @@ docs-stop-surge:
  script:
    - surge teardown ${CI_ENVIRONMENT_URL}
+deploy-pypi:
+  stage: deploy
+  image: python:3.10
+  only:
+    - tags
+  variables:
+    TWINE_USERNAME: gitlab-ci-token
+    TWINE_PASSWORD: ${CI_JOB_TOKEN}
+    TWINE_REPOSITORY_URL: ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi
+  before_script:
+    - pip install twine
+  script:
+    - python setup.py sdist bdist_wheel
+    - twine upload --repository-url ${TWINE_REPOSITORY_URL} dist/*
 bump-python-deps:
  stage: deploy
  image: registry.gitlab.teklia.com/infra/devops:latest

--- a/.gitmodules
+++ b/.gitmodules
-[submodule "nerval"]
-	path = nerval
-	url = ../../ner/nerval.git
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,12 +7,10 @@ RUN apt-get -y update && \
 WORKDIR /src
-# Copy submodule data
-COPY nerval nerval
 # Copy DAN data
 COPY dan dan
 COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
-# Install DAN as a package
+# Install DAN as a package with GitLab package registry
-RUN pip install . --no-cache-dir
+RUN --mount=type=secret,id=netrc,target=/root/.netrc \
+    pip install . --no-cache-dir --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
--- a/README.md
+++ b/README.md
@@ -10,9 +10,20 @@ This is an open-source project, licensed using [the MIT license](https://opensou
 For development and tests purpose it may be useful to install the project as a editable package with pip.
- Use a virtualenv (e.g. with virtualenvwrapper `mkvirtualenv -a . dan`)
+This package is based on a GitLab package registry containing all the nerval source code.
- Initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule (e.g. `git submodule update --init --recursive`)
+You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
- Install `dan` as a package (e.g. `pip install -e .`)
+```shell
+machine gitlab.teklia.com
+login __token__
+password <YOUR_PERSONAL_TOKEN>
+```
+Then you can install the package as a editable package with pip:
+```shell
+pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .
+```
 ### Linter

--- a/VERSION
+++ b/VERSION
-0.2.0-dev4
+0.2.0-dev6
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -17,7 +17,7 @@ fi
 IMAGE_TAG="$CI_REGISTRY_IMAGE:$VERSION"
 cd $CI_PROJECT_DIR
-docker build -f Dockerfile . -t "$IMAGE_TAG"
+docker build -f Dockerfile . -t "$IMAGE_TAG" --secret id=netrc,src=$HOME/.netrc
 # Publish the image on the main branch or on a tag
 if [ "$CI_COMMIT_REF_NAME" = "main" -o -n "$CI_COMMIT_TAG" ]; then

--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
 logger = logging.getLogger(__name__)
-# replace \t with regular space and consecutive spaces
+# Replace \t with regular space and consecutive spaces
 TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
 TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
+# Remove invalid characters to build valid XML tag name
+SLUG_PATTERN = re.compile(r"[\W]+")
 # Some characters are encoded in XML but we don't want them encoded in the end
 ENCODING_MAP = {
    "&#13;": "\r",
@@ -174,9 +177,9 @@ class Tokenizer:
 def slugify(text: str):
    """
-    Replace space in text to underscores to use it as XML tag.
+    Replace invalid characters in text to underscores to use it as XML tag.
    """
-    return text.replace(" ", "_")
+    return SLUG_PATTERN.sub("_", text)
 def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:

--- a/dan/ocr/evaluate.py
+++ b/dan/ocr/evaluate.py
@@ -15,6 +15,9 @@ import numpy as np
 import torch
 import torch.multiprocessing as mp
 from edlib import align, getNiceAlignment
+from nerval.evaluate import evaluate
+from nerval.parse import parse_bio
+from nerval.utils import print_results
 from prettytable import MARKDOWN, PrettyTable
 from dan.bio import convert
@@ -22,9 +25,6 @@ from dan.ocr.manager.metrics import Inference
 from dan.ocr.manager.training import Manager
 from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
 from dan.utils import parse_tokens, read_json
-from nerval.evaluate import evaluate
-from nerval.parse import parse_bio
-from nerval.utils import print_results
 logger = logging.getLogger(__name__)

--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -62,9 +62,11 @@ Add the `docs` extra when installing `teklia-dan`:
 ```shell
 # In a clone of the Git repository
-pip install .[docs]
+pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[docs]
 ```
+The `--index-url` argument is required to find the `nerval` package.
 Build the documentation using `mkdocs serve -v`. You can then write in [Markdown](https://www.markdownguide.org/) in the relevant `docs/*.md` files, and see live output on http://localhost:8000.
 ### Linter

--- a/docs/get_started/index.md
+++ b/docs/get_started/index.md
@@ -2,41 +2,43 @@
 ## Installation
-To use DAN in your own environment, you need to install it as a dependency or manually.
+DAN is based on a GitLab package registry containing all the nerval source code.
+You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
+```shell
+machine gitlab.teklia.com
+login __token__
+password <YOUR_PERSONAL_TOKEN>
+```
+If you install DAN as a dependency, the host must have access to this configuration file to be able to download the [nerval repository](https://gitlab.teklia.com/ner/nerval) package.
 ### As a dependency
 To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
 ```shell
-teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
+--index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
+teklia-dan
 ```
+The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
 Then you can install it via pip:
 ```shell
 pip install -r requirements.txt
 ```
-### Manually
+### Directly
-To install DAN manually, you need to first clone via:
+To install DAN directly, you can install it via pip:
 ```shell
-git clone git@gitlab.teklia.com:atr/dan.git
+pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple teklia-dan
 ```
-Then you can initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule:
+The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
-```shell
-git submodule update --init --recursive
-```
-Then you can install it via pip:
-```shell
-pip install .
-```
 ---

--- a/docs/usage/train/config.md
+++ b/docs/usage/train/config.md
@@ -242,9 +242,11 @@ To log your experiment on MLFlow, you need to:
 - install the extra requirements via
 ```shell
-$ pip install .[mlflow]
+$ pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[mlflow]
 ```
+The `--index-url` argument is required to find the `nerval` package.
 - update the following arguments:
 | Name                           | Description                             | Type  | Default |

--- a/nerval @ 525c1a9e
+++ b/nerval @ 525c1a9e
-Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ known-third-party = [
    "cv2",
    "editdistance",
    "imageio",
+    "nerval",
    "numpy",
    "scipy",
    "tensorboard",

--- a/requirements.txt
+++ b/requirements.txt
-e ./nerval
 albumentations==1.3.1
 arkindex-export==0.1.9
 boto3==1.26.124
@@ -13,6 +12,7 @@ PyYAML==6.0
 scipy==1.10.1
 sentencepiece==0.1.99
 teklia-line-image-extractor==0.2.8rc5
+teklia-nerval==0.3.1
 tenacity==8.2.3
 tensorboard==2.12.2
 torch==2.0.0

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
        element=Element.select().first(),
    )
+    # Create worker version
    WorkerVersion.bulk_create(
        [
            WorkerVersion(
@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
                revision="main",
                type="worker",
            )
-            for nestation in ("nested", "non-nested")
+            for nestation in ("nested", "non-nested", "special-chars")
        ]
    )
-    entities = [
+    # Create entities
+    for entity in [
        # Non-nested entities
        {
            "worker_version": "non-nested-id",
@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
            "name": "us",
            "offset": 43,
        },
-    ]
+        # Special characters
-    for entity in entities:
+        {
+            "worker_version": "special-chars-id",
+            "type": "Arkindex's entity",
+            "name": "great",
+            "offset": 4,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": '"Name" (1)',
+            "name": "Charles",
+            "offset": 15,
+        },
+        {
+            "worker_version": "special-chars-id",
+            "type": "Person /!\\",
+            "name": "us",
+            "offset": 43,
+        },
+    ]:
        create_transcription_entity(transcription=transcription, **entity)
    return database_path

--- a/tests/data/entities.yml
+++ b/tests/data/entities.yml
 ---
 entities:
+- '"Name" (1)'
+- Arkindex's entity
+- Person /!\
 - adj
 - birthdate
 - firstname

--- a/tests/data/tokens/end_tokens.yml
+++ b/tests/data/tokens/end_tokens.yml
 ---
-adj:
+'"Name" (1)':
  start: Ⓐ
  end: Ⓑ
-birthdate:
+Arkindex's entity:
  start: Ⓒ
  end: Ⓓ
-firstname:
+Person /!\:
  start: Ⓔ
  end: Ⓕ
-fullname:
+adj:
  start: Ⓖ
  end: Ⓗ
-name:
+birthdate:
  start: Ⓘ
  end: Ⓙ
-person:
+firstname:
  start: Ⓚ
  end: Ⓛ
-surname:
+fullname:
  start: Ⓜ
  end: Ⓝ
+name:
+  start: Ⓞ
+  end: Ⓟ
+person:
+  start: Ⓠ
+  end: Ⓡ
+surname:
+  start: Ⓢ
+  end: Ⓣ
--- a/tests/data/tokens/no_end_tokens.yml
+++ b/tests/data/tokens/no_end_tokens.yml
 ---
-adj:
+'"Name" (1)':
  start: Ⓐ
  end: ''
-birthdate:
+Arkindex's entity:
  start: Ⓑ
  end: ''
-firstname:
+Person /!\:
  start: Ⓒ
  end: ''
-fullname:
+adj:
  start: Ⓓ
  end: ''
-name:
+birthdate:
  start: Ⓔ
  end: ''
-person:
+firstname:
  start: Ⓕ
  end: ''
-surname:
+fullname:
  start: Ⓖ
  end: ''
+name:
+  start: Ⓗ
+  end: ''
+person:
+  start: Ⓘ
+  end: ''
+surname:
+  start: Ⓙ
+  end: ''
--- a/tests/test_analyze.py
+++ b/tests/test_analyze.py
@@ -28,80 +28,53 @@ def full_statistics():
    return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
-@pytest.mark.parametrize(
+def test_display_image_statistics(image_statistics, tmp_path):
-    "im_paths, expected_summary",
-    (
-        (
-            [
-                "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
-                "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
-                "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
-                "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
-                "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
-                "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
-            ],
-            pytest.lazy_fixture("image_statistics"),
-        ),
-    ),
-)
-def test_display_image_statistics(im_paths, expected_summary, tmp_path):
    stats = Statistics(filename=tmp_path)
-    stats.create_image_statistics(images=im_paths)
+    stats.create_image_statistics(
-    assert stats.document.get_md_text() == expected_summary
+        images=[
+            "tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
+            "tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
-@pytest.mark.parametrize(
+            "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
-    "texts, expected_summary",
+            "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
-    (
+            "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
-        (
+            "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
-            [
+        ]
-                "Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
+    )
-                "Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
+    assert stats.document.get_md_text() == image_statistics
-                "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
-                "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
-            ],
+def test_display_label_statistics(labels_statistics, tmp_path):
-            pytest.lazy_fixture("labels_statistics"),
-        ),
-    ),
-)
-def test_display_label_statistics(texts, expected_summary, tmp_path):
    filename = tmp_path / "labels.md"
    stats = Statistics(filename=str(filename))
-    stats.create_label_statistics(labels=texts)
+    stats.create_label_statistics(
-    assert stats.document.get_md_text() == expected_summary
+        labels=[
+            "Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
+            "Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
-@pytest.mark.parametrize(
+            "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
-    "texts, expected_summary",
+            "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
-    (
+        ]
-        (
+    )
-            [
+    assert stats.document.get_md_text() == labels_statistics
-                "ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
-                "ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
-            ],
+def test_display_ner_statistics(ner_statistics, tmp_path):
-            pytest.lazy_fixture("ner_statistics"),
-        ),
-    ),
-)
-def test_display_ner_statistics(texts, expected_summary, tmp_path):
    tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
    stats = Statistics(filename=tmp_path)
-    stats.create_ner_statistics(labels=texts, ner_tokens=tokens)
+    stats.create_ner_statistics(
-    assert stats.document.get_md_text() == expected_summary
+        labels=[
+            "ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
+            "ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
-@pytest.mark.parametrize(
+        ],
-    "labels, tokens, expected_summary",
+        ner_tokens=tokens,
-    (
+    )
-        (
+    assert stats.document.get_md_text() == ner_statistics
-            FIXTURES / "training" / "training_dataset" / "labels.json",
-            FIXTURES / "training" / "training_dataset" / "tokens.yaml",
-            pytest.lazy_fixture("full_statistics"),
+def test_run(full_statistics, tmp_path):
-        ),
-    ),
-)
-def test_run(labels, tokens, expected_summary, tmp_path):
    output_file = tmp_path / "stats.md"
    stats = Statistics(filename=str(output_file))
-    stats.run(labels=read_json(labels), tokens=read_yaml(tokens))
+    stats.run(
-    assert output_file.read_text() == expected_summary
+        labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"),
+        tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"),
+    )
+    assert output_file.read_text() == full_statistics
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
            "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
            ["\n", " "],
        ),
+        # Special characters in entities
+        (
+            "special-chars-id",
+            "<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
+            None,
+        ),
    ),
 )
 def test_entities_to_xml(mock_database, nestation, xml_output, separators):
@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
            predictions=get_transcription_entities(
                transcription_id="tr-with-entities",
                entity_worker_versions=[nestation],
-                supported_types=["name", "fullname", "person", "adj"],
+                supported_types=[
+                    "name",
+                    "fullname",
+                    "person",
+                    "adj",
+                    "Arkindex's entity",
+                    '"Name" (1)',
+                    "Person /!\\",
+                ],
            ),
            entity_separators=separators,
        )
No results found