Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (14)
Showing with 216 additions and 133 deletions
......@@ -3,10 +3,6 @@ stages:
- build
- deploy
variables:
# Submodule clone
GIT_SUBMODULE_STRATEGY: recursive
lint:
image: python:3.10
stage: test
......@@ -46,6 +42,14 @@ test:
- apt-get update -q
- apt-get install -q -y libgl1
# Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
except:
- schedules
......@@ -64,6 +68,15 @@ docker-build:
except:
- schedules
before_script:
# Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
script:
- ci/build.sh
......@@ -75,7 +88,15 @@ docker-build:
- public
before_script:
- pip install -e .[docs]
# Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
- pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .[docs]
script:
- mkdocs build --strict --verbose
......@@ -149,6 +170,25 @@ docs-stop-surge:
script:
- surge teardown ${CI_ENVIRONMENT_URL}
deploy-pypi:
stage: deploy
image: python:3.10
only:
- tags
variables:
TWINE_USERNAME: gitlab-ci-token
TWINE_PASSWORD: ${CI_JOB_TOKEN}
TWINE_REPOSITORY_URL: ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi
before_script:
- pip install twine
script:
- python setup.py sdist bdist_wheel
- twine upload --repository-url ${TWINE_REPOSITORY_URL} dist/*
bump-python-deps:
stage: deploy
image: registry.gitlab.teklia.com/infra/devops:latest
......
[submodule "nerval"]
path = nerval
url = ../../ner/nerval.git
......@@ -7,12 +7,10 @@ RUN apt-get -y update && \
WORKDIR /src
# Copy submodule data
COPY nerval nerval
# Copy DAN data
COPY dan dan
COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
# Install DAN as a package
RUN pip install . --no-cache-dir
# Install DAN as a package with GitLab package registry
RUN --mount=type=secret,id=netrc,target=/root/.netrc \
pip install . --no-cache-dir --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
......@@ -10,9 +10,20 @@ This is an open-source project, licensed using [the MIT license](https://opensou
For development and tests purpose it may be useful to install the project as a editable package with pip.
- Use a virtualenv (e.g. with virtualenvwrapper `mkvirtualenv -a . dan`)
- Initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule (e.g. `git submodule update --init --recursive`)
- Install `dan` as a package (e.g. `pip install -e .`)
This package is based on a GitLab package registry containing all the nerval source code.
You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
```shell
machine gitlab.teklia.com
login __token__
password <YOUR_PERSONAL_TOKEN>
```
Then you can install the package as a editable package with pip:
```shell
pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .
```
### Linter
......
0.2.0-dev4
0.2.0-dev6
......@@ -17,7 +17,7 @@ fi
IMAGE_TAG="$CI_REGISTRY_IMAGE:$VERSION"
cd $CI_PROJECT_DIR
docker build -f Dockerfile . -t "$IMAGE_TAG"
docker build -f Dockerfile . -t "$IMAGE_TAG" --secret id=netrc,src=$HOME/.netrc
# Publish the image on the main branch or on a tag
if [ "$CI_COMMIT_REF_NAME" = "main" -o -n "$CI_COMMIT_TAG" ]; then
......
......@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__)
# replace \t with regular space and consecutive spaces
# Replace \t with regular space and consecutive spaces
TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
# Remove invalid characters to build valid XML tag name
SLUG_PATTERN = re.compile(r"[\W]+")
# Some characters are encoded in XML but we don't want them encoded in the end
ENCODING_MAP = {
"&#13;": "\r",
......@@ -174,9 +177,9 @@ class Tokenizer:
def slugify(text: str):
"""
Replace space in text to underscores to use it as XML tag.
Replace invalid characters in text to underscores to use it as XML tag.
"""
return text.replace(" ", "_")
return SLUG_PATTERN.sub("_", text)
def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:
......
......@@ -15,6 +15,9 @@ import numpy as np
import torch
import torch.multiprocessing as mp
from edlib import align, getNiceAlignment
from nerval.evaluate import evaluate
from nerval.parse import parse_bio
from nerval.utils import print_results
from prettytable import MARKDOWN, PrettyTable
from dan.bio import convert
......@@ -22,9 +25,6 @@ from dan.ocr.manager.metrics import Inference
from dan.ocr.manager.training import Manager
from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
from dan.utils import parse_tokens, read_json
from nerval.evaluate import evaluate
from nerval.parse import parse_bio
from nerval.utils import print_results
logger = logging.getLogger(__name__)
......
......@@ -62,9 +62,11 @@ Add the `docs` extra when installing `teklia-dan`:
```shell
# In a clone of the Git repository
pip install .[docs]
pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[docs]
```
The `--index-url` argument is required to find the `nerval` package.
Build the documentation using `mkdocs serve -v`. You can then write in [Markdown](https://www.markdownguide.org/) in the relevant `docs/*.md` files, and see live output on http://localhost:8000.
### Linter
......
......@@ -2,41 +2,43 @@
## Installation
To use DAN in your own environment, you need to install it as a dependency or manually.
DAN is based on a GitLab package registry containing all the nerval source code.
You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
```shell
machine gitlab.teklia.com
login __token__
password <YOUR_PERSONAL_TOKEN>
```
If you install DAN as a dependency, the host must have access to this configuration file to be able to download the [nerval repository](https://gitlab.teklia.com/ner/nerval) package.
### As a dependency
To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
```shell
teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git
--index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
teklia-dan
```
The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
Then you can install it via pip:
```shell
pip install -r requirements.txt
```
### Manually
### Directly
To install DAN manually, you need to first clone via:
To install DAN directly, you can install it via pip:
```shell
git clone git@gitlab.teklia.com:atr/dan.git
pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple teklia-dan
```
Then you can initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule:
```shell
git submodule update --init --recursive
```
Then you can install it via pip:
```shell
pip install .
```
The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
---
......
......@@ -242,9 +242,11 @@ To log your experiment on MLFlow, you need to:
- install the extra requirements via
```shell
$ pip install .[mlflow]
$ pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[mlflow]
```
The `--index-url` argument is required to find the `nerval` package.
- update the following arguments:
| Name | Description | Type | Default |
......
Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
......@@ -24,6 +24,7 @@ known-third-party = [
"cv2",
"editdistance",
"imageio",
"nerval",
"numpy",
"scipy",
"tensorboard",
......
-e ./nerval
albumentations==1.3.1
arkindex-export==0.1.9
boto3==1.26.124
......@@ -13,6 +12,7 @@ PyYAML==6.0
scipy==1.10.1
sentencepiece==0.1.99
teklia-line-image-extractor==0.2.8rc5
teklia-nerval==0.3.1
tenacity==8.2.3
tensorboard==2.12.2
torch==2.0.0
......
......@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
element=Element.select().first(),
)
# Create worker version
WorkerVersion.bulk_create(
[
WorkerVersion(
......@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
revision="main",
type="worker",
)
for nestation in ("nested", "non-nested")
for nestation in ("nested", "non-nested", "special-chars")
]
)
entities = [
# Create entities
for entity in [
# Non-nested entities
{
"worker_version": "non-nested-id",
......@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
"name": "us",
"offset": 43,
},
]
for entity in entities:
# Special characters
{
"worker_version": "special-chars-id",
"type": "Arkindex's entity",
"name": "great",
"offset": 4,
},
{
"worker_version": "special-chars-id",
"type": '"Name" (1)',
"name": "Charles",
"offset": 15,
},
{
"worker_version": "special-chars-id",
"type": "Person /!\\",
"name": "us",
"offset": 43,
},
]:
create_transcription_entity(transcription=transcription, **entity)
return database_path
......
---
entities:
- '"Name" (1)'
- Arkindex's entity
- Person /!\
- adj
- birthdate
- firstname
......
---
adj:
'"Name" (1)':
start:
end:
birthdate:
Arkindex's entity:
start:
end:
firstname:
Person /!\:
start:
end:
fullname:
adj:
start:
end:
name:
birthdate:
start:
end:
person:
firstname:
start:
end:
surname:
fullname:
start:
end:
name:
start:
end:
person:
start:
end:
surname:
start:
end:
---
adj:
'"Name" (1)':
start:
end: ''
birthdate:
Arkindex's entity:
start:
end: ''
firstname:
Person /!\:
start:
end: ''
fullname:
adj:
start:
end: ''
name:
birthdate:
start:
end: ''
person:
firstname:
start:
end: ''
surname:
fullname:
start:
end: ''
name:
start:
end: ''
person:
start:
end: ''
surname:
start:
end: ''
......@@ -28,80 +28,53 @@ def full_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
@pytest.mark.parametrize(
"im_paths, expected_summary",
(
(
[
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
"tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
],
pytest.lazy_fixture("image_statistics"),
),
),
)
def test_display_image_statistics(im_paths, expected_summary, tmp_path):
def test_display_image_statistics(image_statistics, tmp_path):
stats = Statistics(filename=tmp_path)
stats.create_image_statistics(images=im_paths)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"texts, expected_summary",
(
(
[
"Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
"Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
"Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
"With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
],
pytest.lazy_fixture("labels_statistics"),
),
),
)
def test_display_label_statistics(texts, expected_summary, tmp_path):
stats.create_image_statistics(
images=[
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
"tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
]
)
assert stats.document.get_md_text() == image_statistics
def test_display_label_statistics(labels_statistics, tmp_path):
filename = tmp_path / "labels.md"
stats = Statistics(filename=str(filename))
stats.create_label_statistics(labels=texts)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"texts, expected_summary",
(
(
[
"ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
"ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
],
pytest.lazy_fixture("ner_statistics"),
),
),
)
def test_display_ner_statistics(texts, expected_summary, tmp_path):
stats.create_label_statistics(
labels=[
"Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
"Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
"Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
"With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
]
)
assert stats.document.get_md_text() == labels_statistics
def test_display_ner_statistics(ner_statistics, tmp_path):
tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
stats = Statistics(filename=tmp_path)
stats.create_ner_statistics(labels=texts, ner_tokens=tokens)
assert stats.document.get_md_text() == expected_summary
@pytest.mark.parametrize(
"labels, tokens, expected_summary",
(
(
FIXTURES / "training" / "training_dataset" / "labels.json",
FIXTURES / "training" / "training_dataset" / "tokens.yaml",
pytest.lazy_fixture("full_statistics"),
),
),
)
def test_run(labels, tokens, expected_summary, tmp_path):
stats.create_ner_statistics(
labels=[
"ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
"ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
],
ner_tokens=tokens,
)
assert stats.document.get_md_text() == ner_statistics
def test_run(full_statistics, tmp_path):
output_file = tmp_path / "stats.md"
stats = Statistics(filename=str(output_file))
stats.run(labels=read_json(labels), tokens=read_yaml(tokens))
assert output_file.read_text() == expected_summary
stats.run(
labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"),
tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"),
)
assert output_file.read_text() == full_statistics
......@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
"<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
["\n", " "],
),
# Special characters in entities
(
"special-chars-id",
"<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
None,
),
),
)
def test_entities_to_xml(mock_database, nestation, xml_output, separators):
......@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
predictions=get_transcription_entities(
transcription_id="tr-with-entities",
entity_worker_versions=[nestation],
supported_types=["name", "fullname", "person", "adj"],
supported_types=[
"name",
"fullname",
"person",
"adj",
"Arkindex's entity",
'"Name" (1)',
"Person /!\\",
],
),
entity_separators=separators,
)
......