Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
Commits on Source (14)
Showing with 216 additions and 133 deletions
...@@ -3,10 +3,6 @@ stages: ...@@ -3,10 +3,6 @@ stages:
- build - build
- deploy - deploy
variables:
# Submodule clone
GIT_SUBMODULE_STRATEGY: recursive
lint: lint:
image: python:3.10 image: python:3.10
stage: test stage: test
...@@ -46,6 +42,14 @@ test: ...@@ -46,6 +42,14 @@ test:
- apt-get update -q - apt-get update -q
- apt-get install -q -y libgl1 - apt-get install -q -y libgl1
# Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
except: except:
- schedules - schedules
...@@ -64,6 +68,15 @@ docker-build: ...@@ -64,6 +68,15 @@ docker-build:
except: except:
- schedules - schedules
before_script:
# Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
script: script:
- ci/build.sh - ci/build.sh
...@@ -75,7 +88,15 @@ docker-build: ...@@ -75,7 +88,15 @@ docker-build:
- public - public
before_script: before_script:
- pip install -e .[docs] # Add netrc file
- |
echo "
machine gitlab.teklia.com
login gitlab-ci-token
password $CI_JOB_TOKEN
" > ~/.netrc
- pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .[docs]
script: script:
- mkdocs build --strict --verbose - mkdocs build --strict --verbose
...@@ -149,6 +170,25 @@ docs-stop-surge: ...@@ -149,6 +170,25 @@ docs-stop-surge:
script: script:
- surge teardown ${CI_ENVIRONMENT_URL} - surge teardown ${CI_ENVIRONMENT_URL}
deploy-pypi:
stage: deploy
image: python:3.10
only:
- tags
variables:
TWINE_USERNAME: gitlab-ci-token
TWINE_PASSWORD: ${CI_JOB_TOKEN}
TWINE_REPOSITORY_URL: ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi
before_script:
- pip install twine
script:
- python setup.py sdist bdist_wheel
- twine upload --repository-url ${TWINE_REPOSITORY_URL} dist/*
bump-python-deps: bump-python-deps:
stage: deploy stage: deploy
image: registry.gitlab.teklia.com/infra/devops:latest image: registry.gitlab.teklia.com/infra/devops:latest
......
[submodule "nerval"]
path = nerval
url = ../../ner/nerval.git
...@@ -7,12 +7,10 @@ RUN apt-get -y update && \ ...@@ -7,12 +7,10 @@ RUN apt-get -y update && \
WORKDIR /src WORKDIR /src
# Copy submodule data
COPY nerval nerval
# Copy DAN data # Copy DAN data
COPY dan dan COPY dan dan
COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./ COPY requirements.txt *-requirements.txt setup.py VERSION README.md ./
# Install DAN as a package # Install DAN as a package with GitLab package registry
RUN pip install . --no-cache-dir RUN --mount=type=secret,id=netrc,target=/root/.netrc \
pip install . --no-cache-dir --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
...@@ -10,9 +10,20 @@ This is an open-source project, licensed using [the MIT license](https://opensou ...@@ -10,9 +10,20 @@ This is an open-source project, licensed using [the MIT license](https://opensou
For development and tests purpose it may be useful to install the project as a editable package with pip. For development and tests purpose it may be useful to install the project as a editable package with pip.
- Use a virtualenv (e.g. with virtualenvwrapper `mkvirtualenv -a . dan`) This package is based on a GitLab package registry containing all the nerval source code.
- Initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule (e.g. `git submodule update --init --recursive`) You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
- Install `dan` as a package (e.g. `pip install -e .`)
```shell
machine gitlab.teklia.com
login __token__
password <YOUR_PERSONAL_TOKEN>
```
Then you can install the package as a editable package with pip:
```shell
pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple -e .
```
### Linter ### Linter
......
0.2.0-dev4 0.2.0-dev6
...@@ -17,7 +17,7 @@ fi ...@@ -17,7 +17,7 @@ fi
IMAGE_TAG="$CI_REGISTRY_IMAGE:$VERSION" IMAGE_TAG="$CI_REGISTRY_IMAGE:$VERSION"
cd $CI_PROJECT_DIR cd $CI_PROJECT_DIR
docker build -f Dockerfile . -t "$IMAGE_TAG" docker build -f Dockerfile . -t "$IMAGE_TAG" --secret id=netrc,src=$HOME/.netrc
# Publish the image on the main branch or on a tag # Publish the image on the main branch or on a tag
if [ "$CI_COMMIT_REF_NAME" = "main" -o -n "$CI_COMMIT_TAG" ]; then if [ "$CI_COMMIT_REF_NAME" = "main" -o -n "$CI_COMMIT_TAG" ]; then
......
...@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping ...@@ -17,10 +17,13 @@ from dan.utils import EntityType, LMTokenMapping
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# replace \t with regular space and consecutive spaces # Replace \t with regular space and consecutive spaces
TRIM_SPACE_REGEX = re.compile(r"[\t ]+") TRIM_SPACE_REGEX = re.compile(r"[\t ]+")
TRIM_RETURN_REGEX = re.compile(r"[\r\n]+") TRIM_RETURN_REGEX = re.compile(r"[\r\n]+")
# Remove invalid characters to build valid XML tag name
SLUG_PATTERN = re.compile(r"[\W]+")
# Some characters are encoded in XML but we don't want them encoded in the end # Some characters are encoded in XML but we don't want them encoded in the end
ENCODING_MAP = { ENCODING_MAP = {
"&#13;": "\r", "&#13;": "\r",
...@@ -174,9 +177,9 @@ class Tokenizer: ...@@ -174,9 +177,9 @@ class Tokenizer:
def slugify(text: str): def slugify(text: str):
""" """
Replace space in text to underscores to use it as XML tag. Replace invalid characters in text to underscores to use it as XML tag.
""" """
return text.replace(" ", "_") return SLUG_PATTERN.sub("_", text)
def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None: def get_translation_map(tokens: Dict[str, EntityType]) -> Dict[str, str] | None:
......
...@@ -15,6 +15,9 @@ import numpy as np ...@@ -15,6 +15,9 @@ import numpy as np
import torch import torch
import torch.multiprocessing as mp import torch.multiprocessing as mp
from edlib import align, getNiceAlignment from edlib import align, getNiceAlignment
from nerval.evaluate import evaluate
from nerval.parse import parse_bio
from nerval.utils import print_results
from prettytable import MARKDOWN, PrettyTable from prettytable import MARKDOWN, PrettyTable
from dan.bio import convert from dan.bio import convert
...@@ -22,9 +25,6 @@ from dan.ocr.manager.metrics import Inference ...@@ -22,9 +25,6 @@ from dan.ocr.manager.metrics import Inference
from dan.ocr.manager.training import Manager from dan.ocr.manager.training import Manager
from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config from dan.ocr.utils import add_metrics_table_row, create_metrics_table, update_config
from dan.utils import parse_tokens, read_json from dan.utils import parse_tokens, read_json
from nerval.evaluate import evaluate
from nerval.parse import parse_bio
from nerval.utils import print_results
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -62,9 +62,11 @@ Add the `docs` extra when installing `teklia-dan`: ...@@ -62,9 +62,11 @@ Add the `docs` extra when installing `teklia-dan`:
```shell ```shell
# In a clone of the Git repository # In a clone of the Git repository
pip install .[docs] pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[docs]
``` ```
The `--index-url` argument is required to find the `nerval` package.
Build the documentation using `mkdocs serve -v`. You can then write in [Markdown](https://www.markdownguide.org/) in the relevant `docs/*.md` files, and see live output on http://localhost:8000. Build the documentation using `mkdocs serve -v`. You can then write in [Markdown](https://www.markdownguide.org/) in the relevant `docs/*.md` files, and see live output on http://localhost:8000.
### Linter ### Linter
......
...@@ -2,41 +2,43 @@ ...@@ -2,41 +2,43 @@
## Installation ## Installation
To use DAN in your own environment, you need to install it as a dependency or manually. DAN is based on a GitLab package registry containing all the nerval source code.
You need [a personal access token](https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html) and access to the [nerval repository](https://gitlab.teklia.com/ner/nerval) in order to install this module. You will need to add the below to your `~/.netrc` file:
```shell
machine gitlab.teklia.com
login __token__
password <YOUR_PERSONAL_TOKEN>
```
If you install DAN as a dependency, the host must have access to this configuration file to be able to download the [nerval repository](https://gitlab.teklia.com/ner/nerval) package.
### As a dependency ### As a dependency
To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file: To install DAN as a dependency, you need to first add the following line to your `requirements.txt` file:
```shell ```shell
teklia-dan @ git+ssh://git@gitlab.teklia.com/atr/dan.git --index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple
teklia-dan
``` ```
The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
Then you can install it via pip: Then you can install it via pip:
```shell ```shell
pip install -r requirements.txt pip install -r requirements.txt
``` ```
### Manually ### Directly
To install DAN manually, you need to first clone via: To install DAN directly, you can install it via pip:
```shell ```shell
git clone git@gitlab.teklia.com:atr/dan.git pip3 install --index-url https://gitlab.teklia.com/api/v4/projects/98/packages/pypi/simple --extra-index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple teklia-dan
``` ```
Then you can initialize the [`Nerval`](https://gitlab.teklia.com/ner/nerval) submodule: The `--index-url` argument is required to find the `DAN` package, the `--extra-index-url` argument is needed to find the `nerval` dependency.
```shell
git submodule update --init --recursive
```
Then you can install it via pip:
```shell
pip install .
```
--- ---
......
...@@ -242,9 +242,11 @@ To log your experiment on MLFlow, you need to: ...@@ -242,9 +242,11 @@ To log your experiment on MLFlow, you need to:
- install the extra requirements via - install the extra requirements via
```shell ```shell
$ pip install .[mlflow] $ pip install --index-url https://gitlab.teklia.com/api/v4/projects/210/packages/pypi/simple .[mlflow]
``` ```
The `--index-url` argument is required to find the `nerval` package.
- update the following arguments: - update the following arguments:
| Name | Description | Type | Default | | Name | Description | Type | Default |
......
Subproject commit 525c1a9e6d5a33075669085148247e2604dd092f
...@@ -24,6 +24,7 @@ known-third-party = [ ...@@ -24,6 +24,7 @@ known-third-party = [
"cv2", "cv2",
"editdistance", "editdistance",
"imageio", "imageio",
"nerval",
"numpy", "numpy",
"scipy", "scipy",
"tensorboard", "tensorboard",
......
-e ./nerval
albumentations==1.3.1 albumentations==1.3.1
arkindex-export==0.1.9 arkindex-export==0.1.9
boto3==1.26.124 boto3==1.26.124
...@@ -13,6 +12,7 @@ PyYAML==6.0 ...@@ -13,6 +12,7 @@ PyYAML==6.0
scipy==1.10.1 scipy==1.10.1
sentencepiece==0.1.99 sentencepiece==0.1.99
teklia-line-image-extractor==0.2.8rc5 teklia-line-image-extractor==0.2.8rc5
teklia-nerval==0.3.1
tenacity==8.2.3 tenacity==8.2.3
tensorboard==2.12.2 tensorboard==2.12.2
torch==2.0.0 torch==2.0.0
......
...@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory): ...@@ -213,6 +213,7 @@ def mock_database(tmp_path_factory):
element=Element.select().first(), element=Element.select().first(),
) )
# Create worker version
WorkerVersion.bulk_create( WorkerVersion.bulk_create(
[ [
WorkerVersion( WorkerVersion(
...@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory): ...@@ -223,11 +224,12 @@ def mock_database(tmp_path_factory):
revision="main", revision="main",
type="worker", type="worker",
) )
for nestation in ("nested", "non-nested") for nestation in ("nested", "non-nested", "special-chars")
] ]
) )
entities = [ # Create entities
for entity in [
# Non-nested entities # Non-nested entities
{ {
"worker_version": "non-nested-id", "worker_version": "non-nested-id",
...@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory): ...@@ -266,8 +268,26 @@ def mock_database(tmp_path_factory):
"name": "us", "name": "us",
"offset": 43, "offset": 43,
}, },
] # Special characters
for entity in entities: {
"worker_version": "special-chars-id",
"type": "Arkindex's entity",
"name": "great",
"offset": 4,
},
{
"worker_version": "special-chars-id",
"type": '"Name" (1)',
"name": "Charles",
"offset": 15,
},
{
"worker_version": "special-chars-id",
"type": "Person /!\\",
"name": "us",
"offset": 43,
},
]:
create_transcription_entity(transcription=transcription, **entity) create_transcription_entity(transcription=transcription, **entity)
return database_path return database_path
......
--- ---
entities: entities:
- '"Name" (1)'
- Arkindex's entity
- Person /!\
- adj - adj
- birthdate - birthdate
- firstname - firstname
......
--- ---
adj: '"Name" (1)':
start: start:
end: end:
birthdate: Arkindex's entity:
start: start:
end: end:
firstname: Person /!\:
start: start:
end: end:
fullname: adj:
start: start:
end: end:
name: birthdate:
start: start:
end: end:
person: firstname:
start: start:
end: end:
surname: fullname:
start: start:
end: end:
name:
start:
end:
person:
start:
end:
surname:
start:
end:
--- ---
adj: '"Name" (1)':
start: start:
end: '' end: ''
birthdate: Arkindex's entity:
start: start:
end: '' end: ''
firstname: Person /!\:
start: start:
end: '' end: ''
fullname: adj:
start: start:
end: '' end: ''
name: birthdate:
start: start:
end: '' end: ''
person: firstname:
start: start:
end: '' end: ''
surname: fullname:
start: start:
end: '' end: ''
name:
start:
end: ''
person:
start:
end: ''
surname:
start:
end: ''
...@@ -28,80 +28,53 @@ def full_statistics(): ...@@ -28,80 +28,53 @@ def full_statistics():
return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats")) return MdUtils(file_name="").read_md_file(str(FIXTURES / "analyze" / "stats"))
@pytest.mark.parametrize( def test_display_image_statistics(image_statistics, tmp_path):
"im_paths, expected_summary",
(
(
[
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
"tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
"tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
"tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
"tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
],
pytest.lazy_fixture("image_statistics"),
),
),
)
def test_display_image_statistics(im_paths, expected_summary, tmp_path):
stats = Statistics(filename=tmp_path) stats = Statistics(filename=tmp_path)
stats.create_image_statistics(images=im_paths) stats.create_image_statistics(
assert stats.document.get_md_text() == expected_summary images=[
"tests/data/training/training_dataset/images/0a34e13a-4ab0-4a91-8d7c-b1d8fee32628.png",
"tests/data/training/training_dataset/images/0a70e14f-feda-4607-989c-36cf581ddff5.png",
@pytest.mark.parametrize( "tests/data/training/training_dataset/images/0a576062-303c-4893-a729-c09c92865d31.png",
"texts, expected_summary", "tests/data/training/training_dataset/images/0b2457c8-81f1-4600-84d9-f8bf2822a991.png",
( "tests/data/training/training_dataset/images/fb3edb59-3678-49f8-8e16-8e32e3b0f051.png",
( "tests/data/training/training_dataset/images/fe498de2-ece4-4fbe-8b53-edfce1b820f0.png",
[ ]
"Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.", )
"Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.", assert stats.document.get_md_text() == image_statistics
"Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
"With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
], def test_display_label_statistics(labels_statistics, tmp_path):
pytest.lazy_fixture("labels_statistics"),
),
),
)
def test_display_label_statistics(texts, expected_summary, tmp_path):
filename = tmp_path / "labels.md" filename = tmp_path / "labels.md"
stats = Statistics(filename=str(filename)) stats = Statistics(filename=str(filename))
stats.create_label_statistics(labels=texts) stats.create_label_statistics(
assert stats.document.get_md_text() == expected_summary labels=[
"Teklia’s expertise is to develop document analysis\nand processing solutions using, among other things,\nOCR technology.",
"Our software combines image analysis, printed and\nhandwritten text recognition, text segmentation with\na document classification and indexation system.",
@pytest.mark.parametrize( "Our objective is to deliver to our clients an automated\ndocument processing tool easy-to-use and adapted\nto their needs.",
"texts, expected_summary", "With the same state of mind, we developed additional solutions to\nenhance both security and business-process.",
( ]
( )
[ assert stats.document.get_md_text() == labels_statistics
"ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
"ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
], def test_display_ner_statistics(ner_statistics, tmp_path):
pytest.lazy_fixture("ner_statistics"),
),
),
)
def test_display_ner_statistics(texts, expected_summary, tmp_path):
tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml") tokens = read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml")
stats = Statistics(filename=tmp_path) stats = Statistics(filename=tmp_path)
stats.create_ner_statistics(labels=texts, ner_tokens=tokens) stats.create_ner_statistics(
assert stats.document.get_md_text() == expected_summary labels=[
"ⓈDayon ⒻFernand Ⓐ6\nⓈDayen ⒻMaurice Ⓐ2\nⓈTottelier ⒻJean Baptiste Ⓐ59",
"ⓈPeryro ⒻEtienne Ⓐ33\nⓈJeannot ⒻCaroline Ⓐ24\nⓈMouline ⒻPierre Ⓐ32",
@pytest.mark.parametrize( ],
"labels, tokens, expected_summary", ner_tokens=tokens,
( )
( assert stats.document.get_md_text() == ner_statistics
FIXTURES / "training" / "training_dataset" / "labels.json",
FIXTURES / "training" / "training_dataset" / "tokens.yaml",
pytest.lazy_fixture("full_statistics"), def test_run(full_statistics, tmp_path):
),
),
)
def test_run(labels, tokens, expected_summary, tmp_path):
output_file = tmp_path / "stats.md" output_file = tmp_path / "stats.md"
stats = Statistics(filename=str(output_file)) stats = Statistics(filename=str(output_file))
stats.run(labels=read_json(labels), tokens=read_yaml(tokens)) stats.run(
assert output_file.read_text() == expected_summary labels=read_json(FIXTURES / "training" / "training_dataset" / "labels.json"),
tokens=read_yaml(FIXTURES / "training" / "training_dataset" / "tokens.yaml"),
)
assert output_file.read_text() == full_statistics
...@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens): ...@@ -478,6 +478,12 @@ def test_extract_transcription_no_translation(mock_database, tokens):
"<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>", "<root><fullname><name>Charles</name> III</fullname>\n<person>us</person></root>",
["\n", " "], ["\n", " "],
), ),
# Special characters in entities
(
"special-chars-id",
"<root>The <Arkindex_s_entity>great</Arkindex_s_entity> king <_Name_1_>Charles</_Name_1_> III has eaten \nwith <Person_>us</Person_>.</root>",
None,
),
), ),
) )
def test_entities_to_xml(mock_database, nestation, xml_output, separators): def test_entities_to_xml(mock_database, nestation, xml_output, separators):
...@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators): ...@@ -488,7 +494,15 @@ def test_entities_to_xml(mock_database, nestation, xml_output, separators):
predictions=get_transcription_entities( predictions=get_transcription_entities(
transcription_id="tr-with-entities", transcription_id="tr-with-entities",
entity_worker_versions=[nestation], entity_worker_versions=[nestation],
supported_types=["name", "fullname", "person", "adj"], supported_types=[
"name",
"fullname",
"person",
"adj",
"Arkindex's entity",
'"Name" (1)',
"Person /!\\",
],
), ),
entity_separators=separators, entity_separators=separators,
) )
......