Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • atr/dan
1 result
Show changes
# -*- coding: utf-8 -*-
import pytest
from arkindex.mock import MockApiClient
from dan.datasets.extract.extract_from_arkindex import ArkindexExtractor, Entity
from dan.datasets.extract.utils import insert_token
@pytest.fixture
def arkindex_extractor():
return ArkindexExtractor(
client=MockApiClient(), split_names=["train", "val", "test"]
)
@pytest.mark.parametrize(
"text,count,offset,length,expected",
(
("n°1 16 janvier 1611", 0, 0, 3, "ⓘn°1 Ⓘ16 janvier 1611"),
("ⓘn°1 Ⓘ16 janvier 1611", 2, 4, 15, "ⓘn°1 Ⓘⓘ16 janvier 1611Ⓘ"),
),
)
def test_insert_token(text, count, offset, length, expected):
start_token, end_token = "", ""
assert (
insert_token(text, count, start_token, end_token, offset, length)[0] == expected
)
@pytest.mark.parametrize(
"text,entities,expected",
(
(
"n°1 16 janvier 1611",
[
Entity(offset=0, length=3, label="P"),
Entity(offset=4, length=15, label="D"),
],
"ⓟn°1 Ⓟⓓ16 janvier 1611Ⓓ",
),
),
)
def test_reconstruct_text(arkindex_extractor, text, entities, expected):
arkindex_extractor.tokens = {
"P": {"start": "", "end": ""},
"D": {"start": "", "end": ""},
}
assert arkindex_extractor.reconstruct_text(text, entities) == expected
@pytest.mark.parametrize(
"text,offset,length,label,expected",
(
(" n°1 16 janvier 1611 ", None, None, None, "n°1 16 janvier 1611"),
("n°1 16 janvier 1611", 0, 3, "P", "ⓟn°1 Ⓟ16 janvier 1611"),
),
)
def test_extract_transcription(
arkindex_extractor, text, offset, length, label, expected
):
element = {"id": "element_id"}
transcription = {"id": "transcription_id", "text": text}
arkindex_extractor.client.add_response(
"ListTranscriptions",
id="element_id",
worker_version=None,
response={"count": 1, "results": [transcription]},
)
if label:
arkindex_extractor.load_entities = True
arkindex_extractor.tokens = {
"P": {"start": "", "end": ""},
}
arkindex_extractor.client.add_response(
"ListTranscriptionEntities",
id="transcription_id",
worker_version=None,
response=[
{
"entity": {"id": "entity_id", "metas": {"subtype": label}},
"offset": offset,
"length": length,
"worker_version": None,
"worker_run_id": None,
}
],
)
assert arkindex_extractor.extract_transcription(element) == expected
@pytest.mark.parametrize(
"offset,length,label",
((0, 3, "P"),),
)
def test_extract_entities(arkindex_extractor, offset, length, label):
transcription = {"id": "transcription_id"}
arkindex_extractor.tokens = {
"P": {"start": "", "end": ""},
}
arkindex_extractor.client.add_response(
"ListTranscriptionEntities",
id="transcription_id",
worker_version=None,
response=[
{
"entity": {"id": "entity_id", "metas": {"subtype": label}},
"offset": offset,
"length": length,
"worker_version": None,
"worker_run_id": None,
}
],
)
assert arkindex_extractor.extract_entities(transcription) == [
Entity(offset=offset, length=length, label=label)
]
[tox]
envlist = teklia-dan
[testenv]
passenv = ARKINDEX_API_SCHEMA_URL
commands =
pytest {posargs}
deps =
pytest
pytest-responses
-rrequirements.txt