Skip to content
Snippets Groups Projects

Filter entities by name when extracting data from Arkindex

Merged Manon Blanco requested to merge allow-unknown-entities into main
1 file
+ 7
4
Compare changes
  • Side-by-side
  • Inline
+ 7
4
@@ -170,9 +170,12 @@ def test_reconstruct_text_joined_entities(entity_separators, text_before, text_a
)
@pytest.mark.parametrize("joined", (True, False))
@pytest.mark.parametrize("text_before", ("", "text before "))
@pytest.mark.parametrize("text_after", ("", " text after"))
def test_reconstruct_text_only_start_token(text_before, text_after):
def test_reconstruct_text_only_start_token(joined, text_before, text_after):
separator = " " if not joined else ""
arkindex_extractor = ArkindexExtractor(entity_separators=[" ", "\n"])
arkindex_extractor.tokens = {
"P": EntityType(start=""),
@@ -180,7 +183,7 @@ def test_reconstruct_text_only_start_token(text_before, text_after):
}
assert (
arkindex_extractor.reconstruct_text(
text_before + "LouisXIV" + text_after,
text_before + "Louis" + separator + "XIV" + text_after,
[
Entity(
offset=0 + len(text_before),
@@ -189,12 +192,12 @@ def test_reconstruct_text_only_start_token(text_before, text_after):
value="Louis",
),
Entity(
offset=5 + len(text_before),
offset=5 + len(separator) + len(text_before),
length=3,
type="I",
value="XIV",
),
],
)
== "ⓟLouisⓘXIV"
== "ⓟLouis" + separator + "ⓘXIV"
)
Loading