From fe0f6eb46f57dbd17ca0bc288f8a6521d9f8c166 Mon Sep 17 00:00:00 2001
From: Yoann Schneider <yschneider@teklia.com>
Date: Wed, 4 Oct 2023 09:33:14 +0000
Subject: [PATCH] Add default values for entity separators

---
 dan/datasets/extract/__init__.py |  1 +
 dan/datasets/extract/extract.py  |  4 ++--
 docs/usage/datasets/extract.md   | 34 ++++++++++++++++----------------
 tests/test_extract.py            |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py
index efd50c9a..6ea479c6 100644
--- a/dan/datasets/extract/__init__.py
+++ b/dan/datasets/extract/__init__.py
@@ -107,6 +107,7 @@ def add_extract_parser(subcommands) -> None:
             Do not give any arguments to keep the whole text.
         """,
         required=False,
+        default=list(map(validate_char, ("\n", " "))),
     )
     parser.add_argument(
         "--tokens",
diff --git a/dan/datasets/extract/extract.py b/dan/datasets/extract/extract.py
index 89869e6b..440986ec 100644
--- a/dan/datasets/extract/extract.py
+++ b/dan/datasets/extract/extract.py
@@ -63,7 +63,7 @@ class ArkindexExtractor:
         element_type: List[str] = [],
         parent_element_type: str = None,
         output: Path = None,
-        entity_separators: list = [],
+        entity_separators: List[str] = ["\n", " "],
         tokens: Path = None,
         transcription_worker_version: Optional[Union[str, bool]] = None,
         entity_worker_version: Optional[Union[str, bool]] = None,
@@ -389,7 +389,7 @@ def run(
     element_type: List[str],
     parent_element_type: str,
     output: Path,
-    entity_separators: list,
+    entity_separators: List[str],
     tokens: Path,
     train_folder: UUID,
     val_folder: UUID,
diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md
index f050a12f..ae0c9059 100644
--- a/docs/usage/datasets/extract.md
+++ b/docs/usage/datasets/extract.md
@@ -10,23 +10,23 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind
 
 If an image download fails for whatever reason, it won't appear in the transcriptions file. The reason will be printed to stdout at the end of the process. Before trying to download the image, it checks that it wasn't downloaded previously. It is thus safe to run this command twice if a few images failed.
 
-| Parameter                        | Description                                                                                                                                                                                                                          | Type            | Default                              |
-| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | ------------------------------------ |
-| `database`                       | Path to an Arkindex export database in SQLite format.                                                                                                                                                                                | `Path`          |                                      |
-| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                                                                                                                                                                     | `str`           |                                      |
-| `--parent-element-type`          | Type of the parent element containing the data.                                                                                                                                                                                      | `str`           | `page`                               |
-| `--output`                       | Folder where the data will be generated.                                                                                                                                                                                             | `Path`          |                                      |
-| `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str`           | (see [dedicated section](#examples)) |
-| `--tokens`                       | Mapping between starting tokens and end tokens to extract text with their entities.                                                                                                                                                  | `Path`          |                                      |
-| `--train-folder`                 | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                      |
-| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                                                                                                                                                                 | `uuid`          |                                      |
-| `--test-folder`                  | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                      |
-| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.                                                                                                                                                          | `str` or `uuid` |                                      |
-| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering                                                                                                                                                  | `str` or `uuid` |                                      |
-| `--max-width`                    | Images larger than this width will be resized to this width.                                                                                                                                                                         | `int`           |                                      |
-| `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                      |
-| `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                |
-| `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                               |
+| Parameter                        | Description                                                                                                                                                                                                                          | Type            | Default                                            |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | --------------- | -------------------------------------------------- |
+| `database`                       | Path to an Arkindex export database in SQLite format.                                                                                                                                                                                | `Path`          |                                                    |
+| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                                                                                                                                                                     | `str`           |                                                    |
+| `--parent-element-type`          | Type of the parent element containing the data.                                                                                                                                                                                      | `str`           | `page`                                             |
+| `--output`                       | Folder where the data will be generated.                                                                                                                                                                                             | `Path`          |                                                    |
+| `--entity-separators`            | Removes all text that does not appear in an entity or in the list of given ordered characters. If several separators follow each other, keep only the first to appear in the list. Do not give any arguments to keep the whole text. | `str`           | `["\n", " "]` (see [dedicated section](#examples)) |
+| `--tokens`                       | Mapping between starting tokens and end tokens to extract text with their entities.                                                                                                                                                  | `Path`          |                                                    |
+| `--train-folder`                 | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                                    |
+| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                                                                                                                                                                 | `uuid`          |                                                    |
+| `--test-folder`                  | ID of the training folder to import from Arkindex.                                                                                                                                                                                   | `uuid`          |                                                    |
+| `--transcription-worker-version` | Filter transcriptions by worker_version. Use `manual` for manual filtering.                                                                                                                                                          | `str` or `uuid` |                                                    |
+| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use `manual` for manual filtering                                                                                                                                                  | `str` or `uuid` |                                                    |
+| `--max-width`                    | Images larger than this width will be resized to this width.                                                                                                                                                                         | `int`           |                                                    |
+| `--max-height`                   | Images larger than this height will be resized to this height.                                                                                                                                                                       | `int`           |                                                    |
+| `--keep-spaces`                  | Transcriptions are trimmed by default. Use this flag to disable this behaviour.                                                                                                                                                      | `bool`          | False                                              |
+| `--image-format`                 | Images will be saved under this format.                                                                                                                                                                                              | `str`           | `.jpg`                                             |
 
 The `--tokens` argument expects a YAML-formatted file with a specific format. A list of entries with each entry describing a NER entity. The label of the entity is the key to a dict mapping the starting and ending tokens respectively.
 
diff --git a/tests/test_extract.py b/tests/test_extract.py
index 6e980877..24ec2905 100644
--- a/tests/test_extract.py
+++ b/tests/test_extract.py
@@ -69,7 +69,7 @@ def test_insert_token(text, offset, length, expected):
 
 
 def test_reconstruct_text_no_end_token_error():
-    arkindex_extractor = ArkindexExtractor()
+    arkindex_extractor = ArkindexExtractor(entity_separators=[])
     arkindex_extractor.tokens = {
         "X": EntityType(start="â“§"),
     }
-- 
GitLab