diff --git a/dan/datasets/extract/__init__.py b/dan/datasets/extract/__init__.py index 232f445887cf2197997668ba32d5c68ffdfea011..82b82a754ec0dfc56a8dc7ac5377e97029b5d7b5 100644 --- a/dan/datasets/extract/__init__.py +++ b/dan/datasets/extract/__init__.py @@ -90,7 +90,7 @@ def add_extract_parser(subcommands) -> None: parser.add_argument( "--only-entities", action="store_true", - help="Extract text with their entities and remove all text that does not belong to the tokens.", + help="Remove all text that does not belong to the tokens.", ) parser.add_argument( "--allow-unknown-entities", diff --git a/docs/usage/datasets/extract.md b/docs/usage/datasets/extract.md index edced49fb0d97346f816aff36316eee117962643..fb65fcbbb77e51e7d2e5a6e839b1bd9e04361f0d 100644 --- a/docs/usage/datasets/extract.md +++ b/docs/usage/datasets/extract.md @@ -12,6 +12,7 @@ Use the `teklia-dan dataset extract` command to extract a dataset from an Arkind | `--parent-element-type` | Type of the parent element containing the data. | `str` | `page` | | `--output` | Folder where the data will be generated. | `Path` | | | `--load-entities` | Extract text with their entities. Needed for NER tasks. | `bool` | `False` | +| `--only-entities` | Remove all text that does not belong to the tokens. | `bool` | `False` | | `--allow-unknown-entities` | Ignore entities that do not appear in the list of tokens. | `bool` | `False` | | `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | `Path` | | | `--use-existing-split` | Use the specified folder IDs for the dataset split. | `bool` | |