Merge branch 'remove-files-of-old-repo' of gitlab.com:teklia/atr/dan into remove-files-of-old-repo

fc8cfef4 · Yoann Schneider · 4650fed2 · a09db0f5 · fc8cfef4 · fc8cfef4
Commit fc8cfef4 authored 2 years ago by Yoann Schneider
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
    rev: 22.6.0
    hooks:
    - id: black
-  - repo: https://gitlab.com/pycqa/flake8
+  - repo: https://github.com/pycqa/flake8
    rev: 3.9.2
    hooks:
      - id: flake8

--- a/README.md
+++ b/README.md
@@ -104,19 +104,19 @@ The available arguments are

 | Parameter                      | Description                                                                         | Type     | Default |
 | ------------------------------ | ----------------------------------------------------------------------------------- | -------- | ------- |
-| `--parent`                       | UUID of the folder to import from Arkindex. You may specify multiple UUIDs.         | str/uuid |         |
-| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                    | str      |         |
-| `--output`                       | Folder where the data will be generated. Must exist.                                | Path     |         |
-| `--load-entities`                | Extract text with their entities. Needed for NER tasks.                             | bool     | False   |
-| `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.               | Path     |         |
-| `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                 | bool     |         |
-| `--train-folder`                 | ID of the training folder to import from Arkindex.                                  | uuid     |         |
-| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                | uuid     |         |
-| `--test-folder`                  | ID of the training folder to import from Arkindex.                                  | uuid     |         |
-| `--transcription-worker-version` | Filter transcriptions by worker_version. Use ‘manual’ for manual filtering.         | str/uuid |         |
-| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use ‘manual’ for manual filtering | str/uuid |         |
-| `--train-prob`                   | Training set split size                                                             | float    | 0,7     |
-| `--val-prob`                     | Validation set split size                                                           | float    | 0,15    |
+| `--parent`                       | UUID of the folder to import from Arkindex. You may specify multiple UUIDs.         | `str/uuid` |         |
+| `--element-type`                 | Type of the elements to extract. You may specify multiple types.                    | `str`      |         |
+| `--output`                       | Folder where the data will be generated. Must exist.                                | `Path`     |         |
+| `--load-entities`                | Extract text with their entities. Needed for NER tasks.                             | `bool`     | `False`   |
+| `--tokens`                       | Mapping between starting tokens and end tokens. Needed for NER tasks.               | `Path`    |         |
+| `--use-existing-split`           | Use the specified folder IDs for the dataset split.                                 | `bool`     |         |
+| `--train-folder`                 | ID of the training folder to import from Arkindex.                                  | `uuid`     |         |
+| `--val-folder`                   | ID of the validation folder to import from Arkindex.                                | `uuid`     |         |
+| `--test-folder`                  | ID of the training folder to import from Arkindex.                                  | `uuid`     |         |
+| `--transcription-worker-version` | Filter transcriptions by worker_version. Use ‘manual’ for manual filtering.         | `str/uuid` |         |
+| `--entity-worker-version`        | Filter transcriptions entities by worker_version. Use ‘manual’ for manual filtering | `str/uuid` |         |
+| `--train-prob`                   | Training set split size                                                             | `float`    | `0,7`     |
+| `--val-prob`                     | Validation set split size                                                           | `float`    | `0,15`    |

 The `--tokens` argument expects a file with the following format.
 ```yaml
@@ -171,7 +171,7 @@ To use the data from three folders as **training**, **validation** and **testing
 ```shell
 teklia-dan extract \
    --use-existing-split \
-    --train-folder 2275529a-1ec5-40ce-a516-42ea7ada858c 
+    --train-folder 2275529a-1ec5-40ce-a516-42ea7ada858c \
    --val-folder af9b38b5-5d95-417d-87ec-730537cb1898 \
    --test-folder 6ff44957-0e65-48c5-9d77-a178116405b2 \
    --element-type page \
@@ -193,4 +193,3 @@ teklia-dan extract \

 #### Synthetic data generation
 `teklia-dan generate` with multiple arguments
-
--- a/dan/datasets/extract/extract_from_arkindex.py
+++ b/dan/datasets/extract/extract_from_arkindex.py
@@ -4,17 +4,18 @@
 Extract dataset from Arkindex using API.
 """

-from collections import defaultdict
 import logging
 import os
 import pathlib
 import random
 import uuid
+from collections import defaultdict

 import imageio.v2 as iio
 from arkindex import ArkindexClient, options_from_env
 from tqdm import tqdm

+from dan import logger
 from dan.datasets.extract.utils import (
    insert_token,
    parse_tokens,
@@ -23,9 +24,6 @@ from dan.datasets.extract.utils import (
    save_text,
 )

-from dan import logger
-
-
 IMAGES_DIR = "images"  # Subpath to the images directory.
 LABELS_DIR = "labels"  # Subpath to the labels directory.
 MANUAL_SOURCE = "manual"

--- a/dan/datasets/extract/utils.py
+++ b/dan/datasets/extract/utils.py
 # -*- coding: utf-8 -*-
-import yaml
 import json
 import random

 import cv2
+import yaml

 random.seed(42)