diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3703391f03074a9efb507832eeba643a8e901b62..13d006c50a5e63a20844abf496da099f270eb336 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: rev: 22.6.0 hooks: - id: black - - repo: https://gitlab.com/pycqa/flake8 + - repo: https://github.com/pycqa/flake8 rev: 3.9.2 hooks: - id: flake8 diff --git a/README.md b/README.md index af2f58c55727ef243b41b2a261bc03f5498504f9..e66d25e031e0cb4b79d073a886ad973095721ee4 100644 --- a/README.md +++ b/README.md @@ -104,19 +104,19 @@ The available arguments are | Parameter | Description | Type | Default | | ------------------------------ | ----------------------------------------------------------------------------------- | -------- | ------- | -| `--parent` | UUID of the folder to import from Arkindex. You may specify multiple UUIDs. | str/uuid | | -| `--element-type` | Type of the elements to extract. You may specify multiple types. | str | | -| `--output` | Folder where the data will be generated. Must exist. | Path | | -| `--load-entities` | Extract text with their entities. Needed for NER tasks. | bool | False | -| `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | Path | | -| `--use-existing-split` | Use the specified folder IDs for the dataset split. | bool | | -| `--train-folder` | ID of the training folder to import from Arkindex. | uuid | | -| `--val-folder` | ID of the validation folder to import from Arkindex. | uuid | | -| `--test-folder` | ID of the training folder to import from Arkindex. | uuid | | -| `--transcription-worker-version` | Filter transcriptions by worker_version. Use ‘manual’ for manual filtering. | str/uuid | | -| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use ‘manual’ for manual filtering | str/uuid | | -| `--train-prob` | Training set split size | float | 0,7 | -| `--val-prob` | Validation set split size | float | 0,15 | +| `--parent` | UUID of the folder to import from Arkindex. You may specify multiple UUIDs. | `str/uuid` | | +| `--element-type` | Type of the elements to extract. You may specify multiple types. | `str` | | +| `--output` | Folder where the data will be generated. Must exist. | `Path` | | +| `--load-entities` | Extract text with their entities. Needed for NER tasks. | `bool` | `False` | +| `--tokens` | Mapping between starting tokens and end tokens. Needed for NER tasks. | `Path` | | +| `--use-existing-split` | Use the specified folder IDs for the dataset split. | `bool` | | +| `--train-folder` | ID of the training folder to import from Arkindex. | `uuid` | | +| `--val-folder` | ID of the validation folder to import from Arkindex. | `uuid` | | +| `--test-folder` | ID of the training folder to import from Arkindex. | `uuid` | | +| `--transcription-worker-version` | Filter transcriptions by worker_version. Use ‘manual’ for manual filtering. | `str/uuid` | | +| `--entity-worker-version` | Filter transcriptions entities by worker_version. Use ‘manual’ for manual filtering | `str/uuid` | | +| `--train-prob` | Training set split size | `float` | `0,7` | +| `--val-prob` | Validation set split size | `float` | `0,15` | The `--tokens` argument expects a file with the following format. ```yaml @@ -171,7 +171,7 @@ To use the data from three folders as **training**, **validation** and **testing ```shell teklia-dan extract \ --use-existing-split \ - --train-folder 2275529a-1ec5-40ce-a516-42ea7ada858c + --train-folder 2275529a-1ec5-40ce-a516-42ea7ada858c \ --val-folder af9b38b5-5d95-417d-87ec-730537cb1898 \ --test-folder 6ff44957-0e65-48c5-9d77-a178116405b2 \ --element-type page \ @@ -193,4 +193,3 @@ teklia-dan extract \ #### Synthetic data generation `teklia-dan generate` with multiple arguments - diff --git a/dan/datasets/extract/extract_from_arkindex.py b/dan/datasets/extract/extract_from_arkindex.py index 5c3818be99b27ed43bb803fac02c4c0da4a9995b..b0274f9859c45dbaa50416a8305cb130a885c1dc 100644 --- a/dan/datasets/extract/extract_from_arkindex.py +++ b/dan/datasets/extract/extract_from_arkindex.py @@ -4,17 +4,18 @@ Extract dataset from Arkindex using API. """ -from collections import defaultdict import logging import os import pathlib import random import uuid +from collections import defaultdict import imageio.v2 as iio from arkindex import ArkindexClient, options_from_env from tqdm import tqdm +from dan import logger from dan.datasets.extract.utils import ( insert_token, parse_tokens, @@ -23,9 +24,6 @@ from dan.datasets.extract.utils import ( save_text, ) -from dan import logger - - IMAGES_DIR = "images" # Subpath to the images directory. LABELS_DIR = "labels" # Subpath to the labels directory. MANUAL_SOURCE = "manual" diff --git a/dan/datasets/extract/utils.py b/dan/datasets/extract/utils.py index 4b1aa93f07d042aa2d895083d3590adfa995af75..6d47943fb8fb2ce72b4567348368d9e3e9c28952 100644 --- a/dan/datasets/extract/utils.py +++ b/dan/datasets/extract/utils.py @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -import yaml import json import random import cv2 +import yaml random.seed(42)