Add a new configuration to tests the training command

82d7946e · Manon Blanco · Yoann Schneider · 1591b8ef · 82d7946e · 82d7946e
Commit 82d7946e authored 1 year ago by Manon Blanco Committed by Yoann Schneider 1 year ago
--- a/configs/tests.json
+++ b/configs/tests.json
+{
+    "dataset": {
+        "datasets": {
+            "training": "tests/data/training/training_dataset"
+        },
+        "train": {
+            "name": "training-train",
+            "datasets": [
+                ["training", "train"]
+            ]
+        },
+        "val": {
+            "training-val": [
+                ["training", "val"]
+            ]
+        },
+        "test": {
+            "training-test": [
+                ["training", "test"]
+            ]
+        },
+        "max_char_prediction": 30,
+        "tokens": null
+    },
+    "model": {
+        "transfered_charset": true,
+        "additional_tokens": 1,
+        "encoder": {
+            "dropout": 0.5,
+            "nb_layers": 5
+        },
+        "h_max": 500,
+        "w_max": 1000,
+        "decoder": {
+            "l_max": 15000,
+            "dec_num_layers": 8,
+            "dec_num_heads": 4,
+            "dec_res_dropout": 0.1,
+            "dec_pred_dropout": 0.1,
+            "dec_att_dropout": 0.1,
+            "dec_dim_feedforward": 256,
+            "attention_win": 100,
+            "enc_dim": 256
+        }
+    },
+    "training": {
+        "data": {
+            "batch_size": 2,
+            "load_in_memory": true,
+            "worker_per_gpu": 4,
+            "preprocessings": [
+                {
+                    "type": "max_resize",
+                    "max_width": 2000,
+                    "max_height": 2000
+                }
+            ],
+            "augmentation": true
+        },
+        "device": {
+            "use_ddp": false,
+            "ddp_port": "20027",
+            "use_amp": true,
+            "nb_gpu": 0,
+            "force": "cpu"
+        },
+        "metrics": {
+            "train": [
+                "loss_ce",
+                "cer",
+                "wer",
+                "wer_no_punct"
+            ],
+            "eval": [
+                "cer",
+                "wer",
+                "wer_no_punct"
+            ]
+        },
+        "validation": {
+            "eval_on_valid": true,
+            "eval_on_valid_interval": 2,
+            "set_name_focus_metric": "training-val"
+        },
+        "output_folder": "dan_trained_model",
+        "gradient_clipping": {},
+        "max_nb_epochs": 4,
+        "load_epoch": "last",
+        "optimizers": {
+            "all": {
+                "args": {
+                    "lr": 0.0001,
+                    "amsgrad": false
+                }
+            }
+        },
+        "lr_schedulers": null,
+        "label_noise_scheduler": {
+            "min_error_rate": 0.2,
+            "max_error_rate": 0.2,
+            "total_num_steps": 5e4
+        },
+        "transfer_learning": null
+    }
+}
--- a/docs/get_started/development.md
+++ b/docs/get_started/development.md
@@ -13,7 +13,9 @@ pip install pre-commit
 pre-commit run -a
 ```

-## Run tests
+## Tests
+
+### Unit tests

 Tests are executed with [tox](https://tox.wiki) using [pytest](https://pytest.org).

@@ -24,6 +26,16 @@ tox

 To recreate tox virtual environment (e.g. a dependencies update), you may run `tox -r`.

+### Commands
+
+As unit tests do not test *everything*, it is sometimes necessary to use DAN commands directly to test developments.
+
+The library already has all the documents needed to run the [training command](../usage/train/index.md) on a minimalist dataset. You can use the configuration available at `configs/tests.json`. It is already populated with the parameters used in the unit tests.
+
+```shell
+teklia-dan train --config configs/tests.json
+```
+
 ## Documentation

 This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [MkDocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -184,112 +184,7 @@ def mock_database(tmp_path_factory):

 @pytest.fixture
 def training_config():
-    config = {
-        "dataset": {
-            "datasets": {
-                "training": str(FIXTURES / "training" / "training_dataset"),
-            },
-            "train": {
-                "name": "training-train",
-                "datasets": [
-                    ("training", "train"),
-                ],
-            },
-            "val": {
-                "training-val": [
-                    ("training", "val"),
-                ],
-            },
-            "test": {
-                "training-test": [
-                    ("training", "test"),
-                ],
-            },
-            "max_char_prediction": 30,  # max number of token prediction
-            "tokens": None,
-        },
-        "model": {
-            "transfered_charset": True,  # Transfer learning of the decision layer based on charset of the line HTR model
-            "additional_tokens": 1,  # for decision layer = [<eot>, ], only for transferred charset
-            "encoder": {
-                "dropout": 0.5,  # dropout rate for encoder
-                "nb_layers": 5,  # encoder
-            },
-            "h_max": 500,  # maximum height for encoder output (for 2D positional embedding)
-            "w_max": 1000,  # maximum width for encoder output (for 2D positional embedding)
-            "decoder": {
-                "l_max": 15000,  # max predicted sequence (for 1D positional embedding)
-                "dec_num_layers": 8,  # number of transformer decoder layers
-                "dec_num_heads": 4,  # number of heads in transformer decoder layers
-                "dec_res_dropout": 0.1,  # dropout in transformer decoder layers
-                "dec_pred_dropout": 0.1,  # dropout rate before decision layer
-                "dec_att_dropout": 0.1,  # dropout rate in multi head attention
-                "dec_dim_feedforward": 256,  # number of dimension for feedforward layer in transformer decoder layers
-                "attention_win": 100,  # length of attention window
-                "enc_dim": 256,  # dimension of extracted features
-            },
-        },
-        "training": {
-            "data": {
-                "batch_size": 2,  # mini-batch size for training
-                "load_in_memory": True,  # Load all images in CPU memory
-                "worker_per_gpu": 4,  # Num of parallel processes per gpu for data loading
-                "preprocessings": [
-                    {
-                        "type": "max_resize",
-                        "max_width": 2000,
-                        "max_height": 2000,
-                    }
-                ],
-                "augmentation": True,
-            },
-            "device": {
-                "use_ddp": False,  # Use DistributedDataParallel
-                "ddp_port": "20027",
-                "use_amp": True,  # Enable automatic mix-precision
-                "nb_gpu": 0,
-                "force": "cpu",  # `cpu` for debug purposes
-            },
-            "metrics": {
-                "train": [
-                    "loss_ce",
-                    "cer",
-                    "wer",
-                    "wer_no_punct",
-                ],  # Metrics name for training
-                "eval": [
-                    "cer",
-                    "wer",
-                    "wer_no_punct",
-                ],  # Metrics name for evaluation on validation set during training
-            },
-            "validation": {
-                "eval_on_valid": True,  # Whether to eval and logs metrics on validation set during training or not
-                "eval_on_valid_interval": 2,  # Interval (in epochs) to evaluate during training
-                "set_name_focus_metric": "training-val",
-            },
-            "output_folder": "dan_trained_model",  # folder name for checkpoint and results
-            "gradient_clipping": {},
-            "max_nb_epochs": 4,  # maximum number of epochs before to stop
-            "load_epoch": "last",  # ["best", "last"]: last to continue training, best to evaluate
-            "optimizers": {
-                "all": {
-                    "args": {
-                        "lr": 0.0001,
-                        "amsgrad": False,
-                    },
-                },
-            },
-            "lr_schedulers": None,  # Learning rate schedulers
-            # Keep teacher forcing rate to 20% during whole training
-            "label_noise_scheduler": {
-                "min_error_rate": 0.2,
-                "max_error_rate": 0.2,
-                "total_num_steps": 5e4,
-            },
-            "transfer_learning": None,
-        },
-    }
+    config = json.loads((FIXTURES.parent.parent / "configs" / "tests.json").read_text())
    update_config(config)
    return config