From a93bc055f54ff4e6cc67e077bf76cd09de567763 Mon Sep 17 00:00:00 2001
From: manonBlanco <blanco@teklia.com>
Date: Tue, 13 Jun 2023 10:06:54 +0200
Subject: [PATCH] Remove the remove_linebreaks parameter from training
 configuration

---
 dan/manager/ocr.py             | 13 ++-----------
 dan/ocr/document/train.py      |  1 -
 docs/usage/train/parameters.md |  1 -
 tests/conftest.py              |  1 -
 4 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/dan/manager/ocr.py b/dan/manager/ocr.py
index 30ca845f..533d074c 100644
--- a/dan/manager/ocr.py
+++ b/dan/manager/ocr.py
@@ -44,11 +44,6 @@ class OCRDatasetManager(DatasetManager):
         for key in datasets.keys():
             with open(os.path.join(datasets[key], "charset.pkl"), "rb") as f:
                 charset = charset.union(set(pickle.load(f)))
-        if (
-            "\n" in charset
-            and "remove_linebreaks" in self.params["config"]["constraints"]
-        ):
-            charset.remove("\n")
         if "" in charset:
             charset.remove("")
         return sorted(list(charset))
@@ -167,13 +162,9 @@ class OCRDataset(GenericDataset):
 
     def convert_sample_labels(self, sample):
         label = sample["label"]
-        if "remove_linebreaks" in self.params["config"]["constraints"]:
-            full_label = label.replace("\n", " ").replace("  ", " ")
-        else:
-            full_label = label
 
-        sample["label"] = full_label
-        sample["token_label"] = token_to_ind(self.charset, full_label)
+        sample["label"] = label
+        sample["token_label"] = token_to_ind(self.charset, label)
         sample["token_label"].append(self.tokens["end"])
         sample["label_len"] = len(sample["token_label"])
         sample["token_label"].insert(0, self.tokens["start"])
diff --git a/dan/ocr/document/train.py b/dan/ocr/document/train.py
index 7d755e71..d0a5fc47 100644
--- a/dan/ocr/document/train.py
+++ b/dan/ocr/document/train.py
@@ -109,7 +109,6 @@ def get_config():
                 "height_divisor": 32,  # Image height will be divided by 32
                 "padding_value": 0,  # Image padding value
                 "padding_token": None,  # Label padding value
-                "constraints": [],
                 "preprocessings": [
                     {
                         "type": "to_RGB",
diff --git a/docs/usage/train/parameters.md b/docs/usage/train/parameters.md
index 9674f06c..8d97ae63 100644
--- a/docs/usage/train/parameters.md
+++ b/docs/usage/train/parameters.md
@@ -18,7 +18,6 @@ All hyperparameters are specified and editable in the training scripts (meaning
 | `dataset_params.config.width_divisor`   | Factor to reduce the height of the feature vector before feeding the decoder.          | `int`        | `32`                                           |
 | `dataset_params.config.padding_value`   | Image padding value.                                                                   | `int`        | `0`                                            |
 | `dataset_params.config.padding_token`   | Transcription padding value.                                                           | `int`        | `None`                                         |
-| `dataset_params.config.constraints`     | Whether to add end-of-transcription and start-of-transcription tokens in labels.       | `list`       | `[]`                                           |
 | `dataset_params.config.preprocessings`  | List of pre-processing functions to apply to input images.                             | `list`       | (see [dedicated section](#data-preprocessing)) |
 | `dataset_params.config.augmentation`    | Configuration for data augmentation.                                                   | `dict`       | (see [dedicated section](#data-augmentation))  |
 
diff --git a/tests/conftest.py b/tests/conftest.py
index fb83a186..e660869c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -72,7 +72,6 @@ def training_config():
                 "height_divisor": 32,  # Image height will be divided by 32
                 "padding_value": 0,  # Image padding value
                 "padding_token": None,  # Label padding value
-                "constraints": [],
                 "preprocessings": [
                     {
                         "type": "to_RGB",
-- 
GitLab