diff --git a/tests/conftest.py b/tests/conftest.py index 50f1348c60268206f780a8188d9754429ebd73e9..53bf4aae33aae318169e0539df533a80e369c01f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -101,9 +101,7 @@ def mock_database(tmp_path_factory): element_json = json.loads(element_path.read_text()) element_type = element_json["type"] - image_path = ( - FIXTURES / "extraction" / "images" / element_type / id - ).with_suffix(".jpg") + image_path = (FIXTURES / "extraction" / "images" / id).with_suffix(".jpg") polygon = element_json.get("polygon") # Always use page images because polygons are based on the full image diff --git a/tests/data/entities.yml b/tests/data/entities.yml index 4bf7513a40cbeec0b6ce5367fcf0925848277d79..a10f027eaa31efa33796d0f678d096c09e003a41 100644 --- a/tests/data/entities.yml +++ b/tests/data/entities.yml @@ -4,7 +4,7 @@ entities: - Arkindex's entity - Person /!\ - adj -- birthdate +- age - firstname - fullname - name diff --git a/tests/data/extraction/elements/test-page_1-line_1.json b/tests/data/extraction/elements/test-page_1-line_1.json index 716bf50aa37442665d7ebc7c9246ac3b0bf01739..de3fb8f02e86a6263d6835eb47047bf98130935f 100644 --- a/tests/data/extraction/elements/test-page_1-line_1.json +++ b/tests/data/extraction/elements/test-page_1-line_1.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [37, 191], - [37, 339], - [767, 339], - [767, 191], - [37, 191] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Coupez", + "name": "Leunaut", "type": "surname", "offset": 0 }, { - "name": "Bouis", + "name": "Claude", "type": "firstname", - "offset": 7 + "offset": 8 }, { - "name": "7.12.14", - "type": "birthdate", - "offset": 13 + "name": "49", + "type": "age", + "offset": 15 } ] } diff --git a/tests/data/extraction/elements/test-page_1-line_2.json b/tests/data/extraction/elements/test-page_1-line_2.json index c3d79c187b6879135757cea304b1fb78cdf9be49..bbc58881db63b48d2d00abaef877cd4ae200bc24 100644 --- a/tests/data/extraction/elements/test-page_1-line_2.json +++ b/tests/data/extraction/elements/test-page_1-line_2.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [28, 339], - [28, 464], - [767, 464], - [767, 339], - [28, 339] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Poutrain", + "name": "Bauracho", "type": "surname", "offset": 0 }, { - "name": "Adolphe", + "name": "Claudine", "type": "firstname", "offset": 9 }, { - "name": "9.4.13", - "type": "birthdate", - "offset": 17 + "name": "39", + "type": "age", + "offset": 18 } ] } diff --git a/tests/data/extraction/elements/test-page_1-line_3.json b/tests/data/extraction/elements/test-page_1-line_3.json index 78fbe7865e47e062dc7b2b9682edc337b3294e53..61a71912e665fecc77ce032623654e63b91c888f 100644 --- a/tests/data/extraction/elements/test-page_1-line_3.json +++ b/tests/data/extraction/elements/test-page_1-line_3.json @@ -1,26 +1,26 @@ { "type": "text_line", "polygon": [ - [28, 464], - [28, 614], - [767, 614], - [767, 464], - [28, 464] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Gabale", + "name": "Laurent", "type": "surname", "offset": 0 }, { - "name": "Français", + "name": "Jacquse", "type": "firstname", - "offset": 7 + "offset": 8 }, { - "name": "26.3.11", - "type": "birthdate", + "name": "21", + "type": "age", "offset": 16 } ] diff --git a/tests/data/extraction/elements/test-page_2-line_1.json b/tests/data/extraction/elements/test-page_2-line_1.json index 9d5c131b90c8b1bd52cd6e365899c8f43afb070e..dfa3554e4e5037b6846dcc01d5d3e88299f52b86 100644 --- a/tests/data/extraction/elements/test-page_2-line_1.json +++ b/tests/data/extraction/elements/test-page_2-line_1.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [14, 199], - [14, 330], - [767, 330], - [767, 199], - [14, 199] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Durosoy", + "name": "Valette", "type": "surname", "offset": 0 }, { - "name": "Bouis", + "name": "Elisabeth", "type": "firstname", "offset": 8 }, { - "name": "22-4-18", - "type": "birthdate", - "offset": 14 + "name": "76", + "type": "age", + "offset": 18 } ] } diff --git a/tests/data/extraction/elements/test-page_2-line_2.json b/tests/data/extraction/elements/test-page_2-line_2.json index 9eda1ee8b10d4e98be32548b1d3b30caa3c3ce98..af3d79cec4b43a3fb1709c61adff61ae0502c02c 100644 --- a/tests/data/extraction/elements/test-page_2-line_2.json +++ b/tests/data/extraction/elements/test-page_2-line_2.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [16, 330], - [16, 471], - [765, 471], - [765, 330], - [16, 330] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Colaiani", + "name": "Tanbol", "type": "surname", "offset": 0 }, { - "name": "Angels", + "name": "Jean", "type": "firstname", - "offset": 9 + "offset": 7 }, { - "name": "28.11.17", - "type": "birthdate", - "offset": 16 + "name": "76", + "type": "age", + "offset": 12 } ] } diff --git a/tests/data/extraction/elements/test-page_2-line_3.json b/tests/data/extraction/elements/test-page_2-line_3.json index e372adcd8a65c853c8f1ced78d9f352003fda4a4..52c5b510eb7e3ab6fc8a258d53de69433b96b061 100644 --- a/tests/data/extraction/elements/test-page_2-line_3.json +++ b/tests/data/extraction/elements/test-page_2-line_3.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [11, 473], - [11, 598], - [772, 598], - [772, 473], - [11, 473] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Renouard", + "name": "Vauret", "type": "surname", "offset": 0 }, { - "name": "Maurice", + "name": "Jean", "type": "firstname", - "offset": 9 + "offset": 7 }, { - "name": "25.7.04", - "type": "birthdate", - "offset": 17 + "name": "64", + "type": "age", + "offset": 12 } ] } diff --git a/tests/data/extraction/elements/train-page_1-line_1.json b/tests/data/extraction/elements/train-page_1-line_1.json index 907dd2a040b31fb96997a9aff856cf7d0a77ad1a..4a39286284ab35f1f2691a1eead776468b982269 100644 --- a/tests/data/extraction/elements/train-page_1-line_1.json +++ b/tests/data/extraction/elements/train-page_1-line_1.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [27, 187], - [27, 327], - [754, 327], - [754, 187], - [27, 187] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Caillet", + "name": "Laulont", "type": "surname", "offset": 0 }, { - "name": "Maurice", + "name": "Francois", "type": "firstname", "offset": 8 }, { - "name": "28.9.06", - "type": "birthdate", - "offset": 16 + "name": "8", + "type": "age", + "offset": 17 } ] } diff --git a/tests/data/extraction/elements/train-page_1-line_2.json b/tests/data/extraction/elements/train-page_1-line_2.json index 080806424555c66cce1af9a38ed064d3332795cf..cdc9ec5a754ccb84d2fc8e5ec6286a4780da5502 100644 --- a/tests/data/extraction/elements/train-page_1-line_2.json +++ b/tests/data/extraction/elements/train-page_1-line_2.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [28, 328], - [28, 465], - [755, 465], - [755, 328], - [28, 328] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Reboul", + "name": "Ciret", "type": "surname", "offset": 0 }, { - "name": "Jean", + "name": "Antoine", "type": "firstname", - "offset": 7 + "offset": 6 }, { - "name": "30.9.02", - "type": "birthdate", - "offset": 12 + "name": "27", + "type": "age", + "offset": 14 } ] } diff --git a/tests/data/extraction/elements/train-page_1-line_3.json b/tests/data/extraction/elements/train-page_1-line_3.json index 63b38d499ba311497acd8100454c9d1074cd478f..70387cc1590e49cbaff2032982d749239e4fdb44 100644 --- a/tests/data/extraction/elements/train-page_1-line_3.json +++ b/tests/data/extraction/elements/train-page_1-line_3.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [23, 463], - [23, 604], - [803, 604], - [803, 463], - [23, 463] + [0, 0], + [0, 85], + [1900, 85], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Bareyre", + "name": "Ciret", "type": "surname", "offset": 0 }, { - "name": "Jean", + "name": "Marie", "type": "firstname", - "offset": 8 + "offset": 6 }, { - "name": "28.3.11", - "type": "birthdate", - "offset": 13 + "name": "28", + "type": "age", + "offset": 12 } ] } diff --git a/tests/data/extraction/elements/train-page_1-line_4.json b/tests/data/extraction/elements/train-page_1-line_4.json index eb348b19e2bd57869d73a89778fb1d1e9879c221..0efa34f9a517785a473f7d63b28a8154bf793f6b 100644 --- a/tests/data/extraction/elements/train-page_1-line_4.json +++ b/tests/data/extraction/elements/train-page_1-line_4.json @@ -1,26 +1,26 @@ { "type": "text_line", "polygon": [ - [21, 604], - [21, 743], - [812, 743], - [812, 604], - [21, 604] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Roussy", + "name": "Ciret", "type": "surname", "offset": 0 }, { - "name": "Jean", + "name": "Marie", "type": "firstname", - "offset": 7 + "offset": 6 }, { - "name": "4.11.14", - "type": "birthdate", + "name": "2", + "type": "age", "offset": 12 } ] diff --git a/tests/data/extraction/elements/train-page_2-line_1.json b/tests/data/extraction/elements/train-page_2-line_1.json index e7f6663b4faeb70b677f67332d44c2e28d6f605e..586d9492bac257406fb101996547e777e396d900 100644 --- a/tests/data/extraction/elements/train-page_2-line_1.json +++ b/tests/data/extraction/elements/train-page_2-line_1.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [18, 197], - [18, 340], - [751, 340], - [751, 197], - [18, 197] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Marin", + "name": "Eureston", "type": "surname", "offset": 0 }, { - "name": "Marcel", + "name": "Solange", "type": "firstname", - "offset": 6 + "offset": 9 }, { - "name": "10.8.06", - "type": "birthdate", - "offset": 13 + "name": "10", + "type": "age", + "offset": 17 } ] } diff --git a/tests/data/extraction/elements/train-page_2-line_2.json b/tests/data/extraction/elements/train-page_2-line_2.json index bc6829dab843ea9b68a2bbdac8df91c575ebc96d..450c1ebc67f95b4d5b855752d59cb95fc82c53a2 100644 --- a/tests/data/extraction/elements/train-page_2-line_2.json +++ b/tests/data/extraction/elements/train-page_2-line_2.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [18, 340], - [18, 476], - [751, 476], - [751, 340], - [18, 340] + [0, 0], + [0, 83], + [1900, 83], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Amical", + "name": "Terontussieux", "type": "surname", "offset": 0 }, { - "name": "Eloi", + "name": "Jean", "type": "firstname", - "offset": 7 + "offset": 14 }, { - "name": "11.10.04", - "type": "birthdate", - "offset": 12 + "name": "2", + "type": "age", + "offset": 19 } ] } diff --git a/tests/data/extraction/elements/train-page_2-line_3.json b/tests/data/extraction/elements/train-page_2-line_3.json index 90432163e7449c6c228eb25d1024fc23eec988ca..fd3390582ffda15e92ed1673c469f114f74cf7e3 100644 --- a/tests/data/extraction/elements/train-page_2-line_3.json +++ b/tests/data/extraction/elements/train-page_2-line_3.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [21, 476], - [21, 615], - [746, 615], - [746, 476], - [21, 476] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Biros", + "name": "Pressonet", "type": "surname", "offset": 0 }, { - "name": "Mael", + "name": "Marie", "type": "firstname", - "offset": 6 + "offset": 10 }, { - "name": "30.10.10", - "type": "birthdate", - "offset": 11 + "name": "12", + "type": "age", + "offset": 16 } ] } diff --git a/tests/data/extraction/elements/val-page_1-line_1.json b/tests/data/extraction/elements/val-page_1-line_1.json index d255644499a7858d72b4e636f0471b61abd19c93..be1ab12473937cc5f848340d5d8a30465c4c19e5 100644 --- a/tests/data/extraction/elements/val-page_1-line_1.json +++ b/tests/data/extraction/elements/val-page_1-line_1.json @@ -1,27 +1,27 @@ { "type": "text_line", "polygon": [ - [14, 211], - [14, 347], - [755, 347], - [755, 211], - [14, 211] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Monard", + "name": "Ciraud", "type": "surname", "offset": 0 }, { - "name": "Bouis", + "name": "Antoine", "type": "firstname", "offset": 7 }, { - "name": "29-7-04", - "type": "birthdate", - "offset": 13 + "name": "34", + "type": "age", + "offset": 15 } ] } diff --git a/tests/data/extraction/elements/val-page_1-line_2.json b/tests/data/extraction/elements/val-page_1-line_2.json index 633ccf9b3047dc85bab971adb0dafdc03765d044..a51724d6cff364e7fc4bf158408d6acc641006e0 100644 --- a/tests/data/extraction/elements/val-page_1-line_2.json +++ b/tests/data/extraction/elements/val-page_1-line_2.json @@ -1,26 +1,26 @@ { "type": "text_line", "polygon": [ - [14, 350], - [14, 484], - [748, 484], - [748, 350], - [14, 350] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "Astier", + "name": "Ciraud", "type": "surname", "offset": 0 }, { - "name": "Arthur", + "name": "Priser", "type": "firstname", "offset": 7 }, { - "name": "11-2-13", - "type": "birthdate", + "name": "34", + "type": "age", "offset": 14 } ] diff --git a/tests/data/extraction/elements/val-page_1-line_3.json b/tests/data/extraction/elements/val-page_1-line_3.json index 951431c82753e689558577571a84fcf396cf903c..fd868e76abd506b1f0b2ce4cb19198a90e21bad1 100644 --- a/tests/data/extraction/elements/val-page_1-line_3.json +++ b/tests/data/extraction/elements/val-page_1-line_3.json @@ -1,26 +1,26 @@ { "type": "text_line", "polygon": [ - [11, 484], - [11, 622], - [751, 622], - [751, 484], - [11, 484] + [0, 0], + [0, 84], + [1900, 84], + [1900, 0], + [0, 0] ], "transcription_entities": [ { - "name": "De Vlieger", + "name": "Ciraud", "type": "surname", "offset": 0 }, { - "name": "Jules", + "name": "Elisabeth", "type": "firstname", - "offset": 11 + "offset": 7 }, { - "name": "21-11-11", - "type": "birthdate", + "name": "34", + "type": "age", "offset": 17 } ] diff --git a/tests/data/extraction/images/double_page/test-page_1.jpg b/tests/data/extraction/images/double_page/test-page_1.jpg deleted file mode 100644 index bb873b5cf0dba6d7cf4d81b388799e6079d5d0ac..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/double_page/test-page_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/double_page/test-page_2.jpg b/tests/data/extraction/images/double_page/test-page_2.jpg deleted file mode 100644 index d5282347582f881a5bdc1071638afe9403e26689..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/double_page/test-page_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/double_page/train-page_1.jpg b/tests/data/extraction/images/double_page/train-page_1.jpg deleted file mode 100644 index ee1a8d546f9633930b51e105bc77bfd7d0524fd6..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/double_page/train-page_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/double_page/train-page_2.jpg b/tests/data/extraction/images/double_page/train-page_2.jpg deleted file mode 100644 index 6f9b6499f240e00c67e6204c9e6453dc70d684fe..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/double_page/train-page_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/double_page/val-page_1.jpg b/tests/data/extraction/images/double_page/val-page_1.jpg deleted file mode 100644 index 07a1912cd67099f64e052cbc1a4e89ebb3a83cce..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/double_page/val-page_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/test-page_1-line_1.jpg b/tests/data/extraction/images/test-page_1-line_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a2c1e672403f61c530782d644dc6af1c1de19e0a Binary files /dev/null and b/tests/data/extraction/images/test-page_1-line_1.jpg differ diff --git a/tests/data/extraction/images/test-page_1-line_2.jpg b/tests/data/extraction/images/test-page_1-line_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9df6239780ac6cde37c3c7327f7183ea481eb02b Binary files /dev/null and b/tests/data/extraction/images/test-page_1-line_2.jpg differ diff --git a/tests/data/extraction/images/test-page_1-line_3.jpg b/tests/data/extraction/images/test-page_1-line_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b31308f68422bd2a94007ce215f7deddf95d80be Binary files /dev/null and b/tests/data/extraction/images/test-page_1-line_3.jpg differ diff --git a/tests/data/extraction/images/test-page_2-line_1.jpg b/tests/data/extraction/images/test-page_2-line_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8f26a3a2def5ad45b3f7e94a0093ac26b84204aa Binary files /dev/null and b/tests/data/extraction/images/test-page_2-line_1.jpg differ diff --git a/tests/data/extraction/images/test-page_2-line_2.jpg b/tests/data/extraction/images/test-page_2-line_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8639a8f4fcc67b8b986ed8877141a96decf5f169 Binary files /dev/null and b/tests/data/extraction/images/test-page_2-line_2.jpg differ diff --git a/tests/data/extraction/images/test-page_2-line_3.jpg b/tests/data/extraction/images/test-page_2-line_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..abe1e6fcedc735e6248503deddb23a93f79642de Binary files /dev/null and b/tests/data/extraction/images/test-page_2-line_3.jpg differ diff --git a/tests/data/extraction/images/text_line/test-page_1-line_1.jpg b/tests/data/extraction/images/text_line/test-page_1-line_1.jpg deleted file mode 100644 index 59b4173da998b2a6c9a00c6fde7a550920e06b47..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_1-line_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/test-page_1-line_2.jpg b/tests/data/extraction/images/text_line/test-page_1-line_2.jpg deleted file mode 100644 index aa4c1f0f923882eebca307773d1099864de0622a..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_1-line_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/test-page_1-line_3.jpg b/tests/data/extraction/images/text_line/test-page_1-line_3.jpg deleted file mode 100644 index bee0d3159564d31de40a21c553d1486cbe296333..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_1-line_3.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/test-page_2-line_1.jpg b/tests/data/extraction/images/text_line/test-page_2-line_1.jpg deleted file mode 100644 index 44d4f37898a62d2f4bee272cb35add685b06dfdf..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_2-line_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/test-page_2-line_2.jpg b/tests/data/extraction/images/text_line/test-page_2-line_2.jpg deleted file mode 100644 index c19ec187bbf254e934cd344422ef32731885dba6..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_2-line_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/test-page_2-line_3.jpg b/tests/data/extraction/images/text_line/test-page_2-line_3.jpg deleted file mode 100644 index 4e05ada4d1c1ce67a85847782cf030245a21b3e2..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/test-page_2-line_3.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_1-line_1.jpg b/tests/data/extraction/images/text_line/train-page_1-line_1.jpg deleted file mode 100644 index 6768c8183e841c5326facfce7bb4b1bdf5801eff..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_1-line_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_1-line_2.jpg b/tests/data/extraction/images/text_line/train-page_1-line_2.jpg deleted file mode 100644 index b787d7072945c314c0917c63b3329a78b12bbf97..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_1-line_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_1-line_3.jpg b/tests/data/extraction/images/text_line/train-page_1-line_3.jpg deleted file mode 100644 index b8ae4811a419bb25d27bd1c684522eeb71e314fa..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_1-line_3.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_1-line_4.jpg b/tests/data/extraction/images/text_line/train-page_1-line_4.jpg deleted file mode 100644 index 41ee3f51abf4926f2d9c930c7fb3e339171687bc..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_1-line_4.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_2-line_1.jpg b/tests/data/extraction/images/text_line/train-page_2-line_1.jpg deleted file mode 100644 index a1d15adc504778437929c9e3f244b5b6bcacde77..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_2-line_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_2-line_2.jpg b/tests/data/extraction/images/text_line/train-page_2-line_2.jpg deleted file mode 100644 index bc7ff4c07f9f6d4765f1d145e6b6e4544a4c31b1..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_2-line_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/train-page_2-line_3.jpg b/tests/data/extraction/images/text_line/train-page_2-line_3.jpg deleted file mode 100644 index 90e737d16d6cf982cbfe25242985c0da7ee7bc6a..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/train-page_2-line_3.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/val-page_1-line_1.jpg b/tests/data/extraction/images/text_line/val-page_1-line_1.jpg deleted file mode 100644 index 5937da29f438c8e98494b29a5f549b0b398e3723..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/val-page_1-line_1.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/val-page_1-line_2.jpg b/tests/data/extraction/images/text_line/val-page_1-line_2.jpg deleted file mode 100644 index 758e2c89b1cd4b37ba2c03a984b417970c384daa..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/val-page_1-line_2.jpg and /dev/null differ diff --git a/tests/data/extraction/images/text_line/val-page_1-line_3.jpg b/tests/data/extraction/images/text_line/val-page_1-line_3.jpg deleted file mode 100644 index 30e2e4319acdb5038fc1da831091018ff871c329..0000000000000000000000000000000000000000 Binary files a/tests/data/extraction/images/text_line/val-page_1-line_3.jpg and /dev/null differ diff --git a/tests/data/extraction/images/train-page_1-line_1.jpg b/tests/data/extraction/images/train-page_1-line_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..579006eeadeda909af6b156b3abcc79be39ad389 Binary files /dev/null and b/tests/data/extraction/images/train-page_1-line_1.jpg differ diff --git a/tests/data/extraction/images/train-page_1-line_2.jpg b/tests/data/extraction/images/train-page_1-line_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c043342cf169f18f4a937b7cb26953111c730c0b Binary files /dev/null and b/tests/data/extraction/images/train-page_1-line_2.jpg differ diff --git a/tests/data/extraction/images/train-page_1-line_3.jpg b/tests/data/extraction/images/train-page_1-line_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..36140494757daf7717ab616219c30551d146dddc Binary files /dev/null and b/tests/data/extraction/images/train-page_1-line_3.jpg differ diff --git a/tests/data/extraction/images/train-page_1-line_4.jpg b/tests/data/extraction/images/train-page_1-line_4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..42c688826ae22837cd1b3b5d00704721b899e7fc Binary files /dev/null and b/tests/data/extraction/images/train-page_1-line_4.jpg differ diff --git a/tests/data/extraction/images/train-page_2-line_1.jpg b/tests/data/extraction/images/train-page_2-line_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1ce1c370792b5df8144ee98a9b66df68a8383069 Binary files /dev/null and b/tests/data/extraction/images/train-page_2-line_1.jpg differ diff --git a/tests/data/extraction/images/train-page_2-line_2.jpg b/tests/data/extraction/images/train-page_2-line_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bfaa7425d60d6b9afd8d56f0a6c115bb92e53edd Binary files /dev/null and b/tests/data/extraction/images/train-page_2-line_2.jpg differ diff --git a/tests/data/extraction/images/train-page_2-line_3.jpg b/tests/data/extraction/images/train-page_2-line_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..eab65d4b9b02348ff9bfd7b17d4c079835238d8e Binary files /dev/null and b/tests/data/extraction/images/train-page_2-line_3.jpg differ diff --git a/tests/data/extraction/images/val-page_1-line_1.jpg b/tests/data/extraction/images/val-page_1-line_1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..daac765839dcb6a04269dbeee8cd05fac889dc16 Binary files /dev/null and b/tests/data/extraction/images/val-page_1-line_1.jpg differ diff --git a/tests/data/extraction/images/val-page_1-line_2.jpg b/tests/data/extraction/images/val-page_1-line_2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0e2399b4b59449de2596f4aabc731a468ae53c9f Binary files /dev/null and b/tests/data/extraction/images/val-page_1-line_2.jpg differ diff --git a/tests/data/extraction/images/val-page_1-line_3.jpg b/tests/data/extraction/images/val-page_1-line_3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3580fdb558916ddb0be8b2344ac9dc83da807eb1 Binary files /dev/null and b/tests/data/extraction/images/val-page_1-line_3.jpg differ diff --git a/tests/data/extraction/split.json b/tests/data/extraction/split.json index e264f689b2e4299e263971869452bb4b7e67544e..143073abfea01d2f351cbc030dbbc905e886b01d 100644 --- a/tests/data/extraction/split.json +++ b/tests/data/extraction/split.json @@ -3,470 +3,470 @@ "test-page_1-line_1": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_1-line_1.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_1-line_1.jpg", "polygon": [ [ - 37, - 191 + 0, + 0 ], [ - 37, - 339 + 0, + 83 ], [ - 767, - 339 + 1900, + 83 ], [ - 767, - 191 + 1900, + 0 ], [ - 37, - 191 + 0, + 0 ] ] }, - "text": "â“¢Couâ‡e⇠ⓕBouis â“‘â‡.12.14" + "text": "â“¢Leunaut â“•Clauâ‡e â“‘â‡â‡" }, "test-page_1-line_2": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_1-line_2.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_1-line_2.jpg", "polygon": [ [ - 28, - 339 + 0, + 0 ], [ - 28, - 464 + 0, + 83 ], [ - 767, - 464 + 1900, + 83 ], [ - 767, - 339 + 1900, + 0 ], [ - 28, - 339 + 0, + 0 ] ] }, - "text": "â“¢â‡outrain â“•Aâ‡olâ‡â‡e â“‘9.4.13" + "text": "â“¢â‡auracâ‡o â“•Clauâ‡ine â“‘â‡â‡" }, "test-page_1-line_3": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_1-line_3.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_1-line_3.jpg", "polygon": [ [ - 28, - 464 + 0, + 0 ], [ - 28, - 614 + 0, + 83 ], [ - 767, - 614 + 1900, + 83 ], [ - 767, - 464 + 1900, + 0 ], [ - 28, - 464 + 0, + 0 ] ] }, - "text": "â“¢â‡abale â“•â‡ranâ‡ais â“‘26.3.11" + "text": "â“¢Laurent â“•Jacâ‡use â“‘21" }, "test-page_2-line_1": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_2-line_1.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_2-line_1.jpg", "polygon": [ [ - 14, - 199 + 0, + 0 ], [ - 14, - 330 + 0, + 83 ], [ - 767, - 330 + 1900, + 83 ], [ - 767, - 199 + 1900, + 0 ], [ - 14, - 199 + 0, + 0 ] ] }, - "text": "â“¢â‡urosoy â“•Bouis â“‘22â‡4â‡18" + "text": "â“¢â‡alette â“•Elisaâ‡et⇠ⓑ7â‡" }, "test-page_2-line_2": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_2-line_2.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_2-line_2.jpg", "polygon": [ [ - 16, - 330 + 0, + 0 ], [ - 16, - 471 + 0, + 84 ], [ - 765, - 471 + 1900, + 84 ], [ - 765, - 330 + 1900, + 0 ], [ - 16, - 330 + 0, + 0 ] ] }, - "text": "â“¢Colaiani â“•Anâ‡els â“‘28.11.1â‡" + "text": "â“¢Tanâ‡ol â“•Jean â“‘7â‡" }, "test-page_2-line_3": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/test-page_2-line_3.jpg", + "iiif_url": "{FIXTURES}/extraction/images/test-page_2-line_3.jpg", "polygon": [ [ - 11, - 473 + 0, + 0 ], [ - 11, - 598 + 0, + 83 ], [ - 772, - 598 + 1900, + 83 ], [ - 772, - 473 + 1900, + 0 ], [ - 11, - 473 + 0, + 0 ] ] }, - "text": "â“¢Renouar⇠ⓕMaurice â“‘2â‡.â‡.04" + "text": "â“¢â‡auret â“•Jean â“‘â‡â‡" } }, "train": { "train-page_1-line_1": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_1-line_1.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_1-line_1.jpg", "polygon": [ [ - 27, - 187 + 0, + 0 ], [ - 27, - 327 + 0, + 84 ], [ - 754, - 327 + 1900, + 84 ], [ - 754, - 187 + 1900, + 0 ], [ - 27, - 187 + 0, + 0 ] ] }, - "text": "â“¢Caillet â“•Maurice â“‘28.9.06" + "text": "â“¢Laulont â“•Francois â“‘8" }, "train-page_1-line_2": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_1-line_2.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_1-line_2.jpg", "polygon": [ [ - 28, - 328 + 0, + 0 ], [ - 28, - 465 + 0, + 84 ], [ - 755, - 465 + 1900, + 84 ], [ - 755, - 328 + 1900, + 0 ], [ - 28, - 328 + 0, + 0 ] ] }, - "text": "â“¢Reboul â“•Jean â“‘30.9.02" + "text": "â“¢Ciret â“•Antoine â“‘27" }, "train-page_1-line_3": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_1-line_3.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_1-line_3.jpg", "polygon": [ [ - 23, - 463 + 0, + 0 ], [ - 23, - 604 + 0, + 85 ], [ - 803, - 604 + 1900, + 85 ], [ - 803, - 463 + 1900, + 0 ], [ - 23, - 463 + 0, + 0 ] ] }, - "text": "â“¢Bareyre â“•Jean â“‘28.3.11" + "text": "â“¢Ciret â“•Marie â“‘28" }, "train-page_1-line_4": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_1-line_4.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_1-line_4.jpg", "polygon": [ [ - 21, - 604 + 0, + 0 ], [ - 21, - 743 + 0, + 84 ], [ - 812, - 743 + 1900, + 84 ], [ - 812, - 604 + 1900, + 0 ], [ - 21, - 604 + 0, + 0 ] ] }, - "text": "â“¢Roussy â“•Jean â“‘4.11.14" + "text": "â“¢Ciret â“•Marie â“‘2" }, "train-page_2-line_1": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_2-line_1.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_2-line_1.jpg", "polygon": [ [ - 18, - 197 + 0, + 0 ], [ - 18, - 340 + 0, + 83 ], [ - 751, - 340 + 1900, + 83 ], [ - 751, - 197 + 1900, + 0 ], [ - 18, - 197 + 0, + 0 ] ] }, - "text": "â“¢Marin â“•Marcel â“‘10.8.06" + "text": "â“¢Eureston â“•Solange â“‘10" }, "train-page_2-line_2": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_2-line_2.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_2-line_2.jpg", "polygon": [ [ - 18, - 340 + 0, + 0 ], [ - 18, - 476 + 0, + 83 ], [ - 751, - 476 + 1900, + 83 ], [ - 751, - 340 + 1900, + 0 ], [ - 18, - 340 + 0, + 0 ] ] }, - "text": "â“¢Amical â“•Eloi â“‘11.10.04" + "text": "â“¢Terontussieux â“•Jean â“‘2" }, "train-page_2-line_3": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/train-page_2-line_3.jpg", + "iiif_url": "{FIXTURES}/extraction/images/train-page_2-line_3.jpg", "polygon": [ [ - 21, - 476 + 0, + 0 ], [ - 21, - 615 + 0, + 84 ], [ - 746, - 615 + 1900, + 84 ], [ - 746, - 476 + 1900, + 0 ], [ - 21, - 476 + 0, + 0 ] ] }, - "text": "â“¢Biros â“•Mael â“‘30.10.10" + "text": "â“¢Pressonet â“•Marie â“‘12" } }, "val": { "val-page_1-line_1": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/val-page_1-line_1.jpg", + "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_1.jpg", "polygon": [ [ - 14, - 211 + 0, + 0 ], [ - 14, - 347 + 0, + 84 ], [ - 755, - 347 + 1900, + 84 ], [ - 755, - 211 + 1900, + 0 ], [ - 14, - 211 + 0, + 0 ] ] }, - "text": "â“¢Monar⇠ⓕBouis â“‘29â‡â‡â‡04" + "text": "â“¢Cirau⇠ⓕAntoine â“‘â‡â‡" }, "val-page_1-line_2": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/val-page_1-line_2.jpg", + "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_2.jpg", "polygon": [ [ - 14, - 350 + 0, + 0 ], [ - 14, - 484 + 0, + 84 ], [ - 748, - 484 + 1900, + 84 ], [ - 748, - 350 + 1900, + 0 ], [ - 14, - 350 + 0, + 0 ] ] }, - "text": "â“¢Astier â“•Artâ‡ur â“‘11â‡2â‡13" + "text": "â“¢Cirau⇠ⓕPriser â“‘â‡â‡" }, "val-page_1-line_3": { "dataset_id": "dataset_id", "image": { - "iiif_url": "{FIXTURES}/extraction/images/text_line/val-page_1-line_3.jpg", + "iiif_url": "{FIXTURES}/extraction/images/val-page_1-line_3.jpg", "polygon": [ [ - 11, - 484 + 0, + 0 ], [ - 11, - 622 + 0, + 84 ], [ - 751, - 622 + 1900, + 84 ], [ - 751, - 484 + 1900, + 0 ], [ - 11, - 484 + 0, + 0 ] ] }, - "text": "â“¢â‡e â‡lieâ‡er â“•Jules â“‘21â‡11â‡11" + "text": "â“¢Cirau⇠ⓕElisaâ‡et⇠ⓑâ‡â‡" } } } diff --git a/tests/data/extraction/tokens.yml b/tests/data/extraction/tokens.yml index 23c24db674bb882f940b8e995e3a9cac75c8b231..c8b75e54ca249b10d9d31029652b82531b09d40b 100644 --- a/tests/data/extraction/tokens.yml +++ b/tests/data/extraction/tokens.yml @@ -5,6 +5,6 @@ surname: firstname: start: "â“•" end: "" -birthdate: +age: start: "â“‘" end: "" diff --git a/tests/data/tokens/end_tokens.yml b/tests/data/tokens/end_tokens.yml index c660b7788fc5fafd48d197be29a55b5dd10e21fa..5f42b7e94b6b0942d2fa46884662c50531239db2 100644 --- a/tests/data/tokens/end_tokens.yml +++ b/tests/data/tokens/end_tokens.yml @@ -11,7 +11,7 @@ Person /!\: adj: start: â’¼ end: â’½ -birthdate: +age: start: â’¾ end: â’¿ firstname: diff --git a/tests/data/tokens/no_end_tokens.yml b/tests/data/tokens/no_end_tokens.yml index a19597267d1952aad03b58c66dafa5fe43e5f52b..f840973a099749cc663612bb6f9c407e73104a59 100644 --- a/tests/data/tokens/no_end_tokens.yml +++ b/tests/data/tokens/no_end_tokens.yml @@ -11,7 +11,7 @@ Person /!\: adj: start: â’¹ end: '' -birthdate: +age: start: â’º end: '' firstname: diff --git a/tests/test_db.py b/tests/test_db.py index 875add45c44d3e395296c5c9f0f5aad90fdf38e4..088cab36cf3d07aa1546e1f0ef49f14d904a7d51 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -84,7 +84,7 @@ def test_get_transcriptions(sources, mock_database): if not sources or False in sources: expected_transcriptions.append( { - "text": "Caillet Maurice 28.9.06", + "text": "Laulont Francois 8", "worker_version_id": None, "worker_run_id": None, } @@ -93,7 +93,7 @@ def test_get_transcriptions(sources, mock_database): if not sources or "id" in sources: expected_transcriptions.append( { - "text": "caillet maurice 28.9.06", + "text": "laulont francois 8", "worker_version_id": "worker_version_id", "worker_run_id": "worker_run_id", } @@ -121,7 +121,7 @@ def test_get_transcriptions(sources, mock_database): @pytest.mark.parametrize("source", (False, "id", None)) @pytest.mark.parametrize( - "supported_types", (["surname"], ["surname", "firstname", "birthdate"]) + "supported_types", (["surname"], ["surname", "firstname", "age"]) ) def test_get_transcription_entities(source, mock_database, supported_types): worker_version = f"worker_version_{source}" if isinstance(source, str) else source @@ -137,22 +137,22 @@ def test_get_transcription_entities(source, mock_database, supported_types): expected_entities = [ { - "name": "Caillet", + "name": "Laulont", "type": "surname", "offset": 0, "length": 7, }, { - "name": "Maurice", + "name": "Francois", "type": "firstname", "offset": 9, - "length": 7, + "length": 8, }, { - "name": "28.9.06", - "type": "birthdate", - "offset": 18, - "length": 7, + "name": "8", + "type": "age", + "offset": 19, + "length": 1, }, ] diff --git a/tests/test_download.py b/tests/test_download.py index 8dd39fc49397e3b0a8bf0cb2a0f528ecfc06e498..1c1879fb56ef6d664564ccf5e11bfc264385452b 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -93,26 +93,26 @@ def test_download(split_content, monkeypatch, tmp_path): # Check "labels.json" expected_labels = { "test": { - "images/test/dataset_id/test-page_1-line_1.jpg": "â“¢Couâ‡e⇠ⓕBouis â“‘â‡.12.14", - "images/test/dataset_id/test-page_1-line_2.jpg": "â“¢â‡outrain â“•Aâ‡olâ‡â‡e â“‘9.4.13", - "images/test/dataset_id/test-page_1-line_3.jpg": "â“¢â‡abale â“•â‡ranâ‡ais â“‘26.3.11", - "images/test/dataset_id/test-page_2-line_1.jpg": "â“¢â‡urosoy â“•Bouis â“‘22â‡4â‡18", - "images/test/dataset_id/test-page_2-line_2.jpg": "â“¢Colaiani â“•Anâ‡els â“‘28.11.1â‡", - "images/test/dataset_id/test-page_2-line_3.jpg": "â“¢Renouar⇠ⓕMaurice â“‘2â‡.â‡.04", + "images/test/dataset_id/test-page_1-line_1.jpg": "â“¢Leunaut â“•Clauâ‡e â“‘â‡â‡", + "images/test/dataset_id/test-page_1-line_2.jpg": "â“¢â‡auracâ‡o â“•Clauâ‡ine â“‘â‡â‡", + "images/test/dataset_id/test-page_1-line_3.jpg": "â“¢Laurent â“•Jacâ‡use â“‘21", + "images/test/dataset_id/test-page_2-line_1.jpg": "â“¢â‡alette â“•Elisaâ‡et⇠ⓑ7â‡", + "images/test/dataset_id/test-page_2-line_2.jpg": "â“¢Tanâ‡ol â“•Jean â“‘7â‡", + "images/test/dataset_id/test-page_2-line_3.jpg": "â“¢â‡auret â“•Jean â“‘â‡â‡", }, "train": { - "images/train/dataset_id/train-page_1-line_1.jpg": "â“¢Caillet â“•Maurice â“‘28.9.06", - "images/train/dataset_id/train-page_1-line_2.jpg": "â“¢Reboul â“•Jean â“‘30.9.02", - "images/train/dataset_id/train-page_1-line_3.jpg": "â“¢Bareyre â“•Jean â“‘28.3.11", - "images/train/dataset_id/train-page_1-line_4.jpg": "â“¢Roussy â“•Jean â“‘4.11.14", - "images/train/dataset_id/train-page_2-line_1.jpg": "â“¢Marin â“•Marcel â“‘10.8.06", - "images/train/dataset_id/train-page_2-line_2.jpg": "â“¢Amical â“•Eloi â“‘11.10.04", - "images/train/dataset_id/train-page_2-line_3.jpg": "â“¢Biros â“•Mael â“‘30.10.10", + "images/train/dataset_id/train-page_1-line_1.jpg": "â“¢Laulont â“•Francois â“‘8", + "images/train/dataset_id/train-page_1-line_2.jpg": "â“¢Ciret â“•Antoine â“‘27", + "images/train/dataset_id/train-page_1-line_3.jpg": "â“¢Ciret â“•Marie â“‘28", + "images/train/dataset_id/train-page_1-line_4.jpg": "â“¢Ciret â“•Marie â“‘2", + "images/train/dataset_id/train-page_2-line_1.jpg": "â“¢Eureston â“•Solange â“‘10", + "images/train/dataset_id/train-page_2-line_2.jpg": "â“¢Terontussieux â“•Jean â“‘2", + "images/train/dataset_id/train-page_2-line_3.jpg": "â“¢Pressonet â“•Marie â“‘12", }, "val": { - "images/val/dataset_id/val-page_1-line_1.jpg": "â“¢Monar⇠ⓕBouis â“‘29â‡â‡â‡04", - "images/val/dataset_id/val-page_1-line_2.jpg": "â“¢Astier â“•Artâ‡ur â“‘11â‡2â‡13", - "images/val/dataset_id/val-page_1-line_3.jpg": "â“¢â‡e â‡lieâ‡er â“•Jules â“‘21â‡11â‡11", + "images/val/dataset_id/val-page_1-line_1.jpg": "â“¢Cirau⇠ⓕAntoine â“‘â‡â‡", + "images/val/dataset_id/val-page_1-line_2.jpg": "â“¢Cirau⇠ⓕPriser â“‘â‡â‡", + "images/val/dataset_id/val-page_1-line_3.jpg": "â“¢Cirau⇠ⓕElisaâ‡et⇠ⓑâ‡â‡", }, } @@ -124,9 +124,7 @@ def test_download(split_content, monkeypatch, tmp_path): continue assert ImageChops.difference( - Image.open( - EXTRACTION_DATA_PATH / "images" / "text_line" / expected_path.name - ), + Image.open(EXTRACTION_DATA_PATH / "images" / expected_path.name), Image.open(expected_path), ) diff --git a/tests/test_extract.py b/tests/test_extract.py index 368d99984dace2f1de58708b8fc06c340e6151c6..922d1026fe95d0f47b09b748fad5e3202ef10e3c 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -114,117 +114,117 @@ def test_process_element_unknown_token_in_text_error(mock_database, tmp_path): True, True, "worker_version_id", - """â– â“¢ c a i l l e t â– â“• m a u r i c e â– â“‘ 28. 9.0 6 -â– â“¢ re b ou l â– â“• j e a n â– â“‘ 30. 9.0 2 -â– â“¢ b a re y re â– â“• j e a n â– â“‘ 28. 3 . 1 1 -â– â“¢ r ou s s y â– â“• j e a n â– â“‘ 4 . 1 1 . 1 4 -â– â“¢ m a r i n â– â“• m a r c e l â– â“‘ 1 0 . 8 . 0 6 -â– â“¢ a m i c a l â– â“• e l o i â– â“‘ 1 1 . 1 0 . 0 4 -â– â“¢ b i r o s â– â“• m a e l â– â“‘ 30. 1 0 . 1 0""", + """â– â“¢ l a u l ont â– â“• f r an c oi s â– â“‘ 8 +â– â“¢ c i re t â– â“• an t oi ne â– â“‘ 2 7 +â– â“¢ c i re t â– â“• m a r ie â– â“‘ 2 8 +â– â“¢ c i re t â– â“• m a r ie â– â“‘ 2 +â– â“¢ e u re s t on â– â“• so l an g e â– â“‘ 1 0 +â– â“¢ t e r ont u s s ie u x â– â“• j e an â– â“‘ 2 +â– â“¢ p re s s on e t â– â“• m a r ie â– â“‘ 1 2""", 40, ), ( True, False, "worker_version_id", - """â– â“¢ c a i l l e t â– â“• m a u r i c e â– â“‘ 28. 9.0 6 -â– â“¢ re b ou l â– â“• j e a n â– â“‘ 30. 9.0 2 -â– â“¢ b a re y re â– â“• j e a n â– â“‘ 28. 3 . 1 1 -â– â“¢ r ou s s y â– â“• j e a n â– â“‘ 4 . 1 1 . 1 4 -â– â“¢ m a r i n â– â“• m a r c e l â– â“‘ 1 0 . 8 . 0 6 -â– â“¢ a m i c a l â– â“• e l o i â– â“‘ 1 1 . 1 0 . 0 4 -â– â“¢ b i r o s â– â“• m a e l â– â“‘ 30. 1 0 . 1 0""", + """â– â“¢ l a u l ont â– â“• f r an c oi s â– â“‘ 8 +â– â“¢ c i re t â– â“• an t oi ne â– â“‘ 2 7 +â– â“¢ c i re t â– â“• m a r ie â– â“‘ 2 8 +â– â“¢ c i re t â– â“• m a r ie â– â“‘ 2 +â– â“¢ e u re s t on â– â“• so l an g e â– â“‘ 1 0 +â– â“¢ t e r ont u s s ie u x â– â“• j e an â– â“‘ 2 +â– â“¢ p re s s on e t â– â“• m a r ie â– â“‘ 1 2""", 40, ), ( False, True, "worker_version_id", - """â– ca i l l e t â– ma u r i ce â– 28. 9.0 6 -â– re b o u l â– j e a n â– 30. 9.0 2 -â– b a re y re â– j e a n â– 28. 3 . 1 1 -â– r o u s s y â– j e a n â– 4 . 11.1 4 -â– ma r i n â– ma r ce l â– 10. 8 . 0 6 -â– a m i ca l â– el o i â– 11.1 0 . 0 4 -â– b i r o s â– ma el â– 30. 10. 1 0""", + """â– la u l ont â– f r an c oi s â– 8 +â– c i re t â– an t oi ne â– 2 7 +â– c i re t â– m a r ie â– 2 8 +â– c i re t â– m a r ie â– 2 +â– e u res t on â– so l an g e â– 1 0 +â– t e r ont u ss ie u x â– j e an â– 2 +â– p res so ne t â– m a r ie â– 1 2""", 40, ), ( False, False, "worker_version_id", - """â– ca i l l e t â– ma u r i ce â– 28. 9.0 6 -â– re b o u l â– j e a n â– 30. 9.0 2 -â– b a re y re â– j e a n â– 28. 3 . 1 1 -â– r o u s s y â– j e a n â– 4 . 11.1 4 -â– ma r i n â– ma r ce l â– 10. 8 . 0 6 -â– a m i ca l â– el o i â– 11.1 0 . 0 4 -â– b i r o s â– ma el â– 30. 10. 1 0""", + """â– la u l ont â– f r an c oi s â– 8 +â– c i re t â– an t oi ne â– 2 7 +â– c i re t â– m a r ie â– 2 8 +â– c i re t â– m a r ie â– 2 +â– e u res t on â– so l an g e â– 1 0 +â– t e r ont u ss ie u x â– j e an â– 2 +â– p res so ne t â– m a r ie â– 1 2""", 40, ), ( True, True, False, - """â– â“¢ C a i l l e t â– â“• M a u r i c e â– â“‘ 2 8 . 9 . 0 6 -â– â“¢ R e b o u l â– â“• J e a n â– â“‘ 3 0 . 9 . 0 2 -â– â“¢ B a r e y r e â– â“• J e a n â– â“‘ 2 8 . 3 . 1 1 -â– â“¢ R o u s s y â– â“• J e a n â– â“‘ 4 . 1 1 . 1 4 -â– â“¢ M a r i n â– â“• M a r c e l â– â“‘ 1 0 . 8 . 0 6 -â– â“¢ A m i c a l â– â“• E l o i â– â“‘ 1 1 . 1 0 . 0 4 -â– â“¢ B i r o s â– â“• M a e l â– â“‘ 3 0 . 1 0 . 1 0""", + """â– â“¢ L a u l o n t â– â“• F r a n c o i s â– â“‘ 8 +â– â“¢ C i r e t â– â“• A n t o i n e â– â“‘ 2 7 +â– â“¢ C i r e t â– â“• M a r ie â– â“‘ 2 8 +â– â“¢ C i r e t â– â“• M a r ie â– â“‘ 2 +â– â“¢ E u r e s t o n â– â“• S o l a n g e â– â“‘ 1 0 +â– â“¢ T e r o n t u s s ie u x â– â“• J e a n â– â“‘ 2 +â– â“¢ P r e s s o n e t â– â“• M a r ie â– â“‘ 1 2""", 40, ), ( True, True, False, - """â– â“¢ C a i l l e t â– â“• M a u ri ce â– â“‘ 28. 9.0 6 -â– â“¢ R e b ou l â– â“• J e a n â– â“‘ 30. 9.0 2 -â– â“¢ B a re y re â– â“• J e a n â– â“‘ 28. 3 . 1 1 -â– â“¢ R ou s s y â– â“• J e a n â– â“‘ 4 . 11.1 4 -â– â“¢ Mar i n â– â“• Mar ce l â– â“‘ 10. 8 . 0 6 -â– â“¢ A m ic a l â– â“• E l o i â– â“‘ 11.1 0 . 0 4 -â– â“¢ B i r o s â– â“• M a e l â– â“‘ 30. 10. 10""", - 55, + """â– â“¢ L a u l ont â– â“• F r an c oi s â– â“‘ 8 +â– â“¢ C i re t â– â“• A n t oi n e â– â“‘ 2 7 +â– â“¢ C i re t â– â“• M a r ie â– â“‘ 2 8 +â– â“¢ C i re t â– â“• M a r ie â– â“‘ 2 +â– â“¢ E u re s t on â– â“• S o l an g e â– â“‘ 1 0 +â– â“¢ T e r ont u s s ie u x â– â“• J e an â– â“‘ 2 +â– â“¢ P re s s on e t â– â“• M a r ie â– â“‘ 1 2""", + 45, ), ( True, False, False, - """â– â“¢ C a i l l e t â– â“• M a u r i c e â– â“‘ 2 8 . 9 . 0 6 -â– â“¢ R e b o u l â– â“• J e a n â– â“‘ 3 0 . 9 . 0 2 -â– â“¢ B a r e y r e â– â“• J e a n â– â“‘ 2 8 . 3 . 1 1 -â– â“¢ R o u s s y â– â“• J e a n â– â“‘ 4 . 1 1 . 1 4 -â– â“¢ M a r i n â– â“• M a r c e l â– â“‘ 1 0 . 8 . 0 6 -â– â“¢ A m i c a l â– â“• E l o i â– â“‘ 1 1 . 1 0 . 0 4 -â– â“¢ B i r o s â– â“• M a e l â– â“‘ 3 0 . 1 0 . 1 0""", + """â– â“¢ L a u l o n t â– â“• F r a n c o i s â– â“‘ 8 +â– â“¢ C i r e t â– â“• A n t o i n e â– â“‘ 2 7 +â– â“¢ C i r e t â– â“• M a r ie â– â“‘ 2 8 +â– â“¢ C i r e t â– â“• M a r ie â– â“‘ 2 +â– â“¢ E u r e s t o n â– â“• S o l a n g e â– â“‘ 1 0 +â– â“¢ T e r o n t u s s ie u x â– â“• J e a n â– â“‘ 2 +â– â“¢ P r e s s o n e t â– â“• M a r ie â– â“‘ 1 2""", 40, ), ( False, True, False, - """â– C a i l l e t â– Ma u r i c e â– 28. 9.0 6 -â– R e b o u l â– J e a n â– 30. 9.0 2 -â– B a r e y r e â– J e a n â– 28. 3 . 1 1 -â– R o u s s y â– J e a n â– 4 . 1 1 . 1 4 -â– Ma r i n â– Ma r c e l â– 1 0 . 8 . 0 6 -â– A m i c a l â– E l o i â– 1 1 . 1 0 . 0 4 -â– B i r o s â– Ma e l â– 30. 1 0 . 1 0""", + """â– L a u l ont â– F r an c oi s â– 8 +â– C i re t â– A n t oi n e â– 2 7 +â– C i re t â– M a r ie â– 2 8 +â– C i re t â– M a r ie â– 2 +â– E u re s t on â– S o l an g e â– 1 0 +â– T e r ont u s s ie u x â– J e an â– 2 +â– P re s s on e t â– M a r ie â– 1 2""", 40, ), ( False, False, False, - """â– C a i l l e t â– Ma u r i c e â– 28. 9.0 6 -â– R e b o u l â– J e a n â– 30. 9.0 2 -â– B a r e y r e â– J e a n â– 28. 3 . 1 1 -â– R o u s s y â– J e a n â– 4 . 1 1 . 1 4 -â– Ma r i n â– Ma r c e l â– 1 0 . 8 . 0 6 -â– A m i c a l â– E l o i â– 1 1 . 1 0 . 0 4 -â– B i r o s â– Ma e l â– 30. 1 0 . 1 0""", + """â– L a u l ont â– F r an c oi s â– 8 +â– C i re t â– A n t oi n e â– 2 7 +â– C i re t â– M a r ie â– 2 8 +â– C i re t â– M a r ie â– 2 +â– E u re s t on â– S o l an g e â– 1 0 +â– T e r ont u s s ie u x â– J e an â– 2 +â– P re s s on e t â– M a r ie â– 1 2""", 40, ), ), @@ -366,21 +366,21 @@ def test_extract( assert set(pickle.loads((output / "charset.pkl").read_bytes())) == expected_charset # Check "language_corpus.txt" - expected_char_language_corpus = """â“¢ C a i l l e t â– â– â“• M a u r i c e â– â– â“‘ 2 8 . 9 . 0 6 -â“¢ R e b o u l â– â– â“• J e a n â– â– â“‘ 3 0 . 9 . 0 2 -â“¢ B a r e y r e â– â– â“• J e a n â– â– â“‘ 2 8 . 3 . 1 1 -â“¢ R o u s s y â– â– â“• J e a n â– â– â“‘ 4 . 1 1 . 1 4 -â“¢ M a r i n â– â– â“• M a r c e l â– â– â“‘ 1 0 . 8 . 0 6 -â“¢ A m i c a l â– â– â“• E l o i â– â– â“‘ 1 1 . 1 0 . 0 4 -â“¢ B i r o s â– â– â“• M a e l â– â– â“‘ 3 0 . 1 0 . 1 0""" - - expected_word_language_corpus = """â“¢ Caillet â– â“• Maurice â– â“‘ 28 â– . â– 9 â– . â– 06 -â“¢ Reboul â– â“• Jean â– â“‘ 30 â– . â– 9 â– . â– 02 -â“¢ Bareyre â– â“• Jean â– â“‘ 28 â– . â– 3 â– . â– 11 -â“¢ Roussy â– â“• Jean â– â“‘ 4 â– . â– 11 â– . â– 14 -â“¢ Marin â– â“• Marcel â– â“‘ 10 â– . â– 8 â– . â– 06 -â“¢ Amical â– â“• Eloi â– â“‘ 11 â– . â– 10 â– . â– 04 -â“¢ Biros â– â“• Mael â– â“‘ 30 â– . â– 10 â– . â– 10""" + expected_char_language_corpus = """â“¢ L a u l o n t â– â– â“• F r a n c o i s â– â– â“‘ 8 +â“¢ C i r e t â– â– â“• A n t o i n e â– â– â“‘ 2 7 +â“¢ C i r e t â– â– â“• M a r i e â– â– â“‘ 2 8 +â“¢ C i r e t â– â– â“• M a r i e â– â– â“‘ 2 +â“¢ E u r e s t o n â– â– â“• S o l a n g e â– â– â“‘ 1 0 +â“¢ T e r o n t u s s i e u x â– â– â“• J e a n â– â– â“‘ 2 +â“¢ P r e s s o n e t â– â– â“• M a r i e â– â– â“‘ 1 2""" + + expected_word_language_corpus = """â“¢ Laulont â– â“• Francois â– â“‘ 8 +â“¢ Ciret â– â“• Antoine â– â“‘ 27 +â“¢ Ciret â– â“• Marie â– â“‘ 28 +â“¢ Ciret â– â“• Marie â– â“‘ 2 +â“¢ Eureston â– â“• Solange â– â“‘ 10 +â“¢ Terontussieux â– â“• Jean â– â“‘ 2 +â“¢ Pressonet â– â“• Marie â– â“‘ 12""" if existing: expected_char_language_corpus = ( @@ -511,7 +511,7 @@ def test_extract_transcription_no_translation(mock_database, tokens, tmp_path): ).execute() # Early return with only the element transcription text instead of a translation - assert extractor.extract_transcription(element) == "Coupez Bouis 7.12.14" + assert extractor.extract_transcription(element) == "Leunaut Claude 49" @pytest.mark.parametrize(