Skip to content
Snippets Groups Projects
Verified Commit 4ba0e43f authored by Yoann Schneider's avatar Yoann Schneider :tennis:
Browse files

refactoring + packaging

parent f3e30e2d
No related branches found
No related tags found
No related merge requests found
Showing
with 0 additions and 1974 deletions
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import os import os
import shutil import shutil
import tarfile import tarfile
......
This diff is collapsed.
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from Datasets.dataset_formatters.generic_dataset_formatter import OCRDatasetFormatter
import os
import numpy as np
from Datasets.dataset_formatters.utils_dataset import natural_sort
from PIL import Image
import xml.etree.ElementTree as ET
import re
# Layout string to token
SEM_MATCHING_TOKENS_STR = {
'Ouverture': "", # opening
'Corps de texte': "", # body
'PS/PJ': "", # post scriptum
'Coordonnées Expéditeur': "", # sender
'Reference': "", # also counted as sender information
'Objet': "", # why
'Date, Lieu': "", # where, when
'Coordonnées Destinataire': "", # recipient
}
# Layout begin-token to end-token
SEM_MATCHING_TOKENS = {
"": "",
"": "",
"": "",
"": "",
"": "",
"": "",
"": ""
}
class RIMESDatasetFormatter(OCRDatasetFormatter):
def __init__(self, level, set_names=["train", "valid", "test"], dpi=150, sem_token=True):
super(RIMESDatasetFormatter, self).__init__("RIMES", level, "_sem" if sem_token else "", set_names)
self.source_fold_path = os.path.join("../raw", "RIMES")
self.dpi = dpi
self.sem_token = sem_token
self.map_datasets_files.update({
"RIMES": {
# (1,050 for train, 100 for validation and 100 for test)
"page": {
"arx_files": ["RIMES_page.tar.gz", ],
"needed_files": [],
"format_function": self.format_rimes_page,
},
}
})
self.matching_tokens_str = SEM_MATCHING_TOKENS_STR
self.matching_tokens = SEM_MATCHING_TOKENS
self.ordering_function = order_text_regions
def preformat_rimes_page(self):
"""
Extract all information from dataset and correct some annotations
"""
dataset = {
"train": list(),
"valid": list(),
"test": list()
}
img_folder_path = os.path.join(self.temp_fold, "RIMES page", "Images")
xml_folder_path = os.path.join(self.temp_fold, "RIMES page", "XML")
xml_files = natural_sort([os.path.join(xml_folder_path, name) for name in os.listdir(xml_folder_path)])
train_xml = xml_files[:1050]
valid_xml = xml_files[1050:1150]
test_xml = xml_files[1150:]
for set_name, xml_files in zip(self.set_names, [train_xml, valid_xml, test_xml]):
for i, xml_path in enumerate(xml_files):
text_regions = list()
root = ET.parse(xml_path).getroot()
img_name = root.find("source").text
if img_name == "01160_L.png":
text_regions.append({
"label": "LETTRE RECOMMANDEE\nAVEC ACCUSE DE RECEPTION",
"type": "",
"coords": {
"left": 88,
"right": 1364,
"top": 1224,
"bottom": 1448,
}
})
for text_region in root.findall("box"):
type = text_region.find("type").text
label = text_region.find("text").text
if label is None or len(label.strip()) <= 0:
continue
if label == "Ref : QVLCP¨65":
label = label.replace("¨", "")
if img_name == "01094_L.png" and type == "Corps de texte":
label = "Suite à la tempête du 19.11.06, un\narbre est tombé sur mon toît et l'a endommagé.\nJe d'eplore une cinquantaine de tuiles à changer,\nune poutre à réparer et une gouttière à\nremplacer. Veuillez trouver ci-joint le devis\nde réparation. Merci de m'envoyer votre\nexpert le plus rapidement possible.\nEn esperant une réponse rapide de votre\npart, veuillez accepter, madame, monsieur,\nmes salutations distinguées."
elif img_name == "01111_L.png" and type == "Corps de texte":
label = "Je vous ai envoyé un courrier le 20 octobre 2006\nvous signalant un sinistre survenu dans ma\nmaison, un dégât des eaux consécutif aux\nfortes pluis.\nVous deviez envoyer un expert pour constater\nles dégâts. Personne n'est venu à ce jour\nJe vous prie donc de faire le nécessaire\nafin que les réparations nécessaires puissent\nêtre commencés.\nDans l'attente, veuillez agréer, Monsieur,\nmes sincères salutations"
label = self.convert_label_accent(label)
label = self.convert_label(label)
label = self.format_text_label(label)
coords = {
"left": int(text_region.attrib["top_left_x"]),
"right": int(text_region.attrib["bottom_right_x"]),
"top": int(text_region.attrib["top_left_y"]),
"bottom": int(text_region.attrib["bottom_right_y"]),
}
text_regions.append({
"label": label,
"type": type,
"coords": coords
})
text_regions = self.ordering_function(text_regions)
dataset[set_name].append({
"text_regions": text_regions,
"img_path": os.path.join(img_folder_path, img_name),
"label": "\n".join([tr["label"] for tr in text_regions]),
"sem_label": "".join([self.sem_label(tr["label"], tr["type"]) for tr in text_regions]),
})
return dataset
def convert_label_accent(self, label):
"""
Solve encoding issues
"""
return label.replace("\\n", "\n").replace("<euro>", "").replace(">euro>", "").replace(">fligne>", " ")\
.replace("¤", "¤").replace("û", "û").replace("", "").replace("ï¿©", "é").replace("ç", "ç")\
.replace("é", "é").replace("ô", "ô").replace(u'\xa0', " ").replace("è", "è").replace("°", "°")\
.replace("À", "À").replace("ì", "À").replace("ê", "ê").replace("î", "î").replace("â", "â")\
.replace("²", "²").replace("ù", "ù").replace("Ã", "à").replace("¬", "")
def format_rimes_page(self):
"""
Format RIMES page dataset
"""
dataset = self.preformat_rimes_page()
for set_name in self.set_names:
fold = os.path.join(self.target_fold_path, set_name)
for sample in dataset[set_name]:
new_name = "{}_{}.png".format(set_name, len(os.listdir(fold)))
new_img_path = os.path.join(fold, new_name)
self.load_resize_save(sample["img_path"], new_img_path, 300, self.dpi)
for tr in sample["text_regions"]:
tr["coords"] = self.adjust_coord_ratio(tr["coords"], self.dpi / 300)
page = {
"text": sample["label"] if not self.sem_token else sample["sem_label"],
"paragraphs": sample["text_regions"],
"nb_cols": 1,
}
self.charset = self.charset.union(set(page["text"]))
self.gt[set_name][new_name] = page
def convert_label(self, label):
"""
Some annotations presents many options for a given text part, always keep the first one only
"""
if "¤" in label:
label = re.sub('¤{([^¤]*)[/|]([^¤]*)}¤', r'\1', label, flags=re.DOTALL)
label = re.sub('¤{([^¤]*)[/|]([^¤]*)[/|]([^¤]*)>', r'\1', label, flags=re.DOTALL)
label = re.sub('¤([^¤]*)[/|]([^¤]*)¤', r'\1', label, flags=re.DOTALL)
label = re.sub('¤{}¤([^¤]*)[/|]([^ ]*)', r'\1', label, flags=re.DOTALL)
label = re.sub('¤{/([^¤]*)/([^ ]*)', r'\1', label, flags=re.DOTALL)
label = re.sub('¤{([^¤]*)[/|]([^ ]*)', r'\1', label, flags=re.DOTALL)
label = re.sub('([^¤]*)/(.*)[¤}{]+', r'\1', label, flags=re.DOTALL)
label = re.sub('[¤}{]+([^¤}{]*)[¤}{]+', r'\1', label, flags=re.DOTALL)
label = re.sub('¤([^¤]*)¤', r'\1', label, flags=re.DOTALL)
label = re.sub('[ ]+', " ", label, flags=re.DOTALL)
label = label.strip()
return label
def sem_label(self, label, type):
"""
Add layout tokens
"""
if type == "":
return label
begin_token = self.matching_tokens_str[type]
end_token = self.matching_tokens[begin_token]
return begin_token + label + end_token
def order_text_regions(text_regions):
"""
Establish reading order based on text region pixel positions
"""
sorted_text_regions = list()
for tr in text_regions:
added = False
if len(sorted_text_regions) == 0:
sorted_text_regions.append(tr)
added = True
else:
for i, sorted_tr in enumerate(sorted_text_regions):
tr_height = tr["coords"]["bottom"] - tr["coords"]["top"]
sorted_tr_height = sorted_tr["coords"]["bottom"] - sorted_tr["coords"]["top"]
tr_is_totally_above = tr["coords"]["bottom"] < sorted_tr["coords"]["top"]
tr_is_top_above = tr["coords"]["top"] < sorted_tr["coords"]["top"]
is_same_level = sorted_tr["coords"]["top"] <= tr["coords"]["bottom"] <= sorted_tr["coords"]["bottom"] or\
sorted_tr["coords"]["top"] <= tr["coords"]["top"] <= sorted_tr["coords"]["bottom"] or\
tr["coords"]["top"] <= sorted_tr["coords"]["bottom"] <= tr["coords"]["bottom"] or\
tr["coords"]["top"] <= sorted_tr["coords"]["top"] <= tr["coords"]["bottom"]
vertical_shared_space = tr["coords"]["bottom"]-sorted_tr["coords"]["top"] if tr_is_top_above else sorted_tr["coords"]["bottom"]-tr["coords"]["top"]
reach_same_level_limit = vertical_shared_space > 0.3*min(tr_height, sorted_tr_height)
is_more_at_left = tr["coords"]["left"] < sorted_tr["coords"]["left"]
equivalent_height = abs(tr_height-sorted_tr_height) < 0.3*min(tr_height, sorted_tr_height)
is_middle_above_top = np.mean([tr["coords"]["top"], tr["coords"]["bottom"]]) < sorted_tr["coords"]["top"]
if tr_is_totally_above or\
(is_same_level and equivalent_height and is_more_at_left and reach_same_level_limit) or\
(is_same_level and equivalent_height and tr_is_top_above and not reach_same_level_limit) or\
(is_same_level and not equivalent_height and is_middle_above_top):
sorted_text_regions.insert(i, tr)
added = True
break
if not added:
sorted_text_regions.append(tr)
return sorted_text_regions
if __name__ == "__main__":
RIMESDatasetFormatter("page", sem_token=True).format()
RIMESDatasetFormatter("page", sem_token=False).format()
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from Datasets.dataset_formatters.generic_dataset_formatter import OCRDatasetFormatter from Datasets.dataset_formatters.generic_dataset_formatter import OCRDatasetFormatter
import os import os
import numpy as np import numpy as np
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import re import re
import random import random
import cv2 import cv2
......
"../../../Fonts/lato/Lato-HairlineItalic.ttf",
"../../../Fonts/lato/Lato-HeavyItalic.ttf",
"../../../Fonts/lato/Lato-BoldItalic.ttf",
"../../../Fonts/lato/Lato-Black.ttf",
"../../../Fonts/lato/Lato-Heavy.ttf",
"../../../Fonts/lato/Lato-Regular.ttf",
"../../../Fonts/lato/Lato-LightItalic.ttf",
"../../../Fonts/lato/Lato-Italic.ttf",
"../../../Fonts/lato/Lato-ThinItalic.ttf",
"../../../Fonts/lato/Lato-Bold.ttf",
"../../../Fonts/lato/Lato-Hairline.ttf",
"../../../Fonts/lato/Lato-Medium.ttf",
"../../../Fonts/lato/Lato-SemiboldItalic.ttf",
"../../../Fonts/lato/Lato-BlackItalic.ttf",
"../../../Fonts/lato/Lato-MediumItalic.ttf",
"../../../Fonts/lato/Lato-Semibold.ttf",
"../../../Fonts/lato/Lato-Thin.ttf",
"../../../Fonts/lato/Lato-Light.ttf",
"../../../Fonts/gentiumplus/GentiumPlus-I.ttf",
"../../../Fonts/gentiumplus/GentiumPlus-R.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-BoldOblique.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed.ttf",
"../../../Fonts/dejavu/DejaVuSans-BoldOblique.ttf",
"../../../Fonts/dejavu/DejaVuSans-ExtraLight.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-BoldItalic.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif-Italic.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-Italic.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSansMono.ttf",
"../../../Fonts/dejavu/DejaVuSerif-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSans-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif-BoldItalic.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSans.ttf",
"../../../Fonts/dejavu/DejaVuSans-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-BoldOblique.ttf"
\ No newline at end of file
"../../../Fonts/handwritten-mix/Parisienne-Regular.ttf",
"../../../Fonts/handwritten-mix/A little sunshine.ttf",
"../../../Fonts/handwritten-mix/Massillo.ttf",
"../../../Fonts/handwritten-mix/Cursive standard Bold.ttf",
"../../../Fonts/handwritten-mix/Merveille-mj8j.ttf",
"../../../Fonts/handwritten-mix/Cursive standard.ttf",
"../../../Fonts/handwritten-mix/Roustel.ttf",
"../../../Fonts/handwritten-mix/Baby Doll.ttf",
"../../../Fonts/handwritten-mix/flashback Demo.ttf",
"../../../Fonts/handwritten-mix/CreamShoes.ttf",
"../../../Fonts/handwritten-mix/Gentle Remind.ttf",
"../../../Fonts/handwritten-mix/Alexandria Rose.ttf",
"../../../Fonts/lato/Lato-HairlineItalic.ttf",
"../../../Fonts/lato/Lato-HeavyItalic.ttf",
"../../../Fonts/lato/Lato-BoldItalic.ttf",
"../../../Fonts/lato/Lato-Black.ttf",
"../../../Fonts/lato/Lato-Heavy.ttf",
"../../../Fonts/lato/Lato-Regular.ttf",
"../../../Fonts/lato/Lato-LightItalic.ttf",
"../../../Fonts/lato/Lato-Italic.ttf",
"../../../Fonts/lato/Lato-ThinItalic.ttf",
"../../../Fonts/lato/Lato-Bold.ttf",
"../../../Fonts/lato/Lato-Hairline.ttf",
"../../../Fonts/lato/Lato-Medium.ttf",
"../../../Fonts/lato/Lato-SemiboldItalic.ttf",
"../../../Fonts/lato/Lato-BlackItalic.ttf",
"../../../Fonts/lato/Lato-MediumItalic.ttf",
"../../../Fonts/lato/Lato-Semibold.ttf",
"../../../Fonts/lato/Lato-Thin.ttf",
"../../../Fonts/lato/Lato-Light.ttf",
"../../../Fonts/gentiumplus/GentiumPlus-I.ttf",
"../../../Fonts/gentiumplus/GentiumPlus-R.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-BoldOblique.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed.ttf",
"../../../Fonts/dejavu/DejaVuSans-BoldOblique.ttf",
"../../../Fonts/dejavu/DejaVuSans-ExtraLight.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-BoldItalic.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif-Italic.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-Italic.ttf",
"../../../Fonts/dejavu/DejaVuSerifCondensed-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSansMono.ttf",
"../../../Fonts/dejavu/DejaVuSerif-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSans-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-Bold.ttf",
"../../../Fonts/dejavu/DejaVuSerif-BoldItalic.ttf",
"../../../Fonts/dejavu/DejaVuSansMono-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSans.ttf",
"../../../Fonts/dejavu/DejaVuSans-Oblique.ttf",
"../../../Fonts/dejavu/DejaVuSansCondensed-BoldOblique.ttf",
"../../../Fonts/open-sans/OpenSans-SemiboldItalic.ttf",
"../../../Fonts/open-sans/OpenSans-CondLight.ttf",
"../../../Fonts/open-sans/OpenSans-Light.ttf",
"../../../Fonts/open-sans/OpenSans-Italic.ttf",
"../../../Fonts/open-sans/OpenSans-CondBold.ttf",
"../../../Fonts/open-sans/OpenSans-Bold.ttf",
"../../../Fonts/open-sans/OpenSans-CondLightItalic.ttf",
"../../../Fonts/open-sans/OpenSans-ExtraBold.ttf",
"../../../Fonts/open-sans/OpenSans-Semibold.ttf",
"../../../Fonts/open-sans/OpenSans-Regular.ttf",
"../../../Fonts/open-sans/OpenSans-BoldItalic.ttf",
"../../../Fonts/open-sans/OpenSans-LightItalic.ttf",
"../../../Fonts/open-sans/OpenSans-ExtraBoldItalic.ttf",
"../../../Fonts/msttcorefonts/Arial.ttf",
"../../../Fonts/msttcorefonts/Verdana_Italic.ttf",
"../../../Fonts/msttcorefonts/Georgia_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Andale_Mono.ttf",
"../../../Fonts/msttcorefonts/Courier_New_Italic.ttf",
"../../../Fonts/msttcorefonts/Georgia_Italic.ttf",
"../../../Fonts/msttcorefonts/Arial_Black.ttf",
"../../../Fonts/msttcorefonts/Trebuchet_MS_Italic.ttf",
"../../../Fonts/msttcorefonts/Verdana.ttf",
"../../../Fonts/msttcorefonts/Courier_New.ttf",
"../../../Fonts/msttcorefonts/Verdana_Bold.ttf",
"../../../Fonts/msttcorefonts/Arial_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Georgia.ttf",
"../../../Fonts/msttcorefonts/Trebuchet_MS_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Impact.ttf",
"../../../Fonts/msttcorefonts/Courier_New_Bold.ttf",
"../../../Fonts/msttcorefonts/Times_New_Roman_Italic.ttf",
"../../../Fonts/msttcorefonts/Georgia_Bold.ttf",
"../../../Fonts/msttcorefonts/Times_New_Roman_Bold.ttf",
"../../../Fonts/msttcorefonts/Times_New_Roman.ttf",
"../../../Fonts/msttcorefonts/Comic_Sans_MS.ttf",
"../../../Fonts/msttcorefonts/Trebuchet_MS_Bold.ttf",
"../../../Fonts/msttcorefonts/Trebuchet_MS.ttf",
"../../../Fonts/msttcorefonts/Arial_Italic.ttf",
"../../../Fonts/msttcorefonts/Courier_New_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Verdana_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Arial_Bold.ttf",
"../../../Fonts/msttcorefonts/Times_New_Roman_Bold_Italic.ttf",
"../../../Fonts/msttcorefonts/Comic_Sans_MS_Bold.ttf"
\ No newline at end of file
This diff is collapsed.
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import os import os
import sys import sys
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import os import os
import sys import sys
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from torch.nn.functional import log_softmax from torch.nn.functional import log_softmax
from torch.nn import AdaptiveMaxPool2d, Conv1d from torch.nn import AdaptiveMaxPool2d, Conv1d
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from basic.metric_manager import MetricManager from basic.metric_manager import MetricManager
from OCR.ocr_manager import OCRManager from OCR.ocr_manager import OCRManager
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from basic.generic_training_manager import GenericTrainingManager from basic.generic_training_manager import GenericTrainingManager
import os import os
from PIL import Image from PIL import Image
......
...@@ -49,68 +49,6 @@ Install the dependencies: ...@@ -49,68 +49,6 @@ Install the dependencies:
pip install -r requirements.txt pip install -r requirements.txt
``` ```
## Datasets
This section is dedicated to the datasets used in the paper: download and formatting instructions are provided
for experiment replication purposes.
RIMES dataset at page level was distributed during the [evaluation compaign of 2009](https://ieeexplore.ieee.org/document/5277557).
READ 2016 dataset corresponds to the one used in the [ICFHR 2016 competition on handwritten text recognition](https://ieeexplore.ieee.org/document/7814136).
It can be found [here](https://zenodo.org/record/1164045#.YiINkBvjKEA)
Raw dataset files must be placed in Datasets/raw/{dataset_name} \
where dataset name is "READ 2016" or "RIMES"
## Training And Evaluation
### Step 1: Download the dataset
### Step 2: Format the dataset
```
python3 Datasets/dataset_formatters/read2016_formatter.py
python3 Datasets/dataset_formatters/rimes_formatter.py
```
### Step 3: Add any font you want as .ttf file in the folder Fonts
### Step 4 : Generate synthetic line dataset for pre-training
```
python3 OCR/line_OCR/ctc/main_syn_line.py
```
There are two lines in this script to adapt to the used dataset:
```
model.generate_syn_line_dataset("READ_2016_syn_line")
dataset_name = "READ_2016"
```
### Step 5 : Pre-training on synthetic lines
```
python3 OCR/line_OCR/ctc/main_line_ctc.py
```
There are two lines in this script to adapt to the used dataset:
```
dataset_name = "READ_2016"
"output_folder": "FCN_read_line_syn"
```
Weights and evaluation results are stored in OCR/line_OCR/ctc/outputs
### Step 6 : Training the DAN
```
python3 OCR/document_OCR/dan/main_dan.py
```
The following lines must be adapted to the dataset used and pre-training folder names:
```
dataset_name = "READ_2016"
"transfer_learning": {
# model_name: [state_dict_name, checkpoint_path, learnable, strict]
"encoder": ["encoder", "../../line_OCR/ctc/outputs/FCN_read_2016_line_syn/checkpoints/best.pt", True, True],
"decoder": ["decoder", "../../line_OCR/ctc/outputs/FCN_read_2016_line_syn/best.pt", True, False],
},
```
Weights and evaluation results are stored in OCR/document_OCR/dan/outputs
### Remarks (for pre-training and training) ### Remarks (for pre-training and training)
All hyperparameters are specified and editable in the training scripts (meaning are in comments).\ All hyperparameters are specified and editable in the training scripts (meaning are in comments).\
Evaluation is performed just after training ending (training is stopped when the maximum elapsed time is reached or after a maximum number of epoch as specified in the training script).\ Evaluation is performed just after training ending (training is stopped when the maximum elapsed time is reached or after a maximum number of epoch as specified in the training script).\
...@@ -154,20 +92,3 @@ To run the inference on a GPU, one can replace `cpu` by the name of the GPU. In ...@@ -154,20 +92,3 @@ To run the inference on a GPU, one can replace `cpu` by the name of the GPU. In
```python ```python
text, confidence_scores = model.predict(image, confidences=True) text, confidence_scores = model.predict(image, confidences=True)
``` ```
## Citation
```bibtex
@misc{Coquenet2022b,
author = {Coquenet, Denis and Chatelain, Clément and Paquet, Thierry},
title = {DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition},
doi = {10.48550/ARXIV.2203.12273},
url = {https://arxiv.org/abs/2203.12273},
publisher = {arXiv},
year = {2022},
}
```
## License
This whole project is under Cecill-C license.
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import torch import torch
import random import random
from torch.utils.data import Dataset, DataLoader from torch.utils.data import Dataset, DataLoader
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import torch import torch
import os import os
import sys import sys
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from Datasets.dataset_formatters.rimes_formatter import SEM_MATCHING_TOKENS as RIMES_MATCHING_TOKENS from Datasets.dataset_formatters.rimes_formatter import SEM_MATCHING_TOKENS as RIMES_MATCHING_TOKENS
from Datasets.dataset_formatters.read2016_formatter import SEM_MATCHING_TOKENS as READ_MATCHING_TOKENS from Datasets.dataset_formatters.read2016_formatter import SEM_MATCHING_TOKENS as READ_MATCHING_TOKENS
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in Python whose purpose is to
# provide public implementation of deep learning works, in pytorch.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
from torch.nn import Dropout, Dropout2d from torch.nn import Dropout, Dropout2d
import numpy as np import numpy as np
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import numpy as np import numpy as np
from numpy import random from numpy import random
......
# Copyright Université de Rouen Normandie (1), INSA Rouen (2),
# tutelles du laboratoire LITIS (1 et 2)
# contributors :
# - Denis Coquenet
#
#
# This software is a computer program written in XXX whose purpose is XXX.
#
# This software is governed by the CeCILL-C license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL-C
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL-C license and that you accept its terms.
import numpy as np import numpy as np
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment