From aa2867dcba3b81f538549f66249c917495e472bb Mon Sep 17 00:00:00 2001 From: Bastien Abadie <abadie@teklia.com> Date: Wed, 28 Aug 2024 09:46:36 +0000 Subject: [PATCH] Import model pylaia_fraktur from pylaia --- pylaia/fraktur/README.md | 16 +++ pylaia/fraktur/language_model.arpa.gz | 3 + pylaia/fraktur/lexicon.txt | 160 ++++++++++++++++++++++++++ pylaia/fraktur/model | Bin 0 -> 1515 bytes pylaia/fraktur/syms.txt | 160 ++++++++++++++++++++++++++ pylaia/fraktur/tokens.txt | 160 ++++++++++++++++++++++++++ pylaia/fraktur/weights.ckpt | 3 + 7 files changed, 502 insertions(+) create mode 100644 pylaia/fraktur/README.md create mode 100644 pylaia/fraktur/language_model.arpa.gz create mode 100644 pylaia/fraktur/lexicon.txt create mode 100644 pylaia/fraktur/model create mode 100644 pylaia/fraktur/syms.txt create mode 100644 pylaia/fraktur/tokens.txt create mode 100644 pylaia/fraktur/weights.ckpt diff --git a/pylaia/fraktur/README.md b/pylaia/fraktur/README.md new file mode 100644 index 0000000..ee4c8bd --- /dev/null +++ b/pylaia/fraktur/README.md @@ -0,0 +1,16 @@ +# PyLaia Austrian Newspaper (fraktur) + +## Datasets + +Trained on horizontal text-lines from the [Austrian Newspaper](https://demo.arkindex.org/browse/4dc4af87-20d0-4101-8ce9-6e427517c2b2?top_level=true&folder=true) corpus. + +## Results + +* Fixed line height of 128 pixels +* LM = kenlm 6-gram character model trained on the training set + +| Model | Split | CER (%) | WER (%) | Support | +|------------|---------|-----------|-----------|-----------| +| PyLaia | train | 1.62 | 5.63 | 38891 | +| PyLaia | val | 1.82 | 7.77 | 3282 | +| PyLaia+LM | val | 1.77 | 7.01 | 3282 | diff --git a/pylaia/fraktur/language_model.arpa.gz b/pylaia/fraktur/language_model.arpa.gz new file mode 100644 index 0000000..d49869f --- /dev/null +++ b/pylaia/fraktur/language_model.arpa.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22cc1a2cba477088a94908c5ab9ae9482a3641a87b6a7a79909b1ce8544c4d42 +size 28302572 diff --git a/pylaia/fraktur/lexicon.txt b/pylaia/fraktur/lexicon.txt new file mode 100644 index 0000000..ad60e5b --- /dev/null +++ b/pylaia/fraktur/lexicon.txt @@ -0,0 +1,160 @@ +<ctc> <ctc> +! ! +" " +# # +% % +& & +' ' +( ( +) ) +* * ++ + +, , +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +: : +; ; += = +? ? +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +[ [ +] ] +_ _ +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +| | +~ ~ +§ § +« « +¬ ¬ +° ° +± ± +´ ´ +· · +» » +¼ ¼ +½ ½ +¾ ¾ +Ä Ä +È È +É É +Ê Ê +Ó Ó +Ô Ô +Ö Ö +Ãœ Ãœ +ß ß +à à +á á +â â +ã ã +ä ä +æ æ +ç ç +è è +é é +ê ê +ë ë +ï ï +ñ ñ +ò ò +ó ó +ô ô +ö ö +û û +ü ü +Å™ Å™ +Å Å +Ž Ž +ž ž +Ë™ Ë™ +· · +Δ Δ +Ο Ο +– – +— — +‘ ‘ +’ ’ +‚ ‚ +“ “ +††+„ „ +††+• • +… … +â…“ â…“ +â…• â…• +â…™ â…™ +â…› â…› +â…œ â…œ +â… â… +â…ž â…ž +∆ ∆ +≅ ≅ +â• â• +â–„ â–„ +â–¡ â–¡ +â—‹ â—‹ +â— â— +<unk> <unk> +<space> <space> diff --git a/pylaia/fraktur/model b/pylaia/fraktur/model new file mode 100644 index 0000000000000000000000000000000000000000..75fb6dd19d327331f8ad400d8e8f7d2fbbb4dd1c GIT binary patch literal 1515 zcmZ`(|8^5a6yByZZI`x`S`kF6C~B(hmLCd=BG_uNZW>~tx}xhk-3*x}o7sH(qgc;D zoj-C8eF=|`;v0B;10TgZn<;GoCnslTzWd#8?#$eKo9^f_%bJ+5o*6UNywy!wJYH$B zoH@NtyuLbX-Fs^MUU>wo=2gj^$VJRC;+>&I%p&HL?wKR<)Y0o8Or=D{qYW*|x*VgP z-nUMswXH@2$4U!J+Zo0dOUcf|a#ekV@tQZM-HL7y$zG8Mp<q(-IKxEvAWJ0idMwSj zlkfE+*|wWW7Uj_{-;lYDNq3|VTj97jO&YsA&7+XTLC$~9F}3f_5*SJuXz}};<weRf zO#7IriAi^=j}r>BG&|kLNrgF@UFhQlh55nkMTJw`J7F|BJf*0DEc%7xB_F42KF(;F z(LP>QI4jO_AFmkTRfPqyK)`DTcwOO~I7h%626$89yg1LfXB+t}PooycTRz?<<##lV zj#g7x6pQ5dt^wXtSQ1MFEE~X4SP?4(Trj{zg-hZRDNh;YRur>5N+d4(c%QUCFvKeg zSH)HG{Lla&DO?lR2>93lpD0`x*9rL405=qFikqZ8W3*XFPccyc%*Sdi%#$<}j+8n> zC}%R~kYU$3>+Egu`nHWVQT`5t>?KKDUhU^TzMvr2YtMcBMj{)$O&gl`zvWi?_)=j* zY*3hA)kF!n3UyH@U{eED9}R`u;x++I!}wZZOKj0Wjvri7n)H$)#~mMcpTG55oMgPU zEu)+=kEgpVwy}MS+{@9&Hwxd1qkyOI-4Jk3;rk)r2L+#(nX8j@hvlJ=N!rDaK7!g~ z8r+*C^&;azmt~y{Kkb+KLd6!uY>%fI*dPOU6vC3LO$Ei6p(Q3aElMG-aKAjJHnq$# z+Z4x*e-A<z3Q<|GnnGp9@!;8qB%58ob9d85r<9AML%DcJuB#xulO(RojxJRfb<B2j zNhj`~WkE$<*5*ORQE(aZ6g_WT^ApLJFJ1wP!fCqW6Tkj-YW_FsDf-|1-*z@_RZTmi z`p+oQ*H<gl%`V$o;-5d-+ZB@5|E4>lKTmq%QF)!v!(+HZKJ9x{SGS(S@izrK@HOEy v-D-8o*cV2ZpD@c<Xm#*?Xe@JMg9-WROR@INdW7apcT6{A@~K#wKj8icAIzI^ literal 0 HcmV?d00001 diff --git a/pylaia/fraktur/syms.txt b/pylaia/fraktur/syms.txt new file mode 100644 index 0000000..3b82158 --- /dev/null +++ b/pylaia/fraktur/syms.txt @@ -0,0 +1,160 @@ +<ctc> 0 +! 1 +" 2 +# 3 +% 4 +& 5 +' 6 +( 7 +) 8 +* 9 ++ 10 +, 11 +- 12 +. 13 +/ 14 +0 15 +1 16 +2 17 +3 18 +4 19 +5 20 +6 21 +7 22 +8 23 +9 24 +: 25 +; 26 += 27 +? 28 +A 29 +B 30 +C 31 +D 32 +E 33 +F 34 +G 35 +H 36 +I 37 +J 38 +K 39 +L 40 +M 41 +N 42 +O 43 +P 44 +Q 45 +R 46 +S 47 +T 48 +U 49 +V 50 +W 51 +X 52 +Y 53 +Z 54 +[ 55 +] 56 +_ 57 +a 58 +b 59 +c 60 +d 61 +e 62 +f 63 +g 64 +h 65 +i 66 +j 67 +k 68 +l 69 +m 70 +n 71 +o 72 +p 73 +q 74 +r 75 +s 76 +t 77 +u 78 +v 79 +w 80 +x 81 +y 82 +z 83 +| 84 +~ 85 +§ 86 +« 87 +¬ 88 +° 89 +± 90 +´ 91 +· 92 +» 93 +¼ 94 +½ 95 +¾ 96 +Ä 97 +È 98 +É 99 +Ê 100 +Ó 101 +Ô 102 +Ö 103 +Ãœ 104 +ß 105 +à 106 +á 107 +â 108 +ã 109 +ä 110 +æ 111 +ç 112 +è 113 +é 114 +ê 115 +ë 116 +ï 117 +ñ 118 +ò 119 +ó 120 +ô 121 +ö 122 +û 123 +ü 124 +Å™ 125 +Å 126 +Ž 127 +ž 128 +Ë™ 129 +· 130 +Δ 131 +Ο 132 +– 133 +— 134 +‘ 135 +’ 136 +‚ 137 +“ 138 +†139 +„ 140 +†141 +• 142 +… 143 +â…“ 144 +â…• 145 +â…™ 146 +â…› 147 +â…œ 148 +â… 149 +â…ž 150 +∆ 151 +≅ 152 +â• 153 +â–„ 154 +â–¡ 155 +â—‹ 156 +â— 157 +<unk> 158 +<space> 159 \ No newline at end of file diff --git a/pylaia/fraktur/tokens.txt b/pylaia/fraktur/tokens.txt new file mode 100644 index 0000000..3a3678b --- /dev/null +++ b/pylaia/fraktur/tokens.txt @@ -0,0 +1,160 @@ +<ctc> +! +" +# +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +| +~ +§ +« +¬ +° +± +´ +· +» +¼ +½ +¾ +Ä +È +É +Ê +Ó +Ô +Ö +Ãœ +ß +à +á +â +ã +ä +æ +ç +è +é +ê +ë +ï +ñ +ò +ó +ô +ö +û +ü +Å™ +Å +Ž +ž +Ë™ +· +Δ +Ο +– +— +‘ +’ +‚ +“ +†+„ +†+• +… +â…“ +â…• +â…™ +â…› +â…œ +â… +â…ž +∆ +≅ +â• +â–„ +â–¡ +â—‹ +â— +<unk> +<space> \ No newline at end of file diff --git a/pylaia/fraktur/weights.ckpt b/pylaia/fraktur/weights.ckpt new file mode 100644 index 0000000..409bb4c --- /dev/null +++ b/pylaia/fraktur/weights.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:263563dfd71bce1b6e42156a8f9b70dc09795f585a28c4d098d17952f4b76225 +size 42996316 -- GitLab