From 1060ba090780690a94861bedf6e068aa7f3c9305 Mon Sep 17 00:00:00 2001 From: Bastien Abadie <abadie@teklia.com> Date: Wed, 28 Aug 2024 07:24:32 +0000 Subject: [PATCH] Import model pylaia_alcar from pylaia --- pylaia/alcar/README.md | 24 +++++ pylaia/alcar/language_model.arpa.gz | 3 + pylaia/alcar/lexicon.txt | 130 ++++++++++++++++++++++++++++ pylaia/alcar/model | Bin 0 -> 1519 bytes pylaia/alcar/syms.txt | 130 ++++++++++++++++++++++++++++ pylaia/alcar/tokens.txt | 130 ++++++++++++++++++++++++++++ pylaia/alcar/weights.ckpt | 3 + 7 files changed, 420 insertions(+) create mode 100644 pylaia/alcar/README.md create mode 100644 pylaia/alcar/language_model.arpa.gz create mode 100644 pylaia/alcar/lexicon.txt create mode 100644 pylaia/alcar/model create mode 100644 pylaia/alcar/syms.txt create mode 100644 pylaia/alcar/tokens.txt create mode 100644 pylaia/alcar/weights.ckpt diff --git a/pylaia/alcar/README.md b/pylaia/alcar/README.md new file mode 100644 index 0000000..9e4fe74 --- /dev/null +++ b/pylaia/alcar/README.md @@ -0,0 +1,24 @@ +# PyLaia Alcar + +## Datasets + +* Trained on [HOME Alcar](https://demo.arkindex.org/browse/46b9b1f4-baeb-4342-a501-e2f15472a276?top_level=true&folder=true) and [Himanis](https://arkindex.teklia.com/browse/2f6e26b0-5fdd-4193-bb30-a3162b96280c?top_level=true&folder=true). +* Text-lines are resized to a fixed height of 128 pixels. +* The language model is a 6-gram character model trained only on the training set of HOME-Alcar. + +| split | N lines Alcar | N lines Himanis | Total | +| ----- | ------------: | --------------: | -----: | +| train | 59,969 | 18,504 | 78,473 | +| val | 7,905 | 2,367 | 10,272 | +| test | 6,932 | 2,241 | 9,173 | + +## Results + +* Evaluation on the test set of HOME-Alcar + +| Split | LM | Evaluation method | CER (%) | WER (%) | Support | +| ----- | --- | ------------------ | -------- | --------- | ------- | +| test | No | basic | 8.35 | 26.15 | 6932 | +| test | No | escape punctuation | 8.35 | 24.6 | 6932 | +| test | Yes | basic | **7.85** | **23.2** | 6932 | +| test | Yes | escape punctuation | **7.85** | **21.76** | 6932 | \ No newline at end of file diff --git a/pylaia/alcar/language_model.arpa.gz b/pylaia/alcar/language_model.arpa.gz new file mode 100644 index 0000000..8cb2589 --- /dev/null +++ b/pylaia/alcar/language_model.arpa.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b025c7b810bf208f4efed66a87cdff5dda9cfd6f1c2a128f69df337ab7ca7db4 +size 19663009 diff --git a/pylaia/alcar/lexicon.txt b/pylaia/alcar/lexicon.txt new file mode 100644 index 0000000..cd67081 --- /dev/null +++ b/pylaia/alcar/lexicon.txt @@ -0,0 +1,130 @@ +<ctc> <ctc> +! ! +& & +# # +' ' +( ( +) ) +* * ++ + +, , +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +: : +; ; += = +? ? +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +[ [ +] ] +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +| | +~ ~ +’ ’ +© © +§ § +ª ª +« « +¬ ¬ +¯ ¯ +° ° +¶ ¶ +º º +» » +¿ ¿ +À À +  +à à +Ç Ç +É É +à à +Ü Ü +à à +á á +â â +æ æ +ç ç +è è +é é +ë ë +ì ì +à à +î î +ï ï +ñ ñ +ú ú +ù ù +û û +ÿ ÿ +ę ę +ŠŠ+œ œ +ȩ ȩ +— — +‘ ‘ +’ ’ +… … +††+<unk> <unk> +<space> <space> diff --git a/pylaia/alcar/model b/pylaia/alcar/model new file mode 100644 index 0000000000000000000000000000000000000000..9aa1a804be6c535a90c03fcde5c18ebcb7f5eac4 GIT binary patch literal 1519 zcmZ`(YjYGu6rF6|Lr6k=foK#(m}s)$p`a*&i3Y=LmL(7!(Q)j|HrvC__UxHQG+LB4 zRa+$={0n~Z2U-3DtNbJ0p6xuAM{jLS_qpeuzTJKAZMefDmNhY9JvC;nIg6#Ch<5mL zljW@1ZO3aX)7HI#@mu8~EXNxo_fFC*VvZs21TA6~u__@v&Z;6$t9l)Tsg$UAw4o(g zhhx~&`_{;`x>>Jdq_nWKm0@(Dlx+W4uEri>-0|kLThR$3*)8%Q6iiA|$3*xbOC<5S zEX{Z|-|a@SWjB&6$|>}9ncJ9jhkCFTj(A5&V~3}C6tXzT`A<2f9(%I{hEfJv{61%S zk@5^jeM~!I(w*vIMq!p_$9p)YFh{d<J-ndsVt@9M!pq#7F&b^2QdB_}{mk)-kK>My z6Iy1thgTI&ij&;OYX*2-VP4D=@P+~2R5&G05%87)-c~p*PIK<rMn22asLAn;k29ov zR@3Ncj>3XiAisAF@Seh=SR`P{09A!$u}r`@1Dsd5ATE&dlu>R*G0USw;-ZiDN&5ps zyrghhTqe&C4e*h|6>)`tj}7pN!c}pVfKLr@P2sw@PTJE(n}zfgef7_LtT<twP-0c3 z)EPoKlT{BGc8#<4?k2Bo*;p0j?;yx-lEmfJe(vK73UbYP=Hu5BS?4X<(7gW*x6;Fx z3hQE>!o2B-5^xo2qDH`m2CN?H3b({90vd*KTVYdd(m{^wT~eBKlOo3*A9tU<^=h1C zytyT#oHCE6J1n-bb%Wf?(Zg2?UyH+lr|``Ha8Kde0pL3YpO=}dlXRQqp^!=1!S_A_ z=XV<1n<Vuj<3WdI?F>IWF7t(oEr{7JPcyK725u{aC0CmYiZMe|Om0||LR{f~c}#6; znq#&oj%)w!g)S7LvS2lZ%8cW|(+^2DyLRX9hK+V97e@zj@qk=ML3+nXT$dePsxIo7 zZR?Uw+(%_WMIF}SLB>&V8S)feZ(Q>e$(PPw0*a#AaK|U6{=5CxuhdiYzwz95He-$K zw=?Ph8KwH#ie+82wZuPvwzeuHt^W;oNPqTw;$eB+DF(-r9r8)<!^*lx9E`sy+&ym- xP{SP?TQrRM;iY|U84s=YzP}jD+-QG7e)?jpKg@cF<_&jLH)ZmvSeoDG{tv@!o-qIb literal 0 HcmV?d00001 diff --git a/pylaia/alcar/syms.txt b/pylaia/alcar/syms.txt new file mode 100644 index 0000000..9cfa4ce --- /dev/null +++ b/pylaia/alcar/syms.txt @@ -0,0 +1,130 @@ +<ctc> 0 +! 1 +& 2 +# 3 +' 4 +( 5 +) 6 +* 7 ++ 8 +, 9 +- 10 +. 11 +/ 12 +0 13 +1 14 +2 15 +3 16 +4 17 +5 18 +6 19 +7 20 +8 21 +9 22 +: 23 +; 24 += 25 +? 26 +A 27 +B 28 +C 29 +D 30 +E 31 +F 32 +G 33 +H 34 +I 35 +J 36 +K 37 +L 38 +M 39 +N 40 +O 41 +P 42 +Q 43 +R 44 +S 45 +T 46 +U 47 +V 48 +W 49 +X 50 +Y 51 +Z 52 +[ 53 +] 54 +a 55 +b 56 +c 57 +d 58 +e 59 +f 60 +g 61 +h 62 +i 63 +j 64 +k 65 +l 66 +m 67 +n 68 +o 69 +p 70 +q 71 +r 72 +s 73 +t 74 +u 75 +v 76 +w 77 +x 78 +y 79 +z 80 +| 81 +~ 82 +’ 83 +© 84 +§ 85 +ª 86 +« 87 +¬ 88 +¯ 89 +° 90 +¶ 91 +º 92 +» 93 +¿ 94 +À 95 + 96 +à 97 +Ç 98 +É 99 +à 100 +Ü 101 +à 102 +á 103 +â 104 +æ 105 +ç 106 +è 107 +é 108 +ë 109 +ì 110 +à 111 +î 112 +ï 113 +ñ 114 +ú 115 +ù 116 +û 117 +ÿ 118 +ę 119 +Š120 +œ 121 +ȩ 122 +— 123 +‘ 124 +’ 125 +… 126 +†127 +<unk> 128 +<space> 129 diff --git a/pylaia/alcar/tokens.txt b/pylaia/alcar/tokens.txt new file mode 100644 index 0000000..97a2fa4 --- /dev/null +++ b/pylaia/alcar/tokens.txt @@ -0,0 +1,130 @@ +<ctc> +! +& +# +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +] +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +| +~ +’ +© +§ +ª +« +¬ +¯ +° +¶ +º +» +¿ +À + +à +Ç +É +à +Ü +à +á +â +æ +ç +è +é +ë +ì +à +î +ï +ñ +ú +ù +û +ÿ +ę +Š+œ +ȩ +— +‘ +’ +… +†+<unk> +<space> diff --git a/pylaia/alcar/weights.ckpt b/pylaia/alcar/weights.ckpt new file mode 100644 index 0000000..6fde10d --- /dev/null +++ b/pylaia/alcar/weights.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11c5c5b6b01a8a45ab48f5433e6f86a5266b8c55fc82c68ac05cd3fe2f9c2a7 +size 42863420 -- GitLab