From 0f7789f131735197685db0280309758f71d3be90 Mon Sep 17 00:00:00 2001 From: Bastien Abadie <abadie@teklia.com> Date: Wed, 28 Aug 2024 09:46:19 +0000 Subject: [PATCH] Import model pylaia_rimes from pylaia --- pylaia/rimes/README.md | 21 ++++++ pylaia/rimes/language_model.arpa.gz | 3 + pylaia/rimes/lexicon.txt | 100 ++++++++++++++++++++++++++++ pylaia/rimes/model | Bin 0 -> 1515 bytes pylaia/rimes/syms.txt | 100 ++++++++++++++++++++++++++++ pylaia/rimes/tokens.txt | 100 ++++++++++++++++++++++++++++ pylaia/rimes/weights.ckpt | 3 + 7 files changed, 327 insertions(+) create mode 100644 pylaia/rimes/README.md create mode 100644 pylaia/rimes/language_model.arpa.gz create mode 100644 pylaia/rimes/lexicon.txt create mode 100644 pylaia/rimes/model create mode 100644 pylaia/rimes/syms.txt create mode 100644 pylaia/rimes/tokens.txt create mode 100644 pylaia/rimes/weights.ckpt diff --git a/pylaia/rimes/README.md b/pylaia/rimes/README.md new file mode 100644 index 0000000..4b87701 --- /dev/null +++ b/pylaia/rimes/README.md @@ -0,0 +1,21 @@ +# PyLaia Rimes + +## Datasets + +Trained on text-lines from the [Rimes 2011 dataset](https://teklia.com/research/rimes-database/). + +| split | N lines | +|--------|--------:| +| train | 10,188 | +| val | 1,138 | +| test | 778 | + +## Results + +* Fixed line height: 128 pixels +* Language model: 6-gram character model trained on the training set with KenLM + +| Model | val CER | test CER | val WER | test WER | +|:--------------------------------|--------:|---------:|--------:|---------:| +| Model without LM | 4.55 | 4.53 | 14.39 | 15.06 | +| Model with LM (`weight = 1.5`) | 3.68 | 3.47 | 10.01 | 10.20 | diff --git a/pylaia/rimes/language_model.arpa.gz b/pylaia/rimes/language_model.arpa.gz new file mode 100644 index 0000000..fae3587 --- /dev/null +++ b/pylaia/rimes/language_model.arpa.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119d9433498a3c17f934be86db2dbc7794cc5ff3861c8d0a4fa8e197f10658d5 +size 5140408 diff --git a/pylaia/rimes/lexicon.txt b/pylaia/rimes/lexicon.txt new file mode 100644 index 0000000..b33b289 --- /dev/null +++ b/pylaia/rimes/lexicon.txt @@ -0,0 +1,100 @@ +<ctc> <ctc> +! ! +" " +% % +' ' +( ( +) ) +, , +- - +. . +/ / +0 0 +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +7 7 +8 8 +9 9 +: : +; ; += = +? ? +A A +B B +C C +D D +E E +F F +G G +H H +I I +J J +K K +L L +M M +N N +O O +P P +Q Q +R R +S S +T T +U U +V V +W W +X X +Y Y +Z Z +_ _ +a a +b b +c c +d d +e e +f f +g g +h h +i i +j j +k k +l l +m m +n n +o o +p p +q q +r r +s s +t t +u u +v v +w w +x x +y y +z z +{ { +} } +¤ ¤ +° ° +² ² +À À +É É +à à +â â +ç ç +è è +é é +ê ê +ë ë +î î +ô ô +ù ù +û û +œ œ +€ € +<unk> <unk> +<space> <space> diff --git a/pylaia/rimes/model b/pylaia/rimes/model new file mode 100644 index 0000000000000000000000000000000000000000..d4cd1bf7472e430655c1a7be353b957b0f99af95 GIT binary patch literal 1515 zcmZ`(e|Hl_6y2sYZI`x`3JRiC1ZAsj$`1ua5p1<kHw~dsUD0)&ZidW~&1~-eDAsd0 zI)8*?zl2}O<2Uf5_+~Su4dCSD?96@l-FY+f-fOy}M=Wb%!g^`USSPG*(&F)Qi{-4= z>%?m-v)02G#vhfZupF;S?nEwPjuG!PEn*h2TIrtEM4s04ItWuKQSoR)OR_G<sHgX> zlWBFc(ZG??!qRqzv4v8y^Q2r=pJLqc=CoVU4I<eq@*osUN*-sJ2p?yOBwmlD8L#EL zy-2q0W|BpDw8PhBZe!9N>BCky>P?fz4o~waWO0!5Uvo_Dd9wtDQU+T55odXk@(j~H zW*jl;PW5q2VU}j6`Z%sIN3)B4yrytsFgvO6I`@tljSf#KsvwJg<9NfzDaXfYEi>B3 zn+j*d8SdjP1H7#;FXjn&#{lmtoE2vYc+UXm6wZtDoO`yB&+;^CalG&215&=AX>>G4 zVL>dA--ibHNMTVd60l@|n!>VJCg7q0E-73Vmq~fbD7T`R<xwJW#mC2_{fQx7Rk$Xu zk>{rd_)OutxK6<52KYkZhPXk%mj<}0a7)}G?HQxZLVAjU`d2<yoG?$)P}HQ<8A3Ue z)eabTowLsFCa-VVSQX{(Ajn>l#O2j~?c*B?a?N?=<2Mr7;BDH_y#H;t(#IWzb+Jxi ze(Q)5a24vJPQZo+tUekFcg0--nuc*tVN-0<L5}WUQkwLVBFB9n4_>|XYMf-ewI!pR zGLNS_EVi+Ao7~IM$9D?fi^G7Y@WT-BP~pcR;3oy2mzk@RberX&kV)Fb&pra@Pa52t zB=sWWL6>En48QD^`9j4O#B7(R8Q35Lw-v&Yt4#&Pn4u*mH!MmauJEWlrZ%<AG20Z! z&42ep7Yb2Xu$n?;#_{;&ha{U_zyDyvMyHgEqXW5kNUp0Oz2hXV%Z@Hp7j?|Gbx9{~ zt1PIf%i281I0`O9o}%ZCYkng6(xt0FQPi65_{8$FlbzqGr|5t4f7{u#RW<F5>OZ4I zUt6hAH@jkMiGTiTZ&gTI|C{cJ{yguAhvjub50Bvv`J(S(UEO*P#@`g|z}JM+bgR`x rW1k;gdd@6kq1FEPiLuO$4JPELFU8t3>k*na-7(#e$){pz{($=*Yj~Q^ literal 0 HcmV?d00001 diff --git a/pylaia/rimes/syms.txt b/pylaia/rimes/syms.txt new file mode 100644 index 0000000..bb2efde --- /dev/null +++ b/pylaia/rimes/syms.txt @@ -0,0 +1,100 @@ +<ctc> 0 +! 1 +" 2 +% 3 +' 4 +( 5 +) 6 +, 7 +- 8 +. 9 +/ 10 +0 11 +1 12 +2 13 +3 14 +4 15 +5 16 +6 17 +7 18 +8 19 +9 20 +: 21 +; 22 += 23 +? 24 +A 25 +B 26 +C 27 +D 28 +E 29 +F 30 +G 31 +H 32 +I 33 +J 34 +K 35 +L 36 +M 37 +N 38 +O 39 +P 40 +Q 41 +R 42 +S 43 +T 44 +U 45 +V 46 +W 47 +X 48 +Y 49 +Z 50 +_ 51 +a 52 +b 53 +c 54 +d 55 +e 56 +f 57 +g 58 +h 59 +i 60 +j 61 +k 62 +l 63 +m 64 +n 65 +o 66 +p 67 +q 68 +r 69 +s 70 +t 71 +u 72 +v 73 +w 74 +x 75 +y 76 +z 77 +{ 78 +} 79 +¤ 80 +° 81 +² 82 +À 83 +É 84 +à 85 +â 86 +ç 87 +è 88 +é 89 +ê 90 +ë 91 +î 92 +ô 93 +ù 94 +û 95 +œ 96 +€ 97 +<unk> 98 +<space> 99 diff --git a/pylaia/rimes/tokens.txt b/pylaia/rimes/tokens.txt new file mode 100644 index 0000000..143fa1a --- /dev/null +++ b/pylaia/rimes/tokens.txt @@ -0,0 +1,100 @@ +<ctc> +! +" +% +' +( +) +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; += +? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +} +¤ +° +² +À +É +à +â +ç +è +é +ê +ë +î +ô +ù +û +œ +€ +<unk> +<space> diff --git a/pylaia/rimes/weights.ckpt b/pylaia/rimes/weights.ckpt new file mode 100644 index 0000000..27cf064 --- /dev/null +++ b/pylaia/rimes/weights.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b24aa78625c3c5cfbd07926aee3d5890dec2d7aab931a1c55bdbe6770236a8aa +size 42750044 -- GitLab