From 052453b944da83cdb0ffc4d8cda5f583b0300cc5 Mon Sep 17 00:00:00 2001
From: Solene Tarride <starride@teklia.com>
Date: Thu, 21 Dec 2023 16:30:47 +0000
Subject: [PATCH] Add the model trained on HOME-Alcar

---
 pylaia/alcar/README.md              |  24 +++++
 pylaia/alcar/language_model.arpa.gz |   3 +
 pylaia/alcar/lexicon.txt            | 130 ++++++++++++++++++++++++++++
 pylaia/alcar/model                  | Bin 0 -> 1519 bytes
 pylaia/alcar/syms.txt               | 130 ++++++++++++++++++++++++++++
 pylaia/alcar/tokens.txt             | 130 ++++++++++++++++++++++++++++
 pylaia/alcar/weights.ckpt           |   3 +
 7 files changed, 420 insertions(+)
 create mode 100644 pylaia/alcar/README.md
 create mode 100644 pylaia/alcar/language_model.arpa.gz
 create mode 100644 pylaia/alcar/lexicon.txt
 create mode 100644 pylaia/alcar/model
 create mode 100644 pylaia/alcar/syms.txt
 create mode 100644 pylaia/alcar/tokens.txt
 create mode 100644 pylaia/alcar/weights.ckpt

diff --git a/pylaia/alcar/README.md b/pylaia/alcar/README.md
new file mode 100644
index 0000000..9e4fe74
--- /dev/null
+++ b/pylaia/alcar/README.md
@@ -0,0 +1,24 @@
+# PyLaia Alcar
+
+## Datasets
+
+* Trained on [HOME Alcar](https://demo.arkindex.org/browse/46b9b1f4-baeb-4342-a501-e2f15472a276?top_level=true&folder=true) and [Himanis](https://arkindex.teklia.com/browse/2f6e26b0-5fdd-4193-bb30-a3162b96280c?top_level=true&folder=true).
+* Text-lines are resized to a fixed height of 128 pixels.
+* The language model is a 6-gram character model trained only on the training set of HOME-Alcar.
+
+| split | N lines Alcar | N lines Himanis |  Total |
+| ----- | ------------: | --------------: | -----: |
+| train |        59,969 |          18,504 | 78,473 |
+| val   |         7,905 |           2,367 | 10,272 |
+| test  |         6,932 |           2,241 |  9,173 |
+
+## Results
+
+* Evaluation on the test set of HOME-Alcar
+
+| Split | LM  | Evaluation method  | CER (%)  | WER (%)   | Support |
+| ----- | --- | ------------------ | -------- | --------- | ------- |
+| test  | No  | basic              | 8.35     | 26.15     | 6932    |
+| test  | No  | escape punctuation | 8.35     | 24.6      | 6932    |
+| test  | Yes | basic              | **7.85** | **23.2**  | 6932    |
+| test  | Yes | escape punctuation | **7.85** | **21.76** | 6932    |
\ No newline at end of file
diff --git a/pylaia/alcar/language_model.arpa.gz b/pylaia/alcar/language_model.arpa.gz
new file mode 100644
index 0000000..8cb2589
--- /dev/null
+++ b/pylaia/alcar/language_model.arpa.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b025c7b810bf208f4efed66a87cdff5dda9cfd6f1c2a128f69df337ab7ca7db4
+size 19663009
diff --git a/pylaia/alcar/lexicon.txt b/pylaia/alcar/lexicon.txt
new file mode 100644
index 0000000..cd67081
--- /dev/null
+++ b/pylaia/alcar/lexicon.txt
@@ -0,0 +1,130 @@
+<ctc> <ctc>
+! !
+& &
+# #
+' '
+( (
+) )
+* *
++ +
+, ,
+- -
+. .
+/ /
+0 0
+1 1
+2 2
+3 3
+4 4
+5 5
+6 6
+7 7
+8 8
+9 9
+: :
+; ;
+= =
+? ?
+A A
+B B
+C C
+D D
+E E
+F F
+G G
+H H
+I I
+J J
+K K
+L L
+M M
+N N
+O O
+P P
+Q Q
+R R
+S S
+T T
+U U
+V V
+W W
+X X
+Y Y
+Z Z
+[ [
+] ]
+a a
+b b
+c c
+d d
+e e
+f f
+g g
+h h
+i i
+j j
+k k
+l l
+m m
+n n
+o o
+p p
+q q
+r r
+s s
+t t
+u u
+v v
+w w
+x x
+y y
+z z
+| |
+~ ~
+Â’ Â’
+© ©
+§ §
+ª ª
+« «
+¬ ¬
+¯ ¯
+° °
+¶ ¶
+º º
+» »
+¿ ¿
+À À
+Â Â
+Ã Ã
+Ç Ç
+É É
+Ï Ï
+Ü Ü
+à à
+á á
+â â
+æ æ
+ç ç
+è è
+é é
+ë ë
+ì ì
+í í
+î î
+ï ï
+ñ ñ
+ú ú
+ù ù
+û û
+ÿ ÿ
+Ä™ Ä™
+ō ō
+Å“ Å“
+È© È©
+— —
+‘ ‘
+’ ’
+… …
+† †
+<unk> <unk>
+<space> <space>
diff --git a/pylaia/alcar/model b/pylaia/alcar/model
new file mode 100644
index 0000000000000000000000000000000000000000..9aa1a804be6c535a90c03fcde5c18ebcb7f5eac4
GIT binary patch
literal 1519
zcmZ`(YjYGu6rF6|Lr6k=foK#(m}s)$p`a*&i3Y=LmL(7!(Q)j|HrvC__UxHQG+LB4
zRa+$={0n~Z2U-3DtNbJ0p6xuAM{jLS_qpeuzTJKAZMefDmNhY9JvC;nIg6#Ch<5mL
zljW@1ZO3aX)7HI#@mu8~EXNxo_fFC*VvZs21TA6~u__@v&Z;6$t9l)Tsg$UAw4o(g
zhhx~&`_{;`x>>Jdq_nWKm0@(Dlx+W4uEri>-0|kLThR$3*)8%Q6iiA|$3*xbOC<5S
zEX{Z|-|a@SWjB&6$|>}9ncJ9jhkCFTj(A5&V~3}C6tXzT`A<2f9(%I{hEfJv{61%S
zk@5^jeM~!I(w*vIMq!p_$9p)YFh{d<J-ndsVt@9M!pq#7F&b^2QdB_}{mk)-kK>My
z6Iy1thgTI&ij&;OYX*2-VP4D=@P+~2R5&G05%87)-c~p*PIK<rMn22asLAn;k29ov
zR@3Ncj>3XiAisAF@Seh=SR`P{09A!$u}r`@1Dsd5ATE&dlu>R*G0USw;-ZiDN&5ps
zyrghhTqe&C4e*h|6>)`tj}7pN!c}pVfKLr@P2sw@PTJE(n}zfgef7_LtT<twP-0c3
z)EPoKlT{BGc8#<4?k2Bo*;p0j?;yx-lEmfJe(vK73UbYP=Hu5BS?4X<(7gW*x6;Fx
z3hQE>!o2B-5^xo2qDH`m2CN?H3b({90vd*KTVYdd(m{^wT~eBKlOo3*A9tU<^=h1C
zytyT#oHCE6J1n-bb%Wf?(Zg2?UyH+lr|``Ha8Kde0pL3YpO=}dlXRQqp^!=1!S_A_
z=XV<1n<Vuj<3WdI?F>IWF7t(oEr{7JPcyK725u{aC0CmYiZMe|Om0||LR{f~c}#6;
znq#&oj%)w!g)S7LvS2lZ%8cW|(+^2DyLRX9hK+V97e@zj@qk=ML3+nXT$dePsxIo7
zZR?Uw+(%_WMIF}SLB>&V8S)feZ(Q>e$(PPw0*a#AaK|U6{=5CxuhdiYzwz95He-$K
zw=?Ph8KwH#ie+82wZuPvwzeuHt^W;oNPqTw;$eB+DF(-r9r8)<!^*lx9E`sy+&ym-
xP{SP?TQrRM;iY|U84s=YzP}jD+-QG7e)?jpKg@cF<_&jLH)ZmvSeoDG{tv@!o-qIb

literal 0
HcmV?d00001

diff --git a/pylaia/alcar/syms.txt b/pylaia/alcar/syms.txt
new file mode 100644
index 0000000..9cfa4ce
--- /dev/null
+++ b/pylaia/alcar/syms.txt
@@ -0,0 +1,130 @@
+<ctc> 0
+! 1
+& 2
+# 3
+' 4
+( 5
+) 6
+* 7
++ 8
+, 9
+- 10
+. 11
+/ 12
+0 13
+1 14
+2 15
+3 16
+4 17
+5 18
+6 19
+7 20
+8 21
+9 22
+: 23
+; 24
+= 25
+? 26
+A 27
+B 28
+C 29
+D 30
+E 31
+F 32
+G 33
+H 34
+I 35
+J 36
+K 37
+L 38
+M 39
+N 40
+O 41
+P 42
+Q 43
+R 44
+S 45
+T 46
+U 47
+V 48
+W 49
+X 50
+Y 51
+Z 52
+[ 53
+] 54
+a 55
+b 56
+c 57
+d 58
+e 59
+f 60
+g 61
+h 62
+i 63
+j 64
+k 65
+l 66
+m 67
+n 68
+o 69
+p 70
+q 71
+r 72
+s 73
+t 74
+u 75
+v 76
+w 77
+x 78
+y 79
+z 80
+| 81
+~ 82
+Â’ 83
+© 84
+§ 85
+ª 86
+« 87
+¬ 88
+¯ 89
+° 90
+¶ 91
+º 92
+» 93
+¿ 94
+À 95
+Â 96
+Ã 97
+Ç 98
+É 99
+Ï 100
+Ü 101
+à 102
+á 103
+â 104
+æ 105
+ç 106
+è 107
+é 108
+ë 109
+ì 110
+í 111
+î 112
+ï 113
+ñ 114
+ú 115
+ù 116
+û 117
+ÿ 118
+Ä™ 119
+ō 120
+Å“ 121
+È© 122
+— 123
+‘ 124
+’ 125
+… 126
+† 127
+<unk> 128
+<space> 129
diff --git a/pylaia/alcar/tokens.txt b/pylaia/alcar/tokens.txt
new file mode 100644
index 0000000..97a2fa4
--- /dev/null
+++ b/pylaia/alcar/tokens.txt
@@ -0,0 +1,130 @@
+<ctc>
+!
+&
+#
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+=
+?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+[
+]
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+|
+~
+Â’
+©
+§
+ª
+«
+¬
+¯
+°
+¶
+º
+»
+¿
+À
+Â
+Ã
+Ç
+É
+Ï
+Ü
+à
+á
+â
+æ
+ç
+è
+é
+ë
+ì
+í
+î
+ï
+ñ
+ú
+ù
+û
+ÿ
+Ä™
+ō
+Å“
+È©
+—
+‘
+’
+…
+†
+<unk>
+<space>
diff --git a/pylaia/alcar/weights.ckpt b/pylaia/alcar/weights.ckpt
new file mode 100644
index 0000000..6fde10d
--- /dev/null
+++ b/pylaia/alcar/weights.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d11c5c5b6b01a8a45ab48f5433e6f86a5266b8c55fc82c68ac05cd3fe2f9c2a7
+size 42863420
-- 
GitLab