From be84500231cd2cb5b2cb9ddb4a1aa00e18fac415 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sol=C3=A8ne=20Tarride?= <starride@teklia.com>
Date: Fri, 27 Oct 2023 10:49:38 +0200
Subject: [PATCH] Update documentation

---
 docs/usage/train/language_model.md | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/usage/train/language_model.md b/docs/usage/train/language_model.md
index f7063216..f484bb01 100644
--- a/docs/usage/train/language_model.md
+++ b/docs/usage/train/language_model.md
@@ -20,9 +20,12 @@ At character-level, we recommend building a 6-gram model. Use the following comm
 ```sh
 bin/lmplz --order 6 \
     --text my_dataset/language_model/corpus_characters.txt \
-    --arpa my_dataset/language_model/model_characters.arpa
+    --arpa my_dataset/language_model/model_characters.arpa \
+    --discount_fallback
 ```
 
+Note that the `--discount_fallback` option can be removed if your corpus is very large.
+
 The following message should be displayed if the language model was built successfully:
 
 ```sh
@@ -62,16 +65,19 @@ Chain sizes: 1:1308 2:27744 3:159140 4:412536 5:717920 6:1028896
 Name:lmplz	VmPeak:12643224 kB	VmRSS:6344 kB	RSSMax:1969316 kB	user:0.196445	sys:0.514686	CPU:0.711161	real:0.682693
 ```
 
-### Subord-level
+### Subword-level
 
 At subword-level, we recommend building a 6-gram model. Use the following command:
 
 ```sh
 bin/lmplz --order 6 \
     --text my_dataset/language_model/corpus_subwords.txt \
-    --arpa my_dataset/language_model/model_subwords.arpa
+    --arpa my_dataset/language_model/model_subwords.arpa \
+    --discount_fallback
 ```
 
+Note that the `--discount_fallback` option can be removed if your corpus is very large.
+
 ### Word-level
 
 At word-level, we recommend building a 3-gram model. Use the following command:
@@ -79,9 +85,12 @@ At word-level, we recommend building a 3-gram model. Use the following command:
 ```sh
 bin/lmplz --order 3 \
     --text my_dataset/language_model/corpus_words.txt \
-    --arpa my_dataset/language_model/model_words.arpa
+    --arpa my_dataset/language_model/model_words.arpa \
+    --discount_fallback
 ```
 
+Note that the `--discount_fallback` option can be removed if your corpus is very large.
+
 ## Predict with a language model
 
 See the [dedicated example](../predict/index.md#predict-with-an-external-n-gram-language-model).
-- 
GitLab