Setup documentation with mkdocs

7308466c · Yoann Schneider · Solene Tarride · 5d8eed59 · 7308466c · 7308466c
Commit 7308466c authored 2 years ago by Yoann Schneider Committed by Solene Tarride 2 years ago
--- a/.gitignore
+++ b/.gitignore
@@ -127,9 +127,3 @@ dmypy.json

 # Pyre type checker
 .pyre/
-
-Datasets/formatted/*
-Datasets/raw/*
-**/outputs
-Fonts/*
-.idea
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -52,6 +52,86 @@ test:
  script:
    - tox

+# Make sure docs still build correctly
+.docs:
+  image: python:3.10
+  artifacts:
+    paths:
+      - public
+
+  before_script:
+    - pip install -e .[docs]
+
+  script:
+    - mkdocs build --strict --verbose
+
+docs-build:
+  extends: .docs
+  stage: build
+
+  # Test job outside of tags to ensure the docs still can build before merging
+  # Does not use the `pages` name, therefore will be ignored by GitLab Pages
+  except:
+    - tags
+    - schedules
+
+pages:
+  extends: .docs
+  stage: deploy
+
+  only:
+    - master
+    - tags
+
+
+docs-deploy:
+  image: node:18
+  stage: deploy
+
+  dependencies:
+    - docs-build
+
+  before_script:
+    - npm install -g surge
+
+  except:
+    - master
+    - tags
+    - schedules
+
+  environment:
+    name: ${CI_COMMIT_REF_SLUG}
+    url: https://${CI_COMMIT_REF_SLUG}-teklia-atr-dan.surge.sh
+    on_stop: docs-stop-surge
+
+  script:
+    - surge public ${CI_ENVIRONMENT_URL}
+
+docs-stop-surge:
+  image: node:18
+  stage: deploy
+  when: manual
+
+  # Do not try to checkout the branch if it was deleted
+  variables:
+    GIT_STRATEGY: none
+
+  except:
+    - master
+    - tags
+    - schedules
+
+  environment:
+    name: ${CI_COMMIT_REF_SLUG}
+    url: https://${CI_COMMIT_REF_SLUG}-base-worker-arkindex.surge.sh
+    action: stop
+
+  before_script:
+    - npm install -g surge
+
+  script:
+    - surge teardown ${CI_ENVIRONMENT_URL}
+
 bump-python-deps:
  stage: deploy
  image: registry.gitlab.com/teklia/devops:latest
@@ -60,7 +140,7 @@ bump-python-deps:
    - schedules

  script:
-    - devops python-deps requirements.txt
+    - devops python-deps requirements.txt doc-requirements.txt

 release-notes:
  stage: deploy

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,19 +5,18 @@ repos:
      - id: isort
        args: ["--profile", "black"]
  - repo: https://github.com/ambv/black
-    rev: 22.6.0
+    rev: 22.10.0
    hooks:
    - id: black
  - repo: https://github.com/pycqa/flake8
-    rev: 3.9.2
+    rev: 6.0.0
    hooks:
      - id: flake8
        additional_dependencies:
-          - 'flake8-coding==1.3.1'
-          - 'flake8-copyright==0.2.2'
-          - 'flake8-debugger==4.0.0'
+          - 'flake8-coding==1.3.2'
+          - 'flake8-debugger==4.1.2'
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v4.4.0
    hooks:
      - id: check-ast
      - id: check-docstring-first
@@ -40,6 +39,10 @@ repos:
    hooks:
      - id: codespell
        args: ['--write-changes']
+  - repo: https://github.com/PyCQA/doc8
+    rev: v1.0.0
+    hooks:
+      - id: doc8
  - repo: meta
    hooks:
      - id: check-useless-excludes
--- a/MANIFEST.in
+++ b/MANIFEST.in
 include requirements.txt
+include doc-requirements.txt
 include VERSION
--- a/README.md
+++ b/README.md
 # DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition

-This repository is a public implementation of the paper: "DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition".
+## Documentation

-![Prediction visualization](images/visual.png)
+For more details about this package, make sure to see the documentation available at ...

-The model uses a character-level attention to handle slanted lines:
-![Prediction visualization on slanted lines](images/visual_slanted_lines.png)

-The paper is available at https://arxiv.org/abs/2203.12273.
-
-To discover my other works, here is my [academic page](https://factodeeplearning.github.io/).
-
-Click to see the demo:
-
-[![Click to see demo](https://img.youtube.com/vi/HrrUsQfW66E/0.jpg)](https://www.youtube.com/watch?v=HrrUsQfW66E)
-
-This work focus on handwritten text and layout recognition through the use of an end-to-end segmentation-free attention-based network.
-We evaluate the DAN on two public datasets: RIMES and READ 2016 at single-page and double-page levels.
-
-We obtained the following results:
-
-|                         | CER (%) | WER (%) | LOER (%) | mAP_cer (%) |
-| :---------------------: | ------- | :-----: | :------: | ----------- |
-|   RIMES (single page)   | 4.54    |  11.85  |   3.82   | 93.74       |
-| READ 2016 (single page) | 3.53    |  13.33  |   5.94   | 92.57       |
-| READ 2016 (double page) | 3.69    |  14.20  |   4.60   | 93.92       |
-
-
-Pretrained model weights are available [here](https://git.litislab.fr/dcoquenet/dan).
-
-Table of contents:
-1. [Getting Started](#Getting-Started)
-2. [Datasets](#Datasets)
-3. [Training And Evaluation](#Training-and-evaluation)
-
-## Getting Started
-We used Python 3.9.1, Pytorch 1.8.2 and CUDA 10.2 for the scripts.
-
-Clone the repository:
-
-```
-git clone https://github.com/FactoDeepLearning/DAN.git
-```
-
-Install the dependencies:
-
-```
-pip install -r requirements.txt
-```
-
-### Remarks (for pre-training and training)
-All hyperparameters are specified and editable in the training scripts (meaning are in comments).\
-Evaluation is performed just after training ending (training is stopped when the maximum elapsed time is reached or after a maximum number of epoch as specified in the training script).\
-The outputs files are split into two subfolders: "checkpoints" and "results". \
-"checkpoints" contains model weights for the last trained epoch and for the epoch giving the best valid CER. \
-"results" contains tensorboard log for loss and metrics as well as text file for used hyperparameters and results of evaluation.
-
-## `Predict` module
-
-This repository also contains a package to run a pre-trained model on an image.
-
-### Installation
+## Installation

 To use DAN in your own scripts, install it using pip:

@@ -68,7 +13,7 @@ To use DAN in your own scripts, install it using pip:
 pip install -e .
 ```

-### Usage
+## Inference

 To apply DAN to an image, one needs to first add a few imports and to load an image. Note that the image should be in RGB.
 ```python
@@ -93,11 +38,11 @@ To run the inference on a GPU, one can replace `cpu` by the name of the GPU. In
 text, confidence_scores = model.predict(image, confidences=True)
 ```

-### Commands
+## Training

 This package provides three subcommands. To get more information about any subcommand, use the `--help` option.

-#### Data extraction from Arkindex
+### Data extraction from Arkindex

 Use the `teklia-dan dataset extract` command to extract a dataset from Arkindex. This will generate the images and the labels needed to train a DAN model.
 The available arguments are
@@ -190,7 +135,7 @@ teklia-dan dataset extract \
    --output data
 ```

-#### Dataset formatting
+### Dataset formatting
 Use the `teklia-dan dataset format` command to format a dataset. This will generate two important files to train a DAN model:
 - `labels.json`
 - `charset.pkl`
@@ -210,8 +155,17 @@ teklia-dan dataset format \
 The created files will be stored at the root of your dataset.


-#### Model training
+### Model training
 `teklia-dan train` with multiple arguments.

-#### Synthetic data generation
+
+#### Remarks (for pre-training and training)
+All hyperparameters are specified and editable in the training scripts (meaning are in comments).\
+Evaluation is performed just after training ending (training is stopped when the maximum elapsed time is reached or after a maximum number of epoch as specified in the training script).\
+The outputs files are split into two subfolders: "checkpoints" and "results". \
+"checkpoints" contains model weights for the last trained epoch and for the epoch giving the best valid CER. \
+"results" contains tensorboard log for loss and metrics as well as text file for used hyperparameters and results of evaluation.
+
+
+### Synthetic data generation
 `teklia-dan generate` with multiple arguments
--- a/doc-requirements.txt
+++ b/doc-requirements.txt
+black==22.10.0
+doc8==1.0.0
+mkdocs==1.4.2
+mkdocs-material==8.5.11
+mkdocstrings==0.19.0
+mkdocstrings-python==0.8.2
+recommonmark==0.7.1
--- a/docs/assets/favicon.png
+++ b/docs/assets/favicon.png
--- a/docs/assets/logo.png
+++ b/docs/assets/logo.png
--- a/images/visual.png
+++ b/images/visual.png
--- a/images/visual_slanted_lines.png
+++ b/images/visual_slanted_lines.png
--- a/docs/dev/build_docs.md
+++ b/docs/dev/build_docs.md
+# Documentation development
+
+## Setup
+
+Add the `docs` extra when installing `teklia-dan`:
+
+```sh
+# In a clone of the Git repository
+pip install .[docs]
+```
+
+Build the documentation using `mkdocs serve -v`. The documentation should be available as http://localhost:8000
+
+## Writing documentation
+
+This documentation uses [Sphinx](http://www.sphinx-doc.org/) and was generated using [mkdocs](https://mkdocs.org/) and [mkdocstrings](https://mkdocstrings.github.io/).
+
+## Linting
+
+This documentation is subject to linting using `doc8`, integrated into
+`pre-commit`. You can run it locally by typing `pre-commit run`. You should use
+`pre-commit install` to have the pre-commit hook run before each Git commit.
+
+The linting rules that `doc8` applies can be found on [its documentation][1].
+
+[1]: https://doc8.readthedocs.io/en/latest/readme.html#usage
--- a/docs/index.md
+++ b/docs/index.md
+# DAN: a Segmentation-free Document Attention Network for Handwritten Document Recognition
+
+This repository allows to use the DAN model, a ***Segmentation-free Document Attention Network for Handwritten Document Recognition***. Both training and inference are possible using the multiple commands exposed.
+
+![Prediction visualization](assets/visual.png)
+
+The model uses a character-level attention to handle slanted lines:
+![Prediction visualization on slanted lines](assets/visual_slanted_lines.png)
+
+Click [here](original_paper.md) to learn more about the model and how it fares against SOTA models.
+
+## Getting started
+
+To use DAN in your own environment, install it using pip:
+
+```shell
+pip install -e .
+```
+
+To learn more about the newly installed `teklia-dan` command, make sure to run:
+```shell
+teklia-dan --help
+```
+
+## Linter
+
+Code syntax is analyzed before submitting the code.\
+To run the linter tools suite you may use pre-commit.
+
+```shell
+pip install pre-commit
+pre-commit run -a
+```
+
+## Run tests
+
+Tests are executed with tox using [pytest](https://pytest.org).
+
+```shell
+pip install tox
+tox
+```
+
+To recreate tox virtual environment (e.g. a dependencies update), you may run `tox -r`
--- a/docs/original_paper.md
+++ b/docs/original_paper.md
+# Original implementation
+
+The paper is available at https://arxiv.org/abs/2203.12273.
+
+<div class="video-wrapper">
+    <iframe width="560" height="315" src="https://www.youtube.com/embed/HrrUsQfW66E" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
+</div>
+
+This model focuses on handwritten text and layout recognition through the use of an end-to-end segmentation-free attention-based network.
+DAN was evaluated on two public datasets: RIMES and READ 2016 at single-page and double-page levels.
+
+The following results were published:
+
+|                         | CER (%) | WER (%) | LOER (%) | mAP_cer (%) |
+| :---------------------: | ------- | :-----: | :------: | ----------- |
+|   RIMES (single page)   | 4.54    |  11.85  |   3.82   | 93.74       |
+| READ 2016 (single page) | 3.53    |  13.33  |   5.94   | 92.57       |
+| READ 2016 (double page) | 3.69    |  14.20  |   4.60   | 93.92       |
+
+
+Pretrained model weights are available [here](https://git.litislab.fr/dcoquenet/dan).
--- a/docs/ref/datasets/extract/arkindex.md
+++ b/docs/ref/datasets/extract/arkindex.md
+# Arkindex
+
+::: dan.datasets.extract.extract_from_arkindex
--- a/docs/ref/datasets/extract/index.md
+++ b/docs/ref/datasets/extract/index.md
+# Extraction
--- a/docs/ref/datasets/extract/utils.md
+++ b/docs/ref/datasets/extract/utils.md
+# Utils
+
+::: dan.datasets.extract.utils
--- a/docs/ref/datasets/format/atr.md
+++ b/docs/ref/datasets/format/atr.md
+# Automatic Text Recognition
+
+::: dan.datasets.format.atr
--- a/docs/ref/datasets/format/index.md
+++ b/docs/ref/datasets/format/index.md
+# Formatting
--- a/docs/ref/datasets/index.md
+++ b/docs/ref/datasets/index.md
+# Datasets
--- a/docs/ref/datasets/utils.md
+++ b/docs/ref/datasets/utils.md
+# Utils
+
+::: dan.datasets.utils