diff --git a/content/tutorial/_index.md b/content/tutorial/_index.md index a44e35fd2e1df4b5f50562970ceb26ea4b75a14b..0bd136286e331c7e7d784a1f2e239c4790da6a5c 100644 --- a/content/tutorial/_index.md +++ b/content/tutorial/_index.md @@ -2,9 +2,7 @@ title = "Tutorial" sort_by = "weight" weight = 200 - -# This is only available in draft mode -draft = true +insert_anchor_links = "right" +++ This documentation is aimed at Digital Humanities specialists who want to transcribe handwritten or printed documents using our Arkindex platform. diff --git a/content/tutorial/arkindex_corpus.jpg b/content/tutorial/arkindex_corpus.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f7838c33496acde71918975953d5a0fc511018e1 Binary files /dev/null and b/content/tutorial/arkindex_corpus.jpg differ diff --git a/content/tutorial/arkindex_create_dataset.jpg b/content/tutorial/arkindex_create_dataset.jpg new file mode 100644 index 0000000000000000000000000000000000000000..09df79bea5ce71ead4ca00079ea9972c9cf73015 Binary files /dev/null and b/content/tutorial/arkindex_create_dataset.jpg differ diff --git a/content/tutorial/arkindex_create_folders.jpg b/content/tutorial/arkindex_create_folders.jpg new file mode 100644 index 0000000000000000000000000000000000000000..00e34176a5d4fff54d34a1554e4cbc1fc4157430 Binary files /dev/null and b/content/tutorial/arkindex_create_folders.jpg differ diff --git a/content/tutorial/arkindex_dataset_set.jpg b/content/tutorial/arkindex_dataset_set.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6dbd8e1a0171a7af115e33f87927f252c402e24d Binary files /dev/null and b/content/tutorial/arkindex_dataset_set.jpg differ diff --git a/content/tutorial/arkindex_dataset_viewer.jpg b/content/tutorial/arkindex_dataset_viewer.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d8674ea1fbea6195b5215868fecd6a91711e9235 Binary files /dev/null and b/content/tutorial/arkindex_dataset_viewer.jpg differ diff --git a/content/tutorial/arkindex_edit_project.jpg b/content/tutorial/arkindex_edit_project.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a78f798b23e65bedba50186a477400e614333a02 Binary files /dev/null and b/content/tutorial/arkindex_edit_project.jpg differ diff --git a/content/tutorial/arkindex_filter_sort.jpg b/content/tutorial/arkindex_filter_sort.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0dc3947527afd875ebeae76af8b57d44f9be9d2b Binary files /dev/null and b/content/tutorial/arkindex_filter_sort.jpg differ diff --git a/content/tutorial/arkindex_move.jpg b/content/tutorial/arkindex_move.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3be0aaa91f8ba2cab2a6a8e42d30e15668558217 Binary files /dev/null and b/content/tutorial/arkindex_move.jpg differ diff --git a/content/tutorial/arkindex_pagination_size.jpg b/content/tutorial/arkindex_pagination_size.jpg new file mode 100644 index 0000000000000000000000000000000000000000..30de689f471e9e7db815b5280013846cb0ac62a9 Binary files /dev/null and b/content/tutorial/arkindex_pagination_size.jpg differ diff --git a/content/tutorial/arkindex_project_id.jpg b/content/tutorial/arkindex_project_id.jpg new file mode 100644 index 0000000000000000000000000000000000000000..21687fc90ec1bf3067e9b15dee17add175018e1c Binary files /dev/null and b/content/tutorial/arkindex_project_id.jpg differ diff --git a/content/tutorial/arkindex_project_information.jpg b/content/tutorial/arkindex_project_information.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1842a4f7b3eca1a44576109a22d805d8e6895b89 Binary files /dev/null and b/content/tutorial/arkindex_project_information.jpg differ diff --git a/content/tutorial/arkindex_select_all_displayed_elements.jpg b/content/tutorial/arkindex_select_all_displayed_elements.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ecf5ec348fa33f016934eb0f565a82b1add53a6c Binary files /dev/null and b/content/tutorial/arkindex_select_all_displayed_elements.jpg differ diff --git a/content/tutorial/arkindex_select_project.jpg b/content/tutorial/arkindex_select_project.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fecf1b910ff4a8a2500b8c85d0252b4df9a6918b Binary files /dev/null and b/content/tutorial/arkindex_select_project.jpg differ diff --git a/content/tutorial/arkindex_selection.jpg b/content/tutorial/arkindex_selection.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d7ccb43476957b5d2491ca080f0356e22079d558 Binary files /dev/null and b/content/tutorial/arkindex_selection.jpg differ diff --git a/content/tutorial/corpus.jpg b/content/tutorial/corpus.jpg new file mode 100644 index 0000000000000000000000000000000000000000..11b9498938e343d4c095cf6bc7f1276a7dfc1112 Binary files /dev/null and b/content/tutorial/corpus.jpg differ diff --git a/content/tutorial/corpus.md b/content/tutorial/corpus.md index c03f6eed65e5f08778c6fedc76dc04ad412758e1..b0eaaafbe56670d77135e544dde9200333b44754 100644 --- a/content/tutorial/corpus.md +++ b/content/tutorial/corpus.md @@ -1,9 +1,161 @@ +++ -title = "Corpus identification" +title = "Corpus import" weight = 20 -draft = true +++ -- example https://europeana.transcribathon.eu/documents/story/item/?item=1258824 -- french, completed transcription, handwritten -- import inside arkindex for segmentation + transcription \ No newline at end of file +In this tutorial, you will learn how to import images and metadata in Arkindex. + +## Corpus description + +As an example, you will import the [**Pellet**](https://europeana.transcribathon.eu/documents/story/?story=121795) dataset from the Europeana 1914-1918 collection. + +The corpus contains 471 scanned documents related to Casimir Marius PELLET, a French soldier during World War I. + +The documents are written in French and include various content types, such as campaign diaries, photographs, and postcards. Each document has been transcribed by volunteers and includes descriptive metadata. +We have selected this corpus as it covers a large variety of document, while still being relatively small to avoid complexity due to large annotation and training needs. + +{{ figure(image="tutorial/corpus.jpg", height=200, caption="Documents from the Pellet corpus") }} +Of course you may import your own data directly in Arkindex, using file uploads. Arkindex supports images, PDFs, METS, ALTO, ZIP archive compatible with Transkribus, ... + +## Create a project in Arkindex + +{% info() %} +This section expects you to have an Arkindex account. Learn how to register [here](@/users/auth/index.md). +{% end %} + +Log in to [Arkindex](https://demo.arkindex.org/) by entering your email and password. + +On the front page, you will find an empty corpus entitled `My Project`. We will publish the data from Europeana in this corpus. Alternatively, you can create a new project by clicking on the `New Project` button at the top right of the page. Note that this corpus is personal and can only be accessed by you. + +To edit your project name and and description: +* Click on `My Project` +{{ figure(image="tutorial/arkindex_select_project.jpg", height=300, caption="Select your project") }} +* Go on your project information page +{{ figure(image="tutorial/arkindex_project_information.jpg", height=200, caption="Go to your project information page") }} +* Edit your project's name and description and click on `Update` + * **Name**: `Europeana | Pellet` + * **Description**: `Corpus from [Europeana](https://europeana.transcribathon.eu/documents/story/?story=121795)` +{{ figure(image="tutorial/arkindex_edit_project.jpg", height=300, caption="Edit your project's name and description") }} + +## Import data to Arkindex + +Two steps are required to import the corpus in Arkindex: +1. Extract the data from Europeana (images, transcriptions and metadata) +2. Publish it to your Arkindex project + +{% info() %} +You will need Python 3.10 and a shell environment (we recommend Ubuntu or Mac OS X) +{% end %} + +We have released a Python package named [`arkindex-scrapers`](https://pypi.org/project/arkindex-scrapers/) to help you achieve these steps. To install it to your environment, run: + +```sh +pip install teklia-scrapers +``` + +### Data extraction + +To extract data from the Europeana website, you need to specify two arguments: +* `--story_id`: the Europeana identifier for the Pellet corpus (`"121795"`) +* `--output_dir`: the directory in which the corpus will be extracted (`"pellet_corpus"`) + +Running the following command will start the import: +```sh +scrapers eu-trans --story_id 121795 --output_dir pellet_corpus/ +``` + +{% warning() %} +The command should take 2 hours to run completely, depending on your network connection and Europeana current availability. +{% end %} + +Once the extraction is done, you will find a JSON file named `121795.json` in the directory named `pellet_corpus/`. + +### Publication to Arkindex + +Then, you can use the `scrapers publish` command to publish the data to Arkindex. + +You will need to provide the following arguments: +* `arkindex-api-url`: The Arkindex instance in which you wish to import the corpus. By default, you should use [https://demo.arkindex.org/](https://demo.arkindex.org/). +* `arkindex-api-token`: Your API token. If you do not know your API token, refer to [this page](@/users/auth/index.md#personal-token). +* `--corpus-id`: The UUID of the Arkindex project created in the [previous step](#create-a-project-in-arkindex). This value can be copied from your Arkindex project details page, just below its name. +{{ figure(image="tutorial/arkindex_project_id.jpg", height=200, caption="Find your project's UUID on Arkindex") }} +* `--worker-run-id`: The worker run UUID that will be used to import the data. Refer to [this page](https://teklia.com/our-solutions/arkindex/releases/arkindex-release-152/) to create your own worker run. +* `--folder-type`: The type of the top level element (`"folder"`) +* `--page-type`: The type of the child level elements (`"page"`) +* `--report`: The path to the JSON report file (`"report.json"`) +* `folder`: The path to the local directory containing the `121795.json` JSON file, generated using the previous command (`"pellet_corpus/"`) + +```sh +scrapers publish --folder-type folder \ + --page-type page \ + --report report.json \ + --corpus-id aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee \ + --arkindex-api-url https://demo.arkindex.org/ \ + --arkindex-api-token aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee \ + --worker-run-id aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee + pellet_corpus/ +``` + +Once the import is finished, you should be able to navigate through the folder named `PELLET casimir marius` in Arkindex: +{{ figure(image="tutorial/arkindex_corpus.jpg", height=500, caption="The Pellet corpus in Arkindex") }} + +## Data partitioning + +To train Machine Learning models on this corpus, you need to split the corpus in three sets for training, validation and evaluation: +* 321 `page` elements (around 75% of the corpus) in the `train` set (used for model training) +* 50 `page` elements (around 12.5% of the corpus) in the `val` set (used for model validation) +* 50 `page` elements (around 12.5% of the corpus) in the `test` set (used for model evaluation) + +### Dataset creation + +First, you need to create an [Arkindex `Dataset`](@/training/datasets/index.md). To do that, go to your corpus, then click on `Actions` > `Project information` > `Datasets` > `+`. Enter a description of this dataset, and keep the three sets named `train`, `validation` and `test`. + +{{ figure(image="tutorial/arkindex_create_dataset.jpg", height=400, caption="Create the dataset") }} + +We also recommend that you create three `folders` inside the `PELLET casimir marius` folder, named `train`, `validation`, and `test`. To create a folder, click on `Actions`, then `Add folder` and enter the name of your folder. + +{{ figure(image="tutorial/arkindex_create_folders.jpg", height=300, caption="Create three folders") }} + + +### Add pages to a dataset set + +To add 50 random pages to a the `test` set of your `Dataset`, follow these steps: + +1. Go to the folder named `PELLET casimir marius` +2. Select 50 random pages + * Filter elements by type `page` and sort them by `Random` order + {{ figure(image="tutorial/arkindex_filter_sort.jpg", height=300, caption="Filter pages and sort randomly") }} + * Click on `Display` > `Pagniation size` and set it to 50 + {{ figure(image="tutorial/arkindex_pagination_size.jpg", height=300, caption="Update pagination size") }} + * Select 50 pages by clicking on `Actions` > `Select all displayed elements` + {{ figure(image="tutorial/arkindex_select_all_displayed_elements.jpg", height=300, caption="Select all displayed elements") }} +3. Add these pages to your `Dataset` + * Go to the `Selection` page + {{ figure(image="tutorial/arkindex_selection.jpg", height=50, caption="Selection") }} + * Click on `Actions` > `Add to a dataset`, then select the `test` set of your `Dataset` + {{ figure(image="tutorial/arkindex_dataset_set.jpg", height=150, caption="The Pellet dataset imported in Arkindex") }} +4. Move these elements to the `test` folder +{{ figure(image="tutorial/arkindex_move.jpg", height=300, caption="Move elements to the test folder") }} + +Repeat these steps for the `validation` set. + +Finally, select all the remaining `page` elements and add them to the `train` set and folder. + +### Visualize your dataset + +Click on the `Dataset` name to visualize its content: +{{ figure(image="tutorial/arkindex_dataset_viewer.jpg", height=1200, caption="The Pellet dataset imported in Arkindex") }} + +## Next steps + +As you can see, transcriptions on this corpus are available at page-level. + +In order to train Machine Learning models, additional annotations will be required such as: +* [Page classification](@/tutorial/classification-ground-truth.md) +* [Text-line detection](@/tutorial/segmentation-ground-truth.md) +* [Text-line transcription](@/tutorial/transcription-ground-truth.md) + +Once the annotation campaigns are completed, you will be able to train Machine learning models: +* [Classification model](@/tutorial/classification-training.md) +* [Segmentation model](@/tutorial/segmentation-training.md) +* [Transcription model](@/tutorial/transcription-training.md)