From c77cdf57dd2d31c99f207ec14de380492fc5f3be Mon Sep 17 00:00:00 2001 From: Martin <maarand@teklia.com> Date: Mon, 18 Nov 2019 12:27:43 +0100 Subject: [PATCH] use corpus id --- README.md | 5 +++++ kaldi_data_generator.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/README.md b/README.md index 2656a43..e77afd2 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,11 @@ Simple example: python kaldi_data_generator.py --dataset_name my_balsac --out_dir /tmp/balsac/ --volumes 8f4005e9-1921-47b0-be7b-e27c7fd29486 d2f7c563-1622-4721-bd51-96fab97189f7 ``` +With corpus ids +```bash +python kaldi_data_generator.py --dataset_name cz --out_dir /tmp/home_cz/ --corpora 1ed45e94-9108-4029-a529-9abe37f55ba0 +``` + Polygon example: ```bash python kaldi_data_generator.py --dataset_name my_balsac2 --extraction_mode polygon --out_dir /tmp/balsac/ --pages 50e1c3c0-2fe9-4216-805e-1a2fd2e7e9f4 diff --git a/kaldi_data_generator.py b/kaldi_data_generator.py index ae3f911..f7e0bdd 100644 --- a/kaldi_data_generator.py +++ b/kaldi_data_generator.py @@ -96,6 +96,10 @@ class KaldiDataGenerator: print("ListTranscriptions failed", e.status_code, e.title, e.content, page_id) raise e print("Num of lines", count) + if count == 0: + print(f"Page {page_id} skipped, because it has no lines") + return + full_image_url = res['zone']['image']['s3_url'] img = self.get_image(full_image_url, page_id=page_id) @@ -141,6 +145,12 @@ class KaldiDataGenerator: page_ids = [page['id'] for page in api_client.paginate('ListElementChildren', id=volume_id)] self.run_pages(page_ids) + def run_corpora(self, corpus_ids: list): + for corpus_id in corpus_ids: + print("Cor", corpus_id) + vol_ids = [vol['id'] for vol in api_client.paginate('ListElements', corpus=corpus_id, type='volume')] + self.run_volumes(vol_ids) + class Split(Enum): Train: int = 0 @@ -219,6 +229,8 @@ def create_parser(): help='Use color images') parser.set_defaults(grayscale=True) + parser.add_argument('--corpora', nargs='*', + help='List of corpus ids to be used, separated by spaces') parser.add_argument('--volumes', nargs='*', help='List of volume ids to be used, separated by spaces') parser.add_argument('--pages', nargs='*', @@ -245,6 +257,8 @@ def main(): kaldi_data_generator.run_pages(args.pages) if args.volumes: kaldi_data_generator.run_volumes(args.volumes) + if args.corpora: + kaldi_data_generator.run_corpora(args.corpora) print() # create partitions from all the extracted data -- GitLab