Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • arkindex/workers/import/file
1 result
Show changes
Commits on Source (2)
Showing with 240 additions and 335 deletions
......@@ -14,7 +14,7 @@ variables:
DEBIAN_FRONTEND: non-interactive
test:
image: python:3.11
image: python:3.12-slim
stage: test
cache:
......@@ -32,8 +32,8 @@ test:
- apt-get update
- apt-get install -y --no-install-recommends poppler-utils
# Install curl
- apt-get update -q -y && apt-get install -q -y --no-install-recommends curl
# Install curl and libmagic
- apt-get update -q -y && apt-get install -q -y --no-install-recommends curl libmagic1
# Download OpenAPI schema from last backend build
- curl https://assets.teklia.com/arkindex/openapi.yml > schema.yml
......@@ -45,7 +45,7 @@ test:
- tox -- --junitxml=test-report.xml --durations=50
lint:
image: python:3.11
image: python:3.12-slim
cache:
paths:
......
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.3.1
rev: v0.11.1
hooks:
# Run the linter.
- id: ruff
......@@ -9,7 +9,7 @@ repos:
# Run the formatter.
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v5.0.0
hooks:
- id: check-ast
- id: check-docstring-first
......@@ -25,11 +25,11 @@ repos:
args: ['--django']
- id: check-json
- id: check-toml
- id: requirements-txt-fixer
- repo: https://github.com/codespell-project/codespell
rev: v2.2.6
rev: v2.4.1
hooks:
- id: codespell
args: ['--write-changes']
exclude: >
(?x)^(
tests/test_pdf_import.py|
......@@ -42,3 +42,9 @@ repos:
rev: v0.10.0.1
hooks:
- id: shellcheck
- repo: https://gitlab.teklia.com/tools/pre-commit-hooks
rev: 0.1.0
hooks:
- id: long-test-files
args: ['1000']
files: '^tests\/(.*\/)?test_[^\/]*\.py$'
FROM python:3.11-slim
FROM python:3.12-slim
WORKDIR /src
......
......@@ -7,7 +7,7 @@ Worker to import files in various formats in Arkindex
For development and tests purpose it may be useful to install the worker as a editable package with pip.
```shell
pip3 install -e .
pip install -e .
```
## Linter
......
......@@ -8,7 +8,7 @@ license = { file = "LICENSE" }
version = "0.1.1"
description = "Worker to import files in various formats in Arkindex"
dependencies = [
"arkindex-base-worker==0.4.0rc2",
"arkindex-base-worker==0.4.0",
"natsort==8.3.1",
"pdfminer.six==20221105",
"pdf2image==1.16.0",
......@@ -29,6 +29,7 @@ classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
[project.scripts]
......@@ -42,6 +43,7 @@ worker_file_import = ["assets/*"]
[tool.ruff]
exclude = [".git", "__pycache__"]
target-version = "py312"
[tool.ruff.lint]
ignore = ["E501"]
......@@ -76,5 +78,5 @@ select = [
"worker_file_import/**/*.py" = ["PT018"]
[tool.ruff.lint.isort]
known-first-party = ["arkindex", "arkindex_worker"]
known-third-party = ["pytest", "setuptools"]
known-first-party = ["arkindex", "arkindex_worker", "teklia_toolbox"]
known-third-party = ["ijson", "lxml", "natsort", "pdf2image", "pdfminer", "PIL", "pytest", "shapely", "tenacity", "zstandard"]
......@@ -34,14 +34,15 @@ def _setup_environment(responses, monkeypatch) -> None:
monkeypatch.setattr(BaseWorker, "setup_api_client", mock_setup_api_client)
@pytest.fixture()
@pytest.fixture
def mock_worker(monkeypatch):
monkeypatch.setattr(ImportWorker, "configure", lambda *args, **kwargs: True)
worker = ImportWorker()
worker.args = Namespace(dev=False)
worker.corpus_types = None
worker._corpus_id = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
worker._corpus_id = "corpusid"
worker.worker_run_id = "bbbbbbbb-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
worker.folder_type = "folder"
......
......@@ -3,7 +3,6 @@ import logging
import tarfile
from io import BytesIO
from pathlib import Path
from tempfile import NamedTemporaryFile
from unittest.mock import patch
from zipfile import ZipFile
......@@ -13,7 +12,7 @@ SAMPLES = Path(__file__).absolute().parent / "samples"
@patch("worker_file_import.worker.TranskribusImporter")
def test_run_zip(importer_mock, mock_worker, responses, caplog):
def test_run_zip(importer_mock, mock_worker, responses, caplog, tmp_path):
caplog.set_level(logging.ERROR)
# Process info
......@@ -75,32 +74,23 @@ def test_run_zip(importer_mock, mock_worker, responses, caplog):
)
# Create an archive that looks like a Transkribus export
transkribus_zip = NamedTemporaryFile()
with ZipFile(transkribus_zip, "a") as archive:
transkribus_zip = tmp_path / "transkribus.zip"
with ZipFile(transkribus_zip, "w") as archive:
archive.writestr("1234/document_title/mets.xml", "lol")
transkribus_zip.flush()
transkribus_zip.seek(0)
zip_path = Path(transkribus_zip.name)
responses.add(responses.GET, "http://s3/file1.zip", body=zip_path.open("rb"))
responses.add(responses.GET, "http://s3/file1.zip", body=transkribus_zip.open("rb"))
# An archive that isn't a Transkribus export
dummy_zip = NamedTemporaryFile()
with ZipFile(dummy_zip, "a") as archive:
dummy_zip = tmp_path / "dummy.zip"
with ZipFile(dummy_zip, "w") as archive:
archive.writestr("lol.txt", "lol")
dummy_zip.flush()
dummy_zip.seek(0)
zip_path = Path(dummy_zip.name)
responses.add(responses.GET, "http://s3/file2.zip", body=zip_path.open("rb"))
responses.add(responses.GET, "http://s3/file2.zip", body=dummy_zip.open("rb"))
# Windows MIME type test: we just use a dummy archive.
# We cannot reuse the dummy_zip defined above since it will be consumed by the mocked request
windows_zip = NamedTemporaryFile()
with ZipFile(windows_zip, "a") as archive:
windows_zip = tmp_path / "windows.zip"
with ZipFile(windows_zip, "w") as archive:
archive.writestr("lol.txt", "lol")
windows_zip.flush()
windows_zip.seek(0)
zip_path = Path(windows_zip.name)
responses.add(responses.GET, "http://s3/file3.zip", body=zip_path.open("rb"))
responses.add(responses.GET, "http://s3/file3.zip", body=windows_zip.open("rb"))
# Not actually a zip file
responses.add(
......@@ -199,7 +189,7 @@ def test_run_zip(importer_mock, mock_worker, responses, caplog):
]
def test_run_tar(mock_worker, responses, caplog):
def test_run_tar(mock_worker, responses, caplog, tmp_path):
caplog.set_level(logging.ERROR)
# Prepare all of the archives to test
......@@ -214,35 +204,26 @@ def test_run_tar(mock_worker, responses, caplog):
]
# tarfile does not support an equivalent of ZipFile.writestr, so we need an actual file to add to archives.
with NamedTemporaryFile() as text_file:
text_file.write(b"lol")
text_file.flush()
for file_id, _, mode in cases:
archive_file = NamedTemporaryFile()
with tarfile.open(fileobj=archive_file, mode=mode) as archive:
archive.add(text_file.name, arcname="lol.txt")
archive_file.flush()
archive_file.seek(0)
tar_path = Path(archive_file.name)
responses.add(
responses.GET, f"http://s3/{file_id}", body=tar_path.open("rb")
)
# .tar.zst is handled separately, since tarfile doesn't support it
with NamedTemporaryFile() as archive_file:
with tarfile.open(fileobj=archive_file, mode="w:") as archive:
archive.add(text_file.name, arcname="lol.txt")
archive_file.flush()
archive_file.seek(0)
zst_archive = NamedTemporaryFile()
ZstdCompressor().copy_stream(archive_file, zst_archive)
zst_archive.seek(0)
zst_path = Path(zst_archive.name)
responses.add(responses.GET, "http://s3/zst", body=zst_path.open("rb"))
cases.append(("zst", "application/zstd", None))
text_file = tmp_path / "my_text.txt"
text_file.touch()
text_file.write_bytes(b"lol")
for file_id, _, mode in cases:
tar_path = tmp_path / f"archive.{file_id}"
with tarfile.open(name=tar_path, mode=mode) as archive:
archive.add(text_file, arcname="lol.txt")
responses.add(responses.GET, f"http://s3/{file_id}", body=tar_path.open("rb"))
# .tar.zst is handled separately, since tarfile doesn't support it
tar_path = tmp_path / "uncompressed_archive.tar"
with tarfile.open(name=tar_path, mode="w:") as archive:
archive.add(text_file, arcname="lol.txt")
zst_path = tmp_path / "compressed_archive.tar.zst"
with tar_path.open("rb") as tar_archive, zst_path.open("wb") as zst_archive:
ZstdCompressor().copy_stream(tar_archive, zst_archive)
responses.add(responses.GET, "http://s3/zst", body=zst_path.open("rb"))
cases.append(("zst", "application/zstd", None))
# Build RetrieveDataFile mocks
for file_id, content_type, _ in cases:
......@@ -320,10 +301,10 @@ def test_run_tar(mock_worker, responses, caplog):
assert caplog.record_tuples == expected_logs
def test_zipception(mock_worker, responses, caplog):
def test_zipception(mock_worker, responses, caplog, tmp_path):
caplog.set_level(logging.ERROR)
zip4 = NamedTemporaryFile()
zip4 = tmp_path / "archive.zip"
with BytesIO() as zip1, BytesIO() as zip2, BytesIO() as zip3:
with ZipFile(zip1, "w") as archive:
archive.writestr("lol.txt", "lol")
......@@ -336,10 +317,7 @@ def test_zipception(mock_worker, responses, caplog):
with ZipFile(zip4, "w") as archive:
archive.writestr("lol.zip.zip.zip", zip3.getvalue())
zip4.flush()
zip4.seek(0)
zip_path = Path(zip4.name)
responses.add(responses.GET, "http://s3/file1.zip", body=zip_path.open("rb"))
responses.add(responses.GET, "http://s3/file1.zip", body=zip4.open("rb"))
# Process info
mock_worker.process_information = {
......
......@@ -4,10 +4,10 @@ from pathlib import Path
from unittest.mock import patch
import pytest
from apistar.exceptions import ErrorResponse
from PIL import Image
from requests.exceptions import ConnectionError
from arkindex.exceptions import ErrorResponse
from worker_file_import.image import check_image, create_image, upload_image
SAMPLES = Path(__file__).absolute().parent / "samples"
......
from pathlib import Path
from tempfile import NamedTemporaryFile
from unittest.mock import patch
from zipfile import ZipFile
import pytest
from apistar.exceptions import ErrorResponse
from arkindex.exceptions import ErrorResponse
from worker_file_import.transkribus import TranskribusElement, TranskribusFolder
SAMPLES = Path(__file__).absolute().parent / "samples"
......@@ -98,7 +97,7 @@ def test_create_folder(mock_worker):
@patch("worker_file_import.transkribus.hash_image")
def test_upload_image(hash_image_mock, mock_worker, responses):
def test_upload_image(hash_image_mock, mock_worker, responses, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.api_client.add_response(
"CreateImage",
......@@ -123,14 +122,10 @@ def test_upload_image(hash_image_mock, mock_worker, responses):
},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(str(SAMPLES / "200x200.jpg"), arcname="0001_123456.jpg")
archive.write(str(SAMPLES / "transcript.xml"), arcname="0001_123456.xml")
archive_file.flush()
archive_file.seek(0)
test_element = TranskribusElement(
mock_worker,
......@@ -155,7 +150,7 @@ def test_upload_image(hash_image_mock, mock_worker, responses):
@patch("worker_file_import.transkribus.hash_image")
def test_upload_image_exists(hash_image_mock, mock_worker):
def test_upload_image_exists(hash_image_mock, mock_worker, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.api_client.add_error_response(
......@@ -185,14 +180,10 @@ def test_upload_image_exists(hash_image_mock, mock_worker):
},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(str(SAMPLES / "200x200.jpg"), arcname="0001_123456.jpg")
archive.write(str(SAMPLES / "transcript.xml"), arcname="0001_123456.xml")
archive_file.flush()
archive_file.seek(0)
test_element = TranskribusElement(
mock_worker,
......@@ -216,7 +207,7 @@ def test_upload_image_exists(hash_image_mock, mock_worker):
@patch("worker_file_import.transkribus.hash_image")
def test_upload_image_failure(hash_image_mock, mock_worker):
def test_upload_image_failure(hash_image_mock, mock_worker, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.api_client.add_error_response(
......@@ -226,14 +217,10 @@ def test_upload_image_failure(hash_image_mock, mock_worker):
content={"oh no"},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(str(SAMPLES / "200x200.jpg"), arcname="0001_123456.jpg")
archive.write(str(SAMPLES / "transcript.xml"), arcname="0001_123456.xml")
archive_file.flush()
archive_file.seek(0)
test_element = TranskribusElement(
mock_worker,
......@@ -250,7 +237,7 @@ def test_upload_image_failure(hash_image_mock, mock_worker):
test_element.upload_image()
def test_upload_transcriptions(mock_worker):
def test_upload_transcriptions(mock_worker, tmp_path):
mock_worker.api_client.add_response(
"CreateElement",
body={
......@@ -476,14 +463,10 @@ def test_upload_transcriptions(mock_worker):
},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(str(SAMPLES / "200x200.jpg"), arcname="0001_123456.jpg")
archive.write(str(SAMPLES / "transcript.xml"), arcname="0001_123456.xml")
archive_file.flush()
archive_file.seek(0)
test_element = TranskribusElement(
mock_worker,
......@@ -509,7 +492,7 @@ def test_upload_transcriptions(mock_worker):
@patch("worker_file_import.transkribus.hash_image")
def test_run(hash_image_mock, mock_worker, responses):
def test_run(hash_image_mock, mock_worker, responses, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.api_client.add_response(
......@@ -759,14 +742,10 @@ def test_run(hash_image_mock, mock_worker, responses):
},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(str(SAMPLES / "200x200.jpg"), arcname="0001_123456.jpg")
archive.write(str(SAMPLES / "transcript.xml"), arcname="0001_123456.xml")
archive_file.flush()
archive_file.seek(0)
test_element = TranskribusElement(
mock_worker,
......
from pathlib import Path
from tempfile import NamedTemporaryFile
from unittest.mock import patch
from zipfile import ZipFile
......@@ -21,38 +20,7 @@ CORPUS_INFO = {
}
# def setUpClass(cls):
# _, cls.archive_path = tempfile.mkstemp()
# cls.archive_path = Path(cls.archive_path)
# with ZipFile(cls.archive_path, mode="w") as archive:
# archive.write(
# str(SAMPLES / "img1.jpg"), arcname="555/document/0001_123456.jpg"
# )
# archive.write(
# str(SAMPLES / "transcript.xml"), arcname="555/document/0001_123456.xml"
# )
# _, cls.subdir_archive_path = tempfile.mkstemp()
# cls.subdir_archive_path = Path(cls.subdir_archive_path)
# with ZipFile(cls.subdir_archive_path, mode="w") as archive:
# archive.write(
# str(SAMPLES / "img1.jpg"),
# arcname="555/document/some/subdir/abcdefg.jpg",
# )
# archive.write(
# str(SAMPLES / "transcript.xml"),
# arcname="555/document/foo/bar/abcdefg.xml",
# )
# @classmethod
# def tearDownClass(cls):
# cls.archive_path.unlink()
# def setUp(self):
# self.maxDiff = None
def test_init_missing_types(mock_worker):
def test_init_missing_types(mock_worker, tmp_path):
mock_worker.process_information = {
"id": "processid",
"corpus": "corpusid",
......@@ -76,7 +44,8 @@ def test_init_missing_types(mock_worker):
"CreateElementType",
body={
"slug": "paragraph",
"display_name": "Paragraph",
"display_name": "paragraph",
"folder": False,
"corpus": "corpusid",
},
response={"id": "par_type_id"},
......@@ -85,28 +54,25 @@ def test_init_missing_types(mock_worker):
"CreateElementType",
body={
"slug": "text_line",
"display_name": "Text Line",
"display_name": "text_line",
"folder": False,
"corpus": "corpusid",
},
response={"id": "line_type_id"},
)
with (
NamedTemporaryFile() as archive_file,
ZipFile(archive_file, mode="w") as archive,
):
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"), arcname="555/document/0001_123456.jpg"
)
archive.write(
str(SAMPLES / "transcript.xml"), arcname="555/document/0001_123456.xml"
)
archive_file.flush()
archive_file.seek(0)
importer = TranskribusImporter(mock_worker, archive_file)
importer = TranskribusImporter(mock_worker, archive_path)
assert importer.archive_path == archive_file
assert importer.archive_path == archive_path
assert importer.project_id == "corpusid"
assert importer.folder_type == "book"
assert importer.element_type == "page"
......@@ -116,7 +82,7 @@ def test_init_missing_types(mock_worker):
@patch("worker_file_import.transkribus.hash_image")
def test_run(hash_image_mock, mock_worker, responses):
def test_run(hash_image_mock, mock_worker, responses, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.process_information = {
"id": "processid",
......@@ -401,43 +367,41 @@ def test_run(hash_image_mock, mock_worker, responses):
},
)
with NamedTemporaryFile() as archive_file:
with ZipFile(archive_file, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"), arcname="555/document/0001_123456.jpg"
)
archive.write(
str(SAMPLES / "transcript.xml"), arcname="555/document/0001_123456.xml"
)
archive_file.flush()
archive_file.seek(0)
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"), arcname="555/document/0001_123456.jpg"
)
archive.write(
str(SAMPLES / "transcript.xml"), arcname="555/document/0001_123456.xml"
)
importer = TranskribusImporter(mock_worker, archive_file)
importer = TranskribusImporter(mock_worker, archive_path)
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
"confidence": None,
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
]
"confidence": None,
},
]
@patch("worker_file_import.transkribus.hash_image")
def test_run_subdir(hash_image_mock, mock_worker, responses):
def test_run_subdir(hash_image_mock, mock_worker, responses, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.process_information = {
"id": "processid",
......@@ -722,45 +686,43 @@ def test_run_subdir(hash_image_mock, mock_worker, responses):
},
)
with NamedTemporaryFile() as archive_file:
with ZipFile(archive_file, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
archive_file.flush()
archive_file.seek(0)
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
importer = TranskribusImporter(mock_worker, archive_file)
importer = TranskribusImporter(mock_worker, archive_path)
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
"confidence": None,
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
]
"confidence": None,
},
]
@patch("worker_file_import.transkribus.hash_image")
def test_run_parent(hash_image_mock, mock_worker, responses):
def test_run_parent(hash_image_mock, mock_worker, responses, tmp_path):
hash_image_mock.return_value = "01e9327d0e5ab7c516a0792c3662bc65"
mock_worker.process_information = {
"id": "processid",
......@@ -1055,52 +1017,50 @@ def test_run_parent(hash_image_mock, mock_worker, responses):
},
)
with NamedTemporaryFile() as archive_file:
with ZipFile(archive_file, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
archive_file.flush()
archive_file.seek(0)
importer = TranskribusImporter(
mock_worker,
archive_file,
parent_element=Element(
{"id": "parentid", "type": "book", "corpus": {"id": "corpusid"}}
),
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"parent": "parentid",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
"confidence": None,
importer = TranskribusImporter(
mock_worker,
archive_path,
parent_element=Element(
{"id": "parentid", "type": "book", "corpus": {"id": "corpusid"}}
),
)
assert importer.run() == [
{
"id": "folderid",
"type": "book",
"name": "document",
"parent": "parentid",
"corpus": {"id": "corpusid"},
},
{
"id": "elementid",
"parent": "folderid",
"type": "page",
"name": "42",
"corpus": {"id": "corpusid"},
"zone": {
"image": {"id": "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"},
"polygon": None,
},
]
"confidence": None,
},
]
@patch("worker_file_import.transkribus.TranskribusFolder.run")
def test_run_folder_error(run_mock, mock_worker):
def test_run_folder_error(run_mock, mock_worker, tmp_path):
mock_worker.process_information = {
"id": "processid",
"corpus": "corpusid",
......@@ -1114,24 +1074,22 @@ def test_run_folder_error(run_mock, mock_worker):
)
run_mock.side_effect = TypeError("Oh snap!")
with NamedTemporaryFile() as archive_file:
with ZipFile(archive_file, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
archive_file.flush()
archive_file.seek(0)
importer = TranskribusImporter(
mock_worker,
archive_file,
archive_path = tmp_path / "archive.zip"
with ZipFile(archive_path, mode="w") as archive:
archive.write(
str(SAMPLES / "200x200.jpg"),
arcname="555/document/some/subdir/0001_123456.jpg",
)
archive.write(
str(SAMPLES / "transcript.xml"),
arcname="555/document/some/subdir/0001_123456.xml",
)
# Nothing is imported
assert importer.run() == []
assert run_mock.call_count == 1
importer = TranskribusImporter(
mock_worker,
archive_path,
)
# Nothing is imported
assert importer.run() == []
assert run_mock.call_count == 1
......@@ -5,11 +5,11 @@ from urllib.parse import urljoin, urlparse
import ijson
import requests
from apistar.exceptions import ErrorResponse
from teklia_toolbox.requests import should_verify_cert
from arkindex.exceptions import ErrorResponse
from arkindex_worker.models import Element
from arkindex_worker.worker.metadata import MetaType
from teklia_toolbox.requests import should_verify_cert
from worker_file_import import USER_AGENT
from worker_file_import.utils import retried_get
......@@ -98,9 +98,9 @@ class IIIFParser:
:param folder_name: Optionally override the folder name.
"""
self.worker = worker
assert isinstance(
stream, BufferedIOBase
), "Stream should be a file-like object in binary mode"
assert isinstance(stream, BufferedIOBase), (
"Stream should be a file-like object in binary mode"
)
self.stream = stream
self.parent_element = parent_element
self.folder_name = folder_name
......@@ -284,7 +284,7 @@ class ManifestParser(IIIFParser):
):
raise
logger.warning(
f'Image check failed for {url}: {e.content["status"]}', exc_info=False
f"Image check failed for {url}: {e.content['status']}", exc_info=False
)
return self.worker.api_client.request("RetrieveImage", id=e.content["id"])
......
......@@ -3,11 +3,11 @@ from logging import Logger, getLogger
from pathlib import Path
import requests
from apistar.exceptions import ErrorResponse
from PIL import Image, ImageOps
from teklia_toolbox.requests import should_verify_cert
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from arkindex.exceptions import ErrorResponse
from teklia_toolbox.requests import should_verify_cert
from worker_file_import.utils import hash_image
logger: Logger = getLogger(__name__)
......
import logging
from shapely.geometry import LinearRing
from teklia_toolbox.pagexml import PageXmlPage
from arkindex_worker.models import Element
from arkindex_worker.worker.metadata import MetaType
from teklia_toolbox.pagexml import PageXmlPage
logger = logging.getLogger(__name__)
......
......@@ -10,9 +10,9 @@ from pdfminer.layout import LTTextContainer
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from teklia_toolbox.time import Timer
from arkindex_worker.models import Element
from teklia_toolbox.time import Timer
from worker_file_import.pdf_language import get_text_orientation
logger = logging.getLogger(__name__)
......@@ -96,9 +96,9 @@ def build_transcription(pdf_element, pdf_page, ark_page):
def extract_pdf_text(path, ark_pages, existing_pages=None):
# Load all pages and children
pdf_pages = list(extract_pages(path))
assert len(pdf_pages) == len(
ark_pages
), f"Invalid number of pages: pdf has {len(pdf_pages)}, ark has {len(ark_pages)}"
assert len(pdf_pages) == len(ark_pages), (
f"Invalid number of pages: pdf has {len(pdf_pages)}, ark has {len(ark_pages)}"
)
# Do not upload transcriptions for pages that already existed on Arkindex (retried imports)
if not existing_pages:
......
......@@ -6,13 +6,13 @@ from pathlib import Path
from zipfile import BadZipFile, ZipFile
import requests
from apistar.exceptions import ErrorResponse
from lxml import etree
from PIL import Image, ImageOps
from teklia_toolbox.requests import should_verify_cert
from arkindex.exceptions import ErrorResponse
from arkindex_worker.models import Element
from arkindex_worker.worker.metadata import MetaType
from teklia_toolbox.requests import should_verify_cert
from worker_file_import.page_xml import PageXmlParser
from worker_file_import.utils import hash_image
......@@ -191,16 +191,13 @@ class TranskribusFolder:
def get_or_create_folder(self):
if self.parent_element is None:
search = self.worker.api_client.paginate(
"ListElements",
corpus=self.project_id,
search = self.worker.list_elements(
name=self.title,
type=self.folder_type,
)
else:
search = self.worker.api_client.paginate(
"ListElementChildren",
id=self.parent_element["id"],
search = self.worker.list_element_children(
element=Element(id=self.parent_element["id"]),
name=self.title,
type=self.folder_type,
recursive=True,
......@@ -303,25 +300,9 @@ class TranskribusImporter:
self.line_type = "text_line"
# Check if paragraph and text_line element types exist on the target project; if not create them
self.project_id = self.worker.process_information["corpus"]
project = self.worker.api_client.request("RetrieveCorpus", id=self.project_id)
if not any(t["slug"] == "paragraph" for t in project["types"]):
self.worker.api_client.request(
"CreateElementType",
body={
"slug": "paragraph",
"display_name": "Paragraph",
"corpus": self.project_id,
},
)
if not any(t["slug"] == "text_line" for t in project["types"]):
self.worker.api_client.request(
"CreateElementType",
body={
"slug": "text_line",
"display_name": "Text Line",
"corpus": self.project_id,
},
)
self.worker.check_required_types(
self.paragraph_type, self.line_type, create_missing=True
)
def run(self):
elements = []
......
......@@ -10,14 +10,15 @@ from zipfile import ZipFile
import magic
import requests
from apistar.exceptions import ErrorResponse
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from zstandard import ZstdDecompressor
from arkindex.exceptions import ErrorResponse
from teklia_toolbox.requests import (
HTTP_GET_RETRY_BACKOFF,
download_file,
should_verify_cert,
)
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
from zstandard import ZstdDecompressor
logger: Logger = getLogger(__name__)
......@@ -100,7 +101,7 @@ def delete_files(api_client, files):
try:
api_client.request("DestroyDataFile", id=datafile["id"])
except ErrorResponse as e:
logger.warning(f'Could not delete file {datafile["name"]}: {e}')
logger.warning(f"Could not delete file {datafile['name']}: {e}")
@retry(
......@@ -154,14 +155,13 @@ def get_archive_datafiles(datafile):
elif datafile["content_type"] in TAR_MIME_TYPES | ZSTANDARD_MIME_TYPES:
if datafile["content_type"] in ZSTANDARD_MIME_TYPES:
# TarFile does not support .tar.zst, so we decompress ourselves
fileobj = TemporaryFile()
fileobj = TemporaryFile() # noqa: SIM115 closed later
with Path(datafile["local_path"]).open("rb") as compressed_stream:
ZstdDecompressor().copy_stream(compressed_stream, fileobj)
fileobj.seek(0)
else:
# Just open the file and let Tarfile deal with other compression methods
with Path(datafile["local_path"]) as compressed_stream:
fileobj = compressed_stream.open("rb")
fileobj = Path(datafile["local_path"]).open("rb") # noqa: SIM115 closed later
# We have to include `with fileobj` because TarFile() will not close the original file object by itself
with fileobj, tarfile.open(fileobj=fileobj) as archive:
......
......@@ -3,9 +3,9 @@ import sys
from datetime import datetime
from logging import Logger, getLogger
from apistar.exceptions import ErrorResponse
from natsort import natsorted
from arkindex.exceptions import ErrorResponse
from arkindex_worker.image import update_pillow_image_size_limit
from arkindex_worker.worker import (
BaseWorker,
......@@ -74,7 +74,7 @@ class ImportWorker(BaseWorker, ElementMixin, MetaDataMixin, TranscriptionMixin):
parent_name = datafiles[0]["name"]
else:
parent_name = (
f'Import images {datetime.now().isoformat(sep=" ", timespec="minutes")}'
f"Import images {datetime.now().isoformat(sep=' ', timespec='minutes')}"
)
self.process_information["element"] = self.api_client.request(
......@@ -179,12 +179,12 @@ class ImportWorker(BaseWorker, ElementMixin, MetaDataMixin, TranscriptionMixin):
When any file in the archive has not been successfully imported, an exception is raised,
but only after having tried to import as much of the archive as possible.
"""
assert (
self.archive_depth < MAX_ARCHIVE_DEPTH
), "Maximum archive decompression depth reached"
assert self.archive_depth < MAX_ARCHIVE_DEPTH, (
"Maximum archive decompression depth reached"
)
self.archive_depth += 1
logger.info(f'Creating folder for archive {datafile["name"]}')
logger.info(f"Creating folder for archive {datafile['name']}")
archive_folder = self.api_client.request(
"CreateElement",
body={
......@@ -205,18 +205,18 @@ class ImportWorker(BaseWorker, ElementMixin, MetaDataMixin, TranscriptionMixin):
self.import_datafile(subdatafile, Element(archive_folder))
except Exception as e:
logger.error(
f'Could not import file {subdatafile["name"]} from archive {datafile["name"]}: {e.__class__.__name__}: {str(e)}'
f"Could not import file {subdatafile['name']} from archive {datafile['name']}: {e.__class__.__name__}: {str(e)}"
)
failed = True
self.archive_depth -= 1
assert (
not failed
), f'Not all files from archive {datafile["name"]} have been imported.'
assert not failed, (
f"Not all files from archive {datafile['name']} have been imported."
)
def import_transkribus_archive(self, datafile, parent_element) -> list:
logger.info(f'Starting Transkribus import for {datafile["name"]}')
logger.info(f"Starting Transkribus import for {datafile['name']}")
self.elements.extend(
TranskribusImporter(
......@@ -304,7 +304,7 @@ class ImportWorker(BaseWorker, ElementMixin, MetaDataMixin, TranscriptionMixin):
self.import_datafile(df, parent_element)
except Exception as e:
logger.error(
f'Could not import file {df["name"]}: {e.__class__.__name__}: {str(e)}'
f"Could not import file {df['name']}: {e.__class__.__name__}: {str(e)}"
)
continue
......