diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000000000000000000000000000000000000..f2210b91baee729c61e5300dbe4b65416ffd9526 --- /dev/null +++ b/.flake8 @@ -0,0 +1,7 @@ +[flake8] +max-line-length = 120 +exclude=.cache,.eggs,.git +# Flake8 ignores multiple errors by default; +# the only interesting ignore is W503, which goes against PEP8. +# See https://lintlyci.github.io/Flake8Rules/rules/W503.html +ignore = E203,E501,W503 diff --git a/document_processing/text_flip.py b/document_processing/text_flip.py new file mode 100644 index 0000000000000000000000000000000000000000..82a9f0c807c8dbd10914b3cfe4c899b97f5c9419 --- /dev/null +++ b/document_processing/text_flip.py @@ -0,0 +1,50 @@ +import codecs +import re + +DATE_SEPARATORS = [{"code": "\u002F", "name": "SOLIDUS"}, {"code": "\u060D", "name": "ARABIC DATE SEPARATOR"}] + + +def is_date_separator(char): + return char in [codecs.decode(symbol["code"], "unicode-escape") for symbol in DATE_SEPARATORS] + + +def is_date(word): + """Loose definition to allow potential prediction errors""" + return all([char.isdigit() or is_date_separator(char) for char in word]) + + +def is_integer(word): + return all([char.isdigit() for char in word]) + + +def flip_single_word(word): + """Two main cases: + 1. for dates and integers => flip the entire string + 2. for other cases (floats, numbers in parenthesis, mix of letters and numbers) => flip only numbers + """ + if is_date(word) or is_integer(word): # should we include float ? + return word[::-1] + + # subword tokenization + word = re.sub(r"([0-9]+)", r" \1 ", word).strip() + word = re.sub(r" +", " ", word) + tokens = word.split(" ") + + # flip only numbers + reversed_word = [] + for token in tokens: + if is_integer(token): + reversed_word.append(token[::-1]) + else: + reversed_word.append(token) + return "".join(reversed_word) + + +def should_flip(word): + return any([char.isdigit() for char in word]) + + +def flip_numbers_in_transcription(text): + words = text.split(" ") + words = [flip_single_word(word) if should_flip(word) else word for word in words] + return " ".join(words) diff --git a/tests/test_text_flip.py b/tests/test_text_flip.py new file mode 100644 index 0000000000000000000000000000000000000000..9c447e45831a6c78be1ae306575c40714e10b2e7 --- /dev/null +++ b/tests/test_text_flip.py @@ -0,0 +1,7 @@ +from document_processing.text_flip import flip_numbers_in_transcription + + +def test_arabic(): + src_ = "(100) عد 293,1 21212121.2 Ù„ 1/ضاير Øضر برقم 4343 Ùˆ تاريخ 2041/01/11 Ù‡ بشأن" + dest = "(001) عد 392,1 12121212.2 Ù„ 1/ضاير Øضر برقم 3434 Ùˆ تاريخ 11/10/1402 Ù‡ بشأن" + assert flip_numbers_in_transcription(src_) == dest