build_utf8_scripts.py

#!/usr/bin/env python3
import re
import json
import os
import collections

# Support 2 formats:
# xxxx..yyyy ; name
# xxxx ; name
REGEX_SCRIPTS = re.compile(r'^([A-F0-9]+)(?:\.\.([A-F0-9]+))?\s+; ([\w_]+)')

# Support 2 formats:
# xxxx..yyyy ; name with spaces and -*
# xxxx ; name with spaces and -*
REGEX_NAMES = re.compile(r'^([A-F0-9]+)(?:\.\.([A-F0-9]+))?\s+; (.*)$')


def int2hex(value):
    assert isinstance(value, int)

    # Convert to hex
    value = hex(value)

    # Remove 0x header
    value = value[2:]

    # Convert to uppercase by convention
    value = value.upper()

    # Pad to 4 digits mininum
    return value.zfill(4)


def linearize_range(start, end):
    """
    Linearize range to get the full list of characters in the range
    """
    if end is None:
        # Support single character
        return [start, ]

    start_int = int(f'0x{start}', 16)
    end_int = int(f'0x{end}', 16)
    return [
        int2hex(index)
        for index in range(start_int, end_int + 1)
    ]


def parse_scripts(path):
    """
    Parse Scripts.txt listing all UTF-8 scripts with their character ranges
    and output a dict indexed by script with all linearized ranges
    """
    assert os.path.exists(path), f"Missing path {path}"
    scripts = collections.defaultdict(list)
    with open(path) as f:
        lines = f.readlines()

    for line in lines:
        # Skip empty lines
        if not line.strip() or line.startswith('#'):
            continue

        output = REGEX_SCRIPTS.search(line)
        assert output is not None, f"Invalid parsing on: {line}"

        # Add all linearized characters to the script
        start, end, script_name = output.groups()
        scripts[script_name] += linearize_range(start, end)

    return scripts


def parse_names(path):
    """
    Parse DerivedName.txt listing all UTF-8 characters and their names
    """
    assert os.path.exists(path), f"Missing path {path}"
    with open(path) as f:
        lines = f.readlines()

    names = {}
    for line in lines:
        # Skip empty lines
        if not line.strip() or line.startswith('#'):
            continue

        output = REGEX_NAMES.search(line)
        assert output is not None, f"Invalid parsing on: {line}"

        start, end, character_name = output.groups()
        names.update({
            char: character_name
            for char in linearize_range(start, end)
        })

    return names


def build_asset(script, characters, names):
    """
    Build the JSON file for this script
    using specified characters and their names
    """
    dest = os.path.join(f"assets/scripts/{script}.json")
    os.makedirs(os.path.dirname(dest), exist_ok=True)

    payload = {
        "name": script,
        "characters": [
            {
                "code": char,
                "name": names.get(char),
            }
            for char in characters
        ]
    }
    with open(dest, "w") as f:
        json.dump(payload, f, indent=4)
    print(f"Built {dest}")


if __name__ == '__main__':
    # Run wget https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt
    # Run https://www.unicode.org/Public/13.0.0/ucd/extracted/DerivedName.txt
    scripts = parse_scripts("Scripts.txt")
    names = parse_names("DerivedName.txt")

    for script, characters in scripts.items():
        build_asset(script, characters, names)