Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
import re
import json
import os
import collections
# Support 2 formats:
# xxxx..yyyy ; name
# xxxx ; name
REGEX_SCRIPTS = re.compile(r'^([A-F0-9]+)(?:\.\.([A-F0-9]+))?\s+; ([\w_]+)')
# Support 2 formats:
# xxxx..yyyy ; name with spaces and -*
# xxxx ; name with spaces and -*
REGEX_NAMES = re.compile(r'^([A-F0-9]+)(?:\.\.([A-F0-9]+))?\s+; (.*)$')
def int2hex(value):
assert isinstance(value, int)
# Convert to hex
value = hex(value)
# Remove 0x header
value = value[2:]
# Convert to uppercase by convention
value = value.upper()
# Pad to 4 digits mininum
return value.zfill(4)
def linearize_range(start, end):
"""
Linearize range to get the full list of characters in the range
"""
if end is None:
# Support single character
return [start, ]
start_int = int(f'0x{start}', 16)
end_int = int(f'0x{end}', 16)
return [
int2hex(index)
for index in range(start_int, end_int + 1)
]
def parse_scripts(path):
"""
Parse Scripts.txt listing all UTF-8 scripts with their character ranges
and output a dict indexed by script with all linearized ranges
"""
assert os.path.exists(path), f"Missing path {path}"
scripts = collections.defaultdict(list)
with open(path) as f:
lines = f.readlines()
for line in lines:
# Skip empty lines
if not line.strip() or line.startswith('#'):
continue
output = REGEX_SCRIPTS.search(line)
assert output is not None, f"Invalid parsing on: {line}"
# Add all linearized characters to the script
start, end, script_name = output.groups()
scripts[script_name] += linearize_range(start, end)
return scripts
def parse_names(path):
"""
Parse DerivedName.txt listing all UTF-8 characters and their names
"""
assert os.path.exists(path), f"Missing path {path}"
with open(path) as f:
lines = f.readlines()
names = {}
for line in lines:
# Skip empty lines
if not line.strip() or line.startswith('#'):
continue
output = REGEX_NAMES.search(line)
assert output is not None, f"Invalid parsing on: {line}"
start, end, character_name = output.groups()
names.update({
char: character_name
for char in linearize_range(start, end)
})
return names
def build_asset(script, characters, names):
"""
Build the JSON file for this script
using specified characters and their names
"""
dest = os.path.join(f"assets/scripts/{script}.json")
os.makedirs(os.path.dirname(dest), exist_ok=True)
payload = {
"name": script,
"characters": [
{
"code": char,
"name": names.get(char),
}
for char in characters
]
}
with open(dest, "w") as f:
json.dump(payload, f, indent=4)
print(f"Built {dest}")
if __name__ == '__main__':
# Run wget https://www.unicode.org/Public/13.0.0/ucd/Scripts.txt
# Run https://www.unicode.org/Public/13.0.0/ucd/extracted/DerivedName.txt
scripts = parse_scripts("Scripts.txt")
names = parse_names("DerivedName.txt")
for script, characters in scripts.items():
build_asset(script, characters, names)