]>
Commit | Line | Data |
---|---|---|
f20569fa XL |
1 | #!/usr/bin/python |
2 | ||
3 | import collections | |
4 | import re | |
5 | ||
6 | column_size = 8 | |
7 | ||
8 | categories = { | |
9 | 'Cc': ['Other', 'Control'], | |
10 | 'Cf': ['Other', 'Format'], | |
11 | 'Cn': ['Other', 'NotAssigned'], | |
12 | 'Co': ['Other', 'PrivateUse'], | |
13 | 'Cs': ['Other', 'Surrogate'], | |
14 | 'Ls': ['Letter', 'Cased'], | |
15 | 'Ll': ['Letter', 'Lowercased'], | |
16 | 'Lm': ['Letter', 'Modifier'], | |
17 | 'Lo': ['Letter', 'Other'], | |
18 | 'Lt': ['Letter', 'Titlecase'], | |
19 | 'Lu': ['Letter', 'Uppercase'], | |
20 | 'Mc': ['Mark', 'SpaceCombining'], | |
21 | 'Me': ['Mark', 'Enclosing'], | |
22 | 'Mn': ['Mark', 'Nonspacing'], | |
23 | 'Nd': ['Number', 'DecimalDigit'], | |
24 | 'Nl': ['Number', 'Letter'], | |
25 | 'No': ['Number', 'Other'], | |
26 | 'Pc': ['Punctuation', 'Connector'], | |
27 | 'Pd': ['Punctuation', 'Dash'], | |
28 | 'Pe': ['Punctuation', 'Close'], | |
29 | 'Pf': ['Punctuation', 'FinalQuote'], | |
30 | 'Pi': ['Punctuation', 'InitialQuote'], | |
31 | 'Po': ['Punctuation', 'Other'], | |
32 | 'Ps': ['Punctuation', 'Open'], | |
33 | 'Sc': ['Symbol', 'Currency'], | |
34 | 'Sk': ['Symbol', 'Modifier'], | |
35 | 'Sm': ['Symbol', 'Math'], | |
36 | 'So': ['Symbol', 'Other'], | |
37 | 'Zl': ['Separator', 'Line'], | |
38 | 'Zp': ['Separator', 'Paragraph'], | |
39 | 'Zs': ['Separator', 'Space'] | |
40 | } | |
41 | ||
42 | def generate_rows(): | |
43 | with open('UnicodeData.txt', 'r') as ucd: | |
44 | for line in ucd: | |
45 | split = line.split(';') | |
46 | char, category = split[0], split[2] | |
47 | yield (char, category) | |
48 | ||
49 | ||
50 | def generate_dict(rows_gen): | |
51 | d = collections.defaultdict(list) | |
52 | for char, category in rows_gen: | |
53 | if category == 'Cs': | |
54 | # for whatever reason, rust doesn't allow this class of characters | |
55 | # as unicode literals. | |
56 | continue | |
57 | d[category].append(char) | |
58 | return d | |
59 | ||
60 | def generate_tables(d): | |
61 | new_dict = collections.defaultdict(list) | |
62 | for key in d.keys(): | |
63 | name = ''.join(categories[key]) | |
64 | new_dict[name] = d[key] | |
65 | return new_dict | |
66 | ||
67 | def print_header(): | |
68 | print("// This file is autogenerated by scripts/unicode.py.\n") | |
69 | ||
70 | def main(): | |
71 | print_header() | |
72 | row_generator = generate_rows() | |
73 | dictionary = generate_dict(row_generator) | |
74 | named_table = generate_tables(dictionary) | |
75 | output_tables(named_table) | |
76 | ||
77 | def output_tables(d): | |
78 | for key in sorted(d.keys()): | |
79 | name = camel_to_snake_case(key).upper() | |
80 | rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key]) | |
81 | table_lines = [] | |
82 | for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]: | |
83 | table_lines.append(' ' + ', '.join(chunk)) | |
84 | table_string = ',\n'.join(table_lines) | |
85 | print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string)) | |
86 | ||
87 | def camel_to_snake_case(name): | |
88 | # thanks to http://stackoverflow.com/a/1176023/1030074 | |
89 | s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) | |
90 | return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() | |
91 | ||
92 | if __name__ == "__main__": | |
93 | main() |