]> git.proxmox.com Git - rustc.git/blame - vendor/unicode_categories/scripts/unicode.py
Update upstream source from tag 'upstream/1.52.1+dfsg1'
[rustc.git] / vendor / unicode_categories / scripts / unicode.py
CommitLineData
f20569fa
XL
1#!/usr/bin/python
2
3import collections
4import re
5
6column_size = 8
7
8categories = {
9 'Cc': ['Other', 'Control'],
10 'Cf': ['Other', 'Format'],
11 'Cn': ['Other', 'NotAssigned'],
12 'Co': ['Other', 'PrivateUse'],
13 'Cs': ['Other', 'Surrogate'],
14 'Ls': ['Letter', 'Cased'],
15 'Ll': ['Letter', 'Lowercased'],
16 'Lm': ['Letter', 'Modifier'],
17 'Lo': ['Letter', 'Other'],
18 'Lt': ['Letter', 'Titlecase'],
19 'Lu': ['Letter', 'Uppercase'],
20 'Mc': ['Mark', 'SpaceCombining'],
21 'Me': ['Mark', 'Enclosing'],
22 'Mn': ['Mark', 'Nonspacing'],
23 'Nd': ['Number', 'DecimalDigit'],
24 'Nl': ['Number', 'Letter'],
25 'No': ['Number', 'Other'],
26 'Pc': ['Punctuation', 'Connector'],
27 'Pd': ['Punctuation', 'Dash'],
28 'Pe': ['Punctuation', 'Close'],
29 'Pf': ['Punctuation', 'FinalQuote'],
30 'Pi': ['Punctuation', 'InitialQuote'],
31 'Po': ['Punctuation', 'Other'],
32 'Ps': ['Punctuation', 'Open'],
33 'Sc': ['Symbol', 'Currency'],
34 'Sk': ['Symbol', 'Modifier'],
35 'Sm': ['Symbol', 'Math'],
36 'So': ['Symbol', 'Other'],
37 'Zl': ['Separator', 'Line'],
38 'Zp': ['Separator', 'Paragraph'],
39 'Zs': ['Separator', 'Space']
40}
41
42def generate_rows():
43 with open('UnicodeData.txt', 'r') as ucd:
44 for line in ucd:
45 split = line.split(';')
46 char, category = split[0], split[2]
47 yield (char, category)
48
49
50def generate_dict(rows_gen):
51 d = collections.defaultdict(list)
52 for char, category in rows_gen:
53 if category == 'Cs':
54 # for whatever reason, rust doesn't allow this class of characters
55 # as unicode literals.
56 continue
57 d[category].append(char)
58 return d
59
60def generate_tables(d):
61 new_dict = collections.defaultdict(list)
62 for key in d.keys():
63 name = ''.join(categories[key])
64 new_dict[name] = d[key]
65 return new_dict
66
67def print_header():
68 print("// This file is autogenerated by scripts/unicode.py.\n")
69
70def main():
71 print_header()
72 row_generator = generate_rows()
73 dictionary = generate_dict(row_generator)
74 named_table = generate_tables(dictionary)
75 output_tables(named_table)
76
77def output_tables(d):
78 for key in sorted(d.keys()):
79 name = camel_to_snake_case(key).upper()
80 rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
81 table_lines = []
82 for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
83 table_lines.append(' ' + ', '.join(chunk))
84 table_string = ',\n'.join(table_lines)
85 print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))
86
87def camel_to_snake_case(name):
88 # thanks to http://stackoverflow.com/a/1176023/1030074
89 s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
90 return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
91
92if __name__ == "__main__":
93 main()