[rustc.git] / vendor / unicode_categories / scripts / unicode.py

#!/usr/bin/python

import collections
import re

column_size = 8

categories = {
    'Cc': ['Other', 'Control'],
    'Cf': ['Other', 'Format'],
    'Cn': ['Other', 'NotAssigned'],
    'Co': ['Other', 'PrivateUse'],
    'Cs': ['Other', 'Surrogate'],
    'Ls': ['Letter', 'Cased'],
    'Ll': ['Letter', 'Lowercased'],
    'Lm': ['Letter', 'Modifier'],
    'Lo': ['Letter', 'Other'],
    'Lt': ['Letter', 'Titlecase'],
    'Lu': ['Letter', 'Uppercase'],
    'Mc': ['Mark', 'SpaceCombining'],
    'Me': ['Mark', 'Enclosing'],
    'Mn': ['Mark', 'Nonspacing'],
    'Nd': ['Number', 'DecimalDigit'],
    'Nl': ['Number', 'Letter'],
    'No': ['Number', 'Other'],
    'Pc': ['Punctuation', 'Connector'],
    'Pd': ['Punctuation', 'Dash'],
    'Pe': ['Punctuation', 'Close'],
    'Pf': ['Punctuation', 'FinalQuote'],
    'Pi': ['Punctuation', 'InitialQuote'],
    'Po': ['Punctuation', 'Other'],
    'Ps': ['Punctuation', 'Open'],
    'Sc': ['Symbol', 'Currency'],
    'Sk': ['Symbol', 'Modifier'],
    'Sm': ['Symbol', 'Math'],
    'So': ['Symbol', 'Other'],
    'Zl': ['Separator', 'Line'],
    'Zp': ['Separator', 'Paragraph'],
    'Zs': ['Separator', 'Space']
}

def generate_rows():
    with open('UnicodeData.txt', 'r') as ucd:
        for line in ucd:
            split = line.split(';')
            char, category = split[0], split[2]
            yield (char, category)


def generate_dict(rows_gen):
    d = collections.defaultdict(list)
    for char, category in rows_gen:
        if category == 'Cs':
            # for whatever reason, rust doesn't allow this class of characters
            # as unicode literals.
            continue
        d[category].append(char)
    return d

def generate_tables(d):
    new_dict = collections.defaultdict(list)
    for key in d.keys():
        name = ''.join(categories[key])
        new_dict[name] = d[key]
    return new_dict

def print_header():
    print("// This file is autogenerated by scripts/unicode.py.\n")

def main():
    print_header()
    row_generator = generate_rows()
    dictionary = generate_dict(row_generator)
    named_table = generate_tables(dictionary)
    output_tables(named_table)

def output_tables(d):
    for key in sorted(d.keys()):
        name = camel_to_snake_case(key).upper()
        rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
        table_lines = []
        for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
            table_lines.append('    ' + ', '.join(chunk))
        table_string = ',\n'.join(table_lines)
        print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))

def camel_to_snake_case(name):
    # thanks to http://stackoverflow.com/a/1176023/1030074
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()

if __name__ == "__main__":
    main()
Commit	Line	Data
f20569fa XL	1	#!/usr/bin/python
	2
	3	import collections
	4	import re
	5
	6	column_size = 8
	7
	8	categories = {
	9	'Cc': ['Other', 'Control'],
	10	'Cf': ['Other', 'Format'],
	11	'Cn': ['Other', 'NotAssigned'],
	12	'Co': ['Other', 'PrivateUse'],
	13	'Cs': ['Other', 'Surrogate'],
	14	'Ls': ['Letter', 'Cased'],
	15	'Ll': ['Letter', 'Lowercased'],
	16	'Lm': ['Letter', 'Modifier'],
	17	'Lo': ['Letter', 'Other'],
	18	'Lt': ['Letter', 'Titlecase'],
	19	'Lu': ['Letter', 'Uppercase'],
	20	'Mc': ['Mark', 'SpaceCombining'],
	21	'Me': ['Mark', 'Enclosing'],
	22	'Mn': ['Mark', 'Nonspacing'],
	23	'Nd': ['Number', 'DecimalDigit'],
	24	'Nl': ['Number', 'Letter'],
	25	'No': ['Number', 'Other'],
	26	'Pc': ['Punctuation', 'Connector'],
	27	'Pd': ['Punctuation', 'Dash'],
	28	'Pe': ['Punctuation', 'Close'],
	29	'Pf': ['Punctuation', 'FinalQuote'],
	30	'Pi': ['Punctuation', 'InitialQuote'],
	31	'Po': ['Punctuation', 'Other'],
	32	'Ps': ['Punctuation', 'Open'],
	33	'Sc': ['Symbol', 'Currency'],
	34	'Sk': ['Symbol', 'Modifier'],
	35	'Sm': ['Symbol', 'Math'],
	36	'So': ['Symbol', 'Other'],
	37	'Zl': ['Separator', 'Line'],
	38	'Zp': ['Separator', 'Paragraph'],
	39	'Zs': ['Separator', 'Space']
	40	}
	41
	42	def generate_rows():
	43	with open('UnicodeData.txt', 'r') as ucd:
	44	for line in ucd:
	45	split = line.split(';')
	46	char, category = split[0], split[2]
	47	yield (char, category)
	48
	49
	50	def generate_dict(rows_gen):
	51	d = collections.defaultdict(list)
	52	for char, category in rows_gen:
	53	if category == 'Cs':
	54	# for whatever reason, rust doesn't allow this class of characters
	55	# as unicode literals.
	56	continue
	57	d[category].append(char)
	58	return d
	59
	60	def generate_tables(d):
	61	new_dict = collections.defaultdict(list)
	62	for key in d.keys():
	63	name = ''.join(categories[key])
	64	new_dict[name] = d[key]
65	return new_dict
66
67	def print_header():
68	print("// This file is autogenerated by scripts/unicode.py.\n")
69
70	def main():
71	print_header()
72	row_generator = generate_rows()
73	dictionary = generate_dict(row_generator)
74	named_table = generate_tables(dictionary)
75	output_tables(named_table)
76
77	def output_tables(d):
78	for key in sorted(d.keys()):
79	name = camel_to_snake_case(key).upper()
80	rust_unicode_escapes = map(lambda x: r"'\u{{{}}}'".format(x), d[key])
81	table_lines = []
82	for chunk in [rust_unicode_escapes[x:x+column_size] for x in xrange(0, len(rust_unicode_escapes), column_size)]:
83	table_lines.append(' ' + ', '.join(chunk))
84	table_string = ',\n'.join(table_lines)
85	print("pub static {} : &'static [char] = &[\n{}];\n".format(name, table_string))
86
87	def camel_to_snake_case(name):
88	# thanks to http://stackoverflow.com/a/1176023/1030074
89	s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
90	return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
91
92	if __name__ == "__main__":
93	main()