]> git.proxmox.com Git - rustc.git/blame - vendor/bstr/scripts/generate-unicode-data
New upstream version 1.66.0+dfsg1
[rustc.git] / vendor / bstr / scripts / generate-unicode-data
CommitLineData
dfeec247
XL
1#!/bin/sh
2
3set -e
4D="$(dirname "$0")"
5
6# Convenience function for checking that a command exists.
7requires() {
8 cmd="$1"
9 if ! command -v "$cmd" > /dev/null 2>&1; then
10 echo "DEPENDENCY MISSING: $cmd must be installed" >&2
11 exit 1
12 fi
13}
14
15# Test if an array ($2) contains a particular element ($1).
16array_exists() {
17 needle="$1"
18 shift
19
20 for el in "$@"; do
21 if [ "$el" = "$needle" ]; then
22 return 0
23 fi
24 done
25 return 1
26}
27
28graphemes() {
29 regex="$(sh "$D/regex/grapheme.sh")"
30
31 echo "generating forward grapheme DFA"
32 ucd-generate dfa \
33 --name GRAPHEME_BREAK_FWD \
34 --sparse --minimize --anchored --state-size 2 \
35 src/unicode/fsm/ \
36 "$regex"
37
38 echo "generating reverse grapheme DFA"
39 ucd-generate dfa \
40 --name GRAPHEME_BREAK_REV \
41 --reverse --longest \
42 --sparse --minimize --anchored --state-size 2 \
43 src/unicode/fsm/ \
44 "$regex"
45}
46
47words() {
48 regex="$(sh "$D/regex/word.sh")"
49
50 echo "generating forward word DFA (this can take a while)"
51 ucd-generate dfa \
52 --name WORD_BREAK_FWD \
53 --sparse --minimize --anchored --state-size 4 \
54 src/unicode/fsm/ \
55 "$regex"
56}
57
58sentences() {
59 regex="$(sh "$D/regex/sentence.sh")"
60
61 echo "generating forward sentence DFA (this can take a while)"
62 ucd-generate dfa \
63 --name SENTENCE_BREAK_FWD \
64 --minimize \
65 --sparse --anchored --state-size 4 \
66 src/unicode/fsm/ \
67 "$regex"
68}
69
70regional_indicator() {
71 # For finding all occurrences of region indicators. This is used to handle
72 # regional indicators as a special case for the reverse grapheme iterator
73 # and the reverse word iterator.
74 echo "generating regional indicator DFA"
75 ucd-generate dfa \
76 --name REGIONAL_INDICATOR_REV \
77 --reverse \
78 --classes --minimize --anchored --premultiply --state-size 1 \
79 src/unicode/fsm/ \
80 "\p{gcb=Regional_Indicator}"
81}
82
83simple_word() {
84 echo "generating forward simple word DFA"
85 ucd-generate dfa \
86 --name SIMPLE_WORD_FWD \
87 --sparse --minimize --state-size 2 \
88 src/unicode/fsm/ \
89 "\w"
90}
91
92whitespace() {
93 echo "generating forward whitespace DFA"
94 ucd-generate dfa \
95 --name WHITESPACE_ANCHORED_FWD \
96 --anchored --classes --premultiply --minimize --state-size 1 \
97 src/unicode/fsm/ \
98 "\s+"
99
100 echo "generating reverse whitespace DFA"
101 ucd-generate dfa \
102 --name WHITESPACE_ANCHORED_REV \
103 --reverse \
064997fb 104 --anchored --classes --premultiply --minimize --state-size 2 \
dfeec247
XL
105 src/unicode/fsm/ \
106 "\s+"
107}
108
109main() {
110 if array_exists "-h" "$@" || array_exists "--help" "$@"; then
111 echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2
112 exit
113 fi
114
115 commands="
116 graphemes
117 sentences
118 words
119 regional-indicator
120 simple-word
121 whitespace
122 "
123 if array_exists "--list-commands" "$@"; then
124 for cmd in $commands; do
125 echo "$cmd"
126 done
127 exit
128 fi
129
130 # ucd-generate is used to compile regexes into DFAs.
131 requires ucd-generate
132
f035d41b 133 mkdir -p src/unicode/fsm/
dfeec247 134
f035d41b 135 cmds=$*
dfeec247 136 if [ $# -eq 0 ] || array_exists "all" "$@"; then
f035d41b 137 cmds=$commands
dfeec247 138 fi
f035d41b
XL
139 for cmd in $cmds; do
140 if array_exists "$cmd" $commands; then
dfeec247
XL
141 fun="$(echo "$cmd" | sed 's/-/_/g')"
142 eval "$fun"
143 else
144 echo "unrecognized command: $cmd" >&2
145 fi
146 done
147}
148
149main "$@"