]>
Commit | Line | Data |
---|---|---|
dfeec247 XL |
1 | #!/bin/sh |
2 | ||
3 | set -e | |
4 | D="$(dirname "$0")" | |
5 | ||
6 | # Convenience function for checking that a command exists. | |
7 | requires() { | |
8 | cmd="$1" | |
9 | if ! command -v "$cmd" > /dev/null 2>&1; then | |
10 | echo "DEPENDENCY MISSING: $cmd must be installed" >&2 | |
11 | exit 1 | |
12 | fi | |
13 | } | |
14 | ||
15 | # Test if an array ($2) contains a particular element ($1). | |
16 | array_exists() { | |
17 | needle="$1" | |
18 | shift | |
19 | ||
20 | for el in "$@"; do | |
21 | if [ "$el" = "$needle" ]; then | |
22 | return 0 | |
23 | fi | |
24 | done | |
25 | return 1 | |
26 | } | |
27 | ||
28 | graphemes() { | |
29 | regex="$(sh "$D/regex/grapheme.sh")" | |
30 | ||
31 | echo "generating forward grapheme DFA" | |
32 | ucd-generate dfa \ | |
33 | --name GRAPHEME_BREAK_FWD \ | |
34 | --sparse --minimize --anchored --state-size 2 \ | |
35 | src/unicode/fsm/ \ | |
36 | "$regex" | |
37 | ||
38 | echo "generating reverse grapheme DFA" | |
39 | ucd-generate dfa \ | |
40 | --name GRAPHEME_BREAK_REV \ | |
41 | --reverse --longest \ | |
42 | --sparse --minimize --anchored --state-size 2 \ | |
43 | src/unicode/fsm/ \ | |
44 | "$regex" | |
45 | } | |
46 | ||
47 | words() { | |
48 | regex="$(sh "$D/regex/word.sh")" | |
49 | ||
50 | echo "generating forward word DFA (this can take a while)" | |
51 | ucd-generate dfa \ | |
52 | --name WORD_BREAK_FWD \ | |
53 | --sparse --minimize --anchored --state-size 4 \ | |
54 | src/unicode/fsm/ \ | |
55 | "$regex" | |
56 | } | |
57 | ||
58 | sentences() { | |
59 | regex="$(sh "$D/regex/sentence.sh")" | |
60 | ||
61 | echo "generating forward sentence DFA (this can take a while)" | |
62 | ucd-generate dfa \ | |
63 | --name SENTENCE_BREAK_FWD \ | |
64 | --minimize \ | |
65 | --sparse --anchored --state-size 4 \ | |
66 | src/unicode/fsm/ \ | |
67 | "$regex" | |
68 | } | |
69 | ||
70 | regional_indicator() { | |
71 | # For finding all occurrences of region indicators. This is used to handle | |
72 | # regional indicators as a special case for the reverse grapheme iterator | |
73 | # and the reverse word iterator. | |
74 | echo "generating regional indicator DFA" | |
75 | ucd-generate dfa \ | |
76 | --name REGIONAL_INDICATOR_REV \ | |
77 | --reverse \ | |
78 | --classes --minimize --anchored --premultiply --state-size 1 \ | |
79 | src/unicode/fsm/ \ | |
80 | "\p{gcb=Regional_Indicator}" | |
81 | } | |
82 | ||
83 | simple_word() { | |
84 | echo "generating forward simple word DFA" | |
85 | ucd-generate dfa \ | |
86 | --name SIMPLE_WORD_FWD \ | |
87 | --sparse --minimize --state-size 2 \ | |
88 | src/unicode/fsm/ \ | |
89 | "\w" | |
90 | } | |
91 | ||
92 | whitespace() { | |
93 | echo "generating forward whitespace DFA" | |
94 | ucd-generate dfa \ | |
95 | --name WHITESPACE_ANCHORED_FWD \ | |
96 | --anchored --classes --premultiply --minimize --state-size 1 \ | |
97 | src/unicode/fsm/ \ | |
98 | "\s+" | |
99 | ||
100 | echo "generating reverse whitespace DFA" | |
101 | ucd-generate dfa \ | |
102 | --name WHITESPACE_ANCHORED_REV \ | |
103 | --reverse \ | |
064997fb | 104 | --anchored --classes --premultiply --minimize --state-size 2 \ |
dfeec247 XL |
105 | src/unicode/fsm/ \ |
106 | "\s+" | |
107 | } | |
108 | ||
109 | main() { | |
110 | if array_exists "-h" "$@" || array_exists "--help" "$@"; then | |
111 | echo "Usage: $(basename "$0") [--list-commands] [<command>] ..." >&2 | |
112 | exit | |
113 | fi | |
114 | ||
115 | commands=" | |
116 | graphemes | |
117 | sentences | |
118 | words | |
119 | regional-indicator | |
120 | simple-word | |
121 | whitespace | |
122 | " | |
123 | if array_exists "--list-commands" "$@"; then | |
124 | for cmd in $commands; do | |
125 | echo "$cmd" | |
126 | done | |
127 | exit | |
128 | fi | |
129 | ||
130 | # ucd-generate is used to compile regexes into DFAs. | |
131 | requires ucd-generate | |
132 | ||
f035d41b | 133 | mkdir -p src/unicode/fsm/ |
dfeec247 | 134 | |
f035d41b | 135 | cmds=$* |
dfeec247 | 136 | if [ $# -eq 0 ] || array_exists "all" "$@"; then |
f035d41b | 137 | cmds=$commands |
dfeec247 | 138 | fi |
f035d41b XL |
139 | for cmd in $cmds; do |
140 | if array_exists "$cmd" $commands; then | |
dfeec247 XL |
141 | fun="$(echo "$cmd" | sed 's/-/_/g')" |
142 | eval "$fun" | |
143 | else | |
144 | echo "unrecognized command: $cmd" >&2 | |
145 | fi | |
146 | done | |
147 | } | |
148 | ||
149 | main "$@" |