]>
Commit | Line | Data |
---|---|---|
cae15db7 DT |
1 | #include <string.h> |
2 | #include "util.h" | |
3 | #include "debug.h" | |
4 | ||
5 | #include "demangle-rust.h" | |
6 | ||
7 | /* | |
8 | * Mangled Rust symbols look like this: | |
9 | * | |
10 | * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a | |
11 | * | |
12 | * The original symbol is: | |
13 | * | |
14 | * <std::sys::fd::FileDesc as core::ops::Drop>::drop | |
15 | * | |
16 | * The last component of the path is a 64-bit hash in lowercase hex, prefixed | |
17 | * with "h". Rust does not have a global namespace between crates, an illusion | |
18 | * which Rust maintains by using the hash to distinguish things that would | |
19 | * otherwise have the same symbol. | |
20 | * | |
21 | * Any path component not starting with a XID_Start character is prefixed with | |
22 | * "_". | |
23 | * | |
24 | * The following escape sequences are used: | |
25 | * | |
26 | * "," => $C$ | |
27 | * "@" => $SP$ | |
28 | * "*" => $BP$ | |
29 | * "&" => $RF$ | |
30 | * "<" => $LT$ | |
31 | * ">" => $GT$ | |
32 | * "(" => $LP$ | |
33 | * ")" => $RP$ | |
34 | * " " => $u20$ | |
35 | * "'" => $u27$ | |
36 | * "[" => $u5b$ | |
37 | * "]" => $u5d$ | |
38 | * "~" => $u7e$ | |
39 | * | |
40 | * A double ".." means "::" and a single "." means "-". | |
41 | * | |
42 | * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ | |
43 | */ | |
44 | ||
45 | static const char *hash_prefix = "::h"; | |
46 | static const size_t hash_prefix_len = 3; | |
47 | static const size_t hash_len = 16; | |
48 | ||
49 | static bool is_prefixed_hash(const char *start); | |
50 | static bool looks_like_rust(const char *sym, size_t len); | |
51 | static bool unescape(const char **in, char **out, const char *seq, char value); | |
52 | ||
53 | /* | |
54 | * INPUT: | |
55 | * sym: symbol that has been through BFD-demangling | |
56 | * | |
57 | * This function looks for the following indicators: | |
58 | * | |
59 | * 1. The hash must consist of "h" followed by 16 lowercase hex digits. | |
60 | * | |
61 | * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible | |
62 | * hex digits. This is true of 99.9998% of hashes so once in your life you | |
63 | * may see a false negative. The point is to notice path components that | |
64 | * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In | |
65 | * this case a false positive (non-Rust symbol has an important path | |
66 | * component removed because it looks like a Rust hash) is worse than a | |
67 | * false negative (the rare Rust symbol is not demangled) so this sets the | |
68 | * balance in favor of false negatives. | |
69 | * | |
70 | * 3. There must be no characters other than a-zA-Z0-9 and _.:$ | |
71 | * | |
72 | * 4. There must be no unrecognized $-sign sequences. | |
73 | * | |
74 | * 5. There must be no sequence of three or more dots in a row ("..."). | |
75 | */ | |
76 | bool | |
77 | rust_is_mangled(const char *sym) | |
78 | { | |
79 | size_t len, len_without_hash; | |
80 | ||
81 | if (!sym) | |
82 | return false; | |
83 | ||
84 | len = strlen(sym); | |
85 | if (len <= hash_prefix_len + hash_len) | |
86 | /* Not long enough to contain "::h" + hash + something else */ | |
87 | return false; | |
88 | ||
89 | len_without_hash = len - (hash_prefix_len + hash_len); | |
90 | if (!is_prefixed_hash(sym + len_without_hash)) | |
91 | return false; | |
92 | ||
93 | return looks_like_rust(sym, len_without_hash); | |
94 | } | |
95 | ||
96 | /* | |
97 | * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex | |
98 | * digits must comprise between 5 and 15 (inclusive) distinct digits. | |
99 | */ | |
100 | static bool is_prefixed_hash(const char *str) | |
101 | { | |
102 | const char *end; | |
103 | bool seen[16]; | |
104 | size_t i; | |
105 | int count; | |
106 | ||
107 | if (strncmp(str, hash_prefix, hash_prefix_len)) | |
108 | return false; | |
109 | str += hash_prefix_len; | |
110 | ||
111 | memset(seen, false, sizeof(seen)); | |
112 | for (end = str + hash_len; str < end; str++) | |
113 | if (*str >= '0' && *str <= '9') | |
114 | seen[*str - '0'] = true; | |
115 | else if (*str >= 'a' && *str <= 'f') | |
116 | seen[*str - 'a' + 10] = true; | |
117 | else | |
118 | return false; | |
119 | ||
120 | /* Count how many distinct digits seen */ | |
121 | count = 0; | |
122 | for (i = 0; i < 16; i++) | |
123 | if (seen[i]) | |
124 | count++; | |
125 | ||
126 | return count >= 5 && count <= 15; | |
127 | } | |
128 | ||
129 | static bool looks_like_rust(const char *str, size_t len) | |
130 | { | |
131 | const char *end = str + len; | |
132 | ||
133 | while (str < end) | |
134 | switch (*str) { | |
135 | case '$': | |
136 | if (!strncmp(str, "$C$", 3)) | |
137 | str += 3; | |
138 | else if (!strncmp(str, "$SP$", 4) | |
139 | || !strncmp(str, "$BP$", 4) | |
140 | || !strncmp(str, "$RF$", 4) | |
141 | || !strncmp(str, "$LT$", 4) | |
142 | || !strncmp(str, "$GT$", 4) | |
143 | || !strncmp(str, "$LP$", 4) | |
144 | || !strncmp(str, "$RP$", 4)) | |
145 | str += 4; | |
146 | else if (!strncmp(str, "$u20$", 5) | |
147 | || !strncmp(str, "$u27$", 5) | |
148 | || !strncmp(str, "$u5b$", 5) | |
149 | || !strncmp(str, "$u5d$", 5) | |
150 | || !strncmp(str, "$u7e$", 5)) | |
151 | str += 5; | |
152 | else | |
153 | return false; | |
154 | break; | |
155 | case '.': | |
156 | /* Do not allow three or more consecutive dots */ | |
157 | if (!strncmp(str, "...", 3)) | |
158 | return false; | |
159 | /* Fall through */ | |
160 | case 'a' ... 'z': | |
161 | case 'A' ... 'Z': | |
162 | case '0' ... '9': | |
163 | case '_': | |
164 | case ':': | |
165 | str++; | |
166 | break; | |
167 | default: | |
168 | return false; | |
169 | } | |
170 | ||
171 | return true; | |
172 | } | |
173 | ||
174 | /* | |
175 | * INPUT: | |
176 | * sym: symbol for which rust_is_mangled(sym) returns true | |
177 | * | |
178 | * The input is demangled in-place because the mangled name is always longer | |
179 | * than the demangled one. | |
180 | */ | |
181 | void | |
182 | rust_demangle_sym(char *sym) | |
183 | { | |
184 | const char *in; | |
185 | char *out; | |
186 | const char *end; | |
187 | ||
188 | if (!sym) | |
189 | return; | |
190 | ||
191 | in = sym; | |
192 | out = sym; | |
193 | end = sym + strlen(sym) - (hash_prefix_len + hash_len); | |
194 | ||
195 | while (in < end) | |
196 | switch (*in) { | |
197 | case '$': | |
198 | if (!(unescape(&in, &out, "$C$", ',') | |
199 | || unescape(&in, &out, "$SP$", '@') | |
200 | || unescape(&in, &out, "$BP$", '*') | |
201 | || unescape(&in, &out, "$RF$", '&') | |
202 | || unescape(&in, &out, "$LT$", '<') | |
203 | || unescape(&in, &out, "$GT$", '>') | |
204 | || unescape(&in, &out, "$LP$", '(') | |
205 | || unescape(&in, &out, "$RP$", ')') | |
206 | || unescape(&in, &out, "$u20$", ' ') | |
207 | || unescape(&in, &out, "$u27$", '\'') | |
208 | || unescape(&in, &out, "$u5b$", '[') | |
209 | || unescape(&in, &out, "$u5d$", ']') | |
210 | || unescape(&in, &out, "$u7e$", '~'))) { | |
211 | pr_err("demangle-rust: unexpected escape sequence"); | |
212 | goto done; | |
213 | } | |
214 | break; | |
215 | case '_': | |
216 | /* | |
217 | * If this is the start of a path component and the next | |
218 | * character is an escape sequence, ignore the | |
219 | * underscore. The mangler inserts an underscore to make | |
220 | * sure the path component begins with a XID_Start | |
221 | * character. | |
222 | */ | |
223 | if ((in == sym || in[-1] == ':') && in[1] == '$') | |
224 | in++; | |
225 | else | |
226 | *out++ = *in++; | |
227 | break; | |
228 | case '.': | |
229 | if (in[1] == '.') { | |
230 | /* ".." becomes "::" */ | |
231 | *out++ = ':'; | |
232 | *out++ = ':'; | |
233 | in += 2; | |
234 | } else { | |
235 | /* "." becomes "-" */ | |
236 | *out++ = '-'; | |
237 | in++; | |
238 | } | |
239 | break; | |
240 | case 'a' ... 'z': | |
241 | case 'A' ... 'Z': | |
242 | case '0' ... '9': | |
243 | case ':': | |
244 | *out++ = *in++; | |
245 | break; | |
246 | default: | |
247 | pr_err("demangle-rust: unexpected character '%c' in symbol\n", | |
248 | *in); | |
249 | goto done; | |
250 | } | |
251 | ||
252 | done: | |
253 | *out = '\0'; | |
254 | } | |
255 | ||
256 | static bool unescape(const char **in, char **out, const char *seq, char value) | |
257 | { | |
258 | size_t len = strlen(seq); | |
259 | ||
260 | if (strncmp(*in, seq, len)) | |
261 | return false; | |
262 | ||
263 | **out = value; | |
264 | ||
265 | *in += len; | |
266 | *out += 1; | |
267 | ||
268 | return true; | |
269 | } |