OILS / ysh / regex_translate.py View on Github | oilshell.org

397 lines, 246 significant
1#!/usr/bin/env python2
2"""regex_translate.py."""
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import (
6 PosixClass,
7 PerlClass,
8 CharCode,
9 char_class_term,
10 char_class_term_e,
11 char_class_term_t,
12 re,
13 re_e,
14 re_repeat,
15 re_repeat_e,
16 EggexFlag,
17)
18from _devbuild.gen.id_kind_asdl import Id
19from _devbuild.gen.value_asdl import value
20from core.error import e_die, p_die
21from frontend import lexer
22from mycpp.mylib import log, tagswitch
23from osh import glob_ # for ExtendedRegexEscape
24
25from typing import List, Optional, TYPE_CHECKING, cast
26
27if TYPE_CHECKING:
28 from _devbuild.gen.syntax_asdl import re_t
29
30from libc import REG_ICASE, REG_NEWLINE
31
32_ = log
33
34PERL_CLASS = {
35 'd': '[:digit:]',
36 # Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
37 'w': '[:alpha:][:digit:]_',
38 # Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
39 's': '[:space:]',
40}
41
42# ERE's in POSIX:
43# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
44#
45# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
46#
47# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
48#
49# (1) a literal tab with Ctrl-V, or
50# (2) bash syntax: grep $'\t' foo.txt
51# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
52#
53# I ran into this in test/lint.sh !!!
54#
55# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
56
57# Algorithm:
58# - Unicode escapes in BREs disallowed
59# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
60#
61# What about utf-8 encoded characters? Those work within literals, but are
62# problematic within character sets. There's no way to disallow those in
63# general though.
64
65CH_RBRACKET = 0x5d
66CH_BACKSLASH = 0x5c
67CH_CARET = 0x5e
68CH_HYPHEN = 0x2d
69
70FLAG_RBRACKET = 0b0001
71FLAG_BACKSLASH = 0b0010
72FLAG_CARET = 0b0100
73FLAG_HYPHEN = 0b1000
74
75
76def _CharCodeToEre(term, parts, special_char_flags):
77 # type: (CharCode, List[str], List[int]) -> None
78 """special_char_flags: list of single int that is mutated."""
79
80 char_int = term.i
81 if char_int >= 128 and term.u_braced:
82 # \u{ff} can't be represented in ERE because we don't know the encoding
83 # \xff can be represented
84 e_die("ERE can't express char code %d" % char_int, term.blame_tok)
85
86 # note: mycpp doesn't handle
87 # special_char_flags[0] |= FLAG_HYPHEN
88 mask = special_char_flags[0]
89
90 if char_int == CH_HYPHEN:
91 mask |= FLAG_HYPHEN
92 elif char_int == CH_CARET:
93 mask |= FLAG_CARET
94 elif char_int == CH_RBRACKET:
95 mask |= FLAG_RBRACKET
96 elif char_int == CH_BACKSLASH:
97 mask |= FLAG_BACKSLASH
98 else:
99 parts.append(chr(char_int))
100
101 special_char_flags[0] = mask
102
103
104def _CharClassTermToEre(term, parts, special_char_flags):
105 # type: (char_class_term_t, List[str], List[int]) -> None
106 """special_char_flags: list of single int that is mutated."""
107
108 UP_term = term
109 with tagswitch(term) as case:
110 if case(char_class_term_e.Range):
111 term = cast(char_class_term.Range, UP_term)
112
113 # Create our own flags
114 range_no_special = [0]
115
116 _CharCodeToEre(term.start, parts, range_no_special)
117 if range_no_special[0] != 0:
118 e_die(
119 "Can't use char %d as start of range in ERE syntax" %
120 term.start.i, term.start.blame_tok)
121
122 parts.append('-') # a-b
123
124 _CharCodeToEre(term.end, parts, range_no_special)
125 if range_no_special[0] != 0:
126 e_die(
127 "Can't use char %d as end of range in ERE syntax" %
128 term.end.i, term.end.blame_tok)
129
130 elif case(char_class_term_e.CharCode):
131 term = cast(CharCode, UP_term)
132
133 _CharCodeToEre(term, parts, special_char_flags)
134
135 elif case(char_class_term_e.PerlClass):
136 term = cast(PerlClass, UP_term)
137 n = term.name
138 chars = PERL_CLASS[term.name] # looks like '[:digit:]'
139 if term.negated:
140 e_die("Perl classes can't be negated in ERE", term.negated)
141 else:
142 pat = '%s' % chars
143 parts.append(pat)
144
145 elif case(char_class_term_e.PosixClass):
146 term = cast(PosixClass, UP_term)
147 n = term.name # looks like 'digit'
148 if term.negated:
149 e_die("POSIX classes can't be negated in ERE", term.negated)
150 else:
151 pat = '[:%s:]' % n
152 parts.append(pat)
153
154 else:
155 raise AssertionError(term)
156
157
158def _AsPosixEre(node, parts, capture_names):
159 # type: (re_t, List[str], List[Optional[str]]) -> None
160 """Translate an Oil regex to a POSIX ERE.
161
162 Appends to a list of parts that you have to join.
163 """
164 UP_node = node
165 tag = node.tag()
166
167 if tag == re_e.Primitive:
168 node = cast(re.Primitive, UP_node)
169 if node.id == Id.Re_Dot:
170 parts.append('.')
171 elif node.id == Id.Re_Start:
172 parts.append('^')
173 elif node.id == Id.Re_End:
174 parts.append('$')
175 else:
176 raise AssertionError(node.id)
177 return
178
179 if tag == re_e.LiteralChars:
180 node = cast(re.LiteralChars, UP_node)
181 # The bash [[ x =~ "." ]] construct also has to do this
182
183 # TODO: What about \0 and unicode escapes?
184 # Those won't be as LiteralChars I don't think?
185 # Unless you put them there through \0
186 # Maybe DISALLOW those.
187 # "Unprintable chars should be written as \0 or \x00 or \u0000"
188
189 parts.append(glob_.ExtendedRegexEscape(node.s))
190 return
191
192 if tag == re_e.Seq:
193 node = cast(re.Seq, UP_node)
194 for c in node.children:
195 _AsPosixEre(c, parts, capture_names)
196 return
197
198 if tag == re_e.Alt:
199 node = cast(re.Alt, UP_node)
200 for i, c in enumerate(node.children):
201 if i != 0:
202 parts.append('|')
203 _AsPosixEre(c, parts, capture_names)
204 return
205
206 if tag == re_e.Repeat:
207 node = cast(re.Repeat, UP_node)
208 # 'foo' or "foo" or $x or ${x} evaluated to too many chars
209 if node.child.tag() == re_e.LiteralChars:
210 child = cast(re.LiteralChars, node.child)
211 if len(child.s) > 1:
212 # Note: Other regex dialects have non-capturing groups since we don't
213 # need this.
214 e_die(
215 "POSIX EREs don't have groups without capture, so this node "
216 "needs () around it.", child.blame_tok)
217
218 _AsPosixEre(node.child, parts, capture_names)
219 op = node.op
220 op_tag = op.tag()
221 UP_op = op
222
223 if op_tag == re_repeat_e.Op:
224 op = cast(re_repeat.Op, UP_op)
225 op_id = op.op.id
226 if op_id == Id.Arith_Plus:
227 parts.append('+')
228 elif op_id == Id.Arith_Star:
229 parts.append('*')
230 elif op_id == Id.Arith_QMark:
231 parts.append('?')
232 else:
233 raise AssertionError(op_id)
234 return
235
236 if op_tag == re_repeat_e.Num:
237 op = cast(re_repeat.Num, UP_op)
238 parts.append('{%s}' % op.times.tval)
239 return
240
241 if op_tag == re_repeat_e.Range:
242 op = cast(re_repeat.Range, UP_op)
243 lower = op.lower.tval if op.lower else ''
244 upper = op.upper.tval if op.upper else ''
245 parts.append('{%s,%s}' % (lower, upper))
246 return
247
248 raise NotImplementedError(op_tag)
249
250 # Special case for familiarity: () is acceptable as a group in ERE
251 if tag == re_e.Group:
252 node = cast(re.Group, UP_node)
253
254 # placeholder so we know this group is numbered, but not named
255 capture_names.append(None)
256
257 parts.append('(')
258 _AsPosixEre(node.child, parts, capture_names)
259 parts.append(')')
260 return
261
262 if tag == re_e.Capture:
263 node = cast(re.Capture, UP_node)
264
265 # Collect in order of ( appearance
266 # TODO: get the name string, and type string
267
268 capture_str = lexer.TokenVal(node.name) if node.name else None
269 capture_names.append(capture_str)
270
271 parts.append('(')
272 _AsPosixEre(node.child, parts, capture_names)
273 parts.append(')')
274 return
275
276 if tag == re_e.PerlClass:
277 node = cast(PerlClass, UP_node)
278 n = node.name
279 chars = PERL_CLASS[node.name] # looks like [:digit:]
280 if node.negated:
281 pat = '[^%s]' % chars
282 else:
283 pat = '[%s]' % chars
284 parts.append(pat)
285 return
286
287 if tag == re_e.PosixClass:
288 node = cast(PosixClass, UP_node)
289 n = node.name # looks like 'digit'
290 if node.negated:
291 pat = '[^[:%s:]]' % n
292 else:
293 pat = '[[:%s:]]' % n
294 parts.append(pat)
295 return
296
297 if tag == re_e.CharClass:
298 node = cast(re.CharClass, UP_node)
299
300 # HYPHEN CARET RBRACKET BACKSLASH
301 special_char_flags = [0]
302 non_special_parts = [] # type: List[str]
303
304 for term in node.terms:
305 _CharClassTermToEre(term, non_special_parts, special_char_flags)
306
307 parts.append('[')
308 if node.negated:
309 parts.append('^')
310
311 # Help the user with some of terrible corner cases
312
313 # - move literal - to end [ab-] not [a-b]
314 # - move literal ^ to end [x^-] not [^x-]
315 # - move literal ] to beginning: []x] not [x]]
316 # - double up \\ because of Gawk extension [\\]
317
318 if special_char_flags[0] & FLAG_RBRACKET:
319 parts.append(']')
320
321 parts.extend(non_special_parts)
322
323 if special_char_flags[0] & FLAG_BACKSLASH:
324 parts.append('\\\\') # TWO backslashes
325
326 if special_char_flags[0] & FLAG_CARET:
327 parts.append('^')
328
329 if special_char_flags[0] & FLAG_HYPHEN:
330 parts.append('-')
331
332 parts.append(']')
333 return
334
335 raise NotImplementedError(tag)
336
337
338def AsPosixEre(eggex):
339 # type: (value.Eggex) -> str
340 """
341 Lazily fills in fields on the value.Eggex argument.
342 """
343 if eggex.as_ere is not None:
344 return eggex.as_ere
345
346 parts = [] # type: List[str]
347 _AsPosixEre(eggex.spliced, parts, eggex.capture_names)
348
349 # These are both indexed by group number, with None for the holes
350 # List[str?] vs. List[value?]
351 assert len(eggex.capture_names) == len(eggex.convert_funcs)
352
353 eggex.as_ere = ''.join(parts)
354
355 return eggex.as_ere
356
357
358def CanonicalFlags(flags):
359 # type: (List[EggexFlag]) -> str
360 """
361 Raises PARSE error on invalid flags.
362
363 In theory we could encode directly to integers like REG_ICASE, but a string
364 like like 'i' makes the error message slightly more legible.
365 """
366 letters = [] # type: List[str]
367 for flag in flags:
368 if flag.negated:
369 p_die("Flag can't be negated", flag.flag)
370 flag_name = lexer.TokenVal(flag.flag)
371 if flag_name in ('i', 'reg_icase'):
372 letters.append('i')
373 elif flag_name == 'reg_newline':
374 letters.append('n')
375 else:
376 p_die("Invalid regex flag %r" % flag_name, flag.flag)
377
378 # Normalize for comparison
379 letters.sort()
380 return ''.join(letters)
381
382
383def LibcFlags(canonical_flags):
384 # type: (Optional[str]) -> int
385 if canonical_flags is None:
386 return 0
387
388 libc_flags = 0
389 for ch in canonical_flags:
390 if ch == 'i':
391 libc_flags |= REG_ICASE
392 elif ch == 'n':
393 libc_flags |= REG_NEWLINE
394 else:
395 # regex_translate should prevent this
396 raise AssertionError()
397 return libc_flags