ysh/regex_translate.py

OILS / ysh / regex_translate.py View on Github | oilshell.org

397 lines, 246 significant

1	#!/usr/bin/env python2
2	"""regex_translate.py."""
3	from __future__ import print_function
4
5	from _devbuild.gen.syntax_asdl import (
6	PosixClass,
7	PerlClass,
8	CharCode,
9	char_class_term,
10	char_class_term_e,
11	char_class_term_t,
12	re,
13	re_e,
14	re_repeat,
15	re_repeat_e,
16	EggexFlag,
17	)
18	from _devbuild.gen.id_kind_asdl import Id
19	from _devbuild.gen.value_asdl import value
20	from core.error import e_die, p_die
21	from frontend import lexer
22	from mycpp.mylib import log, tagswitch
23	from osh import glob_ # for ExtendedRegexEscape
24
25	from typing import List, Optional, TYPE_CHECKING, cast
26
27	if TYPE_CHECKING:
28	from _devbuild.gen.syntax_asdl import re_t
29
30	from libc import REG_ICASE, REG_NEWLINE
31
32	_ = log
33
34	PERL_CLASS = {
35	'd': '[:digit:]',
36	# Python's docs say it's [a-zA-Z0-9_] when NO LOCALE is set.
37	'w': '[:alpha:][:digit:]_',
38	# Python's doc says \s is [ \t\n\r\f\v] when NO LCOALE
39	's': '[:space:]',
40	}
41
42	# ERE's in POSIX:
43	# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
44	#
45	# NOTE: They don't support \x00 or \u1234 as in Perl/Python!
46	#
47	# It's hard to grep for tabs with BRE or ERE syntax. You have to use:
48	#
49	# (1) a literal tab with Ctrl-V, or
50	# (2) bash syntax: grep $'\t' foo.txt
51	# (3) POSIX shell syntax: grep "$(echo -e '\t')" foo.txt.
52	#
53	# I ran into this in test/lint.sh !!!
54	#
55	# https://stackoverflow.com/questions/1825552/grep-a-tab-in-unix
56
57	# Algorithm:
58	# - Unicode escapes in BREs disallowed
59	# - ASCII codes 1-255 allowed LITERALLY, but NUL \0 disallowed
60	#
61	# What about utf-8 encoded characters? Those work within literals, but are
62	# problematic within character sets. There's no way to disallow those in
63	# general though.
64
65	CH_RBRACKET = 0x5d
66	CH_BACKSLASH = 0x5c
67	CH_CARET = 0x5e
68	CH_HYPHEN = 0x2d
69
70	FLAG_RBRACKET = 0b0001
71	FLAG_BACKSLASH = 0b0010
72	FLAG_CARET = 0b0100
73	FLAG_HYPHEN = 0b1000
74
75
76	def _CharCodeToEre(term, parts, special_char_flags):
77	# type: (CharCode, List[str], List[int]) -> None
78	"""special_char_flags: list of single int that is mutated."""
79
80	char_int = term.i
81	if char_int >= 128 and term.u_braced:
82	# \u{ff} can't be represented in ERE because we don't know the encoding
83	# \xff can be represented
84	e_die("ERE can't express char code %d" % char_int, term.blame_tok)
85
86	# note: mycpp doesn't handle
87	# special_char_flags[0] \|= FLAG_HYPHEN
88	mask = special_char_flags[0]
89
90	if char_int == CH_HYPHEN:
91	mask \|= FLAG_HYPHEN
92	elif char_int == CH_CARET:
93	mask \|= FLAG_CARET
94	elif char_int == CH_RBRACKET:
95	mask \|= FLAG_RBRACKET
96	elif char_int == CH_BACKSLASH:
97	mask \|= FLAG_BACKSLASH
98	else:
99	parts.append(chr(char_int))
100
101	special_char_flags[0] = mask
102
103
104	def _CharClassTermToEre(term, parts, special_char_flags):
105	# type: (char_class_term_t, List[str], List[int]) -> None
106	"""special_char_flags: list of single int that is mutated."""
107
108	UP_term = term
109	with tagswitch(term) as case:
110	if case(char_class_term_e.Range):
111	term = cast(char_class_term.Range, UP_term)
112
113	# Create our own flags
114	range_no_special = [0]
115
116	_CharCodeToEre(term.start, parts, range_no_special)
117	if range_no_special[0] != 0:
118	e_die(
119	"Can't use char %d as start of range in ERE syntax" %
120	term.start.i, term.start.blame_tok)
121
122	parts.append('-') # a-b
123
124	_CharCodeToEre(term.end, parts, range_no_special)
125	if range_no_special[0] != 0:
126	e_die(
127	"Can't use char %d as end of range in ERE syntax" %
128	term.end.i, term.end.blame_tok)
129
130	elif case(char_class_term_e.CharCode):
131	term = cast(CharCode, UP_term)
132
133	_CharCodeToEre(term, parts, special_char_flags)
134
135	elif case(char_class_term_e.PerlClass):
136	term = cast(PerlClass, UP_term)
137	n = term.name
138	chars = PERL_CLASS[term.name] # looks like '[:digit:]'
139	if term.negated:
140	e_die("Perl classes can't be negated in ERE", term.negated)
141	else:
142	pat = '%s' % chars
143	parts.append(pat)
144
145	elif case(char_class_term_e.PosixClass):
146	term = cast(PosixClass, UP_term)
147	n = term.name # looks like 'digit'
148	if term.negated:
149	e_die("POSIX classes can't be negated in ERE", term.negated)
150	else:
151	pat = '[:%s:]' % n
152	parts.append(pat)
153
154	else:
155	raise AssertionError(term)
156
157
158	def _AsPosixEre(node, parts, capture_names):
159	# type: (re_t, List[str], List[Optional[str]]) -> None
160	"""Translate an Oil regex to a POSIX ERE.
161
162	Appends to a list of parts that you have to join.
163	"""
164	UP_node = node
165	tag = node.tag()
166
167	if tag == re_e.Primitive:
168	node = cast(re.Primitive, UP_node)
169	if node.id == Id.Re_Dot:
170	parts.append('.')
171	elif node.id == Id.Re_Start:
172	parts.append('^')
173	elif node.id == Id.Re_End:
174	parts.append('$')
175	else:
176	raise AssertionError(node.id)
177	return
178
179	if tag == re_e.LiteralChars:
180	node = cast(re.LiteralChars, UP_node)
181	# The bash [[ x =~ "." ]] construct also has to do this
182
183	# TODO: What about \0 and unicode escapes?
184	# Those won't be as LiteralChars I don't think?
185	# Unless you put them there through \0
186	# Maybe DISALLOW those.
187	# "Unprintable chars should be written as \0 or \x00 or \u0000"
188
189	parts.append(glob_.ExtendedRegexEscape(node.s))
190	return
191
192	if tag == re_e.Seq:
193	node = cast(re.Seq, UP_node)
194	for c in node.children:
195	_AsPosixEre(c, parts, capture_names)
196	return
197
198	if tag == re_e.Alt:
199	node = cast(re.Alt, UP_node)
200	for i, c in enumerate(node.children):
201	if i != 0:
202	parts.append('\|')
203	_AsPosixEre(c, parts, capture_names)
204	return
205
206	if tag == re_e.Repeat:
207	node = cast(re.Repeat, UP_node)
208	# 'foo' or "foo" or $x or ${x} evaluated to too many chars
209	if node.child.tag() == re_e.LiteralChars:
210	child = cast(re.LiteralChars, node.child)
211	if len(child.s) > 1:
212	# Note: Other regex dialects have non-capturing groups since we don't
213	# need this.
214	e_die(
215	"POSIX EREs don't have groups without capture, so this node "
216	"needs () around it.", child.blame_tok)
217
218	_AsPosixEre(node.child, parts, capture_names)
219	op = node.op
220	op_tag = op.tag()
221	UP_op = op
222
223	if op_tag == re_repeat_e.Op:
224	op = cast(re_repeat.Op, UP_op)
225	op_id = op.op.id
226	if op_id == Id.Arith_Plus:
227	parts.append('+')
228	elif op_id == Id.Arith_Star:
229	parts.append('*')
230	elif op_id == Id.Arith_QMark:
231	parts.append('?')
232	else:
233	raise AssertionError(op_id)
234	return
235
236	if op_tag == re_repeat_e.Num:
237	op = cast(re_repeat.Num, UP_op)
238	parts.append('{%s}' % op.times.tval)
239	return
240
241	if op_tag == re_repeat_e.Range:
242	op = cast(re_repeat.Range, UP_op)
243	lower = op.lower.tval if op.lower else ''
244	upper = op.upper.tval if op.upper else ''
245	parts.append('{%s,%s}' % (lower, upper))
246	return
247
248	raise NotImplementedError(op_tag)
249
250	# Special case for familiarity: () is acceptable as a group in ERE
251	if tag == re_e.Group:
252	node = cast(re.Group, UP_node)
253
254	# placeholder so we know this group is numbered, but not named
255	capture_names.append(None)
256
257	parts.append('(')
258	_AsPosixEre(node.child, parts, capture_names)
259	parts.append(')')
260	return
261
262	if tag == re_e.Capture:
263	node = cast(re.Capture, UP_node)
264
265	# Collect in order of ( appearance
266	# TODO: get the name string, and type string
267
268	capture_str = lexer.TokenVal(node.name) if node.name else None
269	capture_names.append(capture_str)
270
271	parts.append('(')
272	_AsPosixEre(node.child, parts, capture_names)
273	parts.append(')')
274	return
275
276	if tag == re_e.PerlClass:
277	node = cast(PerlClass, UP_node)
278	n = node.name
279	chars = PERL_CLASS[node.name] # looks like [:digit:]
280	if node.negated:
281	pat = '[^%s]' % chars
282	else:
283	pat = '[%s]' % chars
284	parts.append(pat)
285	return
286
287	if tag == re_e.PosixClass:
288	node = cast(PosixClass, UP_node)
289	n = node.name # looks like 'digit'
290	if node.negated:
291	pat = '[^[:%s:]]' % n
292	else:
293	pat = '[[:%s:]]' % n
294	parts.append(pat)
295	return
296
297	if tag == re_e.CharClass:
298	node = cast(re.CharClass, UP_node)
299
300	# HYPHEN CARET RBRACKET BACKSLASH
301	special_char_flags = [0]
302	non_special_parts = [] # type: List[str]
303
304	for term in node.terms:
305	_CharClassTermToEre(term, non_special_parts, special_char_flags)
306
307	parts.append('[')
308	if node.negated:
309	parts.append('^')
310
311	# Help the user with some of terrible corner cases
312
313	# - move literal - to end [ab-] not [a-b]
314	# - move literal ^ to end [x^-] not [^x-]
315	# - move literal ] to beginning: []x] not [x]]
316	# - double up \\ because of Gawk extension [\\]
317
318	if special_char_flags[0] & FLAG_RBRACKET:
319	parts.append(']')
320
321	parts.extend(non_special_parts)
322
323	if special_char_flags[0] & FLAG_BACKSLASH:
324	parts.append('\\\\') # TWO backslashes
325
326	if special_char_flags[0] & FLAG_CARET:
327	parts.append('^')
328
329	if special_char_flags[0] & FLAG_HYPHEN:
330	parts.append('-')
331
332	parts.append(']')
333	return
334
335	raise NotImplementedError(tag)
336
337
338	def AsPosixEre(eggex):
339	# type: (value.Eggex) -> str
340	"""
341	Lazily fills in fields on the value.Eggex argument.
342	"""
343	if eggex.as_ere is not None:
344	return eggex.as_ere
345
346	parts = [] # type: List[str]
347	_AsPosixEre(eggex.spliced, parts, eggex.capture_names)
348
349	# These are both indexed by group number, with None for the holes
350	# List[str?] vs. List[value?]
351	assert len(eggex.capture_names) == len(eggex.convert_funcs)
352
353	eggex.as_ere = ''.join(parts)
354
355	return eggex.as_ere
356
357
358	def CanonicalFlags(flags):
359	# type: (List[EggexFlag]) -> str
360	"""
361	Raises PARSE error on invalid flags.
362
363	In theory we could encode directly to integers like REG_ICASE, but a string
364	like like 'i' makes the error message slightly more legible.
365	"""
366	letters = [] # type: List[str]
367	for flag in flags:
368	if flag.negated:
369	p_die("Flag can't be negated", flag.flag)
370	flag_name = lexer.TokenVal(flag.flag)
371	if flag_name in ('i', 'reg_icase'):
372	letters.append('i')
373	elif flag_name == 'reg_newline':
374	letters.append('n')
375	else:
376	p_die("Invalid regex flag %r" % flag_name, flag.flag)
377
378	# Normalize for comparison
379	letters.sort()
380	return ''.join(letters)
381
382
383	def LibcFlags(canonical_flags):
384	# type: (Optional[str]) -> int
385	if canonical_flags is None:
386	return 0
387
388	libc_flags = 0
389	for ch in canonical_flags:
390	if ch == 'i':
391	libc_flags \|= REG_ICASE
392	elif ch == 'n':
393	libc_flags \|= REG_NEWLINE
394	else:
395	# regex_translate should prevent this
396	raise AssertionError()
397	return libc_flags