forked from oils-for-unix/oils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_ops.py
453 lines (371 loc) · 12.9 KB
/
string_ops.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
"""
string_ops.py - String library functions that can be exposed with a saner syntax.
Instead of:
local y=${x//a*/b}
var y = x -> sub('a*', 'b', :ALL)
var y = x -> sub( Glob/a*/, 'b', :ALL) # maybe a glob literal
"""
from _devbuild.gen.id_kind_asdl import Id
from core import pyutil
from core import ui
from core.pyerror import e_die, e_strict, log
from osh import glob_
import libc
from typing import List, Tuple, TYPE_CHECKING
if TYPE_CHECKING:
from _devbuild.gen.syntax_asdl import suffix_op__Unary, suffix_op__PatSub
_ = log
def Utf8Encode(code):
# type: (int) -> str
"""Return utf-8 encoded bytes from a unicode code point.
Based on https://stackoverflow.com/a/23502707
"""
num_cont_bytes = 0
if code <= 0x7F:
return chr(code & 0x7F) # ASCII
elif code <= 0x7FF:
num_cont_bytes = 1
elif code <= 0xFFFF:
num_cont_bytes = 2
elif code <= 0x10FFFF:
num_cont_bytes = 3
else:
return '\xEF\xBF\xBD' # unicode replacement character
bytes_ = [] # type: List[int]
for _ in xrange(num_cont_bytes):
bytes_.append(0x80 | (code & 0x3F))
code >>= 6
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
bytes_.append(b)
bytes_.reverse()
# mod 256 because Python ints don't wrap around!
tmp = [chr(b & 0xFF) for b in bytes_]
return ''.join(tmp)
# TODO: Add details of the invalid character/byte here?
INCOMPLETE_CHAR = 'Incomplete UTF-8 character'
INVALID_CONT = 'Invalid UTF-8 continuation byte'
INVALID_START = 'Invalid start of UTF-8 character'
def _CheckContinuationByte(byte):
# type: (str) -> None
if (ord(byte) >> 6) != 0b10:
e_strict(INVALID_CONT)
def _Utf8CharLen(starting_byte):
# type: (int) -> int
if (starting_byte >> 7) == 0b0:
return 1
elif (starting_byte >> 5) == 0b110:
return 2
elif (starting_byte >> 4) == 0b1110:
return 3
elif (starting_byte >> 3) == 0b11110:
return 4
else:
e_strict(INVALID_START)
def _NextUtf8Char(s, i):
# type: (str, int) -> int
"""
Given a string and a byte offset, returns the byte position after
the character at this position. Usually this is the position of the
next character, but for the last character in the string, it's the
position just past the end of the string.
Validates UTF-8.
"""
n = len(s)
assert i < n, i # should always be in range
byte_as_int = ord(s[i])
length = _Utf8CharLen(byte_as_int)
for j in xrange(i + 1, i + length):
if j >= n:
e_strict(INCOMPLETE_CHAR)
_CheckContinuationByte(s[j])
return i + length
def PreviousUtf8Char(s, i):
# type: (str, int) -> int
"""
Given a string and a byte offset, returns the position of the
character before that offset. To start (find the first byte of the
last character), pass len(s) for the initial value of i.
Validates UTF-8.
"""
# All bytes in a valid UTF-8 string have one of the following formats:
#
# 0xxxxxxx (1-byte char)
# 110xxxxx (start of 2-byte char)
# 1110xxxx (start of 3-byte char)
# 11110xxx (start of 4-byte char)
# 10xxxxxx (continuation byte)
#
# Any byte that starts with 10... MUST be a continuation byte,
# otherwise it must be the start of a character (or just invalid
# data).
#
# Walking backward, we stop at the first non-continuaton byte
# found. We try to interpret it as a valid UTF-8 character starting
# byte, and check that it indicates the correct length, based on how
# far we've moved from the original byte. Possible problems:
# * byte we stopped on does not have a valid value (e.g., 11111111)
# * start byte indicates more or fewer continuation bytes than we've seen
# * no start byte at beginning of array
#
# Note that because we are going backward, on malformed input, we
# won't error out in the same place as when parsing the string
# forwards as normal.
orig_i = i
while i > 0:
i -= 1
byte_as_int = ord(s[i])
if (byte_as_int >> 6) != 0b10:
offset = orig_i - i
if offset != _Utf8CharLen(byte_as_int):
# Leaving a generic error for now, but if we want to, it's not
# hard to calculate the position where things go wrong. Note
# that offset might be more than 4, for an invalid utf-8 string.
e_strict(INVALID_START)
return i
e_strict(INVALID_START)
def CountUtf8Chars(s):
# type: (str) -> int
"""Returns the number of utf-8 characters in the byte string 's'.
TODO: Raise exception rather than returning a string, so we can set the exit
code of the command to 1 ?
$ echo ${#bad}
Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
$ echo $?
1
"""
num_chars = 0
num_bytes = len(s)
i = 0
while i < num_bytes:
i = _NextUtf8Char(s, i)
num_chars += 1
return num_chars
def AdvanceUtf8Chars(s, num_chars, byte_offset):
# type: (str, int, int) -> int
"""
Advance a certain number of UTF-8 chars, beginning with the given byte
offset. Returns a byte offset.
If we got past the end of the string
"""
num_bytes = len(s)
i = byte_offset # current byte position
for _ in xrange(num_chars):
# Neither bash or zsh checks out of bounds for slicing. Either begin or
# length.
if i >= num_bytes:
return i
#raise RuntimeError('Out of bounds')
i = _NextUtf8Char(s, i)
return i
# Implementation without Python regex:
#
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
# regexec. in a loop. fnmatch() does NOT given positions of matches.
#
# (2) Strip -- % %% # ## -
#
# a. Fast path for constant strings.
# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
# doesn't match, short circuit out? We can't do this with fnmatch.
# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
#
# - # shortest prefix - [:1], [:2], [:3] until it matches
# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
# match prefixes, it matches EXATLY.
# - % shortest suffix - [-1:] [-2:] [-3:] ...
# - %% longest suffix - [1:] [2:] [3:]
#
# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
# mksh. Dash doesn't implement it.
# TODO:
# - Unicode support: Convert both pattern, string, and replacement to unicode,
# then the result back at the end.
# - Add location info to errors. Maybe pass spid pair all the way down.
# - Compile time errors for [[:space:]] ?
def DoUnarySuffixOp(s, op, arg, extglob):
# type: (str, suffix_op__Unary, str, bool) -> str
"""Helper for ${x#prefix} and family."""
# Fast path for constant strings.
if not glob_.LooksLikeGlob(arg):
# It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
# reverse it. NOTE: We also do this check in Globber.Expand(). It would
# be nice to somehow store the original string rather tahn
# escaping/unescaping.
arg = glob_.GlobUnescape(arg)
if op.op_id in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
# explicit check for non-empty arg (len for mycpp)
if len(arg) and s.startswith(arg):
return s[len(arg):]
else:
return s
elif op.op_id in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
# need explicit check for non-empty arg (len for mycpp)
if len(arg) and s.endswith(arg):
return s[:-len(arg)]
else:
return s
# These operators take glob arguments, we don't implement that obscure case.
elif op.op_id == Id.VOp1_Comma: # Only lowercase the first letter
if arg != '':
# TODO: location info for op
e_die("%s can't have an argument", ui.PrettyId(op.op_id))
if len(s):
return s[0].lower() + s[1:]
else:
return s
elif op.op_id == Id.VOp1_DComma:
if arg != '':
e_die("%s can't have an argument", ui.PrettyId(op.op_id))
return s.lower()
elif op.op_id == Id.VOp1_Caret: # Only uppercase the first letter
if arg != '':
e_die("%s can't have an argument", ui.PrettyId(op.op_id))
if len(s):
return s[0].upper() + s[1:]
else:
return s
elif op.op_id == Id.VOp1_DCaret:
if arg != '':
e_die("%s can't have an argument", ui.PrettyId(op.op_id))
return s.upper()
else: # e.g. ^ ^^ , ,,
raise AssertionError(op.op_id)
# For patterns, do fnmatch() in a loop.
#
# TODO:
# - Another potential fast path:
# v=aabbccdd
# echo ${v#*b} # strip shortest prefix
#
# If the whole thing doesn't match '*b*', then no test can succeed. So we
# can fail early. Conversely echo ${v%%c*} and '*c*'.
#
# (Although honestly this whole construct is nuts and should be deprecated.)
n = len(s)
if op.op_id == Id.VOp1_Pound: # shortest prefix
# 'abcd': match '', 'a', 'ab', 'abc', ...
i = 0
while True:
assert i <= n
#log('Matching pattern %r with %r', arg, s[:i])
if libc.fnmatch(arg, s[:i], extglob):
return s[i:]
if i >= n:
break
i = _NextUtf8Char(s, i)
return s
elif op.op_id == Id.VOp1_DPound: # longest prefix
# 'abcd': match 'abc', 'ab', 'a'
i = n
while True:
assert i >= 0
#log('Matching pattern %r with %r', arg, s[:i])
if libc.fnmatch(arg, s[:i], extglob):
return s[i:]
if i == 0:
break
i = PreviousUtf8Char(s, i)
return s
elif op.op_id == Id.VOp1_Percent: # shortest suffix
# 'abcd': match 'abcd', 'abc', 'ab', 'a'
i = n
while True:
assert i >= 0
#log('Matching pattern %r with %r', arg, s[:i])
if libc.fnmatch(arg, s[i:], extglob):
return s[:i]
if i == 0:
break
i = PreviousUtf8Char(s, i)
return s
elif op.op_id == Id.VOp1_DPercent: # longest suffix
# 'abcd': match 'abc', 'bc', 'c', ...
i = 0
while True:
assert i <= n
#log('Matching pattern %r with %r', arg, s[:i])
if libc.fnmatch(arg, s[i:], extglob):
return s[:i]
if i >= n:
break
i = _NextUtf8Char(s, i)
return s
else:
raise NotImplementedError(ui.PrettyId(op.op_id))
def _AllMatchPositions(s, regex):
# type: (str, str) -> List[Tuple[int, int]]
"""Returns a list of all (start, end) match positions of the regex against s.
(If there are no matches, it returns the empty list.)
"""
matches = [] # type: List[Tuple[int, int]]
pos = 0
n = len(s)
while pos < n: # needed to prevent infinite loop in (.*) case
m = libc.regex_first_group_match(regex, s, pos)
if m is None:
break
matches.append(m)
start, end = m
pos = end # advance position
return matches
def _PatSubAll(s, regex, replace_str):
# type: (str, str, str) -> str
parts = [] # type: List[str]
prev_end = 0
for start, end in _AllMatchPositions(s, regex):
parts.append(s[prev_end:start])
parts.append(replace_str)
prev_end = end
parts.append(s[prev_end:])
return ''.join(parts)
class GlobReplacer(object):
def __init__(self, regex, replace_str, slash_spid):
# type: (str, str, int) -> None
# TODO: It would be nice to cache the compilation of the regex here,
# instead of just the string. That would require more sophisticated use of
# the Python/C API in libc.c, which we might want to avoid.
self.regex = regex
self.replace_str = replace_str
self.slash_spid = slash_spid
def __repr__(self):
# type: () -> str
return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
def Replace(self, s, op):
# type: (str, suffix_op__PatSub) -> str
regex = '(%s)' % self.regex # make it a group
if op.replace_mode == Id.Lit_Slash:
try:
return _PatSubAll(s, regex, self.replace_str) # loop over matches
except RuntimeError as e:
# libc.regex_first_group_match raises RuntimeError.
# note: MyPy doesn't know RuntimeError has e.message (and e.args)
msg = e.message # type: str
e_die('Error matching regex %r: %s', regex, msg,
span_id=self.slash_spid)
if op.replace_mode == Id.Lit_Pound:
regex = '^' + regex
elif op.replace_mode == Id.Lit_Percent:
regex = regex + '$'
m = libc.regex_first_group_match(regex, s, 0)
#log('regex = %r, s = %r, match = %r', regex, s, m)
if m is None:
return s
start, end = m
return s[:start] + self.replace_str + s[end:]
def ShellQuoteB(s):
# type: (str) -> str
"""Quote by adding backslashes.
Used for autocompletion, so it's friendlier for display on the command line.
We use the strategy above for other use cases.
"""
# There's no way to escape a newline! Bash prints ^J for some reason, but
# we're more explicit. This will happen if there's a newline on a file
# system or a completion plugin returns a newline.
# NOTE: tabs CAN be escaped with \.
s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
# ~ for home dir
# ! for history
# * [] ? for glob
# {} for brace expansion
# space because it separates words
return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')