Skip to content

Commit d16c288

Browse files
committed
Improve re module. Favor string.py over string.rs
1 parent 9261d94 commit d16c288

File tree

5 files changed

+493
-64
lines changed

5 files changed

+493
-64
lines changed

Lib/string.py

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,305 @@
1+
"""A collection of string constants.
2+
3+
Public module variables:
4+
5+
whitespace -- a string containing all ASCII whitespace
6+
ascii_lowercase -- a string containing all ASCII lowercase letters
7+
ascii_uppercase -- a string containing all ASCII uppercase letters
8+
ascii_letters -- a string containing all ASCII letters
9+
digits -- a string containing all ASCII decimal digits
10+
hexdigits -- a string containing all ASCII hexadecimal digits
11+
octdigits -- a string containing all ASCII octal digits
12+
punctuation -- a string containing all ASCII punctuation characters
13+
printable -- a string containing all ASCII characters considered printable
14+
15+
"""
16+
17+
__all__ = ["ascii_letters", "ascii_lowercase", "ascii_uppercase", "capwords",
18+
"digits", "hexdigits", "octdigits", "printable", "punctuation",
19+
"whitespace", "Formatter", "Template"]
20+
21+
import _string
22+
23+
# Some strings for ctype-style character classification
24+
whitespace = ' \t\n\r\v\f'
25+
ascii_lowercase = 'abcdefghijklmnopqrstuvwxyz'
26+
ascii_uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
27+
ascii_letters = ascii_lowercase + ascii_uppercase
28+
digits = '0123456789'
29+
hexdigits = digits + 'abcdef' + 'ABCDEF'
30+
octdigits = '01234567'
31+
punctuation = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
32+
printable = digits + ascii_letters + punctuation + whitespace
33+
34+
# Functions which aren't available as string methods.
35+
36+
# Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def".
37+
def capwords(s, sep=None):
38+
"""capwords(s [,sep]) -> string
39+
40+
Split the argument into words using split, capitalize each
41+
word using capitalize, and join the capitalized words using
42+
join. If the optional second argument sep is absent or None,
43+
runs of whitespace characters are replaced by a single space
44+
and leading and trailing whitespace are removed, otherwise
45+
sep is used to split and join the words.
46+
47+
"""
48+
return (sep or ' ').join(x.capitalize() for x in s.split(sep))
49+
50+
51+
####################################################################
52+
import re as _re
53+
from collections import ChainMap as _ChainMap
54+
55+
class _TemplateMetaclass(type):
56+
pattern = r"""
57+
%(delim)s(?:
58+
(?P<escaped>%(delim)s) | # Escape sequence of two delimiters
59+
(?P<named>%(id)s) | # delimiter and a Python identifier
60+
\{(?P<braced>%(bid)s)\} #| # delimiter and a braced identifier
61+
# (?P<invalid>) # Other ill-formed delimiter exprs
62+
)
63+
"""
64+
65+
def __init__(cls, name, bases, dct):
66+
super(_TemplateMetaclass, cls).__init__(name, bases, dct)
67+
if 'pattern' in dct:
68+
pattern = cls.pattern
69+
else:
70+
pattern = _TemplateMetaclass.pattern % {
71+
'delim' : _re.escape(cls.delimiter),
72+
'id' : cls.idpattern,
73+
'bid' : cls.braceidpattern or cls.idpattern,
74+
}
75+
cls.pattern = _re.compile(pattern, cls.flags | _re.VERBOSE)
76+
77+
78+
class Template(metaclass=_TemplateMetaclass):
79+
"""A string class for supporting $-substitutions."""
80+
81+
delimiter = '$'
82+
# r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, but
83+
# without the ASCII flag. We can't add re.ASCII to flags because of
84+
# backward compatibility. So we use the ?a local flag and [a-z] pattern.
85+
# See https://bugs.python.org/issue31672
86+
idpattern = r'([_a-z][_a-z0-9]*)'
87+
braceidpattern = None
88+
flags = _re.IGNORECASE
89+
90+
def __init__(self, template):
91+
self.template = template
92+
93+
# Search for $$, $identifier, ${identifier}, and any bare $'s
94+
95+
def _invalid(self, mo):
96+
i = mo.start('invalid')
97+
lines = self.template[:i].splitlines(keepends=True)
98+
if not lines:
99+
colno = 1
100+
lineno = 1
101+
else:
102+
colno = i - len(''.join(lines[:-1]))
103+
lineno = len(lines)
104+
raise ValueError('Invalid placeholder in string: line %d, col %d' %
105+
(lineno, colno))
106+
107+
def substitute(*args, **kws):
108+
if not args:
109+
raise TypeError("descriptor 'substitute' of 'Template' object "
110+
"needs an argument")
111+
self, *args = args # allow the "self" keyword be passed
112+
if len(args) > 1:
113+
raise TypeError('Too many positional arguments')
114+
if not args:
115+
mapping = kws
116+
elif kws:
117+
mapping = _ChainMap(kws, args[0])
118+
else:
119+
mapping = args[0]
120+
# Helper function for .sub()
121+
def convert(mo):
122+
# Check the most common path first.
123+
named = mo.group('named') or mo.group('braced')
124+
if named is not None:
125+
return str(mapping[named])
126+
if mo.group('escaped') is not None:
127+
return self.delimiter
128+
if mo.group('invalid') is not None:
129+
self._invalid(mo)
130+
raise ValueError('Unrecognized named group in pattern',
131+
self.pattern)
132+
return self.pattern.sub(convert, self.template)
133+
134+
def safe_substitute(*args, **kws):
135+
if not args:
136+
raise TypeError("descriptor 'safe_substitute' of 'Template' object "
137+
"needs an argument")
138+
self, *args = args # allow the "self" keyword be passed
139+
if len(args) > 1:
140+
raise TypeError('Too many positional arguments')
141+
if not args:
142+
mapping = kws
143+
elif kws:
144+
mapping = _ChainMap(kws, args[0])
145+
else:
146+
mapping = args[0]
147+
# Helper function for .sub()
148+
def convert(mo):
149+
named = mo.group('named') or mo.group('braced')
150+
if named is not None:
151+
try:
152+
return str(mapping[named])
153+
except KeyError:
154+
return mo.group()
155+
if mo.group('escaped') is not None:
156+
return self.delimiter
157+
if mo.group('invalid') is not None:
158+
return mo.group()
159+
raise ValueError('Unrecognized named group in pattern',
160+
self.pattern)
161+
return self.pattern.sub(convert, self.template)
162+
163+
164+
165+
########################################################################
166+
# the Formatter class
167+
# see PEP 3101 for details and purpose of this class
168+
169+
# The hard parts are reused from the C implementation. They're exposed as "_"
170+
# prefixed methods of str.
171+
172+
# The overall parser is implemented in _string.formatter_parser.
173+
# The field name parser is implemented in _string.formatter_field_name_split
174+
175+
class Formatter:
176+
def format(*args, **kwargs):
177+
if not args:
178+
raise TypeError("descriptor 'format' of 'Formatter' object "
179+
"needs an argument")
180+
self, *args = args # allow the "self" keyword be passed
181+
try:
182+
format_string, *args = args # allow the "format_string" keyword be passed
183+
except ValueError:
184+
raise TypeError("format() missing 1 required positional "
185+
"argument: 'format_string'") from None
186+
return self.vformat(format_string, args, kwargs)
187+
188+
def vformat(self, format_string, args, kwargs):
189+
used_args = set()
190+
result, _ = self._vformat(format_string, args, kwargs, used_args, 2)
191+
self.check_unused_args(used_args, args, kwargs)
192+
return result
193+
194+
def _vformat(self, format_string, args, kwargs, used_args, recursion_depth,
195+
auto_arg_index=0):
196+
if recursion_depth < 0:
197+
raise ValueError('Max string recursion exceeded')
198+
result = []
199+
for literal_text, field_name, format_spec, conversion in \
200+
self.parse(format_string):
201+
202+
# output the literal text
203+
if literal_text:
204+
result.append(literal_text)
205+
206+
# if there's a field, output it
207+
if field_name is not None:
208+
# this is some markup, find the object and do
209+
# the formatting
210+
211+
# handle arg indexing when empty field_names are given.
212+
if field_name == '':
213+
if auto_arg_index is False:
214+
raise ValueError('cannot switch from manual field '
215+
'specification to automatic field '
216+
'numbering')
217+
field_name = str(auto_arg_index)
218+
auto_arg_index += 1
219+
elif field_name.isdigit():
220+
if auto_arg_index:
221+
raise ValueError('cannot switch from manual field '
222+
'specification to automatic field '
223+
'numbering')
224+
# disable auto arg incrementing, if it gets
225+
# used later on, then an exception will be raised
226+
auto_arg_index = False
227+
228+
# given the field_name, find the object it references
229+
# and the argument it came from
230+
obj, arg_used = self.get_field(field_name, args, kwargs)
231+
used_args.add(arg_used)
232+
233+
# do any conversion on the resulting object
234+
obj = self.convert_field(obj, conversion)
235+
236+
# expand the format spec, if needed
237+
format_spec, auto_arg_index = self._vformat(
238+
format_spec, args, kwargs,
239+
used_args, recursion_depth-1,
240+
auto_arg_index=auto_arg_index)
241+
242+
# format the object and append to the result
243+
result.append(self.format_field(obj, format_spec))
244+
245+
return ''.join(result), auto_arg_index
246+
247+
248+
def get_value(self, key, args, kwargs):
249+
if isinstance(key, int):
250+
return args[key]
251+
else:
252+
return kwargs[key]
253+
254+
255+
def check_unused_args(self, used_args, args, kwargs):
256+
pass
257+
258+
259+
def format_field(self, value, format_spec):
260+
return format(value, format_spec)
261+
262+
263+
def convert_field(self, value, conversion):
264+
# do any conversion on the resulting object
265+
if conversion is None:
266+
return value
267+
elif conversion == 's':
268+
return str(value)
269+
elif conversion == 'r':
270+
return repr(value)
271+
elif conversion == 'a':
272+
return ascii(value)
273+
raise ValueError("Unknown conversion specifier {0!s}".format(conversion))
274+
275+
276+
# returns an iterable that contains tuples of the form:
277+
# (literal_text, field_name, format_spec, conversion)
278+
# literal_text can be zero length
279+
# field_name can be None, in which case there's no
280+
# object to format and output
281+
# if field_name is not None, it is looked up, formatted
282+
# with format_spec and conversion and then used
283+
def parse(self, format_string):
284+
return _string.formatter_parser(format_string)
285+
286+
287+
# given a field_name, find the object it references.
288+
# field_name: the field being looked up, e.g. "0.name"
289+
# or "lookup[3]"
290+
# used_args: a set of which args have been used
291+
# args, kwargs: as passed in to vformat
292+
def get_field(self, field_name, args, kwargs):
293+
first, rest = _string.formatter_field_name_split(field_name)
294+
295+
obj = self.get_value(first, args, kwargs)
296+
297+
# loop through the rest of the field_name, doing
298+
# getattr or getitem as needed
299+
for is_attr, i in rest:
300+
if is_attr:
301+
obj = getattr(obj, i)
302+
else:
303+
obj = obj[i]
304+
305+
return obj, first

tests/snippets/test_re.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,21 @@
1313
assert mo.end() == 5
1414

1515
assert re.escape('python.exe') == 'python\\.exe'
16+
17+
p = re.compile('ab')
18+
s = p.sub('x', 'abcabca')
19+
print(s)
20+
assert s == 'xcxca'
21+
22+
idpattern = r'([_a-z][_a-z0-9]*)'
23+
24+
mo = re.search(idpattern, '7382 _boe0+2')
25+
print(mo)
26+
# TODO:
27+
# assert mo.group(0) == '_boe0'
28+
29+
from string import Template
30+
s = Template('$who likes $what')
31+
# TODO:
32+
# r = s.substitute(who='tim', what='kung pow')
33+
# print(r)

vm/src/stdlib/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ pub fn get_module_inits() -> HashMap<String, StdlibInitFunc> {
5353
"platform".to_string() => Box::new(platform::make_module),
5454
"re".to_string() => Box::new(re::make_module),
5555
"random".to_string() => Box::new(random::make_module),
56-
"string".to_string() => Box::new(string::make_module),
56+
"_string".to_string() => Box::new(string::make_module),
5757
"struct".to_string() => Box::new(pystruct::make_module),
5858
"_thread".to_string() => Box::new(thread::make_module),
5959
"time".to_string() => Box::new(time_module::make_module),

0 commit comments

Comments
 (0)