Skip to content

Commit d74e126

Browse files
committed
Add parser for C printf-style format strings
This commit is a part of a series of future commits that will resolve RustPython#1007 . Code is based on vm/src/format.rs.
1 parent 73edde6 commit d74e126

File tree

3 files changed

+358
-2
lines changed

3 files changed

+358
-2
lines changed

vm/src/cformat.rs

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
use crate::format::{parse_number, parse_precision};
2+
/// Implementation of Printf-Style string formatting
3+
/// [https://docs.python.org/3/library/stdtypes.html#printf-style-string-formatting]
4+
use num_bigint::{BigInt, Sign};
5+
use num_traits::Signed;
6+
use std::cmp;
7+
use std::str::FromStr;
8+
9+
#[derive(Debug, PartialEq)]
10+
pub enum CFormatErrorType {
11+
UnmatchedKeyParentheses,
12+
MissingModuloSign,
13+
UnescapedModuloSignInLiteral,
14+
UnsupportedFormatChar(char),
15+
IncompleteFormat,
16+
Unimplemented,
17+
}
18+
19+
// also contains how many chars the parsing function consumed
20+
type ParsingError = (CFormatErrorType, usize);
21+
22+
pub struct CFormatError {
23+
pub typ: CFormatErrorType,
24+
pub index: usize,
25+
}
26+
27+
#[derive(Debug, PartialEq)]
28+
pub enum CFormatPreconversor {
29+
Repr,
30+
Str,
31+
Ascii,
32+
}
33+
34+
#[derive(Debug, PartialEq)]
35+
pub enum CFormatCase {
36+
Lowercase,
37+
Uppercase,
38+
}
39+
40+
#[derive(Debug, PartialEq)]
41+
pub enum CNumberType {
42+
Decimal,
43+
Octal,
44+
Hex(CFormatCase),
45+
}
46+
47+
#[derive(Debug, PartialEq)]
48+
pub enum CFloatType {
49+
Exponent(CFormatCase),
50+
PointDecimal,
51+
General(CFormatCase),
52+
}
53+
54+
#[derive(Debug, PartialEq)]
55+
pub enum CFormatType {
56+
Number(CNumberType),
57+
Float(CFloatType),
58+
Character,
59+
String(CFormatPreconversor),
60+
}
61+
62+
bitflags! {
63+
pub struct CConversionFlags: u32 {
64+
const ALTERNATE_FORM = 0b0000_0001;
65+
const ZERO_PAD = 0b0000_0010;
66+
const LEFT_ADJUST = 0b0000_0100;
67+
const BLANK_SIGN = 0b0000_1000;
68+
const SIGN_CHAR = 0b0001_0000;
69+
}
70+
}
71+
72+
#[derive(Debug, PartialEq)]
73+
pub struct CFormatSpec {
74+
pub mapping_key: Option<String>,
75+
pub flags: CConversionFlags,
76+
pub min_field_width: Option<usize>,
77+
pub precision: Option<usize>,
78+
pub format_type: CFormatType,
79+
pub format_char: char,
80+
}
81+
82+
#[derive(Debug, PartialEq)]
83+
pub enum CFormatPart {
84+
Literal(String),
85+
Spec(CFormatSpec),
86+
}
87+
88+
#[derive(Debug, PartialEq)]
89+
pub struct CFormatString {
90+
pub format_parts: Vec<(usize, CFormatPart)>,
91+
}
92+
93+
impl FromStr for CFormatString {
94+
type Err = CFormatError;
95+
96+
fn from_str(text: &str) -> Result<Self, Self::Err> {
97+
let mut cur_text: &str = text;
98+
let mut index = 0;
99+
let mut parts: Vec<(usize, CFormatPart)> = Vec::new();
100+
while !cur_text.is_empty() {
101+
cur_text = parse_literal(cur_text)
102+
.or_else(|_| parse_specifier(cur_text))
103+
.map(|(format_part, new_text, consumed)| {
104+
parts.push((index, format_part));
105+
index = index + consumed;
106+
new_text
107+
})
108+
.map_err(|(e, consumed)| {
109+
CFormatError {
110+
typ: e,
111+
index: index + consumed,
112+
}
113+
})?;
114+
}
115+
116+
Ok(CFormatString {
117+
format_parts: parts,
118+
})
119+
}
120+
}
121+
122+
fn parse_literal_single(text: &str) -> Result<(char, &str), CFormatErrorType> {
123+
let mut chars = text.chars();
124+
// TODO get rid of the unwrap
125+
let first_char = chars.next().unwrap();
126+
if first_char == '%' {
127+
// if we see a %, it has to be escaped
128+
match chars.next() {
129+
Some(next_char) => {
130+
if next_char != first_char {
131+
Err(CFormatErrorType::UnescapedModuloSignInLiteral)
132+
} else {
133+
Ok((first_char, chars.as_str()))
134+
}
135+
}
136+
None => Err(CFormatErrorType::IncompleteFormat),
137+
}
138+
} else {
139+
Ok((first_char, chars.as_str()))
140+
}
141+
}
142+
143+
fn parse_literal(text: &str) -> Result<(CFormatPart, &str, usize), ParsingError> {
144+
let mut cur_text = text;
145+
let mut result_string = String::new();
146+
let mut consumed = 0;
147+
while !cur_text.is_empty() {
148+
match parse_literal_single(cur_text) {
149+
Ok((next_char, remaining)) => {
150+
result_string.push(next_char);
151+
consumed = consumed + 1;
152+
cur_text = remaining;
153+
}
154+
Err(err) => {
155+
if !result_string.is_empty() {
156+
return Ok((
157+
CFormatPart::Literal(result_string.to_string()),
158+
cur_text,
159+
consumed,
160+
));
161+
} else {
162+
return Err((err, consumed));
163+
}
164+
}
165+
}
166+
}
167+
Ok((
168+
CFormatPart::Literal(result_string.to_string()),
169+
"",
170+
text.len(),
171+
))
172+
}
173+
174+
fn parse_spec_mapping_key(text: &str) -> Result<(Option<String>, &str), CFormatErrorType> {
175+
let mut chars = text.chars();
176+
177+
let next_char = chars.next();
178+
if next_char == Some('(') {
179+
// Get remaining characters after opening parentheses.
180+
let cur_text = chars.as_str();
181+
match cur_text.find(')') {
182+
Some(position) => {
183+
let (left, right) = cur_text.split_at(position);
184+
185+
Ok((Some(left.to_string()), &right[1..]))
186+
}
187+
None => Err(CFormatErrorType::UnmatchedKeyParentheses),
188+
}
189+
} else {
190+
Ok((None, text))
191+
}
192+
}
193+
194+
fn parse_flag_single(text: &str) -> (Option<CConversionFlags>, &str) {
195+
let mut chars = text.chars();
196+
match chars.next() {
197+
Some('#') => (Some(CConversionFlags::ALTERNATE_FORM), chars.as_str()),
198+
Some('0') => (Some(CConversionFlags::ZERO_PAD), chars.as_str()),
199+
Some('-') => (Some(CConversionFlags::LEFT_ADJUST), chars.as_str()),
200+
Some(' ') => (Some(CConversionFlags::BLANK_SIGN), chars.as_str()),
201+
Some('+') => (Some(CConversionFlags::SIGN_CHAR), chars.as_str()),
202+
_ => (None, text),
203+
}
204+
}
205+
206+
fn parse_flags(text: &str) -> (CConversionFlags, &str) {
207+
let mut flags = CConversionFlags::empty();
208+
let mut cur_text = text;
209+
while !cur_text.is_empty() {
210+
match parse_flag_single(cur_text) {
211+
(Some(flag), text) => {
212+
flags |= flag;
213+
cur_text = text;
214+
}
215+
216+
(None, text) => {
217+
return (flags, text);
218+
}
219+
}
220+
}
221+
222+
(flags, "")
223+
}
224+
225+
fn consume_length(text: &str) -> &str {
226+
let mut chars = text.chars();
227+
match chars.next() {
228+
Some('h') | Some('l') | Some('L') => chars.as_str(),
229+
_ => text,
230+
}
231+
}
232+
233+
fn parse_format_type(text: &str) -> Result<(CFormatType, &str, char), CFormatErrorType> {
234+
use CFloatType::*;
235+
use CFormatCase::{Lowercase, Uppercase};
236+
use CNumberType::*;
237+
let mut chars = text.chars();
238+
let next_char = chars.next();
239+
match next_char {
240+
Some('d') | Some('i') | Some('u') => Ok((
241+
CFormatType::Number(Decimal),
242+
chars.as_str(),
243+
next_char.unwrap(),
244+
)),
245+
Some('o') => Ok((
246+
CFormatType::Number(Octal),
247+
chars.as_str(),
248+
next_char.unwrap(),
249+
)),
250+
Some('x') => Ok((
251+
CFormatType::Number(Hex(Lowercase)),
252+
chars.as_str(),
253+
next_char.unwrap(),
254+
)),
255+
Some('X') => Ok((
256+
CFormatType::Number(Hex(Uppercase)),
257+
chars.as_str(),
258+
next_char.unwrap(),
259+
)),
260+
Some('e') => Ok((
261+
CFormatType::Float(Exponent(Lowercase)),
262+
chars.as_str(),
263+
next_char.unwrap(),
264+
)),
265+
Some('E') => Ok((
266+
CFormatType::Float(Exponent(Uppercase)),
267+
chars.as_str(),
268+
next_char.unwrap(),
269+
)),
270+
Some('f') => Ok((
271+
CFormatType::Float(PointDecimal),
272+
chars.as_str(),
273+
next_char.unwrap(),
274+
)),
275+
Some('F') => Ok((
276+
CFormatType::Float(PointDecimal),
277+
chars.as_str(),
278+
next_char.unwrap(),
279+
)),
280+
Some('g') => Ok((
281+
CFormatType::Float(General(Lowercase)),
282+
text,
283+
next_char.unwrap(),
284+
)),
285+
Some('G') => Ok((
286+
CFormatType::Float(General(Uppercase)),
287+
text,
288+
next_char.unwrap(),
289+
)),
290+
Some('c') => Ok((CFormatType::Character, chars.as_str(), next_char.unwrap())),
291+
Some('r') => Ok((
292+
CFormatType::String(CFormatPreconversor::Repr),
293+
chars.as_str(),
294+
next_char.unwrap(),
295+
)),
296+
Some('s') => Ok((
297+
CFormatType::String(CFormatPreconversor::Str),
298+
chars.as_str(),
299+
next_char.unwrap(),
300+
)),
301+
Some('a') => Ok((
302+
CFormatType::String(CFormatPreconversor::Ascii),
303+
chars.as_str(),
304+
next_char.unwrap(),
305+
)),
306+
Some(c) => Err(CFormatErrorType::UnsupportedFormatChar(c)),
307+
None => Err(CFormatErrorType::IncompleteFormat), // should not happen because it is handled earlier in the parsing
308+
}
309+
}
310+
311+
fn parse_specifier(text: &str) -> Result<(CFormatPart, &str, usize), ParsingError> {
312+
let consumed = 0;
313+
let mut chars = text.chars();
314+
if chars.next() != Some('%') {
315+
return Err((CFormatErrorType::MissingModuloSign, consumed));
316+
}
317+
let consumed = consumed + 1;
318+
319+
let (mapping_key, after_mapping_key) =
320+
parse_spec_mapping_key(chars.as_str()).map_err(|err| (err, consumed))?;
321+
let consumed = text.find(after_mapping_key).unwrap();
322+
let (flags, after_flags) = parse_flags(after_mapping_key);
323+
let (width, after_width) = parse_number(after_flags);
324+
let (precision, after_precision) = parse_precision(after_width);
325+
// A length modifier (h, l, or L) may be present,
326+
// but is ignored as it is not necessary for Python – so e.g. %ld is identical to %d.
327+
let after_length = consume_length(after_precision);
328+
let (format_type, remaining_text, format_char) =
329+
parse_format_type(after_length).map_err(|err| (err, consumed))?;
330+
let consumed = text.find(remaining_text).unwrap();
331+
332+
// apply default precision for float types
333+
let precision = match precision {
334+
Some(precision) => Some(precision),
335+
None => match format_type {
336+
CFormatType::Float(_) => Some(6),
337+
_ => None,
338+
},
339+
};
340+
341+
Ok((
342+
CFormatPart::Spec(CFormatSpec {
343+
mapping_key: mapping_key,
344+
flags: flags,
345+
min_field_width: width,
346+
precision: precision,
347+
format_type: format_type,
348+
format_char: format_char,
349+
}),
350+
remaining_text,
351+
consumed,
352+
))
353+
}

vm/src/format.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ fn parse_fill_and_align(text: &str) -> (Option<char>, Option<FormatAlign>, &str)
151151
}
152152
}
153153

154-
fn parse_number(text: &str) -> (Option<usize>, &str) {
154+
pub fn parse_number(text: &str) -> (Option<usize>, &str) {
155155
let num_digits: usize = get_num_digits(text);
156156
if num_digits == 0 {
157157
return (None, text);
@@ -189,7 +189,7 @@ fn parse_zero(text: &str) -> &str {
189189
}
190190
}
191191

192-
fn parse_precision(text: &str) -> (Option<usize>, &str) {
192+
pub fn parse_precision(text: &str) -> (Option<usize>, &str) {
193193
let mut chars = text.chars();
194194
match chars.next() {
195195
Some('.') => {

vm/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
clippy::implicit_hasher
1313
)]
1414

15+
#[macro_use]
16+
extern crate bitflags;
1517
#[macro_use]
1618
extern crate lazy_static;
1719
extern crate lexical;
@@ -41,6 +43,7 @@ pub use rustpython_derive::py_compile_bytecode;
4143
pub mod macros;
4244

4345
mod builtins;
46+
pub mod cformat;
4447
mod dictdatatype;
4548
pub mod eval;
4649
mod exceptions;

0 commit comments

Comments
 (0)