Skip to content

Commit

Permalink
starlark: fix bugs in str.{title,capitalize} (google#3)
Browse files Browse the repository at this point in the history
* starlark: fix bugs in str.{title,capitalize}

Fixes google/skylark#140
  • Loading branch information
adonovan authored Nov 2, 2018
1 parent 9b05555 commit 4c43ff3
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 13 deletions.
16 changes: 11 additions & 5 deletions doc/spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -3505,11 +3505,14 @@ See also: `string·elems`.
<a id='string·capitalize'></a>
### string·capitalize
`S.capitalize()` returns a copy of string S with all Unicode letters
that begin words changed to their title case.
`S.capitalize()` returns a copy of string S with its first code point
changed to its title case and all subsequent letters changed to their
lower case.
```python
"hello, world!".capitalize() # "Hello, World!"
"hello, world!".capitalize() # "Hello, world!"
"hElLo, wOrLd!".capitalize() # "Hello, world!"
"¿Por qué?".capitalize() # "¿por qué?"
```
<a id='string·codepoint_ords'></a>
Expand Down Expand Up @@ -3711,6 +3714,8 @@ letter, and all such letters that begin a word are in title case.
"Hello, World!".istitle() # True
"Catch-22".istitle() # True
"HAL-9000".istitle() # False
"Dženan".istitle() # True
"DŽenan".istitle() # False ("DŽ" is a single Unicode letter)
"123".istitle() # False
```
Expand Down Expand Up @@ -3960,12 +3965,13 @@ function reports whether any one of them is a prefix.
<a id='string·title'></a>
### string·title
`S.title()` returns a copy of the string S with letters converted to titlecase.
`S.title()` returns a copy of the string S with letters converted to title case.
Letters are converted to uppercase at the start of words, lowercase elsewhere.
Letters are converted to upper case at the start of words, lower case elsewhere.
```python
"hElLo, WoRlD!".title() # "Hello, World!"
"dženan".title() # "Dženan" ("Dž" is a single Unicode letter)
```
<a id='string·upper'></a>
Expand Down
49 changes: 43 additions & 6 deletions starlark/library.go
Original file line number Diff line number Diff line change
Expand Up @@ -1447,7 +1447,18 @@ func string_capitalize(fnname string, recv Value, args Tuple, kwargs []Tuple) (V
if err := UnpackPositionalArgs(fnname, args, kwargs, 0); err != nil {
return nil, err
}
return String(strings.Title(string(recv.(String)))), nil
s := string(recv.(String))
var res bytes.Buffer
res.Grow(len(s))
for i, r := range s {
if i == 0 {
r = unicode.ToTitle(r)
} else {
r = unicode.ToLower(r)
}
res.WriteRune(r)
}
return String(res.String()), nil
}

// string_iterable returns an unspecified iterable value whose iterator yields:
Expand Down Expand Up @@ -1539,16 +1550,22 @@ func string_islower(fnname string, recv_ Value, args Tuple, kwargs []Tuple) (Val
return Bool(isCasedString(recv) && recv == strings.ToLower(recv)), nil
}

// isCasedString reports whether its argument contains any cased characters.
// isCasedString reports whether its argument contains any cased code points.
func isCasedString(s string) bool {
for _, r := range s {
if 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || unicode.SimpleFold(r) != r {
if isCasedRune(r) {
return true
}
}
return false
}

func isCasedRune(r rune) bool {
// It's unclear what the correct behavior is for a rune such as 'ffi',
// a lowercase letter with no upper or title case and no SimpleFold.
return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || unicode.SimpleFold(r) != r
}

// https://go.starlark.net/starlark/blob/master/doc/spec.md#string·isspace
func string_isspace(fnname string, recv_ Value, args Tuple, kwargs []Tuple) (Value, error) {
if err := UnpackPositionalArgs(fnname, args, kwargs, 0); err != nil {
Expand All @@ -1575,18 +1592,20 @@ func string_istitle(fnname string, recv_ Value, args Tuple, kwargs []Tuple) (Val
// lowercase characters only cased ones."
var cased, prevCased bool
for _, r := range recv {
if unicode.IsUpper(r) {
if 'A' <= r && r <= 'Z' || unicode.IsTitle(r) { // e.g. "Dž"
if prevCased {
return False, nil
}
cased = true
prevCased = true
cased = true
} else if unicode.IsLower(r) {
if !prevCased {
return False, nil
}
prevCased = true
cased = true
} else if unicode.IsUpper(r) {
return False, nil
} else {
prevCased = false
}
Expand Down Expand Up @@ -1934,7 +1953,25 @@ func string_title(fnname string, recv Value, args Tuple, kwargs []Tuple) (Value,
if err := UnpackPositionalArgs(fnname, args, kwargs, 0); err != nil {
return nil, err
}
return String(strings.Title(strings.ToLower(string(recv.(String))))), nil

s := string(recv.(String))

// Python semantics differ from x==strings.{To,}Title(x) in Go:
// "uppercase characters may only follow uncased characters and
// lowercase characters only cased ones."
var buf bytes.Buffer
buf.Grow(len(s))
var prevCased bool
for _, r := range s {
if prevCased {
r = unicode.ToLower(r)
} else {
r = unicode.ToTitle(r)
}
prevCased = isCasedRune(r)
buf.WriteRune(r)
}
return String(buf.String()), nil
}

// https://go.starlark.net/starlark/blob/master/doc/spec.md#string·upper
Expand Down
35 changes: 33 additions & 2 deletions starlark/testdata/string.star
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,15 @@ def test_predicates():
"abc": "alnum alpha lower",
"ABC": "alnum alpha upper",
"123": "alnum digit",
"DŽLJ": "alnum alpha upper",
"DžLj": "alnum alpha",
"Dž Lj": "title",
"džlj": "alnum alpha lower",
}
for str, want in table.items():
got = ' '.join([name for name in predicates if getattr(str, "is"+name)()])
if got != want:
assert.fail("%r matched [%s], want [%s]" % (str, want, got))
assert.fail("%r matched [%s], want [%s]" % (str, got, want))
test_predicates()

# Strings are not iterable.
Expand Down Expand Up @@ -360,4 +364,31 @@ assert.fails(lambda: any("abc"), "got string, want iterable") # any
assert.fails(lambda: reversed("abc"), "got string, want iterable") # reversed
assert.fails(lambda: zip("ab", "cd"), "not iterable: string") # zip

# TODO(adonovan): tests for: {,r}index join {capitalize,lower,title,upper}
# TODO(adonovan): tests for: {,r}index join

# str.capitalize
assert.eq("hElLo, WoRlD!".capitalize(), "Hello, world!")
assert.eq("por qué".capitalize(), "Por qué")
assert.eq("¿Por qué?".capitalize(), "¿por qué?")

# str.lower
assert.eq("hElLo, WoRlD!".lower(), "hello, world!")
assert.eq("por qué".lower(), "por qué")
assert.eq("¿Por qué?".lower(), "¿por qué?")
assert.eq("LJUBOVIĆ".lower(), "ljubović")
assert.true("dženan ljubović".islower())

# str.upper
assert.eq("hElLo, WoRlD!".upper(), "HELLO, WORLD!")
assert.eq("por qué".upper(), "POR QUÉ")
assert.eq("¿Por qué?".upper(), "¿POR QUÉ?")
assert.eq("ljubović".upper(), "LJUBOVIĆ")
assert.true("DŽENAN LJUBOVIĆ".isupper())

# str.title
assert.eq("hElLo, WoRlD!".title(), "Hello, World!")
assert.eq("por qué".title(), "Por Qué")
assert.eq("¿Por qué?".title(), "¿Por Qué?")
assert.eq("ljubović".title(), "Ljubović")
assert.true("Dženan Ljubović".istitle())
assert.true(not "DŽenan LJubović".istitle())

0 comments on commit 4c43ff3

Please sign in to comment.