Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stdlib] Fix wrong Char posix whitespace designation #3983

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 36 additions & 11 deletions stdlib/src/collections/string/codepoint.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -386,30 +386,55 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable):
alias unicode_line_sep = Codepoint.from_u32(0x2028).value()
alias unicode_paragraph_sep = Codepoint.from_u32(0x2029).value()

return self.is_posix_space() or self in (
return self.is_ascii_space() or self in (
next_line,
unicode_line_sep,
unicode_paragraph_sep,
)

fn is_posix_space(self) -> Bool:
"""Returns True if this `Codepoint` is a **space** character according to the
[POSIX locale][1].
"""Returns True if this `Codepoint` is a **space** (aka. whitespace)
character according to the [POSIX locale](
https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
): `" \\t\\n\\v\\f\\r"`.

The POSIX locale is also known as the C locale.
Returns:
True iff the character is one of the whitespace characters listed
above.

Notes:
The POSIX locale is also known as the C locale.
"""

# ASCII char
var c = UInt8(Int(self))

# NOTE: a global LUT doesn't work at compile time so we can't use it here.
alias ` ` = UInt8(ord(" "))
alias `\t` = UInt8(ord("\t"))
alias `\n` = UInt8(ord("\n"))
alias `\r` = UInt8(ord("\r"))
alias `\f` = UInt8(ord("\f"))
alias `\v` = UInt8(ord("\v"))

[1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
# This compiles to something very clever that's even faster than a LUT.
return self.is_ascii() and (
c == ` `
or c == `\t`
or c == `\n`
or c == `\r`
or c == `\f`
or c == `\v`
)

This only respects the default "C" locale, i.e. returns True only if the
character specified is one of " \\t\\n\\v\\f\\r". For semantics similar
to Python, use `String.isspace()`.
fn is_ascii_space(self) -> Bool:
"""Determines whether the given `Codepoint` is an ASCII whitespace
character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`.

Returns:
True iff the character is one of the whitespace characters listed
above.
"""
if not self.is_ascii():
return False

# ASCII char
var c = UInt8(Int(self))
Expand All @@ -426,7 +451,7 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable):
alias `\x1e` = UInt8(ord("\x1e"))

# This compiles to something very clever that's even faster than a LUT.
return (
return self.is_ascii() and (
c == ` `
or c == `\t`
or c == `\n`
Expand Down
8 changes: 4 additions & 4 deletions stdlib/src/collections/string/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -285,13 +285,13 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:
elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]:
result += ord_current - ord_letter_min[1] + 10
found_valid_chars_after_start = True
elif Codepoint(UInt8(ord_current)).is_posix_space():
elif Codepoint(UInt8(ord_current)).is_ascii_space():
has_space_after_number = True
start = pos + 1
break
else:
raise Error(_str_to_base_error(base, str_slice))
if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_posix_space():
if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_ascii_space():
var nextresult = result * real_base
if nextresult < result:
raise Error(
Expand All @@ -305,7 +305,7 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:

if has_space_after_number:
for pos in range(start, str_len):
if not Codepoint(buff[pos]).is_posix_space():
if not Codepoint(buff[pos]).is_ascii_space():
raise Error(_str_to_base_error(base, str_slice))
if is_negative:
result = -result
Expand All @@ -327,7 +327,7 @@ fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool):
"""
var buff = str_slice.unsafe_ptr()
var start: Int = 0
while start < str_len and Codepoint(buff[start]).is_posix_space():
while start < str_len and Codepoint(buff[start]).is_ascii_space():
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI this is much more inefficient than using the original _isspace() (what this method's implementation uses) directly since it is not transforming to UTF-32 on each iteration (it just needs to compare a byte). Branch prediction might lessen the effect at runtime but some penalty is still there in the instruction fetching pipeline. It's one of the motivations behind #3988

start += 1
var p: Bool = buff[start] == ord("+")
var n: Bool = buff[start] == ord("-")
Expand Down
4 changes: 2 additions & 2 deletions stdlib/src/collections/string/string_slice.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
# break
# r_idx -= 1
while (
r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_posix_space()
r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_ascii_space()
):
r_idx -= 1
return Self(unsafe_from_utf8=self.as_bytes()[:r_idx])
Expand Down Expand Up @@ -1227,7 +1227,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
# l_idx += 1
while (
l_idx < self.byte_length()
and Codepoint(self.as_bytes()[l_idx]).is_posix_space()
and Codepoint(self.as_bytes()[l_idx]).is_ascii_space()
):
l_idx += 1
return Self(unsafe_from_utf8=self.as_bytes()[l_idx:])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,40 @@ def test_char_is_posix_space():

# Checking false cases
assert_false(Codepoint.ord("a").is_posix_space())
assert_false(Codepoint.ord("a").is_posix_space())
assert_false(Codepoint.ord("u").is_posix_space())
assert_false(Codepoint.ord("s").is_posix_space())
assert_false(Codepoint.ord("t").is_posix_space())
assert_false(Codepoint.ord("i").is_posix_space())
assert_false(Codepoint.ord("n").is_posix_space())
assert_false(Codepoint.ord("z").is_posix_space())
assert_false(Codepoint.ord(".").is_posix_space())
assert_false(Codepoint.ord("\x1c").is_posix_space())
assert_false(Codepoint.ord("\x1d").is_posix_space())
assert_false(Codepoint.ord("\x1e").is_posix_space())


def test_char_is_ascii_space():
# checking true cases
assert_true(Codepoint.ord(" ").is_ascii_space())
assert_true(Codepoint.ord("\n").is_ascii_space())
assert_true(Codepoint.ord("\n").is_ascii_space())
assert_true(Codepoint.ord("\t").is_ascii_space())
assert_true(Codepoint.ord("\r").is_ascii_space())
assert_true(Codepoint.ord("\v").is_ascii_space())
assert_true(Codepoint.ord("\f").is_ascii_space())
assert_true(Codepoint.ord("\x1c").is_ascii_space())
assert_true(Codepoint.ord("\x1d").is_ascii_space())
assert_true(Codepoint.ord("\x1e").is_ascii_space())

# Checking false cases
assert_false(Codepoint.ord("a").is_ascii_space())
assert_false(Codepoint.ord("u").is_ascii_space())
assert_false(Codepoint.ord("s").is_ascii_space())
assert_false(Codepoint.ord("t").is_ascii_space())
assert_false(Codepoint.ord("i").is_ascii_space())
assert_false(Codepoint.ord("n").is_ascii_space())
assert_false(Codepoint.ord("z").is_ascii_space())
assert_false(Codepoint.ord(".").is_ascii_space())


def test_char_is_lower():
Expand Down Expand Up @@ -238,6 +264,7 @@ def main():
test_char_formatting()
test_char_properties()
test_char_is_posix_space()
test_char_is_ascii_space()
test_char_is_lower()
test_char_is_upper()
test_char_is_digit()
Expand Down