diff --git a/mojo/stdlib/src/collections/string/codepoint.mojo b/mojo/stdlib/src/collections/string/codepoint.mojo index 1974d40318..af1e257e5e 100644 --- a/mojo/stdlib/src/collections/string/codepoint.mojo +++ b/mojo/stdlib/src/collections/string/codepoint.mojo @@ -387,30 +387,55 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable): alias unicode_line_sep = Codepoint.from_u32(0x2028).value() alias unicode_paragraph_sep = Codepoint.from_u32(0x2029).value() - return self.is_posix_space() or self in ( + return self.is_ascii_space() or self in ( next_line, unicode_line_sep, unicode_paragraph_sep, ) fn is_posix_space(self) -> Bool: - """Returns True if this `Codepoint` is a **space** character according to the - [POSIX locale][1]. + """Returns True if this `Codepoint` is a **space** (aka. whitespace) + character according to the [POSIX locale]( + https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 + ): `" \\t\\n\\v\\f\\r"`. - The POSIX locale is also known as the C locale. + Returns: + True iff the character is one of the whitespace characters listed + above. + + Notes: + The POSIX locale is also known as the C locale. + """ + + # ASCII char + var c = UInt8(Int(self)) + + # NOTE: a global LUT doesn't work at compile time so we can't use it here. + alias ` ` = UInt8(ord(" ")) + alias `\t` = UInt8(ord("\t")) + alias `\n` = UInt8(ord("\n")) + alias `\r` = UInt8(ord("\r")) + alias `\f` = UInt8(ord("\f")) + alias `\v` = UInt8(ord("\v")) - [1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 + # This compiles to something very clever that's even faster than a LUT. + return self.is_ascii() and ( + c == ` ` + or c == `\t` + or c == `\n` + or c == `\r` + or c == `\f` + or c == `\v` + ) - This only respects the default "C" locale, i.e. returns True only if the - character specified is one of " \\t\\n\\v\\f\\r". For semantics similar - to Python, use `String.isspace()`. + fn is_ascii_space(self) -> Bool: + """Determines whether the given `Codepoint` is an ASCII whitespace + character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`. Returns: True iff the character is one of the whitespace characters listed above. """ - if not self.is_ascii(): - return False # ASCII char var c = UInt8(Int(self)) @@ -427,7 +452,7 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable): alias `\x1e` = UInt8(ord("\x1e")) # This compiles to something very clever that's even faster than a LUT. - return ( + return self.is_ascii() and ( c == ` ` or c == `\t` or c == `\n` diff --git a/mojo/stdlib/src/collections/string/string.mojo b/mojo/stdlib/src/collections/string/string.mojo index 85e121f004..966ad098ab 100644 --- a/mojo/stdlib/src/collections/string/string.mojo +++ b/mojo/stdlib/src/collections/string/string.mojo @@ -284,13 +284,13 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int: elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]: result += ord_current - ord_letter_min[1] + 10 found_valid_chars_after_start = True - elif Codepoint(UInt8(ord_current)).is_posix_space(): + elif Codepoint(UInt8(ord_current)).is_ascii_space(): has_space_after_number = True start = pos + 1 break else: raise Error(_str_to_base_error(base, str_slice)) - if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_posix_space(): + if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_ascii_space(): var nextresult = result * real_base if nextresult < result: raise Error( @@ -304,7 +304,7 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int: if has_space_after_number: for pos in range(start, str_len): - if not Codepoint(buff[pos]).is_posix_space(): + if not Codepoint(buff[pos]).is_ascii_space(): raise Error(_str_to_base_error(base, str_slice)) if is_negative: result = -result @@ -326,7 +326,7 @@ fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool): """ var buff = str_slice.unsafe_ptr() var start: Int = 0 - while start < str_len and Codepoint(buff[start]).is_posix_space(): + while start < str_len and Codepoint(buff[start]).is_ascii_space(): start += 1 var p: Bool = buff[start] == ord("+") var n: Bool = buff[start] == ord("-") diff --git a/mojo/stdlib/src/collections/string/string_slice.mojo b/mojo/stdlib/src/collections/string/string_slice.mojo index 7ec78dc993..cbaf232445 100644 --- a/mojo/stdlib/src/collections/string/string_slice.mojo +++ b/mojo/stdlib/src/collections/string/string_slice.mojo @@ -1188,7 +1188,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( # break # r_idx -= 1 while ( - r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_posix_space() + r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_ascii_space() ): r_idx -= 1 return Self(unsafe_from_utf8=self.as_bytes()[:r_idx]) @@ -1241,7 +1241,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( # l_idx += 1 while ( l_idx < self.byte_length() - and Codepoint(self.as_bytes()[l_idx]).is_posix_space() + and Codepoint(self.as_bytes()[l_idx]).is_ascii_space() ): l_idx += 1 return Self(unsafe_from_utf8=self.as_bytes()[l_idx:]) diff --git a/mojo/stdlib/test/collections/test_codepoint.mojo b/mojo/stdlib/test/collections/string/test_codepoint.mojo similarity index 86% rename from mojo/stdlib/test/collections/test_codepoint.mojo rename to mojo/stdlib/test/collections/string/test_codepoint.mojo index 2b366a2ca1..c3f3679527 100644 --- a/mojo/stdlib/test/collections/test_codepoint.mojo +++ b/mojo/stdlib/test/collections/string/test_codepoint.mojo @@ -87,7 +87,6 @@ def test_char_is_posix_space(): # Checking false cases assert_false(Codepoint.ord("a").is_posix_space()) - assert_false(Codepoint.ord("a").is_posix_space()) assert_false(Codepoint.ord("u").is_posix_space()) assert_false(Codepoint.ord("s").is_posix_space()) assert_false(Codepoint.ord("t").is_posix_space()) @@ -95,6 +94,33 @@ def test_char_is_posix_space(): assert_false(Codepoint.ord("n").is_posix_space()) assert_false(Codepoint.ord("z").is_posix_space()) assert_false(Codepoint.ord(".").is_posix_space()) + assert_false(Codepoint.ord("\x1c").is_posix_space()) + assert_false(Codepoint.ord("\x1d").is_posix_space()) + assert_false(Codepoint.ord("\x1e").is_posix_space()) + + +def test_char_is_ascii_space(): + # checking true cases + assert_true(Codepoint.ord(" ").is_ascii_space()) + assert_true(Codepoint.ord("\n").is_ascii_space()) + assert_true(Codepoint.ord("\n").is_ascii_space()) + assert_true(Codepoint.ord("\t").is_ascii_space()) + assert_true(Codepoint.ord("\r").is_ascii_space()) + assert_true(Codepoint.ord("\v").is_ascii_space()) + assert_true(Codepoint.ord("\f").is_ascii_space()) + assert_true(Codepoint.ord("\x1c").is_ascii_space()) + assert_true(Codepoint.ord("\x1d").is_ascii_space()) + assert_true(Codepoint.ord("\x1e").is_ascii_space()) + + # Checking false cases + assert_false(Codepoint.ord("a").is_ascii_space()) + assert_false(Codepoint.ord("u").is_ascii_space()) + assert_false(Codepoint.ord("s").is_ascii_space()) + assert_false(Codepoint.ord("t").is_ascii_space()) + assert_false(Codepoint.ord("i").is_ascii_space()) + assert_false(Codepoint.ord("n").is_ascii_space()) + assert_false(Codepoint.ord("z").is_ascii_space()) + assert_false(Codepoint.ord(".").is_ascii_space()) def test_char_is_lower(): @@ -238,6 +264,7 @@ def main(): test_char_formatting() test_char_properties() test_char_is_posix_space() + test_char_is_ascii_space() test_char_is_lower() test_char_is_upper() test_char_is_digit()