modular · martinvuyk · Feb 1, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/stdlib/src/collections/string/codepoint.mojo b/stdlib/src/collections/string/codepoint.mojo
@@ -386,30 +386,55 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable):
         alias unicode_line_sep = Codepoint.from_u32(0x2028).value()
         alias unicode_paragraph_sep = Codepoint.from_u32(0x2029).value()
 
-        return self.is_posix_space() or self in (
+        return self.is_ascii_space() or self in (
             next_line,
             unicode_line_sep,
             unicode_paragraph_sep,
         )
 
     fn is_posix_space(self) -> Bool:
-        """Returns True if this `Codepoint` is a **space** character according to the
-        [POSIX locale][1].
+        """Returns True if this `Codepoint` is a **space** (aka. whitespace)
+        character according to the [POSIX locale](
+        https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
+        ): `" \\t\\n\\v\\f\\r"`.
 
-        The POSIX locale is also known as the C locale.
+        Returns:
+            True iff the character is one of the whitespace characters listed
+            above.
+
+        Notes:
+            The POSIX locale is also known as the C locale.
+        """
+
+        # ASCII char
+        var c = UInt8(Int(self))
+
+        # NOTE: a global LUT doesn't work at compile time so we can't use it here.
+        alias ` ` = UInt8(ord(" "))
+        alias `\t` = UInt8(ord("\t"))
+        alias `\n` = UInt8(ord("\n"))
+        alias `\r` = UInt8(ord("\r"))
+        alias `\f` = UInt8(ord("\f"))
+        alias `\v` = UInt8(ord("\v"))
 
-        [1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01
+        # This compiles to something very clever that's even faster than a LUT.
+        return self.is_ascii() and (
+            c == ` `
+            or c == `\t`
+            or c == `\n`
+            or c == `\r`
+            or c == `\f`
+            or c == `\v`
+        )
 
-        This only respects the default "C" locale, i.e. returns True only if the
-        character specified is one of " \\t\\n\\v\\f\\r". For semantics similar
-        to Python, use `String.isspace()`.
+    fn is_ascii_space(self) -> Bool:
+        """Determines whether the given `Codepoint` is an ASCII whitespace
+        character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`.
 
         Returns:
             True iff the character is one of the whitespace characters listed
             above.
         """
-        if not self.is_ascii():
-            return False
 
         # ASCII char
         var c = UInt8(Int(self))
@@ -426,7 +451,7 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable):
         alias `\x1e` = UInt8(ord("\x1e"))
 
         # This compiles to something very clever that's even faster than a LUT.
-        return (
+        return self.is_ascii() and (
             c == ` `
             or c == `\t`
             or c == `\n`

diff --git a/stdlib/src/collections/string/string.mojo b/stdlib/src/collections/string/string.mojo
@@ -285,13 +285,13 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:
         elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]:
             result += ord_current - ord_letter_min[1] + 10
             found_valid_chars_after_start = True
-        elif Codepoint(UInt8(ord_current)).is_posix_space():
+        elif Codepoint(UInt8(ord_current)).is_ascii_space():
             has_space_after_number = True
             start = pos + 1
             break
         else:
             raise Error(_str_to_base_error(base, str_slice))
-        if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_posix_space():
+        if pos + 1 < str_len and not Codepoint(buff[pos + 1]).is_ascii_space():
             var nextresult = result * real_base
             if nextresult < result:
                 raise Error(
@@ -305,7 +305,7 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int:
 
     if has_space_after_number:
         for pos in range(start, str_len):
-            if not Codepoint(buff[pos]).is_posix_space():
+            if not Codepoint(buff[pos]).is_ascii_space():
                 raise Error(_str_to_base_error(base, str_slice))
     if is_negative:
         result = -result
@@ -327,7 +327,7 @@ fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool):
     """
     var buff = str_slice.unsafe_ptr()
     var start: Int = 0
-    while start < str_len and Codepoint(buff[start]).is_posix_space():
+    while start < str_len and Codepoint(buff[start]).is_ascii_space():
         start += 1
     var p: Bool = buff[start] == ord("+")
     var n: Bool = buff[start] == ord("-")

diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo
@@ -1174,7 +1174,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
         #         break
         #     r_idx -= 1
         while (
-            r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_posix_space()
+            r_idx > 0 and Codepoint(self.as_bytes()[r_idx - 1]).is_ascii_space()
         ):
             r_idx -= 1
         return Self(unsafe_from_utf8=self.as_bytes()[:r_idx])
@@ -1227,7 +1227,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]](
         #     l_idx += 1
         while (
             l_idx < self.byte_length()
-            and Codepoint(self.as_bytes()[l_idx]).is_posix_space()
+            and Codepoint(self.as_bytes()[l_idx]).is_ascii_space()
         ):
             l_idx += 1
         return Self(unsafe_from_utf8=self.as_bytes()[l_idx:])

diff --git a/stdlib/test/collections/test_codepoint.mojo → ...st/collections/string/test_codepoint.mojo b/stdlib/test/collections/test_codepoint.mojo → ...st/collections/string/test_codepoint.mojo
@@ -87,14 +87,40 @@ def test_char_is_posix_space():
 
     # Checking false cases
     assert_false(Codepoint.ord("a").is_posix_space())
-    assert_false(Codepoint.ord("a").is_posix_space())
     assert_false(Codepoint.ord("u").is_posix_space())
     assert_false(Codepoint.ord("s").is_posix_space())
     assert_false(Codepoint.ord("t").is_posix_space())
     assert_false(Codepoint.ord("i").is_posix_space())
     assert_false(Codepoint.ord("n").is_posix_space())
     assert_false(Codepoint.ord("z").is_posix_space())
     assert_false(Codepoint.ord(".").is_posix_space())
+    assert_false(Codepoint.ord("\x1c").is_posix_space())
+    assert_false(Codepoint.ord("\x1d").is_posix_space())
+    assert_false(Codepoint.ord("\x1e").is_posix_space())
+
+
+def test_char_is_ascii_space():
+    # checking true cases
+    assert_true(Codepoint.ord(" ").is_ascii_space())
+    assert_true(Codepoint.ord("\n").is_ascii_space())
+    assert_true(Codepoint.ord("\n").is_ascii_space())
+    assert_true(Codepoint.ord("\t").is_ascii_space())
+    assert_true(Codepoint.ord("\r").is_ascii_space())
+    assert_true(Codepoint.ord("\v").is_ascii_space())
+    assert_true(Codepoint.ord("\f").is_ascii_space())
+    assert_true(Codepoint.ord("\x1c").is_ascii_space())
+    assert_true(Codepoint.ord("\x1d").is_ascii_space())
+    assert_true(Codepoint.ord("\x1e").is_ascii_space())
+
+    # Checking false cases
+    assert_false(Codepoint.ord("a").is_ascii_space())
+    assert_false(Codepoint.ord("u").is_ascii_space())
+    assert_false(Codepoint.ord("s").is_ascii_space())
+    assert_false(Codepoint.ord("t").is_ascii_space())
+    assert_false(Codepoint.ord("i").is_ascii_space())
+    assert_false(Codepoint.ord("n").is_ascii_space())
+    assert_false(Codepoint.ord("z").is_ascii_space())
+    assert_false(Codepoint.ord(".").is_ascii_space())
 
 
 def test_char_is_lower():
@@ -238,6 +264,7 @@ def main():
     test_char_formatting()
     test_char_properties()
     test_char_is_posix_space()
+    test_char_is_ascii_space()
     test_char_is_lower()
     test_char_is_upper()
     test_char_is_digit()