From c29f8b5fec7c0e354644eff7c83e649cf2a4578d Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Sat, 1 Feb 2025 15:29:53 -0300 Subject: [PATCH 1/5] fix wrong whitespace designation Signed-off-by: martinvuyk --- stdlib/src/builtin/char.mojo | 47 ++++++++++++++----- stdlib/src/collections/string/string.mojo | 8 ++-- .../src/collections/string/string_slice.mojo | 4 +- stdlib/test/builtin/test_char.mojo | 29 ++++++++++++ 4 files changed, 71 insertions(+), 17 deletions(-) diff --git a/stdlib/src/builtin/char.mojo b/stdlib/src/builtin/char.mojo index 20d4b1c665..714e325373 100644 --- a/stdlib/src/builtin/char.mojo +++ b/stdlib/src/builtin/char.mojo @@ -359,30 +359,55 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable): alias unicode_line_sep = Char.from_u32(0x2028).value() alias unicode_paragraph_sep = Char.from_u32(0x2029).value() - return self.is_posix_space() or self in ( + return self.is_ascii_space() or self in ( next_line, unicode_line_sep, unicode_paragraph_sep, ) fn is_posix_space(self) -> Bool: - """Returns True if this `Char` is a **space** character according to the - [POSIX locale][1]. + """Returns True if this `Char` is a **space** (aka. whitespace) + character according to the [POSIX locale]( + https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 + ): `" \\t\\n\\v\\f\\r"`. - The POSIX locale is also known as the C locale. + Returns: + True iff the character is one of the whitespace characters listed + above. + + Notes: + The POSIX locale is also known as the C locale. + """ + + # ASCII char + var c = UInt8(Int(self)) + + # NOTE: a global LUT doesn't work at compile time so we can't use it here. + alias ` ` = UInt8(ord(" ")) + alias `\t` = UInt8(ord("\t")) + alias `\n` = UInt8(ord("\n")) + alias `\r` = UInt8(ord("\r")) + alias `\f` = UInt8(ord("\f")) + alias `\v` = UInt8(ord("\v")) - [1]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 + # This compiles to something very clever that's even faster than a LUT. + return self.is_ascii() and ( + c == ` ` + or c == `\t` + or c == `\n` + or c == `\r` + or c == `\f` + or c == `\v` + ) - This only respects the default "C" locale, i.e. returns True only if the - character specified is one of " \\t\\n\\v\\f\\r". For semantics similar - to Python, use `String.isspace()`. + fn is_ascii_space(self) -> Bool: + """Determines whether the given character is an ASCII whitespace + character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`. Returns: True iff the character is one of the whitespace characters listed above. """ - if not self.is_ascii(): - return False # ASCII char var c = UInt8(Int(self)) @@ -399,7 +424,7 @@ struct Char(CollectionElement, EqualityComparable, Intable, Stringable): alias `\x1e` = UInt8(ord("\x1e")) # This compiles to something very clever that's even faster than a LUT. - return ( + return self.is_ascii() and ( c == ` ` or c == `\t` or c == `\n` diff --git a/stdlib/src/collections/string/string.mojo b/stdlib/src/collections/string/string.mojo index 1b0fbe9a4e..85b58e8def 100644 --- a/stdlib/src/collections/string/string.mojo +++ b/stdlib/src/collections/string/string.mojo @@ -290,13 +290,13 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int: elif ord_letter_min[1] <= ord_current <= ord_letter_max[1]: result += ord_current - ord_letter_min[1] + 10 found_valid_chars_after_start = True - elif Char(UInt8(ord_current)).is_posix_space(): + elif Char(UInt8(ord_current)).is_ascii_space(): has_space_after_number = True start = pos + 1 break else: raise Error(_str_to_base_error(base, str_slice)) - if pos + 1 < str_len and not Char(buff[pos + 1]).is_posix_space(): + if pos + 1 < str_len and not Char(buff[pos + 1]).is_ascii_space(): var nextresult = result * real_base if nextresult < result: raise Error( @@ -310,7 +310,7 @@ fn atol(str_slice: StringSlice, base: Int = 10) raises -> Int: if has_space_after_number: for pos in range(start, str_len): - if not Char(buff[pos]).is_posix_space(): + if not Char(buff[pos]).is_ascii_space(): raise Error(_str_to_base_error(base, str_slice)) if is_negative: result = -result @@ -332,7 +332,7 @@ fn _trim_and_handle_sign(str_slice: StringSlice, str_len: Int) -> (Int, Bool): """ var buff = str_slice.unsafe_ptr() var start: Int = 0 - while start < str_len and Char(buff[start]).is_posix_space(): + while start < str_len and Char(buff[start]).is_ascii_space(): start += 1 var p: Bool = buff[start] == ord("+") var n: Bool = buff[start] == ord("-") diff --git a/stdlib/src/collections/string/string_slice.mojo b/stdlib/src/collections/string/string_slice.mojo index f2ae7cd174..128ad00b2f 100644 --- a/stdlib/src/collections/string/string_slice.mojo +++ b/stdlib/src/collections/string/string_slice.mojo @@ -967,7 +967,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( # if not s.isspace(): # break # r_idx -= 1 - while r_idx > 0 and Char(self.as_bytes()[r_idx - 1]).is_posix_space(): + while r_idx > 0 and Char(self.as_bytes()[r_idx - 1]).is_ascii_space(): r_idx -= 1 return Self(unsafe_from_utf8=self.as_bytes()[:r_idx]) @@ -1019,7 +1019,7 @@ struct StringSlice[mut: Bool, //, origin: Origin[mut]]( # l_idx += 1 while ( l_idx < self.byte_length() - and Char(self.as_bytes()[l_idx]).is_posix_space() + and Char(self.as_bytes()[l_idx]).is_ascii_space() ): l_idx += 1 return Self(unsafe_from_utf8=self.as_bytes()[l_idx:]) diff --git a/stdlib/test/builtin/test_char.mojo b/stdlib/test/builtin/test_char.mojo index 2af35ad639..0016b19629 100644 --- a/stdlib/test/builtin/test_char.mojo +++ b/stdlib/test/builtin/test_char.mojo @@ -93,6 +93,34 @@ def test_char_is_posix_space(): assert_false(Char.ord("n").is_posix_space()) assert_false(Char.ord("z").is_posix_space()) assert_false(Char.ord(".").is_posix_space()) + assert_false(Char.ord("\x1c").is_posix_space()) + assert_false(Char.ord("\x1d").is_posix_space()) + assert_false(Char.ord("\x1e").is_posix_space()) + + +def test_char_is_ascii_space(): + # checking true cases + assert_true(Char.ord(" ").is_ascii_space()) + assert_true(Char.ord("\n").is_ascii_space()) + assert_true(Char.ord("\n").is_ascii_space()) + assert_true(Char.ord("\t").is_ascii_space()) + assert_true(Char.ord("\r").is_ascii_space()) + assert_true(Char.ord("\v").is_ascii_space()) + assert_true(Char.ord("\f").is_ascii_space()) + assert_true(Char.ord("\x1c").is_ascii_space()) + assert_true(Char.ord("\x1d").is_ascii_space()) + assert_true(Char.ord("\x1e").is_ascii_space()) + + # Checking false cases + assert_false(Char.ord("a").is_ascii_space()) + assert_false(Char.ord("a").is_ascii_space()) + assert_false(Char.ord("u").is_ascii_space()) + assert_false(Char.ord("s").is_ascii_space()) + assert_false(Char.ord("t").is_ascii_space()) + assert_false(Char.ord("i").is_ascii_space()) + assert_false(Char.ord("n").is_ascii_space()) + assert_false(Char.ord("z").is_ascii_space()) + assert_false(Char.ord(".").is_ascii_space()) def test_char_is_lower(): @@ -234,6 +262,7 @@ def main(): test_char_formatting() test_char_properties() test_char_is_posix_space() + test_char_is_ascii_space() test_char_is_lower() test_char_is_upper() test_char_is_digit() From f5c9ae5a06f766a6226eda63a7bbc8632744098c Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Fri, 21 Feb 2025 13:54:48 -0300 Subject: [PATCH 2/5] fix after merge Signed-off-by: martinvuyk --- stdlib/src/collections/string/codepoint.mojo | 5 ----- .../collections/{ => string}/test_codepoint.mojo | 14 -------------- 2 files changed, 19 deletions(-) rename stdlib/test/collections/{ => string}/test_codepoint.mojo (93%) diff --git a/stdlib/src/collections/string/codepoint.mojo b/stdlib/src/collections/string/codepoint.mojo index 14f9a143c5..fdaab9b37d 100644 --- a/stdlib/src/collections/string/codepoint.mojo +++ b/stdlib/src/collections/string/codepoint.mojo @@ -393,15 +393,10 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable): ) fn is_posix_space(self) -> Bool: -<<<<<<< HEAD:stdlib/src/builtin/char.mojo """Returns True if this `Char` is a **space** (aka. whitespace) character according to the [POSIX locale]( https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 ): `" \\t\\n\\v\\f\\r"`. -======= - """Returns True if this `Codepoint` is a **space** character according to the - [POSIX locale][1]. ->>>>>>> upstream/main:stdlib/src/collections/string/codepoint.mojo Returns: True iff the character is one of the whitespace characters listed diff --git a/stdlib/test/collections/test_codepoint.mojo b/stdlib/test/collections/string/test_codepoint.mojo similarity index 93% rename from stdlib/test/collections/test_codepoint.mojo rename to stdlib/test/collections/string/test_codepoint.mojo index 93af63b32a..d6ce711684 100644 --- a/stdlib/test/collections/test_codepoint.mojo +++ b/stdlib/test/collections/string/test_codepoint.mojo @@ -86,8 +86,6 @@ def test_char_is_posix_space(): assert_true(Codepoint.ord("\f").is_posix_space()) # Checking false cases -<<<<<<< HEAD:stdlib/test/builtin/test_char.mojo - assert_false(Char.ord("a").is_posix_space()) assert_false(Char.ord("a").is_posix_space()) assert_false(Char.ord("u").is_posix_space()) assert_false(Char.ord("s").is_posix_space()) @@ -116,7 +114,6 @@ def test_char_is_ascii_space(): # Checking false cases assert_false(Char.ord("a").is_ascii_space()) - assert_false(Char.ord("a").is_ascii_space()) assert_false(Char.ord("u").is_ascii_space()) assert_false(Char.ord("s").is_ascii_space()) assert_false(Char.ord("t").is_ascii_space()) @@ -124,17 +121,6 @@ def test_char_is_ascii_space(): assert_false(Char.ord("n").is_ascii_space()) assert_false(Char.ord("z").is_ascii_space()) assert_false(Char.ord(".").is_ascii_space()) -======= - assert_false(Codepoint.ord("a").is_posix_space()) - assert_false(Codepoint.ord("a").is_posix_space()) - assert_false(Codepoint.ord("u").is_posix_space()) - assert_false(Codepoint.ord("s").is_posix_space()) - assert_false(Codepoint.ord("t").is_posix_space()) - assert_false(Codepoint.ord("i").is_posix_space()) - assert_false(Codepoint.ord("n").is_posix_space()) - assert_false(Codepoint.ord("z").is_posix_space()) - assert_false(Codepoint.ord(".").is_posix_space()) ->>>>>>> upstream/main:stdlib/test/collections/test_codepoint.mojo def test_char_is_lower(): From 3690ad49dad9855931348bf9a7e5cdf8bfd92d73 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Fri, 21 Feb 2025 13:55:49 -0300 Subject: [PATCH 3/5] fix after merge Signed-off-by: martinvuyk --- .../collections/string/test_codepoint.mojo | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/stdlib/test/collections/string/test_codepoint.mojo b/stdlib/test/collections/string/test_codepoint.mojo index d6ce711684..c3f3679527 100644 --- a/stdlib/test/collections/string/test_codepoint.mojo +++ b/stdlib/test/collections/string/test_codepoint.mojo @@ -86,41 +86,41 @@ def test_char_is_posix_space(): assert_true(Codepoint.ord("\f").is_posix_space()) # Checking false cases - assert_false(Char.ord("a").is_posix_space()) - assert_false(Char.ord("u").is_posix_space()) - assert_false(Char.ord("s").is_posix_space()) - assert_false(Char.ord("t").is_posix_space()) - assert_false(Char.ord("i").is_posix_space()) - assert_false(Char.ord("n").is_posix_space()) - assert_false(Char.ord("z").is_posix_space()) - assert_false(Char.ord(".").is_posix_space()) - assert_false(Char.ord("\x1c").is_posix_space()) - assert_false(Char.ord("\x1d").is_posix_space()) - assert_false(Char.ord("\x1e").is_posix_space()) + assert_false(Codepoint.ord("a").is_posix_space()) + assert_false(Codepoint.ord("u").is_posix_space()) + assert_false(Codepoint.ord("s").is_posix_space()) + assert_false(Codepoint.ord("t").is_posix_space()) + assert_false(Codepoint.ord("i").is_posix_space()) + assert_false(Codepoint.ord("n").is_posix_space()) + assert_false(Codepoint.ord("z").is_posix_space()) + assert_false(Codepoint.ord(".").is_posix_space()) + assert_false(Codepoint.ord("\x1c").is_posix_space()) + assert_false(Codepoint.ord("\x1d").is_posix_space()) + assert_false(Codepoint.ord("\x1e").is_posix_space()) def test_char_is_ascii_space(): # checking true cases - assert_true(Char.ord(" ").is_ascii_space()) - assert_true(Char.ord("\n").is_ascii_space()) - assert_true(Char.ord("\n").is_ascii_space()) - assert_true(Char.ord("\t").is_ascii_space()) - assert_true(Char.ord("\r").is_ascii_space()) - assert_true(Char.ord("\v").is_ascii_space()) - assert_true(Char.ord("\f").is_ascii_space()) - assert_true(Char.ord("\x1c").is_ascii_space()) - assert_true(Char.ord("\x1d").is_ascii_space()) - assert_true(Char.ord("\x1e").is_ascii_space()) + assert_true(Codepoint.ord(" ").is_ascii_space()) + assert_true(Codepoint.ord("\n").is_ascii_space()) + assert_true(Codepoint.ord("\n").is_ascii_space()) + assert_true(Codepoint.ord("\t").is_ascii_space()) + assert_true(Codepoint.ord("\r").is_ascii_space()) + assert_true(Codepoint.ord("\v").is_ascii_space()) + assert_true(Codepoint.ord("\f").is_ascii_space()) + assert_true(Codepoint.ord("\x1c").is_ascii_space()) + assert_true(Codepoint.ord("\x1d").is_ascii_space()) + assert_true(Codepoint.ord("\x1e").is_ascii_space()) # Checking false cases - assert_false(Char.ord("a").is_ascii_space()) - assert_false(Char.ord("u").is_ascii_space()) - assert_false(Char.ord("s").is_ascii_space()) - assert_false(Char.ord("t").is_ascii_space()) - assert_false(Char.ord("i").is_ascii_space()) - assert_false(Char.ord("n").is_ascii_space()) - assert_false(Char.ord("z").is_ascii_space()) - assert_false(Char.ord(".").is_ascii_space()) + assert_false(Codepoint.ord("a").is_ascii_space()) + assert_false(Codepoint.ord("u").is_ascii_space()) + assert_false(Codepoint.ord("s").is_ascii_space()) + assert_false(Codepoint.ord("t").is_ascii_space()) + assert_false(Codepoint.ord("i").is_ascii_space()) + assert_false(Codepoint.ord("n").is_ascii_space()) + assert_false(Codepoint.ord("z").is_ascii_space()) + assert_false(Codepoint.ord(".").is_ascii_space()) def test_char_is_lower(): From 6405bea61f60a58f9852d5aeb4d58440c5b8a471 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Fri, 21 Feb 2025 14:03:11 -0300 Subject: [PATCH 4/5] fix after merge Signed-off-by: martinvuyk --- stdlib/src/collections/string/codepoint.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/collections/string/codepoint.mojo b/stdlib/src/collections/string/codepoint.mojo index fdaab9b37d..5e5cc5f834 100644 --- a/stdlib/src/collections/string/codepoint.mojo +++ b/stdlib/src/collections/string/codepoint.mojo @@ -393,7 +393,7 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable): ) fn is_posix_space(self) -> Bool: - """Returns True if this `Char` is a **space** (aka. whitespace) + """Returns True if this `Codepoint` is a **space** (aka. whitespace) character according to the [POSIX locale]( https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap07.html#tag_07_03_01 ): `" \\t\\n\\v\\f\\r"`. @@ -428,7 +428,7 @@ struct Codepoint(CollectionElement, EqualityComparable, Intable, Stringable): ) fn is_ascii_space(self) -> Bool: - """Determines whether the given character is an ASCII whitespace + """Determines whether the given `Codepoint` is an ASCII whitespace character: `" \\t\\n\\v\\f\\r\\x1c\\x1d\\x1e"`. Returns: From 79c8fe52997642aa020b8830da6f07eef83d8085 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Fri, 28 Feb 2025 07:50:34 -0300 Subject: [PATCH 5/5] fix after merge Signed-off-by: martinvuyk --- .../test/collections/test_codepoint.mojo | 274 ------------------ 1 file changed, 274 deletions(-) delete mode 100644 mojo/stdlib/test/collections/test_codepoint.mojo diff --git a/mojo/stdlib/test/collections/test_codepoint.mojo b/mojo/stdlib/test/collections/test_codepoint.mojo deleted file mode 100644 index c3f3679527..0000000000 --- a/mojo/stdlib/test/collections/test_codepoint.mojo +++ /dev/null @@ -1,274 +0,0 @@ -# ===----------------------------------------------------------------------=== # -# Copyright (c) 2025, Modular Inc. All rights reserved. -# -# Licensed under the Apache License v2.0 with LLVM Exceptions: -# https://llvm.org/LICENSE.txt -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ===----------------------------------------------------------------------=== # -# RUN: %mojo %s - -from testing import assert_equal, assert_false, assert_not_equal, assert_true - - -def test_char_validity(): - # Check that basic unchecked constructor behaves as expected. - var c1 = Codepoint(unsafe_unchecked_codepoint=32) - assert_equal(c1._scalar_value, 32) - - assert_true(Codepoint.from_u32(0)) - - # For a visual intuition of what constitues a valid scalar value: - # https://connorgray.com/ephemera/project-log#2025-01-09 - - # Last valid code point in the smaller scalar value range. - assert_true(Codepoint.from_u32(0xD7FF)) - - # First surrogate code point, not valid. - assert_false(Codepoint.from_u32(0xD7FF + 1)) - - # Last surrogate code point, not valid - assert_false(Codepoint.from_u32(0xDFFF)) - - # First valid code point in the larger scalar value range. - assert_true(Codepoint.from_u32(0xE000)) - - # Beyond Unicode's committed range of code points. - assert_false(Codepoint.from_u32(0x10FFFF + 1)) - - -def test_char_from_u8(): - var c1 = Codepoint(UInt8(0)) - assert_true(c1.is_ascii()) - - # All non-negative 8-bit integers are codepoints, but not all are ASCII. - var c2 = Codepoint(UInt8(255)) - assert_false(c2.is_ascii()) - - -def test_char_comparison(): - assert_equal(Codepoint(0), Codepoint(0)) - assert_not_equal(Codepoint(0), Codepoint(1)) - - -def test_char_formatting(): - assert_equal(String(Codepoint(0)), "\0") - assert_equal(String(Codepoint(32)), " ") - assert_equal(String(Codepoint(97)), "a") - assert_equal(String(Codepoint.from_u32(0x00BE).value()), "¾") - assert_equal(String(Codepoint.from_u32(0x1F642).value()), "🙂") - - -def test_char_properties(): - assert_true(Codepoint.from_u32(0).value().is_ascii()) - # Last ASCII codepoint. - assert_true( - Codepoint.from_u32(0b0111_1111).value().is_ascii() - ) # ASCII 127 0x7F - - # First non-ASCII codepoint. - assert_false(Codepoint.from_u32(0b1000_0000).value().is_ascii()) - assert_false(Codepoint.from_u32(0b1111_1111).value().is_ascii()) - - -def test_char_is_posix_space(): - # checking true cases - assert_true(Codepoint.ord(" ").is_posix_space()) - assert_true(Codepoint.ord("\n").is_posix_space()) - assert_true(Codepoint.ord("\n").is_posix_space()) - assert_true(Codepoint.ord("\t").is_posix_space()) - assert_true(Codepoint.ord("\r").is_posix_space()) - assert_true(Codepoint.ord("\v").is_posix_space()) - assert_true(Codepoint.ord("\f").is_posix_space()) - - # Checking false cases - assert_false(Codepoint.ord("a").is_posix_space()) - assert_false(Codepoint.ord("u").is_posix_space()) - assert_false(Codepoint.ord("s").is_posix_space()) - assert_false(Codepoint.ord("t").is_posix_space()) - assert_false(Codepoint.ord("i").is_posix_space()) - assert_false(Codepoint.ord("n").is_posix_space()) - assert_false(Codepoint.ord("z").is_posix_space()) - assert_false(Codepoint.ord(".").is_posix_space()) - assert_false(Codepoint.ord("\x1c").is_posix_space()) - assert_false(Codepoint.ord("\x1d").is_posix_space()) - assert_false(Codepoint.ord("\x1e").is_posix_space()) - - -def test_char_is_ascii_space(): - # checking true cases - assert_true(Codepoint.ord(" ").is_ascii_space()) - assert_true(Codepoint.ord("\n").is_ascii_space()) - assert_true(Codepoint.ord("\n").is_ascii_space()) - assert_true(Codepoint.ord("\t").is_ascii_space()) - assert_true(Codepoint.ord("\r").is_ascii_space()) - assert_true(Codepoint.ord("\v").is_ascii_space()) - assert_true(Codepoint.ord("\f").is_ascii_space()) - assert_true(Codepoint.ord("\x1c").is_ascii_space()) - assert_true(Codepoint.ord("\x1d").is_ascii_space()) - assert_true(Codepoint.ord("\x1e").is_ascii_space()) - - # Checking false cases - assert_false(Codepoint.ord("a").is_ascii_space()) - assert_false(Codepoint.ord("u").is_ascii_space()) - assert_false(Codepoint.ord("s").is_ascii_space()) - assert_false(Codepoint.ord("t").is_ascii_space()) - assert_false(Codepoint.ord("i").is_ascii_space()) - assert_false(Codepoint.ord("n").is_ascii_space()) - assert_false(Codepoint.ord("z").is_ascii_space()) - assert_false(Codepoint.ord(".").is_ascii_space()) - - -def test_char_is_lower(): - assert_true(Codepoint.ord("a").is_ascii_lower()) - assert_true(Codepoint.ord("b").is_ascii_lower()) - assert_true(Codepoint.ord("y").is_ascii_lower()) - assert_true(Codepoint.ord("z").is_ascii_lower()) - - assert_false(Codepoint.from_u32(ord("a") - 1).value().is_ascii_lower()) - assert_false(Codepoint.from_u32(ord("z") + 1).value().is_ascii_lower()) - - assert_false(Codepoint.ord("!").is_ascii_lower()) - assert_false(Codepoint.ord("0").is_ascii_lower()) - - -def test_char_is_upper(): - assert_true(Codepoint.ord("A").is_ascii_upper()) - assert_true(Codepoint.ord("B").is_ascii_upper()) - assert_true(Codepoint.ord("Y").is_ascii_upper()) - assert_true(Codepoint.ord("Z").is_ascii_upper()) - - assert_false(Codepoint.from_u32(ord("A") - 1).value().is_ascii_upper()) - assert_false(Codepoint.from_u32(ord("Z") + 1).value().is_ascii_upper()) - - assert_false(Codepoint.ord("!").is_ascii_upper()) - assert_false(Codepoint.ord("0").is_ascii_upper()) - - -def test_char_is_digit(): - assert_true(Codepoint.ord("1").is_ascii_digit()) - assert_false(Codepoint.ord("g").is_ascii_digit()) - - # Devanagari Digit 6 — non-ASCII digits are not "ascii digit". - assert_false(Codepoint.ord("६").is_ascii_digit()) - - -def test_char_is_printable(): - assert_true(Codepoint.ord("a").is_ascii_printable()) - assert_false(Codepoint.ord("\n").is_ascii_printable()) - assert_false(Codepoint.ord("\t").is_ascii_printable()) - - # Non-ASCII characters are not considered "ascii printable". - assert_false(Codepoint.ord("स").is_ascii_printable()) - - -alias SIGNIFICANT_CODEPOINTS = List[Tuple[Int, List[Byte]]]( - # -------------------------- - # 1-byte (ASCII) codepoints - # -------------------------- - # Smallest 1-byte codepoint value - (0, List[Byte](0)), - (1, List[Byte](1)), - (32, List[Byte](32)), # First non-control character - (0b0111_1111, List[Byte](127)), # 127 - # ------------------ - # 2-byte codepoints -- 0b110x_xxxx 0b10xx_xxxx (11 x's) - # ------------------ - # Smallest 2-byte codepoint - (128, List[Byte](0b1100_0010, 0b1000_0000)), - # Largest 2-byte codepoint -- 2^11 - 1 == 2047 - (2**11 - 1, List[Byte](0b1101_1111, 0b1011_1111)), - # ------------------ - # 3-byte codepoints -- 0b1110_xxxx 0b10xx_xxxx 0b10xx_xxxx (16 x's) - # ------------------ - # Smallest 3-byte codepoint -- 2^11 == 2048 - (2**11, List[Byte](0b1110_0000, 0b1010_0000, 0b1000_0000)), - # Largest 3-byte codepoint -- 2^16 - 1 == 65535 == 0xFFFF - (2**16 - 1, List[Byte](0b1110_1111, 0b1011_1111, 0b1011_1111)), - # ------------------ - # 4-byte codepoints 0b1111_0xxx 0b10xx_xxxx 0b10xx_xxxx 0b10xx_xxxx (21 x's) - # ------------------ - # Smallest 4-byte codepoint - (2**16, List[Byte](0b1111_0000, 0b1001_0000, 0b1000_0000, 0b1000_0000)), - # Largest 4-byte codepoint -- Maximum Unicode codepoint - ( - 0x10FFFF, - List[Byte](0b1111_0100, 0b1000_1111, 0b1011_1111, 0b1011_1111), - ), -) - - -fn assert_utf8_bytes(codepoint: UInt32, owned expected: List[Byte]) raises: - var char_opt = Codepoint.from_u32(codepoint) - var char = char_opt.value() - - # Allocate a length-4 buffer to write to. - var buffer = List[Byte](0, 0, 0, 0) - var written = char.unsafe_write_utf8(buffer.unsafe_ptr()) - - # Check that the number of bytes written was as expected. - assert_equal( - written, - len(expected), - "wrong byte count written encoding codepoint: {}".format(codepoint), - ) - - # Normalize `expected` to length 4 so we can compare the written byte - # values with `buffer`. - for _ in range(4 - len(expected)): - expected.append(0) - - assert_equal( - buffer, - expected, - "wrong byte values written encoding codepoint: {}".format(codepoint), - ) - - -def test_char_utf8_encoding(): - for entry in SIGNIFICANT_CODEPOINTS: - var codepoint = entry[][0] - var expected_utf8 = entry[][1] - - assert_utf8_bytes(codepoint, expected_utf8) - - -def test_char_utf8_byte_length(): - for entry in SIGNIFICANT_CODEPOINTS: - var codepoint = entry[][0] - var expected_utf8 = entry[][1] - - var computed_len = Codepoint.from_u32( - codepoint - ).value().utf8_byte_length() - - assert_equal(computed_len, len(expected_utf8)) - - -def test_char_comptime(): - alias c1 = Codepoint.from_u32(32).value() - - # Test that `utf8_byte_length()` works at compile time. - alias c1_bytes = c1.utf8_byte_length() - assert_equal(c1_bytes, 1) - - -def main(): - test_char_validity() - test_char_from_u8() - test_char_comparison() - test_char_formatting() - test_char_properties() - test_char_is_posix_space() - test_char_is_ascii_space() - test_char_is_lower() - test_char_is_upper() - test_char_is_digit() - test_char_is_printable() - test_char_utf8_encoding() - test_char_utf8_byte_length() - test_char_comptime()