From 56c4e915f5381d243dbda2af225c6338c05c1f70 Mon Sep 17 00:00:00 2001
From: David Zero <zero-one@zer0-one.net>
Date: Mon, 11 Sep 2023 20:44:40 -0400
Subject: [PATCH] deps: Remove ICU data dep

Builds ICU data into a static library as part of the build instead of
fetching a binary data release archive, and links the library into the
URL lib.
---
 .bazelrc                  |   4 +-
 .github/workflows/ci.yaml |   2 +-
 WORKSPACE                 |  16 ++--
 third_party/icu.BUILD     | 161 +++++++++++++++++++++++++++++++++++++-
 url/BUILD                 |  19 ++++-
 url/url.cpp               |  35 ---------
 6 files changed, 183 insertions(+), 54 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 47d3bb3a7..9e0359290 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -2,7 +2,6 @@
 # =========================================================
 
 build --enable_platform_specific_config
-build --test_env=HASTUR_ICU_DATA=external/icu-data/
 coverage --combined_report=lcov
 test --test_output=errors
 test --test_summary=terse
@@ -32,11 +31,14 @@ build:linux --cxxopt='-fno-rtti'
 build:linux --copt='-gdwarf-4'
 
 build:windows --enable_runfiles
+build:windows --action_env=LOCALAPPDATA # Quirk for running vswhere, remove when icu no-longer needed
+build:windows --action_env=ProgramData # Quirk for running vswhere, remove when icu no-longer needed
 build:windows --cxxopt='/std:c++latest'
 build:windows --cxxopt='/GR-' # Disable rtti.
 build:windows --copt='/permissive-' # Conform to the standard.
 build:windows --copt='/Zc:__cplusplus' # Report the real supported C++ version, not just C++98.
 build:windows --copt='-utf-8' # Use UTF-8 as the source and execution character sets.
+build:windows --host_copt='-utf-8' # Use UTF-8 as the source and execution character sets.
 
 # Special build options
 # =========================================================
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 9b9cbe963..9b43bb092 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -173,7 +173,7 @@ jobs:
       # Include all targets except for
       # * py_test targets: not fully statically linked
       # * targets that depend on sfml: it pulls in host dependencies.
-      - run: ./bazelisk test -- $(bazel query '... except (kind("py_test", ...) union rdeps(..., @sfml//:window))')
+      - run: ./bazelisk test -- $(bazel query '... except (kind("py_test", ...) union rdeps(..., @sfml//:window) union rdeps(..., @icu//:common))')
       - name: Run tui
         run: |
           echo "<html><body><h1>Example</h1><p>This is an example page.</p></body></html>" >example.html
diff --git a/WORKSPACE b/WORKSPACE
index a3ac484d5..77f70301e 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -127,18 +127,12 @@ http_archive(
     patch_cmds = [
         "rm source/common/BUILD.bazel",
         "rm source/stubdata/BUILD.bazel",
+        "rm source/tools/toolutil/BUILD.bazel",
+        "rm source/i18n/BUILD.bazel",
     ],
-    sha256 = "818a80712ed3caacd9b652305e01afc7fa167e6f2e94996da44b90c2ab604ce1",
-    strip_prefix = "icu",
-    url = "https://github.com/unicode-org/icu/releases/download/release-73-2/icu4c-73_2-src.tgz",
-)
-
-# https://github.com/unicode-org/icu
-http_archive(
-    name = "icu-data",  # Unicode-DFS-2016
-    build_file_content = """exports_files(["icudt73l.dat"])""",
-    sha256 = "2657bd18c23b930ddf63f466192832cc083256515e07b5a5e7d79c5c1db058a1",
-    url = "https://github.com/unicode-org/icu/releases/download/release-73-2/icu4c-73_2-data-bin-l.zip",
+    sha256 = "4b6c4a79b0648d228d505601e58780a59e9ad4eaad54be75cc637bd635aa46d6",
+    strip_prefix = "icu-release-73-2/icu4c",
+    url = "https://github.com/unicode-org/icu/archive/refs/tags/release-73-2.zip",
 )
 
 # https://github.com/ocornut/imgui
diff --git a/third_party/icu.BUILD b/third_party/icu.BUILD
index bbec5c462..894f72b41 100644
--- a/third_party/icu.BUILD
+++ b/third_party/icu.BUILD
@@ -1,4 +1,4 @@
-load("@rules_cc//cc:defs.bzl", "cc_library")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
 
 cc_library(
     name = "common",
@@ -9,6 +9,7 @@ cc_library(
         "source/stubdata/*.cpp",
     ]),
     hdrs = glob([
+        "source/common/*.h",
         "source/common/unicode/*.h",
     ]),
     copts = select({
@@ -33,10 +34,8 @@ cc_library(
     }),
     defines = [
         "U_STATIC_IMPLEMENTATION",
-        "U_COMMON_IMPLEMENTATION",
         "U_CHARSET_IS_UTF8=1",
         "U_HIDE_OBSOLETE_UTF_OLD_H=1",
-        "UCONFIG_NO_CONVERSION=1",
     ],
     linkopts = select({
         "@platforms//os:windows": [
@@ -45,6 +44,162 @@ cc_library(
         "//conditions:default": ["-ldl"],
     }),
     linkstatic = True,
+    local_defines = [
+        "U_COMMON_IMPLEMENTATION",
+    ],
     strip_include_prefix = "source/common/",
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "toolutil",
+    srcs = glob(["source/tools/toolutil/*.cpp"]),
+    hdrs = glob(["source/tools/toolutil/*.h"]),
+    copts = select({
+        "@platforms//os:windows": [
+            "/GR",
+        ],
+        "//conditions:default": [
+            "-frtti",
+        ],
+    }),
+    linkstatic = True,
+    local_defines = ["U_TOOLUTIL_IMPLEMENTATION"] + select({
+        "@platforms//os:windows": [],
+        "//conditions:default": [
+            "U_ELF",
+        ],
+    }),
+    strip_include_prefix = "source/tools/toolutil",
+    visibility = ["//visibility:private"],
+    deps = [
+        ":common",
+        ":i18n",
+    ],
+)
+
+cc_library(
+    name = "i18n",
+    srcs = glob(["source/i18n/*.cpp"]),
+    hdrs = glob([
+        "source/i18n/*.h",
+        "source/i18n/unicode/*.h",
+    ]),
+    copts = select({
+        "@platforms//os:windows": [
+            "/GR",
+        ],
+        "//conditions:default": [
+            "-frtti",
+        ],
+    }),
+    linkstatic = True,
+    local_defines = [
+        "U_I18N_IMPLEMENTATION",
+    ],
+    strip_include_prefix = "source/i18n",
+    visibility = ["//visibility:private"],
+    deps = [":common"],
+)
+
+cc_binary(
+    name = "gensprep",
+    srcs = glob(["source/tools/gensprep/*.c"]) + ["source/tools/gensprep/gensprep.h"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":common",
+        ":i18n",
+        ":toolutil",
+    ],
+)
+
+SPREP_DATA = glob(["source/data/sprep/*.txt"])
+
+SPREP_DATA_COMPILED = [s.replace("txt", "spp").rpartition("/")[2] for s in SPREP_DATA]
+
+filegroup(
+    name = "normalizations",
+    srcs = ["source/data/unidata/NormalizationCorrections.txt"],
+)
+
+[genrule(
+    name = "run_sprep_" + input.replace(".txt", "").rpartition("/")[2],
+    srcs = [input],
+    outs = [input.replace("txt", "spp").rpartition("/")[2]],
+    cmd = "./$(location gensprep) --destdir $(RULEDIR) --bundle-name " + input.replace(".txt", "").rpartition("/")[2] + " --norm-correction external/icu/source/data/unidata/ --unicode 15.0.0 $<",
+    tools = [
+        ":gensprep",
+        ":normalizations",
+    ],
+    visibility = ["//visibility:private"],
+) for input in SPREP_DATA]
+
+genrule(
+    name = "create_pkgdata_lst",
+    srcs = SPREP_DATA_COMPILED,
+    outs = ["pkgdata.lst"],
+    cmd = "echo -e \"" + "\\n".join(SPREP_DATA_COMPILED) + "\" > $(RULEDIR)/pkgdata.lst && echo uts46.nrm >> $(RULEDIR)/pkgdata.lst",
+)
+
+genrule(
+    name = "move uts46.nrm",
+    srcs = ["source/data/in/uts46.nrm"],
+    outs = ["uts46.nrm"],
+    cmd = "cp $< $(RULEDIR)",
+)
+
+cc_binary(
+    name = "icupkg",
+    srcs = ["source/tools/icupkg/icupkg.cpp"],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":common",
+        ":i18n",
+        ":toolutil",
+    ],
+)
+
+cc_binary(
+    name = "pkgdata",
+    srcs = [
+        "source/tools/pkgdata/pkgdata.cpp",
+        "source/tools/pkgdata/pkgtypes.c",
+        "source/tools/pkgdata/pkgtypes.h",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        ":common",
+        ":i18n",
+        ":toolutil",
+    ],
+)
+
+genrule(
+    name = "run_pkgdata",
+    srcs = [
+        "pkgdata.lst",
+        "uts46.nrm",
+    ] + SPREP_DATA_COMPILED,
+    outs = ["libicudt73l.a"],
+    cmd = r"""srcs=($(SRCS)); export PATH=$$PATH:$(location icupkg); $(location pkgdata) --entrypoint icudt73 --sourcedir $(RULEDIR) --destdir $(RULEDIR) --name icudt73l --mode static $${srcs[0]}""",
+    tools = [
+        ":icupkg",
+        ":pkgdata",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "run_pkgdata_windows",
+    srcs = [
+        "pkgdata.lst",
+        "uts46.nrm",
+    ] + SPREP_DATA_COMPILED,
+    outs = ["sicudt73l.lib"],
+    cmd = r"""srcs=($(SRCS)); export PATH=$$PATH:$(location icupkg):"/$$('C:\Program Files (x86)\Microsoft Visual Studio\Installer\vswhere.exe' -latest -prerelease -find '**\lib.exe' | grep x64 | grep -v llvm | head -n1 | awk -F '\' 'BEGIN{OFS=FS} {$$NF=""; print}' | tr -d ':' | tr '\134' '/')"; $(location pkgdata) --entrypoint icudt73 --sourcedir $(RULEDIR) --destdir $(RULEDIR) --name icudt73l --mode static $${srcs[0]}""",
+    tools = [
+        ":icupkg",
+        ":pkgdata",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/url/BUILD b/url/BUILD
index df4e278e9..80123cf04 100644
--- a/url/BUILD
+++ b/url/BUILD
@@ -1,7 +1,17 @@
-load("@rules_cc//cc:defs.bzl", "cc_library", "cc_test")
+load("@rules_cc//cc:defs.bzl", "cc_import", "cc_library", "cc_test")
 load("@rules_fuzzing//fuzzing:cc_defs.bzl", "cc_fuzz_test")
 load("//bzl:copts.bzl", "HASTUR_COPTS", "HASTUR_FUZZ_PLATFORMS")
 
+cc_import(
+    name = "icudata",
+    static_library = select({
+        "@platforms//os:windows": "@icu//:sicudt73l.lib",
+        "//conditions:default": "@icu//:libicudt73l.a",
+    }),
+    visibility = ["//visibility:private"],
+    alwayslink = True,
+)
+
 cc_library(
     name = "rtti_hack",
     srcs = ["rtti_hack.cpp"],
@@ -15,18 +25,21 @@ cc_library(
 
 cc_library(
     name = "url",
-    srcs = ["url.cpp"],
+    srcs = [
+        "url.cpp",
+    ],
     hdrs = ["url.h"],
     copts = HASTUR_COPTS,
-    data = ["@icu-data//:icudt73l.dat"],
     visibility = ["//visibility:public"],
     deps = [
+        ":icudata",
         ":rtti_hack",
         "//util:base_parser",
         "//util:string",
         "//util:unicode",
         "//util:uuid",
         "@icu//:common",
+        #"@icu//:icudata",
         "@spdlog",
     ],
 )
diff --git a/url/url.cpp b/url/url.cpp
index 3b988f9be..9eee0bd71 100644
--- a/url/url.cpp
+++ b/url/url.cpp
@@ -118,39 +118,6 @@ struct PercentEncodeSet {
     static constexpr bool component(char c) { return userinfo(c) || (c >= '$' && c <= '&') || c == '+' || c == ','; }
 };
 
-void icu_init() {
-    static std::atomic<bool> called_once = false;
-
-    if (called_once.exchange(true)) {
-        return;
-    }
-
-    // NOLINTNEXTLINE(concurrency-mt-unsafe): This is going away soon.
-    char *data = std::getenv("HASTUR_ICU_DATA");
-
-    if (data != nullptr) {
-        std::filesystem::path env_path{data};
-
-        if (std::filesystem::is_directory(env_path)) {
-            u_setDataDirectory(env_path.string().c_str());
-        }
-    } else {
-        // Use current working directory as a last resort.
-        // TODO(zero-one): Look at engine config for paths.
-        u_setDataDirectory(std::filesystem::current_path().string().c_str());
-    }
-
-    UErrorCode err = U_ZERO_ERROR;
-
-    std::uint32_t opts =
-            UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI | UIDNA_CHECK_CONTEXTJ | UIDNA_USE_STD3_RULES;
-
-    [[maybe_unused]] auto *uts = icu::IDNA::createUTS46Instance(opts, err);
-
-    assert(!U_FAILURE(err));
-
-    delete uts;
-}
 } // namespace
 
 void icu_cleanup() {
@@ -1189,8 +1156,6 @@ void UrlParser::state_fragment() {
 
 // https://url.spec.whatwg.org/#concept-domain-to-ascii
 std::optional<std::string> UrlParser::domain_to_ascii(std::string_view domain, bool be_strict) const {
-    icu_init();
-
     std::string ascii_domain;
     icu::StringByteSink<std::string> tmp{&ascii_domain};