diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index ccd420f17..ce564e2b0 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,12 +3,12 @@ Related issue = # +

Documentation | 简体中文文档 | Contributor Guidelines

[![AppVeyor Status](https://img.shields.io/appveyor/build/yuanming-hu/taichi?logo=AppVeyor&label=AppVeyor)](https://ci.appveyor.com/project/yuanming-hu/taichi/branch/master) @@ -13,9 +13,9 @@ **Taichi** (太极) is a programming language designed for *high-performance computer graphics*. It is deeply embedded in **Python**, and its **just-in-time compiler** offloads compute-intensive tasks to multi-core CPUs and massively parallel GPUs. - + -Advanced features of Taichi include [spatially sparse computing](https://taichi.readthedocs.io/en/latest/sparse.html) and [differentiable programming](https://taichi.readthedocs.io/en/latest/differentiable_programming.html) [[examples]](https://github.com/yuanming-hu/difftaichi). +Advanced features of Taichi include [spatially sparse computing](https://docs.taichi.graphics/docs/lang/articles/advanced/sparse) and [differentiable programming](https://docs.taichi.graphics/docs/lang/articles/advanced/differentiable_programming) [[examples]](https://github.com/yuanming-hu/difftaichi). **Please check out our SIGGRAPH 2020 course on Taichi basics:** [YouTube](https://youtu.be/Y0-76n3aZFA), [Bilibili](https://www.bilibili.com/video/BV1kA411n7jk/), [slides (pdf)](https://yuanming.taichi.graphics/publication/2020-taichi-tutorial/taichi-tutorial.pdf). @@ -23,10 +23,10 @@ Advanced features of Taichi include [spatially sparse computing](https://taichi. ## Examples ([More...](misc/examples.md)) - - - - + + + + ## Installation [![Downloads](https://pepy.tech/badge/taichi)](https://pepy.tech/project/taichi) @@ -67,3 +67,7 @@ If you use Taichi in your research, please cite our papers: - [Taichi Conference](https://github.com/taichi-dev/taichicon): Taichi developer conferences. - [GAMES 201 Lectures](https://github.com/taichi-dev/games201): (Chinese) A hands-on course on building advanced physics engines, based on Taichi. - [More...](misc/links.md) + +## Security + +Please disclose security issues responsibly by contacting contact@taichi.graphics. diff --git a/cmake/PythonNumpyPybind11.cmake b/cmake/PythonNumpyPybind11.cmake index bd351900a..5957afc79 100644 --- a/cmake/PythonNumpyPybind11.cmake +++ b/cmake/PythonNumpyPybind11.cmake @@ -14,7 +14,13 @@ endif () if (WIN32) execute_process(COMMAND where ${PYTHON_EXECUTABLE} - OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATH) + OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATHS) + if (${PYTHON_EXECUTABLE_PATHS}) + string(FIND ${PYTHON_EXECUTABLE_PATHS} "\n" _LINE_BREAK_LOC) + string(SUBSTRING ${PYTHON_EXECUTABLE_PATHS} 0 ${_LINE_BREAK_LOC} PYTHON_EXECUTABLE_PATH) + else () + set(PYTHON_EXECUTABLE_PATH ${PYTHON_EXECUTABLE}) + endif () else () execute_process(COMMAND which ${PYTHON_EXECUTABLE} OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATH) diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake index e6f629163..030d58c6a 100644 --- a/cmake/TaichiCXXFlags.cmake +++ b/cmake/TaichiCXXFlags.cmake @@ -44,11 +44,14 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64" OR "${CMAKE_SYSTEM_PROCESSOR}" message("Setting -march=nehalem for x86_64 processors") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=nehalem -DTI_ARCH_x64") endif() + set(ARCH "x64") elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_ARCH_ARM") + set(ARCH "arm64") else() message(FATAL_ERROR "Unknown processor type ${CMAKE_SYSTEM_PROCESSOR}") endif() +set(HOST_ARCH ${ARCH} CACHE INTERNAL "Host arch") if (USE_STDCPP) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++") diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake index 4404a1cda..d8f594474 100644 --- a/cmake/TaichiTests.cmake +++ b/cmake/TaichiTests.cmake @@ -23,6 +23,15 @@ include_directories( ) add_executable(${TESTS_NAME} ${TAICHI_TESTS_SOURCE}) +if (WIN32) + # Output the executable to bin/ instead of build/Debug/... + set(TESTS_OUTPUT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/bin") + set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUTPUT_DIR}) + set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG ${TESTS_OUTPUT_DIR}) + set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE ${TESTS_OUTPUT_DIR}) + set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR}) + set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR}) +endif() target_link_libraries(${TESTS_NAME} taichi_isolated_core) target_link_libraries(${TESTS_NAME} gtest_main) diff --git a/docs/lang/api/reference/_category_.json b/docs/lang/api/reference/_category_.json new file mode 100644 index 000000000..ec4123c51 --- /dev/null +++ b/docs/lang/api/reference/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "API Reference", + "position": 99 +} diff --git a/docs/lang/articles/advanced/_category_.json b/docs/lang/articles/advanced/_category_.json new file mode 100644 index 000000000..84fc16f93 --- /dev/null +++ b/docs/lang/articles/advanced/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Advanced Programming", + "position": 3 +} diff --git a/docs/lang/articles/basic/_category_.json b/docs/lang/articles/basic/_category_.json new file mode 100644 index 000000000..60b7ef558 --- /dev/null +++ b/docs/lang/articles/basic/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Taichi Language Basic Concepts", + "position": 2 +} diff --git a/docs/lang/articles/contribution/_category_.json b/docs/lang/articles/contribution/_category_.json new file mode 100644 index 000000000..74136a441 --- /dev/null +++ b/docs/lang/articles/contribution/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Contribution Guide", + "position": 5 +} diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/lang/articles/contribution/contributor_guide.md index 7c1adb64c..b1d8d62ff 100644 --- a/docs/lang/articles/contribution/contributor_guide.md +++ b/docs/lang/articles/contribution/contributor_guide.md @@ -166,7 +166,7 @@ This design is terrible. `yapf v0.29.0` locally before you use `ti format`. - If you don't have these formatting tools locally, feel free to - leverage GitHub actions: simply comment `\format` in a PR + leverage GitHub actions: simply comment `/format` in a PR (e.g., [#2481](https://github.com/taichi-dev/taichi/pull/2481#issuecomment-872226701)) and then [Taichi Gardener](https://github.com/taichi-gardener) will automatically format the code for you. diff --git a/docs/lang/articles/contribution/life_of_kernel_lowres.jpg b/docs/lang/articles/contribution/life_of_kernel_lowres.jpg new file mode 100644 index 000000000..0f7dbc332 Binary files /dev/null and b/docs/lang/articles/contribution/life_of_kernel_lowres.jpg differ diff --git a/docs/lang/articles/contribution/writing_cpp_tests.md b/docs/lang/articles/contribution/writing_cpp_tests.md index d41a42037..14792216d 100644 --- a/docs/lang/articles/contribution/writing_cpp_tests.md +++ b/docs/lang/articles/contribution/writing_cpp_tests.md @@ -21,6 +21,11 @@ make ./taichi_cpp_tests ``` +:::note +On Windows, `taichi_cpp_tests.exe` will be placed inside the `%TAICHI_REPO_DIR%\bin` directory, +so you can directly run it after adding the directory to `PATH` in [Setting up Taichi for development](dev_install.md#setting-up-taichi-for-development-3). +::: + :::note Consider polishing the CPP test infrastructure: diff --git a/docs/lang/articles/misc/_category_.json b/docs/lang/articles/misc/_category_.json new file mode 100644 index 000000000..da3cf0599 --- /dev/null +++ b/docs/lang/articles/misc/_category_.json @@ -0,0 +1,4 @@ +{ + "label": "Miscellaneous Topics", + "position": 4 +} diff --git a/examples/rendering/cornell_box.py b/examples/rendering/cornell_box.py index 94e916733..9560b09ee 100644 --- a/examples/rendering/cornell_box.py +++ b/examples/rendering/cornell_box.py @@ -1,6 +1,7 @@ import time import numpy as np +from numpy.lib.function_base import average import taichi as ti @@ -8,6 +9,7 @@ res = (800, 800) color_buffer = ti.Vector.field(3, dtype=ti.f32, shape=res) count_var = ti.field(ti.i32, shape=(1, )) +tonemapped_buffer = ti.Vector.field(3, dtype=ti.f32, shape=res) max_ray_depth = 10 eps = 1e-4 @@ -482,18 +484,34 @@ def render(): count_var[0] = (count_var[0] + 1) % (stratify_res * stratify_res) -gui = ti.GUI('Cornell Box', res) +@ti.kernel +def tonemap(accumulated: ti.f32) -> ti.f32: + sum = 0.0 + sum_sq = 0.0 + for i, j in color_buffer: + luma = color_buffer[i, j][0] * 0.2126 + color_buffer[ + i, j][1] * 0.7152 + color_buffer[i, j][2] * 0.0722 + sum += luma + sum_sq += ti.pow(luma / accumulated, 2.0) + mean = sum / (res[0] * res[1]) + var = sum_sq / (res[0] * res[1]) - ti.pow(mean / accumulated, 2.0) + for i, j in tonemapped_buffer: + tonemapped_buffer[i, j] = ti.sqrt(color_buffer[i, j] / mean * 0.6) + return var + + +gui = ti.GUI('Cornell Box', res, fast_gui=True) +gui.fps_limit = 300 last_t = time.time() i = 0 while gui.running: render() interval = 10 - if i % interval == 0 and i > 0: - img = color_buffer.to_numpy() * (1 / (i + 1)) - img = np.sqrt(img / img.mean() * 0.24) + if i % interval == 0: + var = tonemap(i) print("{:.2f} samples/s ({} iters, var={})".format( - interval / (time.time() - last_t), i, np.var(img))) + interval / (time.time() - last_t), i, var)) last_t = time.time() - gui.set_image(img) + gui.set_image(tonemapped_buffer) gui.show() i += 1 diff --git a/examples/simulation/mpm128.py b/examples/simulation/mpm128.py index a32bf420e..8dffdd7ff 100644 --- a/examples/simulation/mpm128.py +++ b/examples/simulation/mpm128.py @@ -148,7 +148,9 @@ def reset(): attractor_strength[None] = -1 for s in range(int(2e-3 // dt)): substep() - colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32) - gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()]) + gui.circles(x.to_numpy(), + radius=1.5, + palette=[0x068587, 0xED553B, 0xEEEEF0], + palette_indices=material) gui.show( ) # Change to gui.show(f'{frame:06d}.png') to write images to disk diff --git a/examples/simulation/mpm99.py b/examples/simulation/mpm99.py index 9a9d8fe59..cf71c3877 100644 --- a/examples/simulation/mpm99.py +++ b/examples/simulation/mpm99.py @@ -122,7 +122,9 @@ def initialize(): while not gui.get_event(ti.GUI.ESCAPE, ti.GUI.EXIT): for s in range(int(2e-3 // dt)): substep() - colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32) - gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()]) + gui.circles(x.to_numpy(), + radius=1.5, + palette=[0x068587, 0xED553B, 0xEEEEF0], + palette_indices=material) gui.show( ) # Change to gui.show(f'{frame:06d}.png') to write images to disk diff --git a/misc/ci_setup.py b/misc/ci_setup.py index 24868c76e..1e27fa94e 100644 --- a/misc/ci_setup.py +++ b/misc/ci_setup.py @@ -257,12 +257,11 @@ def run(self): execute_command('echo $PYTHONPATH') elif get_os_name() != 'win': # compile .. - os.makedirs('build', exist_ok=True) arg = environ.get('CI_SETUP_CMAKE_ARGS', '') + os.makedirs('build', exist_ok=True) execute_command( - f'cd build && cmake .. -DPYTHON_EXECUTABLE={sys.executable} {arg}' + f'TAICHI_CMAKE_ARGS="{arg}" {sys.executable} setup.py install --user' ) - execute_command('cd build && make -j 10') return if test_installation(): print(' Successfully Installed Taichi at {}.'.format( diff --git a/python/.gitignore b/python/.gitignore index 7d9f12eb9..12389e8d4 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,5 +1,7 @@ lib taichi.egg-info taichi/include -setup.py +taichi/examples +taichi/assets +taichi/tests release diff --git a/python/MANIFEST.in b/python/MANIFEST.in deleted file mode 100644 index c8c81c5e8..000000000 --- a/python/MANIFEST.in +++ /dev/null @@ -1,12 +0,0 @@ -include MANIFEST.in -include *.txt -include *.py -include *.cfg -include taichi/*.md -include taichi/assets/* -include taichi/lib/*.so -include taichi/lib/*.pyd -include taichi/lib/*.bc - -global-exclude *.pyc *.pyo -global-exclude ffmpeg diff --git a/python/build.py b/python/build.py index ffcd10a40..34a33a58c 100644 --- a/python/build.py +++ b/python/build.py @@ -5,8 +5,6 @@ import shutil import sys -import taichi as ti - def get_os_name(): name = platform.platform() @@ -26,95 +24,33 @@ def get_python_executable(): def build(project_name): - """Build and package the wheel file in `python/dist`""" + """Build and package the wheel file in root `dist` dir""" if platform.system() == 'Linux': if re.search("^clang\+\+-*\d*", str(os.environ.get('CXX'))) is None: raise RuntimeError( 'Only the wheel with clang will be released to PyPI') - version = ti.core.get_version_string() - with open('../setup.py') as fin: - with open('setup.py', 'w') as fout: - print("project_name = '{}'".format(project_name), file=fout) - print("version = '{}'".format(version), file=fout) - for l in fin: - print(l, file=fout, end='') - - print("*** project_name = '{}'".format(project_name)) - - try: - os.remove('taichi/CHANGELOG.md') - except FileNotFoundError: - pass - shutil.rmtree('taichi/lib', ignore_errors=True) - shutil.rmtree('taichi/tests', ignore_errors=True) - shutil.rmtree('taichi/examples', ignore_errors=True) - shutil.rmtree('taichi/assets', ignore_errors=True) - os.makedirs('taichi/lib', exist_ok=True) - shutil.rmtree('build', ignore_errors=True) - shutil.rmtree('dist', ignore_errors=True) - shutil.rmtree('taichi/include', ignore_errors=True) - # shutil.copytree('../include/', 'taichi/include') - build_dir = '../build' - - if get_os_name() == 'linux': - shutil.copy('../build/libtaichi_core.so', 'taichi/lib/taichi_core.so') - elif get_os_name() == 'osx': - shutil.copy('../build/libtaichi_core.dylib', - 'taichi/lib/taichi_core.so') - else: - shutil.copy('../runtimes/RelWithDebInfo/taichi_core.dll', - 'taichi/lib/taichi_core.pyd') - - os.system(f'cd .. && {get_python_executable()} -m taichi changelog --save') - - try: - with open('../CHANGELOG.md') as f: - print(f.read()) - except FileNotFoundError: - print('CHANGELOG.md not found') - pass - - try: - shutil.copy('../CHANGELOG.md', './taichi/CHANGELOG.md') - except FileNotFoundError: - pass - shutil.copytree('../tests/python', './taichi/tests') - shutil.copytree('../examples', './taichi/examples') - shutil.copytree('../external/assets', './taichi/assets') - - if get_os_name() != 'osx': - libdevice_path = ti.core.libdevice_path() - print("copying libdevice:", libdevice_path) - assert os.path.exists(libdevice_path) - shutil.copy(libdevice_path, 'taichi/lib/slim_libdevice.10.bc') - - ti.core.compile_runtimes() - runtime_dir = ti.core.get_runtime_dir() - for f in os.listdir(runtime_dir): - if f.startswith('runtime_') and f.endswith('.bc'): - print(f"Fetching runtime file {f}") - shutil.copy(os.path.join(runtime_dir, f), 'taichi/lib') print("Using python executable", get_python_executable()) os.system( '{} -m pip install --user --upgrade twine setuptools wheel'.format( get_python_executable())) + os.system(f'{get_python_executable()} -m taichi changelog --save') + if get_os_name() == 'linux': - os.system('{} setup.py bdist_wheel -p manylinux1_x86_64'.format( - get_python_executable())) + os.system( + f'cd ..; PROJECT_NAME={project_name} {get_python_executable()} setup.py bdist_wheel -p manylinux1_x86_64' + ) else: - os.system('{} setup.py bdist_wheel'.format(get_python_executable())) + os.system( + f'cd ..; PROJECT_NAME={project_name} {get_python_executable()} setup.py bdist_wheel' + ) - shutil.rmtree('taichi/lib') - shutil.rmtree('taichi/tests') - shutil.rmtree('taichi/examples') - shutil.rmtree('taichi/assets') try: os.remove('taichi/CHANGELOG.md') except FileNotFoundError: pass - shutil.rmtree('./build') + shutil.rmtree('../build') def parse_args(): @@ -149,6 +85,8 @@ def main(): env_pypi_pwd = os.environ.get('PYPI_PWD', '') + shutil.rmtree('../dist', ignore_errors=True) + if mode == 'try_upload': if env_pypi_pwd == '': print("Missing environment variable PYPI_PWD") @@ -174,16 +112,18 @@ def main(): get_python_executable(), pypi_repo, pypi_user)) elif mode == 'test': print('Uninstalling old taichi packages...') - os.system(f'{get_python_executable()} -m pip uninstall taichi-nightly') - os.system(f'{get_python_executable()} -m pip uninstall taichi') - dists = os.listdir('dist') + os.system( + f'{get_python_executable()} -m pip uninstall -y taichi-nightly') + os.system(f'{get_python_executable()} -m pip uninstall -y taichi') + dists = os.listdir('../dist') assert len(dists) == 1 dist = dists[0] print('Installing ', dist) os.environ['PYTHONPATH'] = '' os.makedirs('test_env', exist_ok=True) - os.system('cd test_env && {} -m pip install ../dist/{} --user'.format( - get_python_executable(), dist)) + os.system( + 'cd test_env && {} -m pip install ../../dist/{} --user'.format( + get_python_executable(), dist)) print('Entering test environment...') if get_os_name() == 'win': os.system( diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index 97678d9c0..dcfd0edec 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -404,7 +404,8 @@ def visit(node): from taichi.lang.meta import clear_gradients clear_gradients(places) - visit(ti.root) + for root_fb in FieldsBuilder.finalized_roots(): + visit(root_fb) def benchmark(func, repeat=300, args=()): diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py index b999bc202..62386afe2 100644 --- a/python/taichi/lang/expr.py +++ b/python/taichi/lang/expr.py @@ -141,6 +141,10 @@ def snode(self): def __hash__(self): return self.ptr.get_raw_address() + @property + def name(self): + return self.snode.name + @property def shape(self): if self.ptr.is_external_var(): diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py index e3d010be0..819b0e5aa 100644 --- a/python/taichi/lang/impl.py +++ b/python/taichi/lang/impl.py @@ -11,6 +11,7 @@ from taichi.lang.util import (cook_dtype, is_taichi_class, python_scope, taichi_scope) from taichi.misc.util import deprecated, get_traceback, warning +from taichi.snode.fields_builder import FieldsBuilder import taichi as ti @@ -127,6 +128,14 @@ def subscript(value, *indices): raise TypeError( 'Subscription (e.g., "a[i, j]") only works on fields or external arrays.' ) + if not value.ptr.is_external_var() and value.ptr.snode() is None: + if not value.ptr.is_primal(): + raise RuntimeError( + f"Gradient {value.ptr.get_expr_name()} has not been placed, check whether `needs_grad=True`" + ) + else: + raise RuntimeError( + f"{value.ptr.get_expr_name()} has not been placed.") field_dim = int(value.ptr.get_attribute("dim")) else: # When reading bit structure we only support the 0-D case for now. @@ -237,16 +246,20 @@ def create_program(self): if self.prog is None: self.prog = _ti_core.Program() + def materialize_root_fb(self, first): + if (not root.finalized and not root.empty) or first: + root.finalize() + + if root.finalized: + global _root_fb + _root_fb = FieldsBuilder() + def materialize(self): + self.materialize_root_fb(not self.materialized) + if self.materialized: return - print('[Taichi] materializing...') - self.create_program() - - if not root.finalized: - root.finalize() - self.materialized = True not_placed = [] for var in self.global_vars: @@ -424,7 +437,7 @@ def var(dt, shape=None, offset=None, needs_grad=False): @python_scope -def field(dtype, shape=None, offset=None, needs_grad=False): +def field(dtype, shape=None, name="", offset=None, needs_grad=False): _taichi_skip_traceback = 1 dtype = cook_dtype(dtype) @@ -449,6 +462,7 @@ def field(dtype, shape=None, offset=None, needs_grad=False): x = Expr(_ti_core.make_id_expr("")) x.declaration_tb = get_traceback(stacklevel=2) x.ptr = _ti_core.global_new(x.ptr, dtype) + x.ptr.set_name(name) x.ptr.set_is_primal(True) pytaichi.global_vars.append(x) @@ -456,6 +470,7 @@ def field(dtype, shape=None, offset=None, needs_grad=False): # adjoint x_grad = Expr(_ti_core.make_id_expr("")) x_grad.ptr = _ti_core.global_new(x_grad.ptr, dtype) + x_grad.ptr.set_name(name + ".grad") x_grad.ptr.set_is_primal(False) x.set_grad(x_grad) @@ -502,7 +517,13 @@ def vars2entries(vars): if hasattr(var, '__ti_repr__'): res = var.__ti_repr__() elif isinstance(var, (list, tuple)): - res = list_ti_repr(var) + res = var + # If the first element is '__ti_format__', this list is the result of ti_format. + if len(var) > 0 and isinstance( + var[0], str) and var[0] == '__ti_format__': + res = var[1:] + else: + res = list_ti_repr(var) else: yield var continue @@ -537,6 +558,35 @@ def fused_string(entries): _ti_core.create_print(contentries) +@taichi_scope +def ti_format(*args): + content = args[0] + mixed = args[1:] + new_mixed = [] + args = [] + for x in mixed: + if isinstance(x, ti.Expr): + new_mixed.append('{}') + args.append(x) + else: + new_mixed.append(x) + + try: + content = content.format(*new_mixed) + except ValueError: + print('Number formatting is not supported with Taichi fields') + exit(1) + res = content.split('{}') + assert len(res) == len( + args + ) + 1, 'Number of args is different from number of positions provided in string' + + for i in range(len(args)): + res.insert(i * 2 + 1, args[i]) + res.insert(0, '__ti_format__') + return res + + @taichi_scope def ti_assert(cond, msg, extra_args): # Mostly a wrapper to help us convert from Expr (defined in Python) to diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py index eb2a07c38..2f648e4ec 100644 --- a/python/taichi/lang/kernel_impl.py +++ b/python/taichi/lang/kernel_impl.py @@ -351,8 +351,7 @@ def materialize(self, key=None, args=None, arg_features=None): _taichi_skip_traceback = 1 if key is None: key = (self.func, 0) - if not self.runtime.materialized: - self.runtime.materialize() + self.runtime.materialize() if key in self.compiled_functions: return grad_suffix = "" diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py index 46187c54f..42362f1b8 100644 --- a/python/taichi/lang/matrix.py +++ b/python/taichi/lang/matrix.py @@ -585,6 +585,10 @@ def shape(self): def dim(self): return len(self.shape) + @property + def name(self): + return self.loop_range().name + @property def dtype(self): return self.loop_range().dtype @@ -807,6 +811,7 @@ def field(cls, m, dtype, shape=None, + name="", offset=None, needs_grad=False, layout=None): # TODO(archibate): deprecate layout @@ -825,17 +830,17 @@ def field(cls, dtype ) == n, f'Please set correct dtype list for Vector. The shape of dtype list should be ({n}, ) instead of {np.shape(dtype)}' for i in range(n): - self.entries.append(impl.field(dtype[i])) + self.entries.append(impl.field(dtype[i], name=name)) else: assert len(np.shape(dtype)) == 2 and len(dtype) == n and len( dtype[0] ) == m, f'Please set correct dtype list for Matrix. The shape of dtype list should be ({n}, {m}) instead of {np.shape(dtype)}' for i in range(n): for j in range(m): - self.entries.append(impl.field(dtype[i][j])) + self.entries.append(impl.field(dtype[i][j], name=name)) else: for _ in range(n * m): - self.entries.append(impl.field(dtype)) + self.entries.append(impl.field(dtype, name=name)) self.grad = self.make_grad() if layout is not None: diff --git a/python/taichi/lang/meta.py b/python/taichi/lang/meta.py index 0a20bb365..73250fd41 100644 --- a/python/taichi/lang/meta.py +++ b/python/taichi/lang/meta.py @@ -1,26 +1,28 @@ from taichi.core import settings from taichi.lang import impl from taichi.lang.expr import Expr +from taichi.lang.kernel_arguments import ext_arr, template +from taichi.lang.kernel_impl import kernel import taichi as ti # A set of helper (meta)functions -@ti.kernel -def fill_tensor(tensor: ti.template(), val: ti.template()): +@kernel +def fill_tensor(tensor: template(), val: template()): for I in ti.grouped(tensor): tensor[I] = val -@ti.kernel -def tensor_to_ext_arr(tensor: ti.template(), arr: ti.ext_arr()): +@kernel +def tensor_to_ext_arr(tensor: template(), arr: ext_arr()): for I in ti.grouped(tensor): arr[I] = tensor[I] -@ti.kernel -def vector_to_fast_image(img: ti.template(), out: ti.ext_arr()): +@kernel +def vector_to_fast_image(img: template(), out: ext_arr()): # FIXME: Why is ``for i, j in img:`` slower than: for i, j in ti.ndrange(*img.shape): r, g, b = 0, 0, 0 @@ -45,8 +47,8 @@ def vector_to_fast_image(img: ti.template(), out: ti.ext_arr()): out[idx] = (b << 16) + (g << 8) + r + alpha -@ti.kernel -def tensor_to_image(tensor: ti.template(), arr: ti.ext_arr()): +@kernel +def tensor_to_image(tensor: template(), arr: ext_arr()): for I in ti.grouped(tensor): t = ti.cast(tensor[I], ti.f32) arr[I, 0] = t @@ -54,8 +56,8 @@ def tensor_to_image(tensor: ti.template(), arr: ti.ext_arr()): arr[I, 2] = t -@ti.kernel -def vector_to_image(mat: ti.template(), arr: ti.ext_arr()): +@kernel +def vector_to_image(mat: template(), arr: ext_arr()): for I in ti.grouped(mat): for p in ti.static(range(mat.n)): arr[I, p] = ti.cast(mat[I][p], ti.f32) @@ -63,21 +65,20 @@ def vector_to_image(mat: ti.template(), arr: ti.ext_arr()): arr[I, 2] = 0 -@ti.kernel -def tensor_to_tensor(tensor: ti.template(), other: ti.template()): +@kernel +def tensor_to_tensor(tensor: template(), other: template()): for I in ti.grouped(tensor): tensor[I] = other[I] -@ti.kernel -def ext_arr_to_tensor(arr: ti.ext_arr(), tensor: ti.template()): +@kernel +def ext_arr_to_tensor(arr: ext_arr(), tensor: template()): for I in ti.grouped(tensor): tensor[I] = arr[I] -@ti.kernel -def matrix_to_ext_arr(mat: ti.template(), arr: ti.ext_arr(), - as_vector: ti.template()): +@kernel +def matrix_to_ext_arr(mat: template(), arr: ext_arr(), as_vector: template()): for I in ti.grouped(mat): for p in ti.static(range(mat.n)): for q in ti.static(range(mat.m)): @@ -87,9 +88,8 @@ def matrix_to_ext_arr(mat: ti.template(), arr: ti.ext_arr(), arr[I, p, q] = mat[I][p, q] -@ti.kernel -def ext_arr_to_matrix(arr: ti.ext_arr(), mat: ti.template(), - as_vector: ti.template()): +@kernel +def ext_arr_to_matrix(arr: ext_arr(), mat: template(), as_vector: template()): for I in ti.grouped(mat): for p in ti.static(range(mat.n)): for q in ti.static(range(mat.m)): @@ -99,36 +99,36 @@ def ext_arr_to_matrix(arr: ti.ext_arr(), mat: ti.template(), mat[I][p, q] = arr[I, p, q] -@ti.kernel -def clear_gradients(vars: ti.template()): +@kernel +def clear_gradients(vars: template()): for I in ti.grouped(Expr(vars[0])): for s in ti.static(vars): Expr(s)[I] = 0 -@ti.kernel -def clear_loss(l: ti.template()): +@kernel +def clear_loss(l: template()): # Using SNode writers would result in a forced sync, therefore we wrap these # writes into a kernel. l[None] = 0 l.grad[None] = 1 -@ti.kernel -def fill_matrix(mat: ti.template(), vals: ti.template()): +@kernel +def fill_matrix(mat: template(), vals: template()): for I in ti.grouped(mat): for p in ti.static(range(mat.n)): for q in ti.static(range(mat.m)): mat[I][p, q] = vals[p][q] -@ti.kernel -def snode_deactivate(b: ti.template()): +@kernel +def snode_deactivate(b: template()): for I in ti.grouped(b): ti.deactivate(b, I) -@ti.kernel -def snode_deactivate_dynamic(b: ti.template()): +@kernel +def snode_deactivate_dynamic(b: template()): for I in ti.grouped(b.parent()): ti.deactivate(b, I) diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py index 6ea9fa84a..4ceac5579 100644 --- a/python/taichi/lang/ops.py +++ b/python/taichi/lang/ops.py @@ -526,7 +526,6 @@ def external_func_call(func, args=[], outputs=[]): def asm(source, inputs=[], outputs=[]): - _ti_core.insert_external_func_call(0, source, make_expr_group(inputs), make_expr_group(outputs)) @@ -567,11 +566,11 @@ def rescale_index(a, b, I): """ assert isinstance(a, Expr) and a.is_global(), \ - f"first arguement must be a field" + f"first arguement must be a field" assert isinstance(b, Expr) and b.is_global(), \ - f"second arguement must be a field" + f"second arguement must be a field" assert isinstance(I, matrix.Matrix) and not I.is_global(), \ - f"third arguement must be a grouped index" + f"third arguement must be a grouped index" Ib = I.copy() for n in range(min(I.n, min(len(a.shape), len(b.shape)))): if a.shape[n] > b.shape[n]: diff --git a/python/taichi/lang/snode.py b/python/taichi/lang/snode.py index 7086d3f88..e782cbde2 100644 --- a/python/taichi/lang/snode.py +++ b/python/taichi/lang/snode.py @@ -140,6 +140,10 @@ def get_shape(self, i): def loop_range(self): return Expr(_ti_core.global_var_expr_from_snode(self.ptr)) + @property + def name(self): + return self.ptr.name() + @deprecated('x.snode()', 'x.snode') def __call__(self): # TODO: remove this after v0.7.0 return self diff --git a/python/taichi/lang/transformer.py b/python/taichi/lang/transformer.py index a3a52b9ef..b256b8e92 100644 --- a/python/taichi/lang/transformer.py +++ b/python/taichi/lang/transformer.py @@ -4,6 +4,7 @@ from taichi.lang import impl from taichi.lang.ast_resolver import ASTResolver from taichi.lang.exception import TaichiSyntaxError +from taichi.lang.kernel_arguments import ext_arr, template from taichi.lang.util import to_taichi_type import taichi as ti @@ -625,6 +626,11 @@ def visit_Call(self, node): if not ASTResolver.resolve_to(node.func, ti.static, globals()): # Do not apply the generic visitor if the function called is ti.static self.generic_visit(node) + if isinstance(node.func, ast.Attribute): + attr_name = node.func.attr + if attr_name == 'format': + node.args.insert(0, node.func.value) + node.func = self.parse_expr('ti.ti_format') if isinstance(node.func, ast.Name): func_name = node.func.id if func_name == 'print': @@ -679,9 +685,9 @@ def transform_as_kernel(): for i, arg in enumerate(args.args): # Directly pass in template arguments, # such as class instances ("self"), fields, SNodes, etc. - if isinstance(self.func.argument_annotations[i], ti.template): + if isinstance(self.func.argument_annotations[i], template): continue - if isinstance(self.func.argument_annotations[i], ti.ext_arr): + if isinstance(self.func.argument_annotations[i], ext_arr): arg_init = self.parse_stmt( 'x = ti.lang.kernel_arguments.decl_ext_arr_arg(0, 0)') arg_init.targets[0].id = arg.arg @@ -725,8 +731,7 @@ def transform_as_kernel(): for i, arg in enumerate(args.args): # Directly pass in template arguments, # such as class instances ("self"), fields, SNodes, etc. - if isinstance(self.func.argument_annotations[i], - ti.template): + if isinstance(self.func.argument_annotations[i], template): continue # Create a copy for non-template arguments, # so that they are passed by value. diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py index 073f8c280..deb69e9b1 100644 --- a/python/taichi/misc/gui.py +++ b/python/taichi/misc/gui.py @@ -210,7 +210,12 @@ def set_image(self, img): def circle(self, pos, color=0xFFFFFF, radius=1): self.canvas.circle_single(pos[0], pos[1], color, radius) - def circles(self, pos, color=0xFFFFFF, radius=1): + def circles(self, + pos, + radius=1, + color=0xFFFFFF, + palette=None, + palette_indices=None): n = pos.shape[0] if len(pos.shape) == 3: assert pos.shape[2] == 1 @@ -235,6 +240,40 @@ def circles(self, pos, color=0xFFFFFF, radius=1): raise ValueError( 'Color must be an ndarray or int (e.g., 0x956333)') + if palette is not None: + assert palette_indices is not None, 'palette must be used together with palette_indices' + + from taichi.lang.expr import Expr + + if isinstance(palette_indices, Expr): + ind_int = palette_indices.to_numpy().astype(np.uint32) + elif isinstance(palette_indices, list) or isinstance( + palette_indices, np.ndarray): + ind_int = np.array(palette_indices).astype(np.uint32) + else: + try: + ind_int = np.array(palette_indices) + except: + raise TypeError( + 'palette_indices must be a type that can be converted to numpy.ndarray' + ) + + assert issubclass( + ind_int.dtype.type, + np.integer), 'palette_indices must be an integer array' + assert ind_int.shape == ( + n, + ), 'palette_indices must be in 1-d shape with shape (num_particles, )' + assert min( + ind_int + ) >= 0, 'the min of palette_indices must not be less than zero' + assert max(ind_int) < len( + palette + ), 'the max of palette_indices must not exceed the length of palette' + color_array = np.array(palette, dtype=np.uint32)[ind_int] + color_array = np.ascontiguousarray(color_array) + color_array = color_array.ctypes.data + if isinstance(radius, np.ndarray): assert radius.shape == (n, ) radius = np.ascontiguousarray(radius.astype(np.float32)) diff --git a/python/taichi/snode/fields_builder.py b/python/taichi/snode/fields_builder.py index 41d688ae8..f87209d1c 100644 --- a/python/taichi/snode/fields_builder.py +++ b/python/taichi/snode/fields_builder.py @@ -5,6 +5,7 @@ from taichi.core.util import ti_core as _ti_core from taichi.lang import impl, snode from taichi.lang.exception import InvalidOperationError +from taichi.misc.util import warning _snode_registry = _ti_core.SNodeRegistry() @@ -37,6 +38,16 @@ def __init__(self): self._ptr = _snode_registry.create_root() self._root = snode.SNode(self._ptr) self._finalized = False + self._empty = True + + @classmethod + def finalized_roots(cls): + roots_ptr = [] + size = impl.get_runtime().prog.get_snode_tree_size() + for i in range(size): + res = impl.get_runtime().prog.get_snode_root(i) + roots_ptr.append(snode.SNode(res)) + return roots_ptr @property def ptr(self): @@ -46,16 +57,26 @@ def ptr(self): def root(self): return self._root + @property + def empty(self): + return self._empty + + @property + def finalized(self): + return self._finalized + def dense(self, indices: Union[Sequence[_Axis], _Axis], dimensions: Union[Sequence[int], int]): """Same as :func:`taichi.SNode.dense`""" self._check_not_finalized() + self._empty = False return self._root.dense(indices, dimensions) def pointer(self, indices: Union[Sequence[_Axis], _Axis], dimensions: Union[Sequence[int], int]): """Same as :func:`taichi.SNode.pointer`""" self._check_not_finalized() + self._empty = False return self._root.pointer(indices, dimensions) def hash(self, indices, dimensions): @@ -67,23 +88,27 @@ def dynamic(self, chunk_size: Optional[int] = None): """Same as :func:`taichi.SNode.dynamic`""" self._check_not_finalized() + self._empty = False return self._root.dynamic(index, dimension, chunk_size) def bitmasked(self, indices: Union[Sequence[_Axis], _Axis], dimensions: Union[Sequence[int], int]): """Same as :func:`taichi.SNode.bitmasked`""" self._check_not_finalized() + self._empty = False return self._root.bitmasked(indices, dimensions) def bit_struct(self, num_bits: int): """Same as :func:`taichi.SNode.bit_struct`""" self._check_not_finalized() + self._empty = False return self._root.bit_struct(num_bits) def bit_array(self, indices: Union[Sequence[_Axis], _Axis], dimensions: Union[Sequence[int], int], num_bits: int): """Same as :func:`taichi.SNode.bit_array`""" self._check_not_finalized() + self._empty = False return self._root.bit_array(indices, dimensions, num_bits) def place(self, @@ -92,17 +117,21 @@ def place(self, shared_exponent: bool = False): """Same as :func:`taichi.SNode.place`""" self._check_not_finalized() + self._empty = False self._root.place(*args, offset=offset, shared_exponent=shared_exponent) def lazy_grad(self): """Same as :func:`taichi.SNode.lazy_grad`""" # TODO: This complicates the implementation. Figure out why we need this self._check_not_finalized() + self._empty = False self._root.lazy_grad() def finalize(self): """Constructs the SNodeTree and finalizes this builder.""" self._check_not_finalized() + if self._empty: + warning("Finalizing an empty FieldsBuilder!") _ti_core.finalize_snode_tree(_snode_registry, self._ptr, impl.get_runtime().prog) self._finalized = True @@ -110,7 +139,3 @@ def finalize(self): def _check_not_finalized(self): if self._finalized: raise InvalidOperationError('FieldsBuilder finalized') - - @property - def finalized(self): - return self._finalized diff --git a/python/taichi/torch_io.py b/python/taichi/torch_io.py index 30794729a..8a800a151 100644 --- a/python/taichi/torch_io.py +++ b/python/taichi/torch_io.py @@ -1,14 +1,15 @@ -import taichi as ti +from taichi.lang.kernel_arguments import ext_arr, template +from taichi.lang.kernel_impl import kernel -@ti.kernel -def from_torch_template(expr: ti.template(), torch_tensor: ti.ext_arr()): +@kernel +def from_torch_template(expr: template(), torch_tensor: ext_arr()): for i in expr: expr[i] = torch_tensor[i] -@ti.kernel -def to_torch_template(expr: ti.template(), torch_tensor: ti.ext_arr()): +@kernel +def to_torch_template(expr: template(), torch_tensor: ext_arr()): for i in expr: torch_tensor[i] = expr[i] diff --git a/python/setup.cfg b/setup.cfg similarity index 100% rename from python/setup.cfg rename to setup.cfg diff --git a/setup.py b/setup.py index cad2a6d6d..06f5a41f7 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,15 @@ import glob +import multiprocessing +import os +import platform +import shutil +import subprocess +import sys -import setuptools +from setuptools import Extension, find_packages, setup +from setuptools.command.build_ext import build_ext +from setuptools.command.build_py import build_py +from setuptools.command.egg_info import egg_info classifiers = [ 'Development Status :: 2 - Pre-Alpha', @@ -16,35 +25,187 @@ 'Programming Language :: Python :: 3.9', ] +project_name = os.getenv('PROJECT_NAME', 'taichi') +TI_VERSION_MAJOR = 0 +TI_VERSION_MINOR = 7 +TI_VERSION_PATCH = 27 +version = f'{TI_VERSION_MAJOR}.{TI_VERSION_MINOR}.{TI_VERSION_PATCH}' + data_files = glob.glob('python/lib/*') print(data_files) -packages = setuptools.find_packages() + ['taichi.examples'] +packages = find_packages('python') print(packages) -setuptools.setup(name=project_name, - packages=packages, - version=version, - description='The Taichi Programming Language', - author='Taichi developers', - author_email='yuanmhu@gmail.com', - url='https://github.com/taichi-dev/taichi', - install_requires=[ - 'numpy', - 'pybind11>=2.5.0', - 'sourceinspect>=0.0.4', - 'colorama', - 'astor', - ], - data_files=[('lib', data_files)], - keywords=['graphics', 'simulation'], - license='MIT', - include_package_data=True, - entry_points={ - 'console_scripts': [ - 'ti=taichi.main:main', - ], - }, - classifiers=classifiers, - has_ext_modules=lambda: True) - -# Note: this is a template setup.py used by python/build.py +# Our python package root dir is python/ +package_dir = 'python' + + +def get_python_executable(): + return sys.executable.replace('\\', '/') + + +def get_os_name(): + name = platform.platform() + # in python 3.8, platform.platform() uses mac_ver() on macOS + # it will return 'macOS-XXXX' instead of 'Darwin-XXXX' + if name.lower().startswith('darwin') or name.lower().startswith('macos'): + return 'osx' + elif name.lower().startswith('windows'): + return 'win' + elif name.lower().startswith('linux'): + return 'linux' + assert False, "Unknown platform name %s" % name + + +def remove_tmp(taichi_dir): + shutil.rmtree(os.path.join(taichi_dir, 'assets'), ignore_errors=True) + shutil.rmtree(os.path.join(taichi_dir, 'examples'), ignore_errors=True) + shutil.rmtree(os.path.join(taichi_dir, 'tests'), ignore_errors=True) + + +class CMakeExtension(Extension): + def __init__(self, name): + Extension.__init__(self, name, sources=[]) + + +class EggInfo(egg_info): + def run(self): + taichi_dir = os.path.join(package_dir, 'taichi') + remove_tmp(taichi_dir) + shutil.rmtree('build', ignore_errors=True) + + shutil.copytree('tests/python', os.path.join(taichi_dir, 'tests')) + shutil.copytree('examples', os.path.join(taichi_dir, 'examples')) + shutil.copytree('external/assets', os.path.join(taichi_dir, 'assets')) + + egg_info.run(self) + + +# python setup.py build runs the following commands in order: +# python setup.py build_py +# python setup.py build_ext +class BuildPy(build_py): + def run(self): + build_py.run(self) + taichi_dir = os.path.join(package_dir, 'taichi') + remove_tmp(taichi_dir) + + +class CMakeBuild(build_ext): + def parse_cmake_args_from_env(self): + # Source: TAICHI_CMAKE_ARGS=... python setup.py ... + cmake_args = os.getenv('TAICHI_CMAKE_ARGS', '') + return cmake_args.strip().split() + + def run(self): + try: + out = subprocess.check_output(['cmake', '--version']) + except OSError: + raise RuntimeError( + "CMake must be installed to build the following extensions: " + + ", ".join(e.name for e in self.extensions)) + + # CMakeLists.txt is in the same directory as this setup.py file + cmake_list_dir = os.path.abspath(os.path.dirname(__file__)) + self.build_temp = os.path.join(cmake_list_dir, 'build') + + build_directory = os.path.abspath(self.build_temp) + + cmake_args = self.parse_cmake_args_from_env() + + cmake_args += [ + f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={build_directory}', + f'-DPYTHON_EXECUTABLE={get_python_executable()}', + f'-DTI_VERSION_MAJOR={TI_VERSION_MAJOR}', + f'-DTI_VERSION_MINOR={TI_VERSION_MINOR}', + f'-DTI_VERSION_PATCH={TI_VERSION_PATCH}', + ] + + cfg = 'Debug' if self.debug else 'Release' + build_args = ['--config', cfg] + + cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] + + # Assuming Makefiles + build_args += ['--', f'-j{multiprocessing.cpu_count()}'] + + self.build_args = build_args + + env = os.environ.copy() + os.makedirs(self.build_temp, exist_ok=True) + + print('-' * 10, 'Running CMake prepare', '-' * 40) + subprocess.check_call(['cmake', cmake_list_dir] + cmake_args, + cwd=self.build_temp, + env=env) + + print('-' * 10, 'Building extensions', '-' * 40) + cmake_cmd = ['cmake', '--build', '.'] + self.build_args + subprocess.check_call(cmake_cmd, cwd=self.build_temp) + + self.prepare_package() + + def prepare_package(self): + # We need to make sure these additional files are ready for + # - develop mode: must exist in local python/taichi/lib/ folder + # - install mode: must exist in self.build_lib/taichi/lib + taichi_lib_dir = 'taichi/lib' + for target in ( + os.path.join(package_dir, taichi_lib_dir), + os.path.join(self.build_lib, taichi_lib_dir), + ): + shutil.rmtree(target, ignore_errors=True) + os.makedirs(target) + if get_os_name() == 'linux': + shutil.copy(os.path.join(self.build_temp, 'libtaichi_core.so'), + os.path.join(target, 'taichi_core.so')) + elif get_os_name() == 'osx': + shutil.copy( + os.path.join(self.build_temp, 'libtaichi_core.dylib'), + os.path.join(target, 'taichi_core.so')) + else: + shutil.copy('../runtimes/RelWithDebInfo/taichi_core.dll', + os.path.join(target, 'taichi_core.pyd')) + + if get_os_name() != 'osx': + libdevice_path = 'external/cuda_libdevice/slim_libdevice.10.bc' + print("copying libdevice:", libdevice_path) + assert os.path.exists(libdevice_path) + shutil.copy(libdevice_path, + os.path.join(target, 'slim_libdevice.10.bc')) + + llvm_runtime_dir = 'taichi/runtime/llvm' + for f in os.listdir(llvm_runtime_dir): + if f.startswith('runtime_') and f.endswith('.bc'): + print(f"Fetching runtime file {f} to {target} folder") + shutil.copy(os.path.join(llvm_runtime_dir, f), target) + + +setup(name=project_name, + packages=packages, + package_dir={"": package_dir}, + version=version, + description='The Taichi Programming Language', + author='Taichi developers', + author_email='yuanmhu@gmail.com', + url='https://github.com/taichi-dev/taichi', + install_requires=[ + 'numpy', + 'pybind11>=2.5.0', + 'sourceinspect>=0.0.4', + 'colorama', + 'astor', + ], + data_files=[('lib', data_files)], + keywords=['graphics', 'simulation'], + license='MIT', + include_package_data=True, + entry_points={ + 'console_scripts': [ + 'ti=taichi.main:main', + ], + }, + classifiers=classifiers, + ext_modules=[CMakeExtension('taichi_core')], + cmdclass=dict(egg_info=EggInfo, build_py=BuildPy, build_ext=CMakeBuild), + has_ext_modules=lambda: True) diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h index 171e8463f..9b3347809 100644 --- a/taichi/backends/metal/shaders/runtime_structs.metal.h +++ b/taichi/backends/metal/shaders/runtime_structs.metal.h @@ -117,7 +117,7 @@ STR( int32_t start = 0; int32_t num_bits = 0; int32_t acc_offset = 0; - int32_t num_elements = 0; + int32_t num_elements_from_root = 0; }; Extractor extractors[kTaichiMaxNumIndices]; diff --git a/taichi/backends/vulkan/codegen_vulkan.cpp b/taichi/backends/vulkan/codegen_vulkan.cpp new file mode 100644 index 000000000..8a6b9f4bf --- /dev/null +++ b/taichi/backends/vulkan/codegen_vulkan.cpp @@ -0,0 +1,795 @@ +#include "taichi/backends/vulkan/codegen_vulkan.h" + +#include +#include + +#include "taichi/program/program.h" +#include "taichi/program/kernel.h" +#include "taichi/ir/statements.h" +#include "taichi/ir/ir.h" +#include "taichi/util/line_appender.h" +#include "taichi/backends/vulkan/kernel_utils.h" +#include "taichi/backends/vulkan/runtime.h" +#include "taichi/backends/opengl/opengl_data_types.h" +#include "taichi/ir/transforms.h" + +namespace taichi { +namespace lang { +namespace vulkan { +namespace { + +constexpr char kRootBufferName[] = "root_buffer"; +constexpr char kGlobalTmpsBufferName[] = "global_tmps_buffer"; +constexpr char kContextBufferName[] = "context_buffer"; + +constexpr char kGlobalInvocationIDName[] = "int(gl_GlobalInvocationID.x)"; +constexpr char kLinearLoopIndexName[] = "linear_loop_idx_"; + +constexpr int kMaxNumThreadsGridStrideLoop = 65536; + +#define TI_INSIDE_VULKAN_CODEGEN +#include "taichi/backends/vulkan/shaders/atomics.glsl.h" +#undef TI_INSIDE_VULKAN_CODEGEN + +using opengl::opengl_data_type_name; +using BuffersEnum = TaskAttributes::Buffers; +using BufferBind = TaskAttributes::BufferBind; + +std::string buffer_instance_name(BuffersEnum b) { + // https://www.khronos.org/opengl/wiki/Interface_Block_(GLSL)#Syntax + switch (b) { + case BuffersEnum::Root: + return kRootBufferName; + case BuffersEnum::GlobalTmps: + return kGlobalTmpsBufferName; + case BuffersEnum::Context: + return kContextBufferName; + default: + TI_NOT_IMPLEMENTED; + break; + } + return {}; +} + +std::string store_as_int_bits(const std::string &in, DataType dt) { + if (dt->is_primitive(PrimitiveTypeID::f32)) { + return fmt::format("floatBitsToInt({})", in); + } + return in; +} + +std::string load_from_int_bits(const std::string &in, DataType dt) { + if (dt->is_primitive(PrimitiveTypeID::f32)) { + return fmt::format("intBitsToFloat({})", in); + } + return in; +} + +std::string vk_data_address_shifter(const Stmt *s, DataType) { + // Hardcoded ">> 2" because we only support 32-bit for now. + return fmt::format("({} >> 2)", s->raw_name()); +} + +class TaskCodegen : public IRVisitor { + private: + enum class Section { + Headers, + Kernels, + }; + + static constexpr Section kAllSections[] = { + Section::Headers, + Section::Kernels, + }; + + public: + struct Params { + OffloadedStmt *task_ir; + const CompiledSNodeStructs *compiled_structs; + const KernelContextAttributes *ctx_attribs; + std::string ti_kernel_name; + int task_id_in_kernel; + }; + + explicit TaskCodegen(const Params ¶ms) + : task_ir_(params.task_ir), + compiled_structs_(params.compiled_structs), + ctx_attribs_(params.ctx_attribs), + task_name_(fmt::format("{}_t{:02d}", + params.ti_kernel_name, + params.task_id_in_kernel)) { + allow_undefined_visitor = true; + invoke_default_visitor = true; + } + + struct Result { + std::string source_code; + TaskAttributes task_attribs; + }; + + Result run() { + code_section_ = Section::Kernels; + if (task_ir_->task_type == OffloadedTaskType::serial) { + generate_serial_kernel(task_ir_); + } else if (task_ir_->task_type == OffloadedTaskType::range_for) { + // struct_for is automatically lowered to ranged_for for dense snodes + generate_range_for_kernel(task_ir_); + } else { + TI_ERROR("Unsupported offload type={} on Vulkan arch", + task_ir_->task_name()); + } + // Headers need global information, so it has to be delayed after visiting + // the task IR. + emit_headers(); + + Result res; + for (const auto s : kAllSections) { + res.source_code += section_appenders_.find(s)->second.lines(); + res.source_code += '\n'; + } + res.task_attribs = std::move(task_attribs_); + return res; + } + + void visit(OffloadedStmt *) override { + TI_ERROR("This codegen is supposed to deal with one offloaded task"); + } + + void visit(Block *stmt) override { + push_indent(); + for (auto &s : stmt->statements) { + s->accept(this); + } + pop_indent(); + } + + void visit(ConstStmt *const_stmt) override { + TI_ASSERT(const_stmt->width() == 1); + emit("const {} {} = {};", opengl_data_type_name(const_stmt->element_type()), + const_stmt->raw_name(), const_stmt->val[0].stringify()); + } + + void visit(AllocaStmt *alloca) override { + emit("{} {} = 0;", opengl_data_type_name(alloca->element_type()), + alloca->raw_name()); + } + + void visit(LocalLoadStmt *stmt) override { + // TODO: optimize for partially vectorized load... + bool linear_index = true; + for (int i = 0; i < (int)stmt->src.size(); i++) { + if (stmt->src[i].offset != i) { + linear_index = false; + } + } + if (stmt->same_source() && linear_index && + stmt->width() == stmt->src[0].var->width()) { + auto ptr = stmt->src[0].var; + emit("const {} {} = {};", opengl_data_type_name(stmt->element_type()), + stmt->raw_name(), ptr->raw_name()); + } else { + TI_NOT_IMPLEMENTED; + } + } + + void visit(LocalStoreStmt *stmt) override { + emit("{} = {};", stmt->dest->raw_name(), stmt->val->raw_name()); + } + + void visit(GetRootStmt *stmt) override { + // Should we assert |root_stmt_| is assigned only once? + root_stmt_ = stmt; + emit("const int {} = 0;", stmt->raw_name()); + } + + void visit(GetChStmt *stmt) override { + // TODO: GetChStmt -> GetComponentStmt ? + const auto &snode_descs = compiled_structs_->snode_descriptors; + auto *out_snode = stmt->output_snode; + TI_ASSERT(snode_descs.at(stmt->input_snode->id).get_child(stmt->chid) == + out_snode); + + emit("// SNode: {} -> {}", stmt->input_snode->node_type_name, + out_snode->node_type_name); + emit("const int {} = {} + {};", stmt->raw_name(), + stmt->input_ptr->raw_name(), + snode_descs.at(out_snode->id).mem_offset_in_parent_cell); + if (out_snode->is_place()) { + TI_ASSERT(ptr_to_buffers_.count(stmt) == 0); + ptr_to_buffers_[stmt] = BuffersEnum::Root; + } + } + + void visit(SNodeLookupStmt *stmt) override { + // TODO: SNodeLookupStmt -> GetSNodeCellStmt ? + std::string parent; + if (stmt->input_snode) { + parent = stmt->input_snode->raw_name(); + } else { + TI_ASSERT(root_stmt_ != nullptr); + parent = root_stmt_->raw_name(); + } + const auto *sn = stmt->snode; + + if (stmt->activate && !(sn->type == SNodeType::dense)) { + // Sparse SNode not supported yet. + TI_NOT_IMPLEMENTED; + } + const auto &snode_descs = compiled_structs_->snode_descriptors; + emit("// Get the cell of SNode {}", sn->node_type_name); + emit("const int {} = {} + ({} * {});", stmt->raw_name(), parent, + stmt->input_index->raw_name(), snode_descs.at(sn->id).cell_stride); + } + + void visit(LinearizeStmt *stmt) override { + std::string val = "0"; + for (int i = 0; i < (int)stmt->inputs.size(); i++) { + val = fmt::format("({} * {} + {})", val, stmt->strides[i], + stmt->inputs[i]->raw_name()); + } + emit("const int {} = {};", stmt->raw_name(), val); + } + + void visit(BitExtractStmt *stmt) override { + emit("const int {} = (({} >> {}) & ((1 << {}) - 1));", stmt->raw_name(), + stmt->input->raw_name(), stmt->bit_begin, + stmt->bit_end - stmt->bit_begin); + } + + void visit(LoopIndexStmt *stmt) override { + const auto stmt_name = stmt->raw_name(); + if (stmt->loop->is()) { + const auto type = stmt->loop->as()->task_type; + if (type == OffloadedTaskType::range_for) { + TI_ASSERT(stmt->index == 0); + emit("const int {} = {};", stmt_name, kLinearLoopIndexName); + } else { + TI_NOT_IMPLEMENTED; + } + } else if (stmt->loop->is()) { + TI_ASSERT(stmt->index == 0); + emit("const int {} = {};", stmt_name, stmt->loop->raw_name()); + } else { + TI_NOT_IMPLEMENTED; + } + } + + void visit(GlobalStoreStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + const auto dt = stmt->val->element_type(); + emit("{} = {};", at_buffer(stmt->dest, dt), + store_as_int_bits(stmt->val->raw_name(), dt)); + } + + void visit(GlobalLoadStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + auto dt = stmt->element_type(); + const auto loaded_int = at_buffer(stmt->src, dt); + emit("const {} {} = {};", opengl_data_type_name(dt), stmt->raw_name(), + load_from_int_bits(loaded_int, dt)); + } + + void visit(ArgLoadStmt *stmt) override { + const auto arg_id = stmt->arg_id; + const auto &arg_attribs = ctx_attribs_->args()[arg_id]; + const auto offset_in_mem = arg_attribs.offset_in_mem; + if (stmt->is_ptr) { + emit("// Pointer arg: id={} offset_in_mem={}", arg_id, offset_in_mem); + // Do not shift! We are indexing the buffers at byte granularity. + emit("const int {} = {};", stmt->raw_name(), offset_in_mem); + } else { + const auto dt = arg_attribs.dt; + const auto val_str = fmt::format("{}[{}]", kContextBufferName, + (offset_in_mem / sizeof(int32_t))); + emit("// Scalar arg: id={} offset_in_mem={}", arg_id, offset_in_mem); + emit("const {} {} = {};", opengl_data_type_name(dt), stmt->raw_name(), + load_from_int_bits(val_str, dt)); + } + } + + void visit(ReturnStmt *stmt) override { + // TODO: use stmt->ret_id instead of 0 as index + const auto &ret_attribs = ctx_attribs_->rets()[0]; + const int index_in_buffer = ret_attribs.offset_in_mem / sizeof(int32_t); + emit("// Return value: offset_in_mem={}", ret_attribs.offset_in_mem); + emit("{}[{}] = {};", kContextBufferName, index_in_buffer, + store_as_int_bits(stmt->value->raw_name(), ret_attribs.dt)); + } + + void visit(GlobalTemporaryStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + const auto dt = opengl_data_type_name(stmt->element_type().ptr_removed()); + emit("const int {} = {}", stmt->raw_name(), stmt->offset); + ptr_to_buffers_[stmt] = BuffersEnum::GlobalTmps; + } + + void visit(ExternalPtrStmt *stmt) override { + // Used mostly for transferring data between host (e.g. numpy array) and + // Vulkan. + TI_ASSERT(stmt->width() == 1); + const auto linear_offset_name = + fmt::format("{}_linear_mem_offset_", stmt->raw_name()); + emit("int {} = 0;", linear_offset_name); + emit("{{"); + { + ScopedIndent s(current_appender()); + const auto *argload = stmt->base_ptrs[0]->as(); + const int arg_id = argload->arg_id; + const int num_indices = stmt->indices.size(); + std::vector size_var_names; + const auto extra_args_mem_offset = ctx_attribs_->extra_args_mem_offset(); + const auto extra_args_index_base = + (extra_args_mem_offset / sizeof(int32_t)); + emit("// External ptr, extra args: mem_offset={} index_base={}", + extra_args_mem_offset, extra_args_index_base); + for (int i = 0; i < num_indices; i++) { + std::string var_name = fmt::format("{}_size{}_", stmt->raw_name(), i); + const auto extra_arg_linear_index_offset = + (arg_id * taichi_max_num_indices) + i; + const auto extra_arg_linear_index = + extra_args_index_base + extra_arg_linear_index_offset; + emit("// Extra arg: arg_id={} i={} linear_index=({} + {})={}", arg_id, + i, extra_args_index_base, extra_arg_linear_index_offset, + extra_arg_linear_index); + emit("const int {} = {}[{}];", var_name, kContextBufferName, + extra_arg_linear_index); + size_var_names.push_back(std::move(var_name)); + } + for (int i = 0; i < num_indices; i++) { + emit("{} *= {};", linear_offset_name, size_var_names[i]); + emit("{} += {};", linear_offset_name, stmt->indices[i]->raw_name()); + } + emit("// Convert index to bytes"); + emit("{} = ({} << 2);", linear_offset_name, linear_offset_name); + } + emit("}}"); + emit("const int {} = ({} + {});", stmt->raw_name(), + stmt->base_ptrs[0]->raw_name(), linear_offset_name); + ptr_to_buffers_[stmt] = BuffersEnum::Context; + } + + void visit(UnaryOpStmt *stmt) override { + const auto dt_name = opengl_data_type_name(stmt->element_type()); + const auto var_decl = fmt::format("const {} {}", dt_name, stmt->raw_name()); + const auto operand_name = stmt->operand->raw_name(); + + if (stmt->op_type == UnaryOpType::logic_not) { + emit("{} = {}({} == 0);", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::neg) { + emit("{} = -{}({});", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::rsqrt) { + emit("{} = {}(inversesqrt({}));", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::sgn) { + emit("{} = {}(sign({}));", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::bit_not) { + emit("{} = ~{}({});", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::cast_value) { + emit("{} = {}({});", var_decl, dt_name, operand_name); + } else if (stmt->op_type == UnaryOpType::cast_bits) { + constexpr int kFloatingPoint = 0; + constexpr int kSignedInteger = 1; + constexpr int kUnsignedInteger = 2; + + const auto dst_type = stmt->cast_type; + const auto src_type = stmt->operand->element_type(); + auto dst_type_id = kFloatingPoint; + if (is_integral(dst_type)) { + dst_type_id = is_unsigned(dst_type) ? kUnsignedInteger : kSignedInteger; + } + auto src_type_id = kFloatingPoint; + if (is_integral(src_type)) { + src_type_id = is_unsigned(src_type) ? kUnsignedInteger : kSignedInteger; + } + + TI_ASSERT_INFO( + data_type_size(dst_type) == data_type_size(src_type), + "bit_cast is only supported between data type with same size"); + + if (dst_type_id != kFloatingPoint && src_type_id != kFloatingPoint) { + emit("{} = {}({});", var_decl, dt_name, operand_name); + } else if (dst_type_id == kFloatingPoint && + src_type_id == kSignedInteger) { + emit("{} = intBitsToFloat({});", var_decl, operand_name); + } else if (dst_type_id == kSignedInteger && + src_type_id == kFloatingPoint) { + emit("{} = floatBitsToInt({});", var_decl, operand_name); + } else if (dst_type_id == kFloatingPoint && + src_type_id == kUnsignedInteger) { + emit("{} = uintBitsToFloat({});", var_decl, operand_name); + } else if (dst_type_id == kUnsignedInteger && + src_type_id == kFloatingPoint) { + emit("{} = floatBitsToUint({});", var_decl, operand_name); + } else { + TI_ERROR("[glsl] unsupported bit cast from {} to {}", + data_type_name(src_type), data_type_name(dst_type)); + } + } else { + emit("{} = {}({});", var_decl, unary_op_type_name(stmt->op_type), + operand_name); + } + } + + void visit(BinaryOpStmt *bin) override { + const auto dt_name = opengl_data_type_name(bin->element_type()); + const auto lhs_name = bin->lhs->raw_name(); + const auto rhs_name = bin->rhs->raw_name(); + const auto bin_name = bin->raw_name(); + const auto op_type = bin->op_type; + const auto var_decl = fmt::format("const {} {}", dt_name, bin_name); + if (op_type == BinaryOpType::floordiv) { + if (is_integral(bin->lhs->element_type()) && + is_integral(bin->rhs->element_type())) { + emit( + "{} = {}(sign({}) * {} >= 0 ? abs({}) / abs({}) : " + "sign({}) * " + "(abs({}) + abs({}) - 1) / {});", + var_decl, dt_name, lhs_name, rhs_name, lhs_name, rhs_name, lhs_name, + lhs_name, rhs_name, rhs_name); + } else { + emit("{} = floor({} / {});", var_decl, lhs_name, rhs_name); + } + return; + } + if (bin->op_type == BinaryOpType::mod) { + // NOTE: the GLSL built-in function `mod()` is a pythonic mod: x - y * + // floor(x / y) + emit("{} = {} - {} * int({} / {});", var_decl, lhs_name, rhs_name, + lhs_name, rhs_name); + return; + } + + const auto binop = binary_op_type_symbol(bin->op_type); + if (opengl::is_opengl_binary_op_infix(op_type)) { + if (is_comparison(op_type)) { + // TODO(#577): Taichi uses -1 as true due to LLVM i1. + emit(" {} = -{}({} {} {});", var_decl, dt_name, lhs_name, binop, + rhs_name); + } else { + emit("{} = {}({} {} {});", var_decl, dt_name, lhs_name, binop, + rhs_name); + } + } else { + // This is a function call + emit("{} = {}({}, {});", var_decl, binop, lhs_name, rhs_name); + } + } + + void visit(TernaryOpStmt *tri) override { + TI_ASSERT(tri->op_type == TernaryOpType::select); + emit("const {} {} = ({}) ? ({}) : ({});", + opengl_data_type_name(tri->element_type()), tri->raw_name(), + tri->op1->raw_name(), tri->op2->raw_name(), tri->op3->raw_name()); + } + + void visit(AtomicOpStmt *stmt) override { + TI_ASSERT(stmt->width() == 1); + if (stmt->op_type != AtomicOpType::add) { + TI_NOT_IMPLEMENTED; + } + const auto dt = stmt->dest->element_type().ptr_removed(); + std::string func = "atomicAdd"; // GLSL builtin + std::string mem = at_buffer(stmt->dest, dt); + if (dt->is_primitive(PrimitiveTypeID::f32)) { + // Buffer has to be specified in the fatomicAdd helpers. + const std::string buffer_name = + buffer_instance_name(ptr_to_buffers_.at(stmt->dest)); + func = fmt::format("fatomicAdd_{}", buffer_name); + mem = vk_data_address_shifter(stmt->dest, dt); + } else if (!is_integral(dt)) { + TI_ERROR("Vulkan only supports 32-bit atomic data types"); + } + // const dt stmt = atomicAdd(mem, val); + emit("const {} {} = {}({}, {});", opengl_data_type_name(dt), + stmt->raw_name(), func, mem, stmt->val->raw_name()); + } + + void visit(IfStmt *if_stmt) override { + emit("if ({} != 0) {{", if_stmt->cond->raw_name()); + if (if_stmt->true_statements) { + if_stmt->true_statements->accept(this); + } + emit("}} else {{"); + if (if_stmt->false_statements) { + if_stmt->false_statements->accept(this); + } + emit("}}"); + } + + void visit(RangeForStmt *for_stmt) override { + TI_ASSERT(for_stmt->width() == 1); + auto loop_var_name = for_stmt->raw_name(); + if (!for_stmt->reversed) { + emit("for (int {}_ = {}; {}_ < {}; {}_ = {}_ + {}) {{", loop_var_name, + for_stmt->begin->raw_name(), loop_var_name, + for_stmt->end->raw_name(), loop_var_name, loop_var_name, 1); + emit(" int {} = {}_;", loop_var_name, loop_var_name); + } else { + // reversed for loop + emit("for (int {}_ = {} - 1; {}_ >= {}; {}_ = {}_ - {}) {{", + loop_var_name, for_stmt->end->raw_name(), loop_var_name, + for_stmt->begin->raw_name(), loop_var_name, loop_var_name, 1); + emit(" int {} = {}_;", loop_var_name, loop_var_name); + } + for_stmt->body->accept(this); + emit("}}"); + } + + void visit(WhileStmt *stmt) override { + emit("while (true) {{"); + stmt->body->accept(this); + emit("}}"); + } + + void visit(WhileControlStmt *stmt) override { + emit("if ({} == 0) break;", stmt->cond->raw_name()); + } + + void visit(ContinueStmt *stmt) override { + if (stmt->as_return()) { + emit("return;"); + } else { + emit("continue;"); + } + } + + private: + void emit_headers() { + SectionGuard sg(this, Section::Headers); + + emit("#version 450"); + emit("layout(local_size_x={}, local_size_y=1, local_size_z=1) in;", + task_attribs_.advisory_num_threads_per_group); + emit(""); + for (const auto &bb : task_attribs_.buffer_binds) { + // e.g. + // layout(std430, binding=0) buffer Root { int root_buffer[]; }; + emit("layout(std430, binding={}) buffer {} {{ int {}[]; }};", bb.binding, + TaskAttributes::buffers_name(bb.type), + buffer_instance_name(bb.type)); + } + emit(""); + emit("// Helpers"); + current_appender().append_raw(kVulkanAtomicsSourceCode); + } + + void generate_serial_kernel(OffloadedStmt *stmt) { + task_attribs_.name = task_name_; + task_attribs_.task_type = OffloadedTaskType::serial; + task_attribs_.buffer_binds = get_common_buffer_binds(); + task_attribs_.advisory_total_num_threads = 1; + task_attribs_.advisory_num_threads_per_group = 1; + + const auto func_name = single_work_func_name(); + // The computation for a single work is wrapped inside a function, so that + // we can do grid-strided loop. + emit_single_work_func_def(func_name, stmt->body.get()); + // The actual compute kernel entry point. + emit("void main() {{"); + { + ScopedIndent s(current_appender()); + emit("// serial"); + emit("if ({} > 0) return;", kGlobalInvocationIDName); + + emit_call_single_work_func(func_name, /*loop_index_expr=*/"0"); + } + // Close kernel + emit("}}\n"); + } + + void generate_range_for_kernel(OffloadedStmt *stmt) { + task_attribs_.name = task_name_; + task_attribs_.task_type = OffloadedTaskType::range_for; + task_attribs_.buffer_binds = get_common_buffer_binds(); + + task_attribs_.range_for_attribs = TaskAttributes::RangeForAttributes(); + auto &range_for_attribs = task_attribs_.range_for_attribs.value(); + range_for_attribs.const_begin = stmt->const_begin; + range_for_attribs.const_end = stmt->const_end; + range_for_attribs.begin = + (stmt->const_begin ? stmt->begin_value : stmt->begin_offset); + range_for_attribs.end = + (stmt->const_end ? stmt->end_value : stmt->end_offset); + + const auto func_name = single_work_func_name(); + emit_single_work_func_def(func_name, stmt->body.get()); + + emit("void main() {{"); + push_indent(); + const std::string total_elems_name("total_elems"); + std::string begin_expr; + if (range_for_attribs.const_range()) { + const int num_elems = range_for_attribs.end - range_for_attribs.begin; + begin_expr = std::to_string(stmt->begin_value); + emit("// range_for, range known at compile time"); + emit("const int {} = {};", total_elems_name, num_elems); + task_attribs_.advisory_total_num_threads = num_elems; + } else { + TI_NOT_IMPLEMENTED; + } + // begin_ = thread_id + begin_expr + emit("const int begin_ = {} + {};", kGlobalInvocationIDName, begin_expr); + // end_ = total_elems + begin_expr + emit("const int end_ = {} + {};", total_elems_name, begin_expr); + // For now, |total_invocs_name| is equal to |total_elems|. Once we support + // dynamic range, they will be different. + const std::string total_invocs_name = "total_invocs"; + // https://www.khronos.org/opengl/wiki/Compute_Shader#Inputs + emit("const int {} = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);", + total_invocs_name); + // grid-strided loop + emit("for (int ii = begin_; ii < end_; ii += {}) {{", total_invocs_name); + { + ScopedIndent s2(current_appender()); + emit_call_single_work_func(func_name, /*loop_index_expr=*/"ii"); + } + emit("}}"); // closes for loop + + pop_indent(); + // Close kernel + emit("}}\n"); + // TODO: runtime needs to verify if block_dim is feasible + task_attribs_.advisory_num_threads_per_group = stmt->block_dim; + } + + void emit_single_work_func_def(const std::string &func_name, + + Block *func_ir) { + emit("void {}(", func_name); + emit(" const int {}) {{", kLinearLoopIndexName); + // We do not need additional indentation, because |func_ir| itself is a + // block, which will be indented automatically. + func_ir->accept(this); + emit("}}\n"); // closes this function + } + + void emit_call_single_work_func(const std::string &func_name, + + const std::string &loop_index_expr) { + emit("{}({});", func_name, loop_index_expr); + } + + std::string at_buffer(const Stmt *ptr, DataType dt) const { + const std::string buffer_name = + buffer_instance_name(ptr_to_buffers_.at(ptr)); + return fmt::format("{}[{}]", buffer_name, vk_data_address_shifter(ptr, dt)); + } + + std::string single_work_func_name() const { + return task_name_ + "_func"; + } + + std::vector get_common_buffer_binds() const { + std::vector result; + int binding = 0; + result.push_back({BuffersEnum::Root, binding++}); + result.push_back({BuffersEnum::GlobalTmps, binding++}); + if (!ctx_attribs_->empty()) { + result.push_back({BuffersEnum::Context, binding++}); + } + return result; + } + + class SectionGuard { + public: + SectionGuard(TaskCodegen *tcg, Section new_sec) + : tcg_(tcg), saved_(tcg->code_section_) { + tcg_->code_section_ = new_sec; + } + + ~SectionGuard() { + tcg_->code_section_ = saved_; + } + + private: + TaskCodegen *const tcg_; + const Section saved_; + }; + + friend class SectionGuard; + + template + void emit(std::string f, Args &&... args) { + // TI_INFO(f, args...); + current_appender().append(std::move(f), std::forward(args)...); + } + + void push_indent() { + current_appender().push_indent(); + } + + void pop_indent() { + current_appender().pop_indent(); + } + + LineAppender ¤t_appender() { + return section_appenders_[code_section_]; + } + + OffloadedStmt *const task_ir_; // not owned + const CompiledSNodeStructs *const compiled_structs_; // not owned + const KernelContextAttributes *const ctx_attribs_; // not owned + const std::string task_name_; + + TaskAttributes task_attribs_; + GetRootStmt *root_stmt_{nullptr}; + std::unordered_map ptr_to_buffers_; + Section code_section_{Section::Kernels}; + std::unordered_map section_appenders_; +}; + +class KernelCodegen { + public: + struct Params { + std::string ti_kernel_name; + Kernel *kernel; + const CompiledSNodeStructs *compiled_structs; + }; + + explicit KernelCodegen(const Params ¶ms) + : params_(params), ctx_attribs_(*params.kernel) { + } + + using Result = VkRuntime::RegisterParams; + + Result run() { + Result res; + auto &kernel_attribs = res.kernel_attribs; + auto *root = params_.kernel->ir->as(); + auto &tasks = root->statements; + for (int i = 0; i < tasks.size(); ++i) { + TaskCodegen::Params tp; + tp.task_ir = tasks[i]->as(); + tp.task_id_in_kernel = i; + tp.compiled_structs = params_.compiled_structs; + tp.ctx_attribs = &ctx_attribs_; + tp.ti_kernel_name = params_.ti_kernel_name; + + TaskCodegen cgen(tp); + auto task_res = cgen.run(); + kernel_attribs.tasks_attribs.push_back(std::move(task_res.task_attribs)); + res.task_glsl_source_codes.push_back(std::move(task_res.source_code)); + } + kernel_attribs.ctx_attribs = std::move(ctx_attribs_); + kernel_attribs.name = params_.ti_kernel_name; + kernel_attribs.is_jit_evaluator = params_.kernel->is_evaluator; + return res; + } + + private: + Params params_; + KernelContextAttributes ctx_attribs_; +}; + +} // namespace + +void lower(Kernel *kernel) { + auto &config = kernel->program->config; + config.demote_dense_struct_fors = true; + irpass::compile_to_executable(kernel->ir.get(), config, kernel, + /*vectorize=*/false, kernel->grad, + /*ad_use_stack=*/false, config.print_ir, + /*lower_global_access=*/true, + /*make_thread_local=*/false); +} + +FunctionType compile_to_executable(Kernel *kernel, + const CompiledSNodeStructs *compiled_structs, + VkRuntime *runtime) { + const auto id = Program::get_kernel_id(); + const auto taichi_kernel_name(fmt::format("{}_k{:04d}_vk", kernel->name, id)); + TI_INFO("VK codegen for Taichi kernel={}", taichi_kernel_name); + KernelCodegen::Params params; + params.ti_kernel_name = taichi_kernel_name; + params.kernel = kernel; + params.compiled_structs = compiled_structs; + KernelCodegen codegen(params); + auto res = codegen.run(); + auto handle = runtime->register_taichi_kernel(std::move(res)); + return [runtime, handle, taichi_kernel_name](Context &ctx) { + runtime->launch_kernel(handle, &ctx); + }; +} + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/codegen_vulkan.h b/taichi/backends/vulkan/codegen_vulkan.h new file mode 100644 index 000000000..64210d95e --- /dev/null +++ b/taichi/backends/vulkan/codegen_vulkan.h @@ -0,0 +1,25 @@ +#pragma once + +#include "taichi/lang_util.h" + +#include "taichi/backends/vulkan/snode_struct_compiler.h" + +namespace taichi { +namespace lang { + +class Kernel; + +namespace vulkan { + +class VkRuntime; + +void lower(Kernel *kernel); + +// These ASTs must have already been lowered at the CHI level. +FunctionType compile_to_executable(Kernel *kernel, + const CompiledSNodeStructs *compiled_structs, + VkRuntime *runtime); + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/shaders/atomics.glsl.h b/taichi/backends/vulkan/shaders/atomics.glsl.h new file mode 100644 index 000000000..1c183b6f4 --- /dev/null +++ b/taichi/backends/vulkan/shaders/atomics.glsl.h @@ -0,0 +1,44 @@ +// clang-format on +#include "taichi/backends/vulkan/shaders/prologue.h" + +#ifndef TI_INSIDE_VULKAN_CODEGEN +static_assert(false, "do not include"); +#endif // TI_INSIDE_VULKAN_CODEGEN + +#define VULKAN_BEGIN_CODE_DEF constexpr auto kVulkanAtomicsSourceCode = +#define VULKAN_END_CODE_DEF ; + +// clang-format off +VULKAN_BEGIN_CODE_DEF +STR( +// TODO: don't duplicate, pass in pointer +float fatomicAdd_root_buffer(int addr, float data) { + int old_val = 0; + int new_val = 0; + int cas_val = 0; + int ok = 0; + while (ok == 0) { + old_val = root_buffer[addr]; + new_val = floatBitsToInt(intBitsToFloat(old_val) + data); + cas_val = atomicCompSwap(root_buffer[addr], old_val, new_val); + ok = int(cas_val == old_val); + } + return intBitsToFloat(old_val); +} + +float fatomicAdd_global_tmps_buffer(int addr, float data) { + int old_val = 0; + int new_val = 0; + int cas_val = 0; + int ok = 0; + while (ok == 0) { + old_val = global_tmps_buffer[addr]; + new_val = floatBitsToInt(intBitsToFloat(old_val) + data); + cas_val = atomicCompSwap(global_tmps_buffer[addr], old_val, new_val); + ok = int(cas_val == old_val); + } + return intBitsToFloat(old_val); +} +) +VULKAN_END_CODE_DEF +// clang-format on diff --git a/taichi/backends/vulkan/shaders/epilogue.h b/taichi/backends/vulkan/shaders/epilogue.h new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/taichi/backends/vulkan/shaders/epilogue.h @@ -0,0 +1 @@ + diff --git a/taichi/backends/vulkan/shaders/prologue.h b/taichi/backends/vulkan/shaders/prologue.h new file mode 100644 index 000000000..bbacdf2ef --- /dev/null +++ b/taichi/backends/vulkan/shaders/prologue.h @@ -0,0 +1,28 @@ +#ifdef TI_INSIDE_VULKAN_CODEGEN + +#include "taichi/util/macros.h" + +#else + +#define STR(...) __VA_ARGS__ + +#define inout + +// GLSL builtin stubs +int floatBitsToInt(float f) { + return *reinterpret_cast(&f); +} + +int intBitsToFloat(float f) { + return *reinterpret_cast(&f); +} + +int atomicCompSwap(int &mem, int compare, int data) { + const int old = mem; + if (mem == compare) { + mem = data; + } + return old; +} + +#endif // TI_INSIDE_VULKAN_CODEGEN diff --git a/taichi/backends/vulkan/snode_struct_compiler.cpp b/taichi/backends/vulkan/snode_struct_compiler.cpp new file mode 100644 index 000000000..e586cd582 --- /dev/null +++ b/taichi/backends/vulkan/snode_struct_compiler.cpp @@ -0,0 +1,90 @@ +#include "taichi/backends/vulkan/snode_struct_compiler.h" + +#include "taichi/backends/vulkan/data_type_utils.h" + +namespace taichi { +namespace lang { +namespace vulkan { +namespace { + +class StructCompiler { + public: + CompiledSNodeStructs run(const SNode &root) { + TI_ASSERT(root.type == SNodeType::root); + + CompiledSNodeStructs result; + result.root_size = compute_snode_size(&root); + result.snode_descriptors = std::move(snode_descriptors_); + TI_INFO("Vulkan RootBuffer size={}", result.root_size); + return result; + } + + private: + std::size_t compute_snode_size(const SNode *sn) { + const bool is_place = sn->is_place(); + + SNodeDescriptor sn_desc; + sn_desc.snode = sn; + if (is_place) { + sn_desc.cell_stride = vk_data_type_size(sn->dt); + sn_desc.container_stride = sn_desc.cell_stride; + } else { + std::size_t cell_stride = 0; + for (const auto &ch : sn->ch) { + const auto child_offset = cell_stride; + const auto *ch_snode = ch.get(); + cell_stride += compute_snode_size(ch_snode); + snode_descriptors_.find(ch_snode->id) + ->second.mem_offset_in_parent_cell = child_offset; + } + sn_desc.cell_stride = cell_stride; + sn_desc.container_stride = + cell_stride * sn_desc.cells_per_container_pot(); + } + + sn_desc.total_num_cells_from_root = 1; + for (const auto &e : sn->extractors) { + // Note that the extractors are set in two places: + // 1. When a new SNode is first defined + // 2. StructCompiler::infer_snode_properties() + // The second step is the finalized result. + sn_desc.total_num_cells_from_root *= e.num_elements; + } + + TI_INFO("SNodeDescriptor"); + TI_INFO("* snode={}", sn_desc.snode->id); + TI_INFO("* type={} (is_place={})", sn_desc.snode->node_type_name, is_place); + TI_INFO("* cell_stride={}", sn_desc.cell_stride); + TI_INFO("* cells_per_container_pot={}", sn_desc.cells_per_container_pot()); + TI_INFO("* container_stride={}", sn_desc.container_stride); + TI_INFO("* total_num_cells_from_root={}", + sn_desc.total_num_cells_from_root); + TI_INFO(""); + + TI_ASSERT(snode_descriptors_.find(sn->id) == snode_descriptors_.end()); + snode_descriptors_[sn->id] = sn_desc; + return sn_desc.container_stride; + } + + SNodeDescriptorsMap snode_descriptors_; +}; + +} // namespace + +int SNodeDescriptor::cells_per_container_pot() const { + // For root, |snode->n| is 0. + const auto ty = snode->type; + if (ty == SNodeType::root || ty == SNodeType::place) { + return 1; + } + return snode->n; +} + +CompiledSNodeStructs compile_snode_structs(const SNode &root) { + StructCompiler compiler; + return compiler.run(root); +} + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/snode_struct_compiler.h b/taichi/backends/vulkan/snode_struct_compiler.h new file mode 100644 index 000000000..426c96ee0 --- /dev/null +++ b/taichi/backends/vulkan/snode_struct_compiler.h @@ -0,0 +1,53 @@ +// Codegen for the hierarchical data structure +#pragma once + +#include + +#include "taichi/ir/snode.h" + +namespace taichi { +namespace lang { +namespace vulkan { + +struct SNodeDescriptor { + const SNode *snode = nullptr; + // Stride (bytes) of a single cell. + int cell_stride = 0; + + // Number of cells per container, padded to Power of Two (pot). + int cells_per_container_pot() const; + + // Bytes of a single container. + int container_stride = 0; + + // Total number of CELLS of this SNode, NOT padded to PoT. + // For example, for a layout of + // ti.root + // .dense(ti.ij, (3, 2)) // S1 + // .dense(ti.ij, (5, 3)) // S2 + // |total_num_cells_from_root| for S2 is 3x2x5x3 = 90. That is, S2 has a total + // of 90 cells. Note that the number of S2 (container) itself is 3x2=6! + int total_num_cells_from_root = 0; + // An SNode can have multiple number of components, where each component + // starts at a fixed offset in its parent cell's memory. + int mem_offset_in_parent_cell = 0; + + SNode *get_child(int ch_i) const { + return snode->ch[ch_i].get(); + } +}; + +using SNodeDescriptorsMap = std::unordered_map; + +struct CompiledSNodeStructs { + // Root buffer size in bytes. + size_t root_size; + // Map from SNode ID to its descriptor. + SNodeDescriptorsMap snode_descriptors; +}; + +CompiledSNodeStructs compile_snode_structs(const SNode &root); + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp b/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp new file mode 100644 index 000000000..72ee792e5 --- /dev/null +++ b/taichi/backends/vulkan/vulkan_simple_memory_pool.cpp @@ -0,0 +1,143 @@ +#include "taichi/backends/vulkan/vulkan_simple_memory_pool.h" + +#include "taichi/math/arithmetic.h" +#include "taichi/backends/vulkan/vulkan_common.h" +#include "taichi/common/logging.h" + +namespace taichi { +namespace lang { +namespace vulkan { + +namespace { + +static constexpr VkDeviceSize kAlignment = 256; + +VkDeviceSize roundup_aligned(VkDeviceSize size) { + return iroundup(size, kAlignment); +} + +} // namespace + +VkBufferWithMemory::VkBufferWithMemory(VkDevice device, + VkBuffer buffer, + VkDeviceMemory mem, + VkDeviceSize size, + VkDeviceSize offset) + : device_(device), + buffer_(buffer), + backing_memory_(mem), + size_(size), + offset_in_mem_(offset) { + TI_ASSERT(buffer_ != VK_NULL_HANDLE); + TI_ASSERT(size_ > 0); + TI_ASSERT(backing_memory_ != VK_NULL_HANDLE); +} + +VkBufferWithMemory::~VkBufferWithMemory() { + if (buffer_ != VK_NULL_HANDLE) { + vkDestroyBuffer(device_, buffer_, kNoVkAllocCallbacks); + } +} + +LinearVkMemoryPool::LinearVkMemoryPool(const Params ¶ms, + VkDeviceMemory mem, + uint32_t mti) + : device_(params.device), + memory_(mem), + memory_type_index_(mti), + compute_queue_family_index_(params.compute_queue_family_index), + buffer_creation_template_(params.buffer_creation_template), + pool_size_(params.pool_size), + next_(0) { + buffer_creation_template_.size = 0; + buffer_creation_template_.queueFamilyIndexCount = 1; + buffer_creation_template_.pQueueFamilyIndices = &compute_queue_family_index_; +} + +LinearVkMemoryPool::~LinearVkMemoryPool() { + if (memory_ != VK_NULL_HANDLE) { + vkFreeMemory(device_, memory_, kNoVkAllocCallbacks); + } +} + +// static +std::unique_ptr LinearVkMemoryPool::try_make( + Params params) { + params.pool_size = roundup_aligned(params.pool_size); + + VkMemoryAllocateInfo alloc_info{}; + alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + alloc_info.allocationSize = params.pool_size; + const auto mem_type_index = find_memory_type(params); + if (!mem_type_index.has_value()) { + return nullptr; + } + alloc_info.memoryTypeIndex = mem_type_index.value(); + VkDeviceMemory mem; + if (vkAllocateMemory(params.device, &alloc_info, kNoVkAllocCallbacks, &mem) != + VK_SUCCESS) { + return nullptr; + } + return std::make_unique(params, mem, + alloc_info.memoryTypeIndex); +} + +std::unique_ptr LinearVkMemoryPool::alloc_and_bind( + VkDeviceSize buf_size) { + buf_size = roundup_aligned(buf_size); + if (pool_size_ <= (next_ + buf_size)) { + TI_WARN("Vulkan memory pool exhausted, max size={}", pool_size_); + return nullptr; + } + + VkBuffer buffer; + buffer_creation_template_.size = buf_size; + BAIL_ON_VK_BAD_RESULT(vkCreateBuffer(device_, &buffer_creation_template_, + kNoVkAllocCallbacks, &buffer), + "failed to create buffer"); + buffer_creation_template_.size = 0; // reset + const auto offset_in_mem = next_; + next_ += buf_size; + BAIL_ON_VK_BAD_RESULT( + vkBindBufferMemory(device_, buffer, memory_, offset_in_mem), + "failed to bind buffer to memory"); + + VkMemoryRequirements mem_requirements; + vkGetBufferMemoryRequirements(device_, buffer, &mem_requirements); + TI_ASSERT(mem_requirements.memoryTypeBits & (1 << memory_type_index_)); + TI_ASSERT_INFO((buf_size % mem_requirements.alignment) == 0, + "buf_size={} required alignment={}", buf_size, + mem_requirements.alignment); + return std::make_unique(device_, buffer, memory_, + buf_size, offset_in_mem); +} + +// static +std::optional LinearVkMemoryPool::find_memory_type( + const Params ¶ms) { + VkPhysicalDeviceMemoryProperties mem_properties; + vkGetPhysicalDeviceMemoryProperties(params.physical_device, &mem_properties); + auto satisfies = [&](int i) -> bool { + const auto &mem_type = mem_properties.memoryTypes[i]; + if ((mem_type.propertyFlags & params.required_properties) != + params.required_properties) { + return false; + } + if (mem_properties.memoryHeaps[mem_type.heapIndex].size <= + params.pool_size) { + return false; + } + return true; + }; + + for (int i = 0; i < mem_properties.memoryTypeCount; ++i) { + if (satisfies(i)) { + return i; + } + } + return std::nullopt; +} + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/vulkan_simple_memory_pool.h b/taichi/backends/vulkan/vulkan_simple_memory_pool.h new file mode 100644 index 000000000..9e1023f0a --- /dev/null +++ b/taichi/backends/vulkan/vulkan_simple_memory_pool.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include + +#include +#include + +namespace taichi { +namespace lang { +namespace vulkan { + +class VkBufferWithMemory { + public: + VkBufferWithMemory(VkDevice device, + VkBuffer buffer, + VkDeviceMemory mem, + VkDeviceSize size, + VkDeviceSize offset); + + // Just use std::unique_ptr to save all the trouble from crafting move ctors + // on our own + VkBufferWithMemory(const VkBufferWithMemory &) = delete; + VkBufferWithMemory &operator=(const VkBufferWithMemory &) = delete; + VkBufferWithMemory(VkBufferWithMemory &&) = delete; + VkBufferWithMemory &operator=(VkBufferWithMemory &&) = delete; + + ~VkBufferWithMemory(); + + VkBuffer buffer() const { + return buffer_; + } + + VkDeviceSize size() const { + return size_; + } + + VkDeviceSize offset_in_mem() const { + return offset_in_mem_; + } + + class Mapped { + public: + explicit Mapped(VkBufferWithMemory *buf) : buf_(buf), data_(nullptr) { + vkMapMemory(buf_->device_, buf_->backing_memory_, buf_->offset_in_mem(), + buf_->size(), /*flags=*/0, &data_); + } + + ~Mapped() { + vkUnmapMemory(buf_->device_, buf_->backing_memory_); + } + + void *data() const { + return data_; + } + + private: + VkBufferWithMemory *const buf_; // not owned + void *data_; + }; + + Mapped map_mem() { + return Mapped(this); + } + + private: + friend class Mapped; + + VkDevice device_{VK_NULL_HANDLE}; + VkBuffer buffer_{VK_NULL_HANDLE}; + VkDeviceMemory backing_memory_{VK_NULL_HANDLE}; + VkDeviceSize size_{0}; + VkDeviceSize offset_in_mem_{0}; +}; + +// TODO: Use +// https://gpuopen-librariesandsdks.github.io/VulkanMemoryAllocator/html/custom_memory_pools.html +class LinearVkMemoryPool { + public: + struct Params { + VkPhysicalDevice physical_device{VK_NULL_HANDLE}; + VkDevice device{VK_NULL_HANDLE}; + VkMemoryPropertyFlags required_properties; + VkDeviceSize pool_size{0}; + uint32_t compute_queue_family_index{0}; + VkBufferCreateInfo buffer_creation_template{}; + }; + + LinearVkMemoryPool(const Params ¶ms, VkDeviceMemory mem, uint32_t mti); + + ~LinearVkMemoryPool(); + + static std::unique_ptr try_make(Params params); + + std::unique_ptr alloc_and_bind(VkDeviceSize buf_size); + + private: + static std::optional find_memory_type(const Params ¶ms); + + VkDevice device_{VK_NULL_HANDLE}; // not owned + VkDeviceMemory memory_{VK_NULL_HANDLE}; + uint32_t memory_type_index_{0}; + uint32_t compute_queue_family_index_{0}; + VkBufferCreateInfo buffer_creation_template_{}; + VkDeviceSize pool_size_{0}; + VkDeviceSize next_{0}; +}; + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/vulkan_utils.cpp b/taichi/backends/vulkan/vulkan_utils.cpp new file mode 100644 index 000000000..d0fc7d60f --- /dev/null +++ b/taichi/backends/vulkan/vulkan_utils.cpp @@ -0,0 +1,54 @@ +#include "taichi/backends/vulkan/vulkan_utils.h" + +#include + +namespace taichi { +namespace lang { +namespace vulkan { + +std::vector GetInstanceExtensionProperties() { + constexpr char *kNoLayerName = nullptr; + uint32_t count = 0; + vkEnumerateInstanceExtensionProperties(kNoLayerName, &count, nullptr); + std::vector extensions(count); + vkEnumerateInstanceExtensionProperties(kNoLayerName, &count, + extensions.data()); + return extensions; +} + +std::vector GetDeviceExtensionProperties( + VkPhysicalDevice physicalDevice) { + constexpr char *kNoLayerName = nullptr; + uint32_t count = 0; + vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count, + nullptr); + std::vector extensions(count); + vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count, + extensions.data()); + return extensions; +} + +GlslToSpirvCompiler::GlslToSpirvCompiler(const ErrorHandler &err_handler) + : err_handler_(err_handler) { + opts_.SetTargetEnvironment(shaderc_target_env_vulkan, + VulkanEnvSettings::kShadercEnvVersion()); + opts_.SetOptimizationLevel(shaderc_optimization_level_performance); +} + +std::optional GlslToSpirvCompiler::compile( + const std::string &glsl_src, + const std::string &shader_name) { + auto spv_result = + compiler_.CompileGlslToSpv(glsl_src, shaderc_glsl_default_compute_shader, + /*input_file_name=*/shader_name.c_str(), + /*entry_point_name=*/"main", opts_); + if (spv_result.GetCompilationStatus() != shaderc_compilation_status_success) { + err_handler_(glsl_src, shader_name, spv_result.GetErrorMessage()); + return std::nullopt; + } + return SpirvBinary(spv_result.begin(), spv_result.end()); +} + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/vulkan/vulkan_utils.h b/taichi/backends/vulkan/vulkan_utils.h new file mode 100644 index 000000000..6f01e9f1f --- /dev/null +++ b/taichi/backends/vulkan/vulkan_utils.h @@ -0,0 +1,52 @@ +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace taichi { +namespace lang { + +namespace vulkan { + +std::vector GetInstanceExtensionProperties(); + +std::vector GetDeviceExtensionProperties( + VkPhysicalDevice physicalDevice); + +class VulkanEnvSettings { + public: + static constexpr uint32_t kApiVersion() { + return VK_API_VERSION_1_0; + } + + static constexpr shaderc_env_version kShadercEnvVersion() { + return shaderc_env_version_vulkan_1_0; + } +}; + +class GlslToSpirvCompiler { + public: + using SpirvBinary = std::vector; + using ErrorHandler = std::function; + + explicit GlslToSpirvCompiler(const ErrorHandler &err_handler); + + std::optional compile(const std::string &glsl_src, + const std::string &shader_name); + + private: + shaderc::CompileOptions opts_; + shaderc::Compiler compiler_; + ErrorHandler err_handler_{nullptr}; +}; + +} // namespace vulkan +} // namespace lang +} // namespace taichi diff --git a/taichi/backends/wasm/codegen_wasm.cpp b/taichi/backends/wasm/codegen_wasm.cpp index 1649f11e8..cf5d68224 100644 --- a/taichi/backends/wasm/codegen_wasm.cpp +++ b/taichi/backends/wasm/codegen_wasm.cpp @@ -13,6 +13,12 @@ namespace taichi { namespace lang { +namespace { +constexpr std::array kPreloadedFuncNames = { + "wasm_materialize", "wasm_set_kernel_parameter_i32", + "wasm_set_kernel_parameter_f32"}; +} + class CodeGenLLVMWASM : public CodeGenLLVM { public: using IRVisitor::visit; @@ -173,14 +179,16 @@ class CodeGenLLVMWASM : public CodeGenLLVM { ir->accept(this); finalize_taichi_kernel_function(); - auto wasm_materialize_name = "wasm_materialize"; - // compile_module_to_executable // only keep the current func TaichiLLVMContext::eliminate_unused_functions( - module.get(), [&](std::string func_name) { - return offloaded_task_name == func_name || - wasm_materialize_name == func_name; + module.get(), [offloaded_task_name](const std::string &func_name) { + for (auto &name : kPreloadedFuncNames) { + if (std::string(name) == func_name) { + return true; + } + } + return func_name == offloaded_task_name; }); tlctx->add_module(std::move(module)); auto kernel_symbol = tlctx->lookup_function_pointer(offloaded_task_name); @@ -199,9 +207,6 @@ FunctionType CodeGenWASM::codegen() { std::unique_ptr CodeGenWASM::modulegen( std::unique_ptr &&module) { - /* - TODO: move wasm_materialize to dump process in AOT. - */ bool init_flag = module == nullptr; std::vector name_list; @@ -211,8 +216,11 @@ std::unique_ptr CodeGenWASM::modulegen( gen->emit_to_module(); gen->finalize_taichi_kernel_function(); + // TODO: move the following functions to dump process in AOT. if (init_flag) { - name_list.emplace_back("wasm_materialize"); + for (auto &name : kPreloadedFuncNames) { + name_list.emplace_back(name); + } } gen->tlctx->jit->global_optimize_module(gen->module.get()); diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h index ec2b99933..a02f3bdc2 100644 --- a/taichi/ir/frontend_ir.h +++ b/taichi/ir/frontend_ir.h @@ -375,6 +375,7 @@ class GlobalVariableExpression : public Expression { public: Identifier ident; DataType dt; + std::string name; SNode *snode; bool has_ambient; TypedConstant ambient_value; diff --git a/taichi/ir/type_utils.cpp b/taichi/ir/type_utils.cpp index dffe9ff99..7ba723994 100644 --- a/taichi/ir/type_utils.cpp +++ b/taichi/ir/type_utils.cpp @@ -25,17 +25,11 @@ std::string data_type_format(DataType dt) { } else if (dt->is_primitive(PrimitiveTypeID::u32)) { return "%u"; } else if (dt->is_primitive(PrimitiveTypeID::i64)) { -#if defined(TI_PLATFORM_UNIX) + // Use %lld on Windows. + // Discussion: https://github.com/taichi-dev/taichi/issues/2522 return "%lld"; -#else - return "%I64d"; -#endif } else if (dt->is_primitive(PrimitiveTypeID::u64)) { -#if defined(TI_PLATFORM_UNIX) return "%llu"; -#else - return "%I64u"; -#endif } else if (dt->is_primitive(PrimitiveTypeID::f32)) { return "%f"; } else if (dt->is_primitive(PrimitiveTypeID::f64)) { diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp index 19c46efc9..9597a2523 100644 --- a/taichi/llvm/llvm_context.cpp +++ b/taichi/llvm/llvm_context.cpp @@ -230,13 +230,6 @@ void compile_runtime_bitcode(Arch arch) { } } -void compile_runtimes() { - compile_runtime_bitcode(host_arch()); -#if defined(TI_WITH_CUDA) - compile_runtime_bitcode(Arch::cuda); -#endif -} - std::string libdevice_path() { std::string folder; if (is_release()) { diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp index a5e3ab019..cf884cbc4 100644 --- a/taichi/program/kernel.cpp +++ b/taichi/program/kernel.cpp @@ -276,25 +276,25 @@ Context &Kernel::LaunchContextBuilder::get_context() { float64 Kernel::get_ret_float(int i) { auto dt = rets[i].dt->get_compute_type(); if (dt->is_primitive(PrimitiveTypeID::f32)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::f64)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i32)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i64)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i8)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i16)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u8)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u16)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u32)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u64)) { - return (float64)get_current_program().fetch_result(i); + return (float64)program->fetch_result(i); } else { TI_NOT_IMPLEMENTED } @@ -303,25 +303,25 @@ float64 Kernel::get_ret_float(int i) { int64 Kernel::get_ret_int(int i) { auto dt = rets[i].dt->get_compute_type(); if (dt->is_primitive(PrimitiveTypeID::i32)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i64)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i8)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::i16)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u8)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u16)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u32)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::u64)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::f32)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else if (dt->is_primitive(PrimitiveTypeID::f64)) { - return (int64)get_current_program().fetch_result(i); + return (int64)program->fetch_result(i); } else { TI_NOT_IMPLEMENTED } diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 456c74bb3..b8c88894e 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -586,6 +586,10 @@ void Program::async_flush() { async_engine->flush(); } +int Program::get_snode_tree_size() { + return snode_trees_.size(); +} + std::string capitalize_first(std::string s) { s[0] = std::toupper(s[0]); return s; diff --git a/taichi/program/program.h b/taichi/program/program.h index c9c392a48..1eb7ae9ae 100644 --- a/taichi/program/program.h +++ b/taichi/program/program.h @@ -169,6 +169,8 @@ class Program { */ void materialize_runtime(); + int get_snode_tree_size(); + void visualize_layout(const std::string &fn); struct KernelProxy { diff --git a/taichi/program/snode_expr_utils.cpp b/taichi/program/snode_expr_utils.cpp index b83e7e7c0..02e7ee3c3 100644 --- a/taichi/program/snode_expr_utils.cpp +++ b/taichi/program/snode_expr_utils.cpp @@ -67,7 +67,11 @@ void place_child(Expr *expr_arg, } auto &child = parent->insert_children(SNodeType::place); glb_var_expr->set_snode(&child); - child.name = glb_var_expr->ident.raw_name(); + if (glb_var_expr->name == "") { + child.name = glb_var_expr->ident.raw_name(); + } else { + child.name = glb_var_expr->name; + } if (glb_var_expr->has_ambient) { glb_var_expr->snode->has_ambient = true; glb_var_expr->snode->ambient_val = glb_var_expr->ambient_value; diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 918cb24e6..ed469c619 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -55,7 +55,6 @@ void expr_assign(const Expr &lhs_, const Expr &rhs, std::string tb) { std::vector> scope_stack; -void compile_runtimes(); std::string libdevice_path(); std::string get_runtime_dir(); @@ -231,7 +230,10 @@ void export_lang(py::module &m) { .def("synchronize", &Program::synchronize) .def("async_flush", &Program::async_flush) .def("materialize_runtime", &Program::materialize_runtime) - .def("make_aot_module_builder", &Program::make_aot_module_builder); + .def("make_aot_module_builder", &Program::make_aot_module_builder) + .def("get_snode_tree_size", &Program::get_snode_tree_size) + .def("get_snode_root", &Program::get_snode_root, + py::return_value_policy::reference); py::class_(m, "AotModuleBuilder") .def("add", &AotModuleBuilder::add) @@ -276,6 +278,7 @@ void export_lang(py::module &m) { get_current_program().get_snode_to_glb_var_exprs()); }) .def("data_type", [](SNode *snode) { return snode->dt; }) + .def("name", [](SNode *snode) { return snode->name; }) .def("get_num_ch", [](SNode *snode) -> int { return (int)snode->ch.size(); }) .def( @@ -358,13 +361,25 @@ void export_lang(py::module &m) { [](Expr *expr) { return expr->is(); }) .def("is_external_var", [](Expr *expr) { return expr->is(); }) + .def("is_primal", + [](Expr *expr) { + return expr->cast()->is_primal; + }) .def("set_tb", &Expr::set_tb) + .def("set_name", + [&](Expr *expr, std::string na) { + expr->cast()->name = na; + }) .def("set_is_primal", [&](Expr *expr, bool v) { expr->cast()->is_primal = v; }) .def("set_grad", &Expr::set_grad) .def("set_attribute", &Expr::set_attribute) + .def("get_expr_name", + [](Expr *expr) { + return expr->cast()->name; + }) .def("get_attribute", &Expr::get_attribute) .def("get_raw_address", [](Expr *expr) { return (uint64)expr; }) .def("get_underlying_ptr_address", [](Expr *e) { @@ -744,7 +759,6 @@ void export_lang(py::module &m) { m.def("test_throw", [] { throw IRModified(); }); m.def("needs_grad", needs_grad); - m.def("compile_runtimes", compile_runtimes); m.def("libdevice_path", libdevice_path); m.def("host_arch", host_arch); diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp index 43cb5a1ef..2888e7f19 100644 --- a/taichi/runtime/llvm/runtime.cpp +++ b/taichi/runtime/llvm/runtime.cpp @@ -1083,19 +1083,19 @@ i32 op_xor_i32(i32 a, i32 b) { return a ^ b; } -#define DEFINE_REDUCTION(op, dtype) \ - dtype warp_reduce_##op##_##dtype(dtype val) { \ - for (int offset = 16; offset > 0; offset /= 2) \ - val = op_##op##_##dtype( \ - val, cuda_shfl_down_sync_i32(0xFFFFFFFF, val, offset, 31)); \ - return val; \ - } \ - dtype reduce_##op##_##dtype(dtype *result, dtype val) { \ - dtype warp_result = warp_reduce_##op##_##dtype(val); \ - if ((thread_idx() & (warp_size() - 1)) == 0) { \ - atomic_##op##_##dtype(result, warp_result); \ - } \ - return val; \ +#define DEFINE_REDUCTION(op, dtype) \ + dtype warp_reduce_##op##_##dtype(dtype val) { \ + for (int offset = 16; offset > 0; offset /= 2) \ + val = op_##op##_##dtype( \ + val, cuda_shfl_down_sync_##dtype(0xFFFFFFFF, val, offset, 31)); \ + return val; \ + } \ + dtype reduce_##op##_##dtype(dtype *result, dtype val) { \ + dtype warp_result = warp_reduce_##op##_##dtype(val); \ + if ((thread_idx() & (warp_size() - 1)) == 0) { \ + atomic_##op##_##dtype(result, warp_result); \ + } \ + return val; \ } DEFINE_REDUCTION(add, i32); @@ -1751,6 +1751,14 @@ i32 wasm_materialize(Context *context) { (Ptr)((size_t)context->runtime->rand_states + sizeof(RandState)); return (i32)(size_t)context->runtime->roots[0]; } + +void wasm_set_kernel_parameter_i32(Context *context, int index, i32 value) { + *(i32 *)(&context->args[index]) = value; +} + +void wasm_set_kernel_parameter_f32(Context *context, int index, f32 value) { + *(f32 *)(&context->args[index]) = value; +} } #endif diff --git a/tests/conftest.py b/tests/python/conftest.py similarity index 100% rename from tests/conftest.py rename to tests/python/conftest.py diff --git a/tests/python/test_field.py b/tests/python/test_field.py index 363e71f5d..53dd5af13 100644 --- a/tests/python/test_field.py +++ b/tests/python/test_field.py @@ -92,3 +92,18 @@ def test_default_ip(dtype): x = ti.Vector.field(2, int, ()) assert x.dtype == ti.get_runtime().default_ip + + +@ti.test() +def test_field_name(): + a = ti.field(dtype=ti.f32, shape=(2, 3), name='a') + b = ti.Vector.field(3, dtype=ti.f32, shape=(2, 3), name='b') + c = ti.Matrix.field(3, 3, dtype=ti.f32, shape=(5, 4), name='c') + assert a.name == 'a' + assert b.name == 'b' + assert c.name == 'c' + assert b.snode.name == 'b' + d = [] + for i in range(10): + d.append(ti.field(dtype=ti.f32, shape=(2, 3), name=f'd{i}')) + assert d[i].name == f'd{i}' diff --git a/tests/python/test_fields_builder.py b/tests/python/test_fields_builder.py index f315b4692..a4edd92a7 100644 --- a/tests/python/test_fields_builder.py +++ b/tests/python/test_fields_builder.py @@ -13,18 +13,31 @@ def test_fields_with_shape(): def func(): for i in range(n): x[i] = i - for i in range(n): - assert x[i] == i + func() + + for i in range(n): + assert x[i] == i + + y = ti.field(ti.f32, [n]) + + @ti.kernel + def func2(): for i in range(n): - x[i] = i * 2 + y[i] = i * 2 for i in range(n): - assert x[i] == i * 2 + x[i] = i * 3 + + func2() + + for i in range(n): + assert x[i] == i * 3 + assert y[i] == i * 2 func() - with pytest.raises(InvalidOperationError, match='FieldsBuilder finalized'): - y = ti.field(ti.f32, [n]) + for i in range(n): + assert x[i] == i @ti.test(arch=[ti.cpu, ti.cuda]) @@ -72,7 +85,7 @@ def func2(): assert x[i] == i * 3 -@ti.test(arch=[ti.cpu, ti.cuda]) +@ti.test(arch=[ti.cpu, ti.cuda], use_unified_memory=True) def test_fields_builder_pointer(): n = 5 diff --git a/tests/python/test_loop_unique.py b/tests/python/test_loop_unique.py index c4a33d15d..7ee0cce48 100644 --- a/tests/python/test_loop_unique.py +++ b/tests/python/test_loop_unique.py @@ -1,7 +1,7 @@ import taichi as ti -@ti.test(require=ti.extension.sparse) +@ti.test(require=ti.extension.sparse, use_unified_memory=True) def test_loop_unique_simple_1d(): x, y = ti.field(ti.i32), ti.field(ti.i32) @@ -26,7 +26,7 @@ def inc_y(): assert y[i] == expected_result.get(i, 0) -@ti.test(require=ti.extension.sparse) +@ti.test(require=ti.extension.sparse, use_unified_memory=True) def test_loop_unique_binary_op_1d(): x, y = ti.field(ti.i32), ti.field(ti.i32) @@ -51,7 +51,7 @@ def inc_y(): assert y[i] == expected_result.get(i, 0) -@ti.test(require=ti.extension.sparse) +@ti.test(require=ti.extension.sparse, use_unified_memory=True) def test_loop_unique_nested_1d(): x, y = ti.field(ti.i32), ti.field(ti.i32) @@ -77,7 +77,7 @@ def inc_y(): assert y[i] == expected_result.get(i, 0) -@ti.test(require=ti.extension.sparse) +@ti.test(require=ti.extension.sparse, use_unified_memory=True) def test_loop_unique_2d(): x, y, z = ti.field(ti.i32), ti.field(ti.i32), ti.field(ti.i32) diff --git a/tests/python/test_mpm_particle_list.py b/tests/python/test_mpm_particle_list.py index 61e3febb0..de94bc4ed 100644 --- a/tests/python/test_mpm_particle_list.py +++ b/tests/python/test_mpm_particle_list.py @@ -33,7 +33,7 @@ def __init__(self, res): def build_pid(self): ti.block_dim(256) for p in self.x: - base = ti.floor(self.x[p] * self.inv_dx - 0.5).cast(int) + base = ti.floor(self.x[p] * self.inv_dx - 0.5).cast(int) + 1 ti.append(self.pid.parent(), base, p) def step(self): diff --git a/tests/python/test_no_grad.py b/tests/python/test_no_grad.py index ef6445130..4aba4558a 100644 --- a/tests/python/test_no_grad.py +++ b/tests/python/test_no_grad.py @@ -1,3 +1,6 @@ +import numpy as np +import pytest + import taichi as ti @@ -19,3 +22,23 @@ def func(): with ti.Tape(loss): func() + + +@ti.all_archs +def test_raise_no_gradient(): + y = ti.field(shape=(), name='y', dtype=ti.f64, needs_grad=True) + x = ti.field(shape=(), name='x', dtype=ti.f32) + z = np.array([1.0]) + + @ti.kernel + def func(x: ti.template()): + y[None] = x.grad[None] * x.grad[None] + z[0] = x.grad[None] + + x[None] = 5. + with pytest.raises(RuntimeError) as e: + func(x) + + assert e.type is RuntimeError + assert e.value.args[ + 0] == f"Gradient x.grad has not been placed, check whether `needs_grad=True`" diff --git a/tests/python/test_reduction.py b/tests/python/test_reduction.py index f390cbe6e..0b859665b 100644 --- a/tests/python/test_reduction.py +++ b/tests/python/test_reduction.py @@ -39,10 +39,18 @@ def _test_reduction_single(dtype, criterion, op): a = ti.field(dtype, shape=N) tot = ti.field(dtype, shape=()) - @ti.kernel - def fill(): - for i in a: - a[i] = i + if dtype in [ti.f32, ti.f64]: + + @ti.kernel + def fill(): + for i in a: + a[i] = i + 0.5 + else: + + @ti.kernel + def fill(): + for i in a: + a[i] = i ti_op = ti_ops[op] @@ -62,7 +70,8 @@ def reduce_tmp() -> dtype: reduce() tot2 = reduce_tmp() - ground_truth = np_ops[op](a.to_numpy()) + np_arr = np.append(a.to_numpy(), [0]) + ground_truth = np_ops[op](np_arr) assert criterion(tot[None], ground_truth) assert criterion(tot2, ground_truth) diff --git a/tests/python/test_sparse_basics.py b/tests/python/test_sparse_basics.py index 2cece4351..c6b27d73b 100644 --- a/tests/python/test_sparse_basics.py +++ b/tests/python/test_sparse_basics.py @@ -1,3 +1,5 @@ +import pytest + import taichi as ti @@ -75,3 +77,19 @@ def func(): assert s[None] == 5 * n print(x[257 + n * n * 7]) assert s[None] == 5 * n + + +@pytest.mark.skip(reason='https://github.com/taichi-dev/taichi/issues/2520') +@ti.test(require=ti.extension.sparse, use_unified_memory=False) +def test_pointer_direct_place(): + x, y = ti.field(ti.i32), ti.field(ti.i32) + + N = 1 + ti.root.pointer(ti.i, N).place(x) + ti.root.pointer(ti.i, N).place(y) + + @ti.kernel + def foo(): + pass + + foo() diff --git a/tests/python/test_tensor_reflection.py b/tests/python/test_tensor_reflection.py index 5bac5cdbb..cdae5af42 100644 --- a/tests/python/test_tensor_reflection.py +++ b/tests/python/test_tensor_reflection.py @@ -59,7 +59,7 @@ def test_unordered(): assert val.snode in blk3.get_children() assert blk3 in blk2.get_children() assert blk2 in blk1.get_children() - assert blk1 in ti.root.get_children() + assert blk1 in ti.FieldsBuilder.finalized_roots()[0].get_children() expected_str = f'ti.root => dense {[n]} => dense {[n, m]}' \ f' => dense {[n, m, p]} => place {[n, m, p]}'