From fae990ea3d4517e77b0c372fddfaaf2f70aff04b Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Thu, 24 Oct 2024 23:34:34 -0300 Subject: [PATCH 1/8] fix: make exla build resilient to stale upgrades --- exla/Makefile | 2 +- exla/README.md | 16 ++++++++++++++++ exla/lib/exla/nif.ex | 15 ++++++++++++++- exla/mix.exs | 38 +++++++++++++++++++++++++++++++------- 4 files changed, 62 insertions(+), 9 deletions(-) diff --git a/exla/Makefile b/exla/Makefile index a371447d10..ada1bb7001 100644 --- a/exla/Makefile +++ b/exla/Makefile @@ -9,7 +9,7 @@ XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include # Cache configuration EXLA_CACHE_SO = cache/libexla.so -EXLA_CACHE_OBJ_DIR = cache/objs +EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs # Private configuration EXLA_DIR = c_src/exla diff --git a/exla/README.md b/exla/README.md index 3091555796..938865bb47 100644 --- a/exla/README.md +++ b/exla/README.md @@ -48,6 +48,22 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4). +## Troubleshooting + +EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality. +If for any reason these fail to compile or load, troubleshooting can be tricky. + +We recommend following the steps below: + + 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile --clean-libexla-cache --force` to clean up all cached files. + - Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. + - Remember to save the compilation logs from this step for further debugging. + - It is a good idea to save the `cache/libexla.so` file so that the team can inspect its contents if needed. + 2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up. + This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub + so that the Nx team can investigate further. + + ## Contributing ### Building locally diff --git a/exla/lib/exla/nif.ex b/exla/lib/exla/nif.ex index be0567cc0a..7fb09c7979 100644 --- a/exla/lib/exla/nif.ex +++ b/exla/lib/exla/nif.ex @@ -4,7 +4,20 @@ defmodule EXLA.NIF do def __on_load__ do path = :filename.join(:code.priv_dir(:exla), ~c"libexla") - :erlang.load_nif(path, 0) + + case :erlang.load_nif(path, 0) do + :ok -> + :ok + + {:error, {reason, text}} -> + raise """ + Failed to load NIF library. + Follow the steps in the :exla README Troubleshooting section for more information. + + #{reason} + #{text} + """ + end end def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef) diff --git a/exla/mix.exs b/exla/mix.exs index 184a48cb94..06ed647124 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -35,7 +35,8 @@ defmodule EXLA.MixProject do %{ "MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}", - "CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv + "CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv, + "EXLA_VERSION" => "#{@version}" } end, make_args: make_args @@ -133,7 +134,23 @@ defmodule EXLA.MixProject do {:ok, []} end - defp cached_make(_) do + defp cached_make(args) do + {parsed, _args, _invalid} = + OptionParser.parse(args, strict: [clean_libexla_cache: :boolean, force: :boolean]) + + clean_libexla_cache? = parsed[:clean_libexla_cache] == true + force? = parsed[:force] == true + + if force? do + Mix.shell().info("Removing cached .o files in cache/#{@version}/objs") + File.rm_rf!("cache/#{@version}/objs") + end + + if clean_libexla_cache? or force? do + Mix.shell().info("Removing cached libexla.so files in cache/libexla.so") + File.rm_rf!("cache/libexla.so") + end + contents = for path <- Path.wildcard("c_src/**/*"), {:ok, contents} <- [File.read(path)], @@ -150,14 +167,21 @@ defmodule EXLA.MixProject do cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"]) cached? = File.exists?(cached_so) - if cached? do - Mix.shell().info("Using libexla.so from #{cached_so}") - File.cp!(cached_so, "cache/libexla.so") + cond do + cached? and clean_libexla_cache? -> + Mix.shell().info("Removing libexla.so cache at #{cached_so}") + + cached? -> + Mix.shell().info("Using libexla.so from #{cached_so}") + File.cp!(cached_so, "cache/libexla.so") + + true -> + :ok end - result = Mix.Tasks.Compile.ElixirMake.run([]) + result = Mix.Tasks.Compile.ElixirMake.run(args) - if not cached? and match?({:ok, _}, result) do + if (not cached? or clean_libexla_cache?) and match?({:ok, _}, result) do Mix.shell().info("Caching libexla.so at #{cached_so}") File.mkdir_p!(Path.dirname(cached_so)) File.cp!("cache/libexla.so", cached_so) From 02f5f592adee3b5dad19b1fe83c3525590ee770e Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Thu, 24 Oct 2024 23:36:23 -0300 Subject: [PATCH 2/8] docs: markdown syntax --- exla/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/exla/README.md b/exla/README.md index 938865bb47..406e58db04 100644 --- a/exla/README.md +++ b/exla/README.md @@ -56,9 +56,9 @@ If for any reason these fail to compile or load, troubleshooting can be tricky. We recommend following the steps below: 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile --clean-libexla-cache --force` to clean up all cached files. - - Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. - - Remember to save the compilation logs from this step for further debugging. - - It is a good idea to save the `cache/libexla.so` file so that the team can inspect its contents if needed. + * Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. + * Remember to save the compilation logs from this step for further debugging. + * It is a good idea to save the `cache/libexla.so` file so that the team can inspect its contents if needed. 2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up. This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub so that the Nx team can investigate further. From d7fa1f9a317a2ef8e76f50953e315316aa40621f Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 02:45:17 -0300 Subject: [PATCH 3/8] changes due to code review --- exla/Makefile | 2 +- exla/README.md | 2 +- exla/mix.exs | 15 +++++++-------- exla/test/exla/device_memory_sharing_test.exs | 9 ++++----- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/exla/Makefile b/exla/Makefile index ada1bb7001..695c7f9409 100644 --- a/exla/Makefile +++ b/exla/Makefile @@ -8,7 +8,7 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include # Cache configuration -EXLA_CACHE_SO = cache/libexla.so +EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs # Private configuration diff --git a/exla/README.md b/exla/README.md index 406e58db04..fe46bdf56f 100644 --- a/exla/README.md +++ b/exla/README.md @@ -58,7 +58,7 @@ We recommend following the steps below: 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile --clean-libexla-cache --force` to clean up all cached files. * Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. * Remember to save the compilation logs from this step for further debugging. - * It is a good idea to save the `cache/libexla.so` file so that the team can inspect its contents if needed. + * It is a good idea to save the `cache//libexla.so` file so that the team can inspect its contents if needed. 2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up. This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub so that the Nx team can investigate further. diff --git a/exla/mix.exs b/exla/mix.exs index 06ed647124..936f540db4 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -135,11 +135,10 @@ defmodule EXLA.MixProject do end defp cached_make(args) do - {parsed, _args, _invalid} = - OptionParser.parse(args, strict: [clean_libexla_cache: :boolean, force: :boolean]) + clean_libexla_cache? = System.get_env("EXLA_CLEAN_LIBEXLA_CACHE") in ["1", "true"] + force? = System.get_env("EXLA_FORCE") in ["1", "true"] - clean_libexla_cache? = parsed[:clean_libexla_cache] == true - force? = parsed[:force] == true + File.mkdir_p!("cache/#{@version}") if force? do Mix.shell().info("Removing cached .o files in cache/#{@version}/objs") @@ -147,8 +146,8 @@ defmodule EXLA.MixProject do end if clean_libexla_cache? or force? do - Mix.shell().info("Removing cached libexla.so files in cache/libexla.so") - File.rm_rf!("cache/libexla.so") + Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so") + File.rm_rf!("cache/#{@version}/libexla.so") end contents = @@ -173,7 +172,7 @@ defmodule EXLA.MixProject do cached? -> Mix.shell().info("Using libexla.so from #{cached_so}") - File.cp!(cached_so, "cache/libexla.so") + File.cp!(cached_so, "cache/#{@version}/libexla.so") true -> :ok @@ -184,7 +183,7 @@ defmodule EXLA.MixProject do if (not cached? or clean_libexla_cache?) and match?({:ok, _}, result) do Mix.shell().info("Caching libexla.so at #{cached_so}") File.mkdir_p!(Path.dirname(cached_so)) - File.cp!("cache/libexla.so", cached_so) + File.cp!("cache/#{@version}/libexla.so", cached_so) end result diff --git a/exla/test/exla/device_memory_sharing_test.exs b/exla/test/exla/device_memory_sharing_test.exs index e986ea1ff8..09e54a42eb 100644 --- a/exla/test/exla/device_memory_sharing_test.exs +++ b/exla/test/exla/device_memory_sharing_test.exs @@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do end @tag :cuda_required - test "ipc handles don't crash the runtime when :local mode is selected" do - assert {:error, ~c"Invalid pointer size for selected mode."} == + test "invalid ipc handles don't crash the runtime" do + assert {:error, ~c"Unable to get pointer for IPC handle."} == Nx.from_pointer( {EXLA.Backend, client: :cuda}, - Enum.to_list(0..63), + %Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4}, {:f, 32}, - {1}, - mode: :local + {1} ) end end From 0c302ebf361a36246486a5bada92ac557c80efc6 Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 02:52:15 -0300 Subject: [PATCH 4/8] docs: update readme --- exla/README.md | 6 +++++- exla/mix.exs | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/exla/README.md b/exla/README.md index fe46bdf56f..704d10abb1 100644 --- a/exla/README.md +++ b/exla/README.md @@ -55,7 +55,11 @@ If for any reason these fail to compile or load, troubleshooting can be tricky. We recommend following the steps below: - 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile --clean-libexla-cache --force` to clean up all cached files. + 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting the following environment variables to `1` or `true` to clean up all cached files: + * `EXLA_CLEAN_LIBEXLA_CACHE`: Removes the libexla.so caches (both local and global ones). + * `EXLA_FORCE`: Removes the intermediate `.o` compilation artifacts retained from previous builds. + + Additional notes on compilation: * Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. * Remember to save the compilation logs from this step for further debugging. * It is a good idea to save the `cache//libexla.so` file so that the team can inspect its contents if needed. diff --git a/exla/mix.exs b/exla/mix.exs index 936f540db4..7a9d676b84 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -169,6 +169,7 @@ defmodule EXLA.MixProject do cond do cached? and clean_libexla_cache? -> Mix.shell().info("Removing libexla.so cache at #{cached_so}") + File.rm!(cached_so) cached? -> Mix.shell().info("Using libexla.so from #{cached_so}") From 3da43e9b53c82b96f2a5346a3743982cfbe72bed Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 02:53:38 -0300 Subject: [PATCH 5/8] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: José Valim Co-authored-by: Jonatan Kłosko --- exla/Makefile | 2 +- exla/lib/exla/nif.ex | 2 +- exla/mix.exs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/exla/Makefile b/exla/Makefile index ada1bb7001..695c7f9409 100644 --- a/exla/Makefile +++ b/exla/Makefile @@ -8,7 +8,7 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include # Cache configuration -EXLA_CACHE_SO = cache/libexla.so +EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs # Private configuration diff --git a/exla/lib/exla/nif.ex b/exla/lib/exla/nif.ex index 7fb09c7979..023a0bcbd2 100644 --- a/exla/lib/exla/nif.ex +++ b/exla/lib/exla/nif.ex @@ -14,7 +14,7 @@ defmodule EXLA.NIF do Failed to load NIF library. Follow the steps in the :exla README Troubleshooting section for more information. - #{reason} + #{inspect(reason)} #{text} """ end diff --git a/exla/mix.exs b/exla/mix.exs index 06ed647124..9692e4748c 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -147,7 +147,7 @@ defmodule EXLA.MixProject do end if clean_libexla_cache? or force? do - Mix.shell().info("Removing cached libexla.so files in cache/libexla.so") + Mix.shell().info("Removing cached libexla.so file in cache/libexla.so") File.rm_rf!("cache/libexla.so") end From 2866799c4f656c144394275b1a82ce1002c5028d Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 05:07:27 -0300 Subject: [PATCH 6/8] refactor: single env var --- exla/README.md | 6 +++--- exla/mix.exs | 55 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/exla/README.md b/exla/README.md index 704d10abb1..2ffe144ff7 100644 --- a/exla/README.md +++ b/exla/README.md @@ -55,9 +55,9 @@ If for any reason these fail to compile or load, troubleshooting can be tricky. We recommend following the steps below: - 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting the following environment variables to `1` or `true` to clean up all cached files: - * `EXLA_CLEAN_LIBEXLA_CACHE`: Removes the libexla.so caches (both local and global ones). - * `EXLA_FORCE`: Removes the intermediate `.o` compilation artifacts retained from previous builds. + 1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files: + * `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones). + * `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds. Additional notes on compilation: * Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important. diff --git a/exla/mix.exs b/exla/mix.exs index 7a9d676b84..52f382d47a 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -135,21 +135,37 @@ defmodule EXLA.MixProject do end defp cached_make(args) do - clean_libexla_cache? = System.get_env("EXLA_CLEAN_LIBEXLA_CACHE") in ["1", "true"] - force? = System.get_env("EXLA_FORCE") in ["1", "true"] + force_rebuild_mode = + case System.get_env("EXLA_FORCE_REBUILD") do + "" -> + false + + "0" -> + false + + "partial" -> + :partial + + "true" -> + true + + "1" -> + true + + value -> + Mix.raise( + "invalid value for EXLA_FORCE_REBUILD: #{value}. Expected one of: partial, true" + ) + end File.mkdir_p!("cache/#{@version}") - if force? do + # remove only in full mode + if force_rebuild_mode == true do Mix.shell().info("Removing cached .o files in cache/#{@version}/objs") File.rm_rf!("cache/#{@version}/objs") end - if clean_libexla_cache? or force? do - Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so") - File.rm_rf!("cache/#{@version}/libexla.so") - end - contents = for path <- Path.wildcard("c_src/**/*"), {:ok, contents} <- [File.read(path)], @@ -164,24 +180,25 @@ defmodule EXLA.MixProject do "elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}" cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"]) - cached? = File.exists?(cached_so) + cached? = File.exists?(cached_so) and force_rebuild_mode == false - cond do - cached? and clean_libexla_cache? -> - Mix.shell().info("Removing libexla.so cache at #{cached_so}") - File.rm!(cached_so) + # remove in both partial and full modes + if force_rebuild_mode do + Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so") + File.rm_rf!("cache/#{@version}/libexla.so") - cached? -> - Mix.shell().info("Using libexla.so from #{cached_so}") - File.cp!(cached_so, "cache/#{@version}/libexla.so") + Mix.shell().info("Removing libexla.so cache at #{cached_so}") + File.rm!(cached_so) + end - true -> - :ok + if cached? do + Mix.shell().info("Using libexla.so from #{cached_so}") + File.cp!(cached_so, "cache/#{@version}/libexla.so") end result = Mix.Tasks.Compile.ElixirMake.run(args) - if (not cached? or clean_libexla_cache?) and match?({:ok, _}, result) do + if not cached? and match?({:ok, _}, result) do Mix.shell().info("Caching libexla.so at #{cached_so}") File.mkdir_p!(Path.dirname(cached_so)) File.cp!("cache/#{@version}/libexla.so", cached_so) From ad725e4e81d98f61bac8c61055dcd4db54906453 Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 05:10:51 -0300 Subject: [PATCH 7/8] fix: missing default value --- exla/mix.exs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exla/mix.exs b/exla/mix.exs index 52f382d47a..32fa11c467 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -136,7 +136,7 @@ defmodule EXLA.MixProject do defp cached_make(args) do force_rebuild_mode = - case System.get_env("EXLA_FORCE_REBUILD") do + case System.get_env("EXLA_FORCE_REBUILD", "") do "" -> false @@ -154,7 +154,7 @@ defmodule EXLA.MixProject do value -> Mix.raise( - "invalid value for EXLA_FORCE_REBUILD: #{value}. Expected one of: partial, true" + "invalid value for EXLA_FORCE_REBUILD: '#{value}'. Expected one of: partial, true" ) end From 7483d63e6e5cb91079d55124c90dd23dfef68532 Mon Sep 17 00:00:00 2001 From: Paulo Valente <16843419+polvalente@users.noreply.github.com> Date: Mon, 28 Oct 2024 17:38:21 -0300 Subject: [PATCH 8/8] refactor: explicit modes --- exla/mix.exs | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/exla/mix.exs b/exla/mix.exs index 32fa11c467..17ef1915d4 100644 --- a/exla/mix.exs +++ b/exla/mix.exs @@ -138,19 +138,19 @@ defmodule EXLA.MixProject do force_rebuild_mode = case System.get_env("EXLA_FORCE_REBUILD", "") do "" -> - false + :none "0" -> - false + :none "partial" -> :partial "true" -> - true + :full "1" -> - true + :full value -> Mix.raise( @@ -161,7 +161,7 @@ defmodule EXLA.MixProject do File.mkdir_p!("cache/#{@version}") # remove only in full mode - if force_rebuild_mode == true do + if force_rebuild_mode in [:partial, :full] do Mix.shell().info("Removing cached .o files in cache/#{@version}/objs") File.rm_rf!("cache/#{@version}/objs") end @@ -180,10 +180,9 @@ defmodule EXLA.MixProject do "elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}" cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"]) - cached? = File.exists?(cached_so) and force_rebuild_mode == false + cached? = File.exists?(cached_so) and force_rebuild_mode == :none - # remove in both partial and full modes - if force_rebuild_mode do + if force_rebuild_mode in [:partial, :full] do Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so") File.rm_rf!("cache/#{@version}/libexla.so")