Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions exla/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib
XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include

# Cache configuration
EXLA_CACHE_SO = cache/libexla.so
EXLA_CACHE_OBJ_DIR = cache/objs
EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so
EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs

# Private configuration
EXLA_DIR = c_src/exla
Expand Down
20 changes: 20 additions & 0 deletions exla/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th

For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4).

## Troubleshooting

EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality.
If for any reason these fail to compile or load, troubleshooting can be tricky.

We recommend following the steps below:

1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files:
* `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones).
* `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds.

Additional notes on compilation:
* Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important.
* Remember to save the compilation logs from this step for further debugging.
* It is a good idea to save the `cache/<version>/libexla.so` file so that the team can inspect its contents if needed.
2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up.
This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub
so that the Nx team can investigate further.


## Contributing

### Building locally
Expand Down
15 changes: 14 additions & 1 deletion exla/lib/exla/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@ defmodule EXLA.NIF do

def __on_load__ do
path = :filename.join(:code.priv_dir(:exla), ~c"libexla")
:erlang.load_nif(path, 0)

case :erlang.load_nif(path, 0) do
:ok ->
:ok

{:error, {reason, text}} ->
raise """
Failed to load NIF library.
Follow the steps in the :exla README Troubleshooting section for more information.

#{inspect(reason)}
#{text}
"""
end
end

def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef)
Expand Down
52 changes: 46 additions & 6 deletions exla/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ defmodule EXLA.MixProject do

%{
"MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}",
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv,
"EXLA_VERSION" => "#{@version}"
}
end,
make_args: make_args
Expand Down Expand Up @@ -133,7 +134,38 @@ defmodule EXLA.MixProject do
{:ok, []}
end

defp cached_make(_) do
defp cached_make(args) do
force_rebuild_mode =
case System.get_env("EXLA_FORCE_REBUILD", "") do
"" ->
:none

"0" ->
:none

"partial" ->
:partial

"true" ->
:full

"1" ->
:full

value ->
Mix.raise(
"invalid value for EXLA_FORCE_REBUILD: '#{value}'. Expected one of: partial, true"
)
end

File.mkdir_p!("cache/#{@version}")

# remove only in full mode
if force_rebuild_mode in [:partial, :full] do
Mix.shell().info("Removing cached .o files in cache/#{@version}/objs")
File.rm_rf!("cache/#{@version}/objs")
end

contents =
for path <- Path.wildcard("c_src/**/*"),
{:ok, contents} <- [File.read(path)],
Expand All @@ -148,19 +180,27 @@ defmodule EXLA.MixProject do
"elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}"

cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"])
cached? = File.exists?(cached_so)
cached? = File.exists?(cached_so) and force_rebuild_mode == :none

if force_rebuild_mode in [:partial, :full] do
Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so")
File.rm_rf!("cache/#{@version}/libexla.so")

Mix.shell().info("Removing libexla.so cache at #{cached_so}")
File.rm!(cached_so)
end

if cached? do
Mix.shell().info("Using libexla.so from #{cached_so}")
File.cp!(cached_so, "cache/libexla.so")
File.cp!(cached_so, "cache/#{@version}/libexla.so")
end

result = Mix.Tasks.Compile.ElixirMake.run([])
result = Mix.Tasks.Compile.ElixirMake.run(args)

if not cached? and match?({:ok, _}, result) do
Mix.shell().info("Caching libexla.so at #{cached_so}")
File.mkdir_p!(Path.dirname(cached_so))
File.cp!("cache/libexla.so", cached_so)
File.cp!("cache/#{@version}/libexla.so", cached_so)
end

result
Expand Down
9 changes: 4 additions & 5 deletions exla/test/exla/device_memory_sharing_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do
end

@tag :cuda_required
test "ipc handles don't crash the runtime when :local mode is selected" do
assert {:error, ~c"Invalid pointer size for selected mode."} ==
test "invalid ipc handles don't crash the runtime" do
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was breaking locally (I guess I hadn't run it on my desktop since the update 😛)

assert {:error, ~c"Unable to get pointer for IPC handle."} ==
Nx.from_pointer(
{EXLA.Backend, client: :cuda},
Enum.to_list(0..63),
%Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4},
{:f, 32},
{1},
mode: :local
{1}
)
end
end
Loading