Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions exla/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ XLA_EXTENSION_LIB = $(XLA_EXTENSION_DIR)/lib
XLA_INCLUDE_PATH = $(XLA_EXTENSION_DIR)/include

# Cache configuration
EXLA_CACHE_SO = cache/libexla.so
EXLA_CACHE_OBJ_DIR = cache/objs
EXLA_CACHE_SO = cache/$(EXLA_VERSION)/libexla.so
EXLA_CACHE_OBJ_DIR = cache/$(EXLA_VERSION)/objs

# Private configuration
EXLA_DIR = c_src/exla
Expand Down
20 changes: 20 additions & 0 deletions exla/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,26 @@ EXLA relies on the [XLA](https://github.com/elixir-nx/xla) package to provide th

For cross-compilation, you need to [set your `XLA_TARGET_PLATFORM` variable](https://github.com/elixir-nx/xla#xla_target_platform) to the correct target platform value (i.e. `aarch64-linux-gnu` for the Raspberry Pi 4).

## Troubleshooting

EXLA uses NIFs (C-interface code called from Elixir) for part of its functionality.
If for any reason these fail to compile or load, troubleshooting can be tricky.

We recommend following the steps below:

1. If the error appeared after upgrading EXLA, ensure that you have the proper dependency versions given by [XLA](https://github.com/elixir-nx/xla). Afterwards, compile with `mix compile` after setting `EXLA_FORCE_REBUILD` to clean up cached files:
* `EXLA_FORCE_REBUILD=partial`: Removes the only the libexla.so caches (both local and global ones).
* `EXLA_FORCE_REBUILD=true`: Removes the libexla.so caches but also removes the intermediate `.o` compilation artifacts retained from previous builds.

Additional notes on compilation:
* Besides the XLA dependency versions, ensuring `gcc` (or your compiler of choice), `libc` and `make` are compatible is also important.
* Remember to save the compilation logs from this step for further debugging.
* It is a good idea to save the `cache/<version>/libexla.so` file so that the team can inspect its contents if needed.
2. If the error persists, look for the `** (RuntimeError) Failed to load NIF library.` exception on application start-up.
This exception should provide more information on what's the issue when loading the NIF. Share these logs in an issue on GitHub
so that the Nx team can investigate further.


## Contributing

### Building locally
Expand Down
15 changes: 14 additions & 1 deletion exla/lib/exla/nif.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,20 @@ defmodule EXLA.NIF do

def __on_load__ do
path = :filename.join(:code.priv_dir(:exla), ~c"libexla")
:erlang.load_nif(path, 0)

case :erlang.load_nif(path, 0) do
:ok ->
:ok

{:error, {reason, text}} ->
raise """
Failed to load NIF library.
Follow the steps in the :exla README Troubleshooting section for more information.

#{inspect(reason)}
#{text}
"""
end
end

def mlir_new_thread_pool(_concurrency), do: :erlang.nif_error(:undef)
Expand Down
53 changes: 47 additions & 6 deletions exla/mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ defmodule EXLA.MixProject do

%{
"MIX_BUILD_EMBEDDED" => "#{Mix.Project.config()[:build_embedded]}",
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv
"CWD_RELATIVE_TO_PRIV_PATH" => cwd_relative_to_priv,
"EXLA_VERSION" => "#{@version}"
}
end,
make_args: make_args
Expand Down Expand Up @@ -133,7 +134,38 @@ defmodule EXLA.MixProject do
{:ok, []}
end

defp cached_make(_) do
defp cached_make(args) do
force_rebuild_mode =
case System.get_env("EXLA_FORCE_REBUILD") do
"" ->
false

"0" ->
false

"partial" ->
:partial

"true" ->
true

"1" ->
true
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
false
"0" ->
false
"partial" ->
:partial
"true" ->
true
"1" ->
true
:none
"0" ->
:none
"partial" ->
:partial
"true" ->
:full
"1" ->
:full

This should force all the code below to be explicit :)


value ->
Mix.raise(
"invalid value for EXLA_FORCE_REBUILD: #{value}. Expected one of: partial, true"
)
end

File.mkdir_p!("cache/#{@version}")

# remove only in full mode
if force_rebuild_mode == true do
Mix.shell().info("Removing cached .o files in cache/#{@version}/objs")
File.rm_rf!("cache/#{@version}/objs")
end

contents =
for path <- Path.wildcard("c_src/**/*"),
{:ok, contents} <- [File.read(path)],
Expand All @@ -148,19 +180,28 @@ defmodule EXLA.MixProject do
"elixir-#{System.version()}-erts-#{:erlang.system_info(:version)}-xla-#{Application.spec(:xla, :vsn)}-exla-#{@version}-#{md5}"

cached_so = Path.join([xla_cache_dir(), "exla", cache_key, "libexla.so"])
cached? = File.exists?(cached_so)
cached? = File.exists?(cached_so) and force_rebuild_mode == false

# remove in both partial and full modes
if force_rebuild_mode do
Mix.shell().info("Removing cached libexla.so file in cache/#{@version}/libexla.so")
File.rm_rf!("cache/#{@version}/libexla.so")

Mix.shell().info("Removing libexla.so cache at #{cached_so}")
File.rm!(cached_so)
end

if cached? do
Mix.shell().info("Using libexla.so from #{cached_so}")
File.cp!(cached_so, "cache/libexla.so")
File.cp!(cached_so, "cache/#{@version}/libexla.so")
end

result = Mix.Tasks.Compile.ElixirMake.run([])
result = Mix.Tasks.Compile.ElixirMake.run(args)

if not cached? and match?({:ok, _}, result) do
Mix.shell().info("Caching libexla.so at #{cached_so}")
File.mkdir_p!(Path.dirname(cached_so))
File.cp!("cache/libexla.so", cached_so)
File.cp!("cache/#{@version}/libexla.so", cached_so)
end

result
Expand Down
9 changes: 4 additions & 5 deletions exla/test/exla/device_memory_sharing_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,13 @@ defmodule EXLA.DeviceMemorySharingTest do
end

@tag :cuda_required
test "ipc handles don't crash the runtime when :local mode is selected" do
assert {:error, ~c"Invalid pointer size for selected mode."} ==
test "invalid ipc handles don't crash the runtime" do
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test was breaking locally (I guess I hadn't run it on my desktop since the update 😛)

assert {:error, ~c"Unable to get pointer for IPC handle."} ==
Nx.from_pointer(
{EXLA.Backend, client: :cuda},
Enum.to_list(0..63),
%Nx.Pointer{handle: "#{System.unique_integer()}", kind: :ipc, data_size: 4},
{:f, 32},
{1},
mode: :local
{1}
)
end
end
Loading