jax-ml
diff --git a/‎jax/_src/compilation_cache.py‎
Lines changed: 12 additions & 0 deletions b/‎jax/_src/compilation_cache.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎jax/_src/compiler.py‎
Lines changed: 114 additions & 6 deletions b/‎jax/_src/compiler.py‎
Lines changed: 114 additions & 6 deletions
diff --git a/‎jax/_src/config.py‎
Lines changed: 43 additions & 1 deletion b/‎jax/_src/config.py‎
Lines changed: 43 additions & 1 deletion
@@ -157,6 +157,18 @@ def decompress_executable(executable):
   else:
     return zlib.decompress(executable)
 
+
+def is_executable_in_cache(cache_key: str) -> bool:
+  """Checks if the executable is in the cache."""
+  cache = _get_cache()
+  if cache is None:
+    return False
+
+  # TODO(patrios): add check cache key method to cache interface.
+  executable_and_time = cache.get(cache_key)
+  return executable_and_time is not None
+
+
 def get_executable_and_time(
     cache_key: str, compile_options, backend
 ) -> tuple[xla_client.LoadedExecutable | None, int | None]:
 
@@ -21,7 +21,7 @@
 import os
 import tempfile
 import time
-from typing import Any
+from typing import Any, Optional
 import warnings
 
 from jax._src import compilation_cache
@@ -243,6 +243,7 @@ def compile_or_get_cached(
     devices: np.ndarray,
     compile_options: xc.CompileOptions,
     host_callbacks: Sequence[Any],
+    pgle_profiler: profiler.PGLEProfiler | None = None,
 ) -> xc.LoadedExecutable:
   sym_name = computation.operation.attributes['sym_name']
   module_name = ir.StringAttr(sym_name).value
@@ -278,14 +279,55 @@ def compile_or_get_cached(
     return backend_compile(backend, computation, compile_options,
                            host_callbacks)
 
+  is_multi_process = (
+      len({device.process_index for device in devices.flatten()}) > 1)
+  min_device_process_id = (
+      min(devices.flatten(), key=lambda device: device.id).process_index)
+
+  # When PGLE is enabled there might be 3 types of situations:
+  # 1. PGLE profiled module (the one which was recompiled with FDO profile) is
+  # in the persistent cache. In this case the module should be returned from
+  # cache and PGLE should be disabled for this module. Is module is stored in
+  # the persistent cache under the "pgle_profiled_module_key" which calculated
+  # with replacing FDO profile with flag which identify that module were PGLE
+  # profiled.
+  # 2. PGLE profiled module is not in the persistent cache and the module is
+  # getting built with an FDO profile. In this case we need to share FDO profile
+  # with other processes and store the result under the
+  # "pgle_profiled_module_key" so later in case 1 we will be able to find the
+  # module.
+  # 3. PGLE profiled module is not in the persistent cache and the module is
+  # getting compiled to be PGLEd (FDO profile is empty). In this case we need to
+  # simply return the non-PGLE profiled module from the persistent cache.
+  if (config.enable_pgle.value
+      and config.pgle_profiling_runs.value > 0):
+    fdo_profile = compile_options.executable_build_options.fdo_profile
+    compile_options.executable_build_options.fdo_profile = b"pgle profiled"
+
+    pgle_profiled_module_key = compilation_cache.get_cache_key(
+        computation, devices, compile_options, backend)
+    compile_options.executable_build_options.fdo_profile = fdo_profile
+
+    if _is_executable_in_cache(pgle_profiled_module_key):
+      # Load PGLE profiled module from the persistent cache.
+      cache_key = pgle_profiled_module_key
+      if pgle_profiler is not None:
+        pgle_profiler.disable()
+    elif fdo_profile is not None and len(fdo_profile) > 0:
+      # Store module under PGLE profiled module cache key.
+      cache_key = pgle_profiled_module_key
+      if is_multi_process and distributed.global_state.client is not None:
+        compile_options.executable_build_options.fdo_profile = _share_fdo_profiles(
+          computation, devices, compile_options, backend,
+          distributed.global_state.client,
+          min_device_process_id
+        )
+
   cache_retrieval_start = time.monotonic()
   retrieved_executable, retrieved_compile_time = _cache_read(
       module_name, cache_key, compile_options, backend)
   cache_retrieval_time = time.monotonic() - cache_retrieval_start
 
-
-  is_multi_process = (
-      len({device.process_index for device in devices.flatten()}) > 1)
   if retrieved_executable is not None:
     assert retrieved_compile_time is not None
     logger.debug("Persistent compilation cache hit for '%s'", module_name)
@@ -315,7 +357,7 @@ def compile_or_get_cached(
         distributed.global_state.client,
         module_name,
         cache_key,
-        min(devices.flatten(), key=lambda device: device.id).process_index
+        min_device_process_id
     )
   elif (
       config.share_autotune_config_between_hosts.value
@@ -330,7 +372,7 @@ def compile_or_get_cached(
         distributed.global_state.client,
         module_name,
         cache_key,
-        min(devices.flatten(), key=lambda device: device.id).process_index
+        min_device_process_id
     )
   else:
     return _compile_and_write_cache(
@@ -342,6 +384,58 @@ def compile_or_get_cached(
         cache_key,
     )
 
+# The process that has the lowest device ID should share FDO profile before
+# compilation with other processes.
+def _share_fdo_profiles(
+    computation: ir.Module,
+    devices: np.ndarray,
+    compile_options: xc.CompileOptions,
+    backend: xc.Client,
+    global_client: lib.xla_extension.DistributedRuntimeClient,
+    min_process_id
+) -> Optional[bytes]:
+  sym_name = computation.operation.attributes['sym_name']
+  module_name = ir.StringAttr(sym_name).value
+  fdo_profile = compile_options.executable_build_options.fdo_profile
+  if fdo_profile is None or len(fdo_profile) == 0:
+    return fdo_profile
+
+  compile_options.executable_build_options.fdo_profile = b""
+  profile_key = (
+      compilation_cache.get_cache_key(
+          computation, devices, compile_options, backend
+      )
+      + "_fdo_sync"
+  )
+  if profile_key in _share_fdo_profiles.modules_profiles:
+    return _share_fdo_profiles.modules_profiles[profile_key]
+
+  share_timeout = config.share_binary_between_hosts_timeout_ms.value
+  if distributed.global_state.process_id == min_process_id:
+    logger.debug(
+        "Sharing FDO profile: %s. For module %s. Process %d.",
+        fdo_profile,
+        module_name,
+        min_process_id,
+    )
+    global_client.key_value_set_bytes(profile_key, fdo_profile)
+  else:
+    logger.debug(
+        "Waiting for FDO profile: %s. For module %s. Should be set by process %d.",
+        fdo_profile,
+        module_name,
+        min_process_id,
+    )
+    fdo_profile = global_client.blocking_key_value_get_bytes(
+        profile_key, share_timeout
+    )
+
+  _share_fdo_profiles.modules_profiles[profile_key] = fdo_profile
+  return fdo_profile
+
+
+_share_fdo_profiles.modules_profiles = {}
+
 
 # The process with the first_process_id should compile the module and write an
 # autotune config to the K-V storage.
@@ -520,6 +614,20 @@ def _compile_and_write_cache(
   )
   return executable
 
+def _is_executable_in_cache(cache_key) -> bool:
+  """Checks if executable is presented in cache on a given key
+  """
+  try:
+    return compilation_cache.is_executable_in_cache(cache_key)
+  except Exception as ex:
+    if config.raise_persistent_cache_errors.value:
+      raise
+    warnings.warn(
+        f"Error reading persistent compilation cache entry for "
+        f"'{cache_key}': {type(ex).__name__}: {ex}")
+    return False
+
+
 def _cache_read(
     module_name: str, cache_key: str, compile_options: xc.CompileOptions,
     backend: xc.Client
 
@@ -217,7 +217,9 @@ def trace_context():
           debug_key_reuse.value,
           jax_xla_profile_version.value,
           # Technically this affects jaxpr->stablehlo lowering, not tracing.
-          hlo_source_file_canonicalization_regex.value)
+          hlo_source_file_canonicalization_regex.value,
+          pgle_profiling_runs.value,
+          enable_pgle.value)
 
 config = Config()
 
@@ -815,6 +817,8 @@ class _GlobalExtraJitContext(NamedTuple):
   threefry_gpu_kernel_lowering: bool = False
   softmax_custom_jvp: bool = False
   xla_profile_version: int = 0
+  pgle_profiling_runs: int = 0
+  enable_pgle: bool = False
 
 
 def _update_global_jit_state(**kw):
@@ -850,6 +854,8 @@ class _ThreadLocalExtraJitContext(NamedTuple):
   threefry_gpu_kernel_lowering: bool | None = None
   softmax_custom_jvp: bool | None = None
   xla_profile_version: int | None = None
+  pgle_profiling_runs: int | None = None
+  enable_pgle: bool | None = None
 
 
 class _ThreadLocalStateCache(threading.local):
@@ -1221,6 +1227,42 @@ def _update_jax_memories_thread_local(val):
     help='Timeout for the compiled module share.',
 )
 
+enable_pgle = define_bool_state(
+    name='jax_enable_pgle',
+    default=False,
+    help=(
+      'If set to True and the property jax_pgle_profiling_runs is set to '
+      'greater than 0, the modules will be recompiled after running specified '
+      'number times with collected data provided to the profile guided latency '
+      'estimator.'
+    ),
+    update_global_hook=lambda val: _update_global_jit_state(enable_pgle=val),
+    update_thread_local_hook=lambda val: update_thread_local_jit_state(
+        enable_pgle=val),
+)
+
+pgle_profiling_runs = define_int_state(
+    name='jax_pgle_profiling_runs',
+    default=3,
+    help=(
+        'Amount of times module should be profiled before recompilation when '
+        'PGLE is used.'
+    ),
+    update_global_hook=lambda val: _update_global_jit_state(
+        pgle_profiling_runs=val
+    ),
+    update_thread_local_hook=lambda val: update_thread_local_jit_state(
+        pgle_profiling_runs=val
+    ),
+)
+
+pgle_aggregation_percentile = define_int_state(
+    name='jax_pgle_aggregation_percentile',
+    default=90,
+    help='Percentile used to aggregate performance data between devices when '
+         'PGLE is used.',
+)
+
 enable_compilation_cache = define_bool_state(
     name='jax_enable_compilation_cache',
     default=True,