Skip to content

Commit a716ac0

Browse files
committed
Two header caching changes
(1) Fix incorrectly assumed fresh cache when (generated) files are missing. See #134 (2) Fall back on stale caches if we fail to get any headers and have some in cache.
1 parent 9d438af commit a716ac0

File tree

1 file changed

+54
-26
lines changed

1 file changed

+54
-26
lines changed

refresh.template.py

Lines changed: 54 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,9 @@ def _parse_headers_from_makefile_deps(d_file_content: str, source_path_for_sanit
131131
A dependency file can be generated with the `-M`/`--dependencies` flag or its friends.
132132
See https://clang.llvm.org/docs/ClangCommandLineReference.html#dependency-file-generation for more details.
133133
"""
134+
if not d_file_content: # There was an error generating.
135+
return set()
136+
134137
# When reading file content as text with universal newlines mode enabled (the default), Python converts OS-specific line endings to '\n' (see https://docs.python.org/3/library/functions.html#open-newline-parameter for the thrilling details).
135138
# This function takes an arbitrary string, so we also ensure no `\r` characters have snuck through, because that's almost certainly an upstream error.
136139
assert '\r' not in d_file_content, "Something went wrong in makefile parsing to get headers. Dependency file content should not contain literal '\r' characters. Output:\n" + repr(d_file_content)
@@ -155,24 +158,37 @@ def _parse_headers_from_makefile_deps(d_file_content: str, source_path_for_sanit
155158

156159

157160
@functools.lru_cache(maxsize=None)
158-
def _get_cached_adjusted_modified_time(path: str):
159-
"""Get (and cache!) the modified time of a file, slightly adjusted for easy comparison.
160-
161-
This is primarily intended to check whether header include caches are fresh.
162-
163-
If the file doesn't exist or is inaccessible (either because it was deleted or wasn't generated), return 0.
164-
For bazel's internal sources, which have timestamps 10 years in the future, also return 0.
161+
def _get_cached_modified_time(path: str):
162+
"""Returns 0 if the file doesn't exist.
165163
166164
Without the cache, most of our runtime in the cached case is `stat`'ing the same headers repeatedly.
167165
"""
168166
try:
169-
mtime = os.path.getmtime(path)
167+
return os.path.getmtime(path)
170168
except OSError: # The file doesn't exist or is inaccessible.
171169
# For our purposes, this means we don't have a newer version, so we'll return a very old time that'll always qualify the cache as fresh in a comparison. There are two cases here:
172170
# (1) Somehow it wasn't generated in the build that created the depfile. We therefore won't get any fresher by building, so we'll treat that as good enough; or
173171
# (2) It has been deleted since we last cached, in which case we'd rather use the cached version if it's otherwise fresh.
174172
return 0
175173

174+
175+
def _get_cached_file_exists(path: str):
176+
return _get_cached_modified_time(path) != 0
177+
178+
179+
def _get_cached_adjusted_modified_time(path: str):
180+
"""Get the modified time of a file, slightly adjusted for easy comparison.
181+
182+
This is primarily intended to check whether header include caches are fresh.
183+
184+
If the file doesn't exist or is inaccessible (either because it was deleted or wasn't generated), return 0.
185+
For bazel's internal sources, which have timestamps 10 years in the future, also return 0.
186+
187+
Without the cache, most of our runtime in the cached case is `stat`'ing the same headers repeatedly.
188+
Cache shared with _get_cached_modified_time
189+
"""
190+
mtime = _get_cached_modified_time(path)
191+
176192
# Bazel internal sources have timestamps 10 years in the future as part of a mechanism to detect and prevent modification, so we'll similarly ignore those, since they shouldn't be changing.
177193
if mtime > BAZEL_INTERNAL_SOURCE_CUTOFF:
178194
return 0
@@ -209,7 +225,7 @@ def _get_headers_gcc(compile_args: typing.List[str], source_path: str, action_ke
209225
# Check freshness of dep file by making sure none of the files in it have been modified since its creation.
210226
if (_get_cached_adjusted_modified_time(source_path) <= dep_file_last_modified
211227
and all(_get_cached_adjusted_modified_time(header_path) <= dep_file_last_modified for header_path in headers)):
212-
return headers # Fresh cache! exit early.
228+
return headers, True # Fresh cache! exit early. Still put in the Hedron outer cache bc we're willing to hit stale if we're unable to get new headers.
213229
break
214230

215231
# Strip out existing dependency file generation that could interfere with ours.
@@ -228,14 +244,15 @@ def _get_headers_gcc(compile_args: typing.List[str], source_path: str, action_ke
228244
header_cmd = (arg for arg in header_cmd
229245
if not arg.startswith('-fsanitize'))
230246

231-
header_cmd = list(header_cmd)
232-
# Dump system and user headers to stdout...in makefile format, tolerating missing (generated) files--whose paths may be wrong!
247+
# Dump system and user headers to stdout...in makefile format.
233248
# Relies on our having made the workspace directory simulate a complete version of the execroot with //external symlink
234-
deps_args = ['--dependencies', '--print-missing-file-dependencies']
249+
header_cmd = list(header_cmd)
250+
is_nvcc = _is_nvcc(header_cmd[0])
235251
# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#nvcc-command-options
236-
if _is_nvcc(header_cmd[0]):
237-
deps_args = ['--generate-dependencies']
238-
header_cmd += deps_args
252+
if is_nvcc:
253+
header_cmd += ['--generate-dependencies']
254+
else:
255+
header_cmd += ['--dependencies', '--print-missing-file-dependencies'] # Allows us to continue on past missing (generated) files--whose paths may be wrong (listed as written in the include)!
239256

240257
header_search_process = _subprocess_run_spilling_over_to_param_file_if_needed( # Note: gcc/clang can be run from Windows, too.
241258
header_cmd,
@@ -251,11 +268,18 @@ def _get_headers_gcc(compile_args: typing.List[str], source_path: str, action_ke
251268
_print_header_finding_warning_once()
252269
print(header_search_process.stderr, file=sys.stderr, end='') # Stderr captured and dumped atomically to avoid interlaced output.
253270

254-
if not header_search_process.stdout: # Worst case, we couldn't get the headers,
255-
return set()
256-
# But often, we can get the headers, despite the error.
271+
headers = _parse_headers_from_makefile_deps(header_search_process.stdout)
257272

258-
return _parse_headers_from_makefile_deps(header_search_process.stdout)
273+
# Can't cache when headers are missing.
274+
# Why? We'd wrongly get a subset of the headers and we might wrongly think the cache is still fresh because we wouldn't know that the formerly missing header had been generated.
275+
if is_nvcc:
276+
should_cache = bool(header_search_process.stdout) # Without --print-missing-file-dependencies, nothing is written if a file isn't found. (Something is still written if no headers are included.) This avoids hardcoding the error message. Note that some errors, like that for #bad_directive are okay to ignore! We'll know the cache isn't fresh when the user changes the file.
277+
else: # Handle '--print-missing-file-dependencies'
278+
num_headers_output = len(headers)
279+
headers = {header for header in headers if _get_cached_file_exists(header)} # We can't filter on headers starting with bazel-out because missing headers don't have complete paths; they just listed as #included
280+
should_cache = len(headers) == num_headers_output
281+
282+
return headers, should_cache
259283

260284

261285
@functools.lru_cache(maxsize=None)
@@ -435,7 +459,8 @@ def _get_headers_msvc(compile_args: typing.List[str], source_path: str):
435459
_print_header_finding_warning_once()
436460
print('\n'.join(error_lines), file=sys.stderr)
437461

438-
return headers
462+
should_cache = all('fatal error C1083:' not in error_line for error_line in error_lines) # Error code for file not found (when trying to include). Without this, we'd wrongly get a subset of the headers and we might wrongly think the cache is still fresh because we wouldn't know that the formerly missing header had been generated.
463+
return headers, should_cache
439464

440465

441466
def _is_relative_to(sub: pathlib.PurePath, parent: pathlib.PurePath):
@@ -510,13 +535,14 @@ def _get_headers(compile_action, source_path: str):
510535
Continuing gracefully...""")
511536

512537
# Check for a fresh cache of headers
538+
cached_headers = None
513539
if output_file:
514540
cache_file_path = output_file + ".hedron.compile-commands.headers" # Embed our cache in bazel's
515541
if os.path.isfile(cache_file_path):
516542
cache_last_modified = os.path.getmtime(cache_file_path) # Do before opening just as a basic hedge against concurrent write, even though we won't handle the concurrent delete case perfectly.
517543
try:
518544
with open(cache_file_path) as cache_file:
519-
action_key, headers = json.load(cache_file)
545+
action_key, cached_headers = json.load(cache_file)
520546
except json.JSONDecodeError:
521547
# Corrupted cache, which can happen if, for example, the user kills the program, since writes aren't atomic.
522548
# But if it is the result of a bug, we want to print it before it's overwritten, so it can be reported
@@ -534,11 +560,11 @@ def _get_headers(compile_action, source_path: str):
534560
# And we also need to check that there aren't newer versions of the files
535561
if (action_key == compile_action.actionKey
536562
and _get_cached_adjusted_modified_time(source_path) <= cache_last_modified
537-
and all(_get_cached_adjusted_modified_time(header_path) <= cache_last_modified for header_path in headers)):
538-
return set(headers)
563+
and all(_get_cached_adjusted_modified_time(header_path) <= cache_last_modified for header_path in cached_headers)):
564+
return set(cached_headers)
539565

540566
if compile_action.arguments[0].endswith('cl.exe'): # cl.exe and also clang-cl.exe
541-
headers = _get_headers_msvc(compile_action.arguments, source_path)
567+
headers, should_cache = _get_headers_msvc(compile_action.arguments, source_path)
542568
else:
543569
# Emscripten is tricky. There isn't an easy way to make it emcc run without lots of environment variables.
544570
# So...rather than doing our usual script unwrapping, we just swap in clang/gcc and use that to get headers, knowing that they'll accept the same argument format.
@@ -551,13 +577,15 @@ def _get_headers(compile_action, source_path: str):
551577
if not alternate_compiler: return set() # Skip getting headers.
552578
args = args.copy()
553579
args[0] = alternate_compiler
554-
headers = _get_headers_gcc(args, source_path, compile_action.actionKey)
580+
headers, should_cache = _get_headers_gcc(args, source_path, compile_action.actionKey)
555581

556582
# Cache for future use
557-
if output_file:
583+
if output_file and should_cache:
558584
os.makedirs(os.path.dirname(cache_file_path), exist_ok=True)
559585
with open(cache_file_path, 'w') as cache_file:
560586
json.dump((compile_action.actionKey, list(headers)), cache_file)
587+
elif not headers and cached_headers: # If we failed to get headers, we'll fall back on a stale cache.
588+
headers = set(cached_headers)
561589

562590
if {exclude_headers} == "external":
563591
headers = {header for header in headers if _file_is_in_main_workspace_and_not_external(header)}

0 commit comments

Comments
 (0)