Skip to content

Commit e71f314

Browse files
authored
Merge pull request #106 from VectorInstitute/clean_logs
Added log clean up functionality.
2 parents 645223b + df4edb8 commit e71f314

File tree

8 files changed

+465
-1
lines changed

8 files changed

+465
-1
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ export VEC_INF_CONFIG=/h/<username>/my-model-config.yaml
107107
* `metrics`: Streams performance metrics to the console.
108108
* `shutdown`: Shutdown a model by providing its Slurm job ID.
109109
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
110+
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.
110111

111112
For more details on the usage of these commands, refer to the [User Guide](https://vectorinstitute.github.io/vector-inference/user_guide/)
112113

tests/vec_inf/cli/test_cli.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,3 +531,77 @@ def test_metrics_command_request_failed(
531531
in result.output
532532
)
533533
assert "Connection refused" in result.output
534+
535+
536+
def test_cli_cleanup_logs_dry_run(runner, tmp_path):
537+
"""Test CLI cleanup command in dry-run mode."""
538+
model_dir = tmp_path / "fam_a" / "model_a.123"
539+
model_dir.mkdir(parents=True)
540+
541+
result = runner.invoke(
542+
cli,
543+
[
544+
"cleanup",
545+
"--log-dir",
546+
str(tmp_path),
547+
"--model-family",
548+
"fam_a",
549+
"--model-name",
550+
"model_a",
551+
"--dry-run",
552+
],
553+
)
554+
555+
assert result.exit_code == 0
556+
assert "would be deleted" in result.output
557+
assert "model_a.123" in result.output
558+
559+
560+
def test_cli_cleanup_logs_delete(tmp_path):
561+
"""Test cleanup_logs CLI deletes matching directories when not in dry-run mode."""
562+
fam_dir = tmp_path / "fam_a"
563+
fam_dir.mkdir()
564+
(fam_dir / "model_a.1").mkdir()
565+
566+
runner = CliRunner()
567+
result = runner.invoke(
568+
cli,
569+
[
570+
"cleanup",
571+
"--log-dir",
572+
str(tmp_path),
573+
"--model-family",
574+
"fam_a",
575+
"--model-name",
576+
"model_a",
577+
"--job-id",
578+
"1",
579+
],
580+
)
581+
582+
assert result.exit_code == 0
583+
assert "Deleted 1 log directory" in result.output
584+
assert not (fam_dir / "model_a.1").exists()
585+
586+
587+
def test_cli_cleanup_logs_no_match(tmp_path):
588+
"""Test cleanup_logs CLI when no directories match the filters."""
589+
fam_dir = tmp_path / "fam_a"
590+
fam_dir.mkdir()
591+
(fam_dir / "model_a.1").mkdir()
592+
593+
runner = CliRunner()
594+
result = runner.invoke(
595+
cli,
596+
[
597+
"cleanup",
598+
"--log-dir",
599+
str(tmp_path),
600+
"--model-family",
601+
"fam_b",
602+
],
603+
)
604+
605+
assert result.exit_code == 0
606+
assert "No matching log directories were deleted." in result.output
607+
assert (fam_dir / "model_a.1").exists()

tests/vec_inf/client/test_api.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,77 @@ def test_wait_until_ready():
131131
assert mock_status.call_count == 2
132132

133133

134+
def test_cleanup_logs_no_match(tmp_path):
135+
"""Test when cleanup_logs returns empty list."""
136+
fam_a = tmp_path / "fam_a"
137+
model_a = fam_a / "model_a.999"
138+
model_a.mkdir(parents=True)
139+
140+
client = VecInfClient()
141+
deleted = client.cleanup_logs(
142+
log_dir=tmp_path,
143+
model_family="fam_b",
144+
dry_run=False,
145+
)
146+
147+
assert deleted == []
148+
assert fam_a.exists()
149+
assert model_a.exists()
150+
151+
152+
def test_cleanup_logs_deletes_matching_dirs(tmp_path):
153+
"""Test that cleanup_logs deletes model directories matching filters."""
154+
fam_a = tmp_path / "fam_a"
155+
fam_a.mkdir()
156+
157+
model_a_1 = fam_a / "model_a.10"
158+
model_a_2 = fam_a / "model_a.20"
159+
model_b = fam_a / "model_b.30"
160+
161+
model_a_1.mkdir()
162+
model_a_2.mkdir()
163+
model_b.mkdir()
164+
165+
client = VecInfClient()
166+
deleted = client.cleanup_logs(
167+
log_dir=tmp_path,
168+
model_family="fam_a",
169+
model_name="model_a",
170+
before_job_id=15,
171+
dry_run=False,
172+
)
173+
174+
assert deleted == [model_a_1]
175+
assert not model_a_1.exists()
176+
assert model_a_2.exists()
177+
assert model_b.exists()
178+
179+
180+
def test_cleanup_logs_matching_dirs_dry_run(tmp_path):
181+
"""Test that cleanup_logs find model directories matching filters."""
182+
fam_a = tmp_path / "fam_a"
183+
fam_a.mkdir()
184+
185+
model_a_1 = fam_a / "model_a.10"
186+
model_a_2 = fam_a / "model_a.20"
187+
188+
model_a_1.mkdir()
189+
model_a_2.mkdir()
190+
191+
client = VecInfClient()
192+
deleted = client.cleanup_logs(
193+
log_dir=tmp_path,
194+
model_family="fam_a",
195+
model_name="model_a",
196+
before_job_id=15,
197+
dry_run=True,
198+
)
199+
200+
assert deleted == [model_a_1]
201+
assert model_a_1.exists()
202+
assert model_a_2.exists()
203+
204+
134205
def test_shutdown_model_success():
135206
"""Test model shutdown success."""
136207
client = VecInfClient()

tests/vec_inf/client/test_utils.py

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from vec_inf.client._utils import (
1010
MODEL_READY_SIGNATURE,
11+
find_matching_dirs,
1112
get_base_url,
1213
is_server_running,
1314
load_config,
@@ -208,3 +209,135 @@ def test_load_config_invalid_user_model(tmp_path):
208209
assert "validation error" in str(excinfo.value).lower()
209210
assert "model_type" in str(excinfo.value)
210211
assert "num_gpus" in str(excinfo.value)
212+
213+
214+
def test_find_matching_dirs_only_model_family(tmp_path):
215+
"""Return model_family directory when only model_family is provided."""
216+
fam_dir = tmp_path / "fam_a"
217+
fam_dir.mkdir()
218+
(fam_dir / "model_a.1").mkdir()
219+
(fam_dir / "model_b.2").mkdir()
220+
221+
other_dir = tmp_path / "fam_b"
222+
other_dir.mkdir()
223+
(other_dir / "model_c.3").mkdir()
224+
225+
matches = find_matching_dirs(log_dir=tmp_path, model_family="fam_a")
226+
assert len(matches) == 1
227+
assert matches[0].name == "fam_a"
228+
229+
230+
def test_find_matching_dirs_only_model_name(tmp_path):
231+
"""Return directories matching when only model_name is provided."""
232+
fam_a = tmp_path / "fam_a"
233+
fam_a.mkdir()
234+
(fam_a / "target.1").mkdir()
235+
(fam_a / "other.2").mkdir()
236+
237+
fam_b = tmp_path / "fam_b"
238+
fam_b.mkdir()
239+
(fam_b / "different.3").mkdir()
240+
241+
matches = find_matching_dirs(log_dir=tmp_path, model_name="target")
242+
result_names = [p.name for p in matches]
243+
244+
assert "target.1" in result_names
245+
assert "other.2" not in result_names
246+
assert "different.3" not in result_names
247+
248+
249+
def test_find_matching_dirs_only_job_id(tmp_path):
250+
"""Return directories matching exact job_id."""
251+
fam_dir = tmp_path / "fam"
252+
fam_dir.mkdir()
253+
(fam_dir / "model_a.10").mkdir()
254+
(fam_dir / "model_b.20").mkdir()
255+
(fam_dir / "model_c.30").mkdir()
256+
257+
matches = find_matching_dirs(log_dir=tmp_path, job_id=10)
258+
result_names = [p.name for p in matches]
259+
260+
assert "model_a.10" in result_names
261+
assert "model_b.20" not in result_names
262+
assert "model_c.30" not in result_names
263+
264+
265+
def test_find_matching_dirs_only_before_job_id(tmp_path):
266+
"""Return directories with job_id < before_job_id."""
267+
fam_dir = tmp_path / "fam_a"
268+
fam_dir.mkdir()
269+
(fam_dir / "model_a.1").mkdir()
270+
(fam_dir / "model_a.5").mkdir()
271+
(fam_dir / "model_a.100").mkdir()
272+
273+
fam_dir = tmp_path / "fam_b"
274+
fam_dir.mkdir()
275+
(fam_dir / "model_b.30").mkdir()
276+
277+
matches = find_matching_dirs(log_dir=tmp_path, before_job_id=50)
278+
result_names = [p.name for p in matches]
279+
280+
assert "model_a.1" in result_names
281+
assert "model_a.5" in result_names
282+
assert "model_a.100" not in result_names
283+
assert "model_b.30" in result_names
284+
285+
286+
def test_find_matching_dirs_family_and_before_job_id(tmp_path):
287+
"""Return directories under a given family with job IDs less than before_job_id."""
288+
fam_dir = tmp_path / "targetfam"
289+
fam_dir.mkdir()
290+
(fam_dir / "model_a.10").mkdir()
291+
(fam_dir / "model_a.20").mkdir()
292+
(fam_dir / "model_a.99").mkdir()
293+
(fam_dir / "model_a.150").mkdir()
294+
295+
other_fam = tmp_path / "otherfam"
296+
other_fam.mkdir()
297+
(other_fam / "model_b.5").mkdir()
298+
(other_fam / "model_b.10").mkdir()
299+
(other_fam / "model_b.100").mkdir()
300+
301+
matches = find_matching_dirs(
302+
log_dir=tmp_path,
303+
model_family="targetfam",
304+
before_job_id=100,
305+
)
306+
307+
result_names = [p.name for p in matches]
308+
309+
assert "model_a.10" in result_names
310+
assert "model_a.20" in result_names
311+
assert "model_a.99" in result_names
312+
assert "model_a.150" not in result_names
313+
assert all("otherfam" not in str(p) for p in matches)
314+
315+
316+
def test_find_matching_dirs_with_family_model_name_and_before_job_id(tmp_path):
317+
"""Return matching dirs with model_family, model_name, and before_job_id filters."""
318+
fam_dir = tmp_path / "targetfam"
319+
fam_dir.mkdir()
320+
(fam_dir / "model_a.1").mkdir()
321+
(fam_dir / "model_a.50").mkdir()
322+
(fam_dir / "model_a.150").mkdir()
323+
(fam_dir / "model_b.40").mkdir()
324+
325+
other_fam = tmp_path / "otherfam"
326+
other_fam.mkdir()
327+
(other_fam / "model_c.20").mkdir()
328+
329+
matches = find_matching_dirs(
330+
log_dir=tmp_path,
331+
model_family="targetfam",
332+
model_name="model_a",
333+
before_job_id=100,
334+
)
335+
336+
result_names = [p.name for p in matches]
337+
338+
assert "model_a.1" in result_names
339+
assert "model_a.50" in result_names
340+
assert "model_a.150" not in result_names
341+
assert "model_b.40" not in result_names
342+
assert all("model_b" not in p for p in result_names)
343+
assert all("otherfam" not in str(p) for p in matches)

vec_inf/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,6 @@
55
* `metrics`: Streams performance metrics to the console.
66
* `shutdown`: Shutdown a model by providing its Slurm job ID.
77
* `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
8+
* `cleanup`: Remove old log directories. You can filter by `--model-family`, `--model-name`, `--job-id`, and/or `--before-job-id`. Use `--dry-run` to preview what would be deleted.
89

910
Use `--help` to see all available options

vec_inf/cli/_cli.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,5 +361,69 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
361361
raise click.ClickException(f"Metrics check failed: {str(e)}") from e
362362

363363

364+
@cli.command("cleanup")
365+
@click.option("--log-dir", type=str, help="Path to SLURM log directory")
366+
@click.option("--model-family", type=str, help="Filter by model family")
367+
@click.option("--model-name", type=str, help="Filter by model name")
368+
@click.option(
369+
"--job-id", type=int, help="Only remove logs with this exact SLURM job ID"
370+
)
371+
@click.option(
372+
"--before-job-id",
373+
type=int,
374+
help="Remove logs with job ID less than this value",
375+
)
376+
@click.option("--dry-run", is_flag=True, help="List matching logs without deleting")
377+
def cleanup_logs_cli(
378+
log_dir: Optional[str],
379+
model_family: Optional[str],
380+
model_name: Optional[str],
381+
job_id: Optional[int],
382+
before_job_id: Optional[int],
383+
dry_run: bool,
384+
) -> None:
385+
"""Clean up log files based on optional filters.
386+
387+
Parameters
388+
----------
389+
log_dir : str or Path, optional
390+
Root directory containing log files. Defaults to ~/.vec-inf-logs.
391+
model_family : str, optional
392+
Only delete logs for this model family.
393+
model_name : str, optional
394+
Only delete logs for this model name.
395+
job_id : int, optional
396+
If provided, only match directories with this exact SLURM job ID.
397+
before_job_id : int, optional
398+
If provided, only delete logs with job ID less than this value.
399+
dry_run : bool
400+
If True, return matching files without deleting them.
401+
"""
402+
try:
403+
client = VecInfClient()
404+
matched = client.cleanup_logs(
405+
log_dir=log_dir,
406+
model_family=model_family,
407+
model_name=model_name,
408+
job_id=job_id,
409+
before_job_id=before_job_id,
410+
dry_run=dry_run,
411+
)
412+
413+
if not matched:
414+
if dry_run:
415+
click.echo("Dry run: no matching log directories found.")
416+
else:
417+
click.echo("No matching log directories were deleted.")
418+
elif dry_run:
419+
click.echo(f"Dry run: {len(matched)} directories would be deleted:")
420+
for f in matched:
421+
click.echo(f" - {f}")
422+
else:
423+
click.echo(f"Deleted {len(matched)} log directory(ies).")
424+
except Exception as e:
425+
raise click.ClickException(f"Cleanup failed: {str(e)}") from e
426+
427+
364428
if __name__ == "__main__":
365429
cli()

0 commit comments

Comments
 (0)