Skip to content

Commit 1e66bf7

Browse files
authored
[ML][Pipelines] support ignore files in additional includes (Azure#28515)
* fix: support ignore file in additional includes * fix: hello.py/hello.py => hello.py * doc: docstring for util func * fix: roll back unexpected snapshot id change * feat: update ignore logic in additional includes * fix: avoid get ignore file for file copying
1 parent e4edec7 commit 1e66bf7

File tree

8 files changed

+245
-86
lines changed

8 files changed

+245
-86
lines changed

sdk/ml/azure-ai-ml/azure/ai/ml/_internal/entities/_additional_includes.py

Lines changed: 67 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
import tempfile
88
import zipfile
99
from pathlib import Path
10-
from typing import Optional, Union
10+
from typing import Union
1111

1212
import yaml
1313

14-
from azure.ai.ml._utils._asset_utils import IgnoreFile, get_local_paths
14+
from azure.ai.ml._utils._asset_utils import IgnoreFile, get_local_paths, get_ignore_file
1515
from azure.ai.ml.entities._util import _general_copy
1616
from azure.ai.ml.entities._validation import MutableValidationResult, _ValidationResultBuilder
1717

@@ -29,25 +29,22 @@ def __init__(
2929
self,
3030
code_path: Union[None, str],
3131
yaml_path: str,
32-
ignore_file: Optional[InternalComponentIgnoreFile] = None,
3332
):
3433
self.__yaml_path = yaml_path
3534
self.__code_path = code_path
36-
self._ignore_file = ignore_file
3735

3836
self._tmp_code_path = None
3937
self.__includes = None
40-
self._is_artifact_includes = False
41-
self._artifact_validate_result = _ValidationResultBuilder.success()
38+
# artifact validation is done on loading now, so need a private variable to store the result
39+
self.__artifact_validate_result = None
4240

4341
@property
4442
def _includes(self):
4543
if not self._additional_includes_file_path.is_file():
4644
return []
4745
if self.__includes is None:
48-
self._is_artifact_includes = self._is_yaml_format_additional_includes()
4946
if self._is_artifact_includes:
50-
self.__includes = self._load_yaml_format_additional_includes()
47+
self.__includes = self._load_artifact_additional_includes()
5148
else:
5249
with open(self._additional_includes_file_path, "r") as f:
5350
lines = f.readlines()
@@ -84,24 +81,27 @@ def _additional_includes_file_path(self) -> Path:
8481
def code(self) -> Path:
8582
return self._tmp_code_path if self._tmp_code_path else self._code_path
8683

87-
def _copy(self, src: Path, dst: Path, ignore_file=None) -> None:
84+
@staticmethod
85+
def _copy(src: Path, dst: Path, *, ignore_file=None) -> None:
8886
if src.is_file():
87+
if not dst.parent.is_dir():
88+
dst.parent.mkdir(parents=True)
8989
_general_copy(src, dst)
9090
else:
9191
# use os.walk to replace shutil.copytree, which may raise
9292
# FileExistsError for same folder, the expected behavior
9393
# is merging ignore will be also applied during this process
94+
# TODO: inner ignore file is not supported with current implementation
95+
# TODO: empty folder will be ignored with current implementation
9496
local_paths, _ = get_local_paths(
9597
source_path=str(src),
96-
ignore_file=(ignore_file or self._ignore_file)
98+
ignore_file=ignore_file or IgnoreFile(),
9799
)
98-
for path in local_paths:
99-
dst_root = Path(dst) / Path(path).relative_to(src)
100-
dst_root_mkdir_flag = dst_root.is_dir()
101-
# if there is nothing to copy under current dst_root, no need to create this folder
102-
if dst_root_mkdir_flag is False:
103-
dst_root.mkdir(parents=True)
104-
_general_copy(path, dst_root / Path(path).name)
100+
# local_paths contains and only contains all file paths, so no need to apply ignore-file
101+
for src_path in local_paths:
102+
src_path = Path(src_path)
103+
dst_path = Path(dst) / src_path.relative_to(src)
104+
_AdditionalIncludes._copy(src_path, dst_path)
105105

106106
@staticmethod
107107
def _is_folder_to_compress(path: Path) -> bool:
@@ -163,7 +163,7 @@ def _validate(self) -> MutableValidationResult:
163163
validation_result.append_error(message=error_msg)
164164
return validation_result
165165

166-
def resolve(self) -> None:
166+
def resolve(self, ignore_file: IgnoreFile) -> None:
167167
"""Resolve code and potential additional includes.
168168
If no additional includes is specified, just return and use
169169
original real code path; otherwise, create a tmp folder and copy
@@ -176,34 +176,59 @@ def resolve(self) -> None:
176176
if Path(self._code_path).is_file():
177177
self._copy(Path(self._code_path), tmp_folder_path / Path(self._code_path).name)
178178
else:
179-
for path in os.listdir(self._code_path):
180-
src_path = (Path(self._code_path) / str(path)).resolve()
181-
if src_path.suffix == ADDITIONAL_INCLUDES_SUFFIX:
182-
continue
183-
dst_path = tmp_folder_path / str(path)
184-
self._copy(src_path, dst_path)
179+
self._copy(self._code_path, tmp_folder_path, ignore_file=ignore_file)
185180
# additional includes
186181
base_path = self._additional_includes_file_path.parent
182+
# additional includes from artifact will be downloaded to a temp local path on calling
183+
# self._includes, so no need to add specific logic for artifact
184+
185+
# TODO: skip ignored files defined in code when copying additional includes
186+
# copy additional includes disregarding ignore files as current ignore file implementation
187+
# is based on absolute path, which is not suitable for additional includes
187188
for additional_include in self._includes:
188189
src_path = Path(additional_include)
189190
if not src_path.is_absolute():
190191
src_path = (base_path / additional_include).resolve()
192+
dst_path = (tmp_folder_path / src_path.name).resolve()
193+
191194
if self._is_folder_to_compress(src_path):
192-
self._resolve_folder_to_compress(additional_include, Path(tmp_folder_path))
195+
self._resolve_folder_to_compress(
196+
additional_include,
197+
Path(tmp_folder_path),
198+
# TODO: seems it won't work as current ignore file implementation is based on absolute path
199+
ignore_file=ignore_file,
200+
)
201+
elif src_path.is_dir():
202+
# support ignore file in additional includes
203+
self._copy(src_path, dst_path, ignore_file=get_ignore_file(src_path))
193204
else:
194-
dst_path = (tmp_folder_path / src_path.name).resolve()
195-
self._copy(src_path, dst_path, ignore_file=IgnoreFile() if self._is_artifact_includes else None)
205+
# do not apply ignore file for files
206+
self._copy(src_path, dst_path)
207+
208+
# Remove ignored files copied from additional includes
209+
rebased_ignore_file = InternalComponentIgnoreFile(
210+
directory_path=tmp_folder_path,
211+
additional_include_file_name=self._additional_includes_file_path.name,
212+
)
213+
for base, dirs, files in os.walk(tmp_folder_path):
214+
for name in files + dirs:
215+
path = os.path.join(base, name)
216+
if rebased_ignore_file.is_file_excluded(path):
217+
if os.path.isdir(path):
218+
shutil.rmtree(path)
219+
if os.path.isfile(path):
220+
os.remove(path)
196221
self._tmp_code_path = tmp_folder_path # point code path to tmp folder
197222
return
198223

199-
def _resolve_folder_to_compress(self, include: str, dst_path: Path) -> None:
224+
def _resolve_folder_to_compress(self, include: str, dst_path: Path, ignore_file: IgnoreFile) -> None:
200225
"""resolve the zip additional include, need to compress corresponding folder."""
201226
zip_additional_include = (self._additional_includes_file_path.parent / include).resolve()
202227
folder_to_zip = zip_additional_include.parent / zip_additional_include.stem
203228
zip_file = dst_path / zip_additional_include.name
204229
with zipfile.ZipFile(zip_file, "w") as zf:
205230
zf.write(folder_to_zip, os.path.relpath(folder_to_zip, folder_to_zip.parent)) # write root in zip
206-
local_paths, _ = get_local_paths(source_path=str(folder_to_zip), ignore_file=self._ignore_file)
231+
local_paths, _ = get_local_paths(source_path=str(folder_to_zip), ignore_file=ignore_file)
207232
for path in local_paths:
208233
zf.write(path, os.path.relpath(path, folder_to_zip.parent))
209234

@@ -216,7 +241,8 @@ def cleanup(self) -> None:
216241
shutil.rmtree(self._tmp_code_path)
217242
self._tmp_code_path = None # point code path back to real path
218243

219-
def _is_yaml_format_additional_includes(self):
244+
@property
245+
def _is_artifact_includes(self):
220246
try:
221247
with open(self._additional_includes_file_path) as f:
222248
additional_includes_configs = yaml.safe_load(f)
@@ -227,7 +253,16 @@ def _is_yaml_format_additional_includes(self):
227253
except Exception: # pylint: disable=broad-except
228254
return False
229255

230-
def _load_yaml_format_additional_includes(self):
256+
@property
257+
def _artifact_validate_result(self):
258+
if not self._is_artifact_includes:
259+
return _ValidationResultBuilder.success()
260+
if self.__artifact_validate_result is None:
261+
# artifact validation is done on loading now, so trigger it here
262+
self._load_artifact_additional_includes()
263+
return self.__artifact_validate_result
264+
265+
def _load_artifact_additional_includes(self):
231266
"""
232267
Load the additional includes by yaml format.
233268
Addition includes is a list of include files, such as local paths and Azure Devops Artifacts.
@@ -247,7 +282,7 @@ def _load_yaml_format_additional_includes(self):
247282
:rtype additional_includes: List[str]
248283
"""
249284
additional_includes, conflict_files = [], {}
250-
self._artifact_validate_result = _ValidationResultBuilder.success()
285+
self.__artifact_validate_result = _ValidationResultBuilder.success()
251286

252287
def merge_local_path_to_additional_includes(local_path, config_info):
253288
additional_includes.append(local_path)

sdk/ml/azure-ai-ml/azure/ai/ml/_internal/entities/code.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,41 @@
55
from pathlib import Path
66
from typing import List, Optional, Union
77

8+
from ..._utils._asset_utils import IgnoreFile
89
from ...entities._assets import Code
910
from ...entities._component.code import ComponentIgnoreFile
1011

1112

1213
class InternalComponentIgnoreFile(ComponentIgnoreFile):
13-
_INTERNAL_COMPONENT_CODE_IGNORES = ["*.additional_includes"]
14-
15-
def __init__(self, directory_path: Union[str, Path]):
14+
def __init__(self, directory_path: Union[str, Path], additional_include_file_name: Optional[str]):
1615
super(InternalComponentIgnoreFile, self).__init__(directory_path=directory_path)
16+
# only the additional include file in root directory is ignored
17+
# additional include files in subdirectories are not processed so keep them
18+
self._additional_include_file_name = additional_include_file_name
19+
self._other_ignores = []
1720

1821
def _get_ignore_list(self) -> List[str]:
1922
"""Override to add custom ignores for internal component."""
20-
return super(InternalComponentIgnoreFile, self)._get_ignore_list() + self._INTERNAL_COMPONENT_CODE_IGNORES
23+
if self._additional_include_file_name is None:
24+
return super(InternalComponentIgnoreFile, self)._get_ignore_list()
25+
return super(InternalComponentIgnoreFile, self)._get_ignore_list() + [self._additional_include_file_name]
26+
27+
def merge(self, other: IgnoreFile):
28+
"""Merge other ignore file with this one and create a new IgnoreFile for it.
29+
"""
30+
ignore_file = InternalComponentIgnoreFile(self._base_path, self._additional_include_file_name)
31+
ignore_file._other_ignores.append(other) # pylint: disable=protected-access
32+
return ignore_file
33+
34+
def is_file_excluded(self, file_path: Union[str, Path]) -> bool:
35+
"""Override to check if file is excluded in other ignore files."""
36+
# TODO: current design of ignore file can't distinguish between files and directories of the same name
37+
if super(InternalComponentIgnoreFile, self).is_file_excluded(file_path):
38+
return True
39+
for other in self._other_ignores:
40+
if other.is_file_excluded(file_path):
41+
return True
42+
return False
2143

2244

2345
class InternalCode(Code):

sdk/ml/azure-ai-ml/azure/ai/ml/_internal/entities/component.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@
2020

2121
from ... import Input, Output
2222
from ..._utils._arm_id_utils import parse_name_label
23+
from ..._utils._asset_utils import IgnoreFile
2324
from ...entities._assets import Code
2425
from ...entities._job.distribution import DistributionConfiguration
2526
from .._schema.component import InternalComponentSchema
26-
from ._additional_includes import _AdditionalIncludes
27+
from ._additional_includes import _AdditionalIncludes, ADDITIONAL_INCLUDES_SUFFIX
2728
from ._input_outputs import InternalInput, InternalOutput
2829
from ._merkle_tree import create_merkletree
2930
from .code import InternalCode, InternalComponentIgnoreFile
@@ -124,6 +125,7 @@ def __init__(
124125
self.environment = InternalEnvironment(**environment) if isinstance(environment, dict) else environment
125126
self.environment_variables = environment_variables
126127
self.__additional_includes = None
128+
self.__ignore_file = None
127129
# TODO: remove these to keep it a general component class
128130
self.command = command
129131
self.scope = scope
@@ -152,9 +154,6 @@ def _additional_includes(self):
152154
self.__additional_includes = _AdditionalIncludes(
153155
code_path=self.code,
154156
yaml_path=self._source_path,
155-
ignore_file=InternalComponentIgnoreFile(
156-
self.code if self.code is not None else Path(self._source_path).parent,
157-
), # use YAML's parent as code when self.code is None
158157
)
159158
return self.__additional_includes
160159

@@ -208,15 +207,15 @@ def _to_rest_object(self) -> ComponentVersionData:
208207
def _get_snapshot_id(
209208
cls,
210209
code_path: Union[str, PathLike],
211-
ignore_file: Optional[InternalComponentIgnoreFile],
210+
ignore_file: IgnoreFile,
212211
) -> str:
213212
"""Get the snapshot id of a component with specific working directory in ml-components.
214213
Use this as the name of code asset to reuse steps in a pipeline job from ml-components runs.
215214
216215
:param code_path: The path of the working directory.
217216
:type code_path: str
218217
:param ignore_file: The ignore file of the snapshot.
219-
:type ignore_file: InternalComponentIgnoreFile
218+
:type ignore_file: IgnoreFile
220219
:return: The snapshot id of a component in ml-components with code_path as its working directory.
221220
"""
222221
curr_root = create_merkletree(code_path, ignore_file.is_file_excluded)
@@ -242,7 +241,26 @@ def _resolve_local_code(self) -> Optional[Code]:
242241
yield code
243242
return
244243

245-
self._additional_includes.resolve()
244+
def get_additional_include_file_name():
245+
if self._source_path is not None:
246+
return Path(self._source_path).with_suffix(ADDITIONAL_INCLUDES_SUFFIX).name
247+
return None
248+
249+
def get_ignore_file() -> InternalComponentIgnoreFile:
250+
if self.code is None and self._source_path is None:
251+
# no code and no yaml, ignore file is not needed; return ignore file on cwd to avoid error
252+
return InternalComponentIgnoreFile(
253+
self.base_path,
254+
additional_include_file_name=get_additional_include_file_name(),
255+
)
256+
257+
return InternalComponentIgnoreFile(
258+
self.code if self.code is not None else Path(self._source_path).parent,
259+
additional_include_file_name=get_additional_include_file_name(),
260+
)
261+
ignore_file = get_ignore_file()
262+
263+
self._additional_includes.resolve(ignore_file=ignore_file)
246264

247265
# file dependency in code will be read during internal environment resolution
248266
# for example, docker file of the environment may be in additional includes
@@ -252,18 +270,26 @@ def _resolve_local_code(self) -> Optional[Code]:
252270
self.environment.resolve(self._additional_includes.code)
253271
# use absolute path in case temp folder & work dir are in different drive
254272
tmp_code_dir = self._additional_includes.code.absolute()
255-
ignore_file = InternalComponentIgnoreFile(tmp_code_dir)
273+
rebased_ignore_file = InternalComponentIgnoreFile(
274+
tmp_code_dir,
275+
additional_include_file_name=get_additional_include_file_name(),
276+
)
256277
# Use the snapshot id in ml-components as code name to enable anonymous
257278
# component reuse from ml-component runs.
258279
# calculate snapshot id here instead of inside InternalCode to ensure that
259280
# snapshot id is calculated based on the resolved code path
260281
yield InternalCode(
261-
name=self._get_snapshot_id(tmp_code_dir, ignore_file),
282+
name=self._get_snapshot_id(
283+
# use absolute path in case temp folder & work dir are in different drive
284+
self._additional_includes.code.absolute(),
285+
# this ignore-file should be rebased to the resolved code path
286+
rebased_ignore_file,
287+
),
262288
version="1",
263289
base_path=self._base_path,
264290
path=tmp_code_dir,
265291
is_anonymous=True,
266-
ignore_file=ignore_file,
292+
ignore_file=rebased_ignore_file,
267293
)
268294

269295
self._additional_includes.cleanup()

sdk/ml/azure-ai-ml/azure/ai/ml/entities/_component/code.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,10 @@ class ComponentIgnoreFile(IgnoreFile):
1313
_COMPONENT_CODE_IGNORES = ["__pycache__"]
1414

1515
def __init__(self, directory_path: Union[str, Path]):
16+
self._base_path = Path(directory_path)
1617
# note: the parameter changes to directory path in this class, rather than file path
1718
file_path = get_ignore_file(directory_path).path
1819
super(ComponentIgnoreFile, self).__init__(file_path=file_path)
19-
self._base_path = Path(directory_path)
2020

2121
def exists(self) -> bool:
2222
"""Override to always return True as we do have default ignores."""

sdk/ml/azure-ai-ml/tests/component/unittests/test_command_component_entity.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import pytest
1111

1212
from conftest import normalized_arm_id_in_object
13-
from test_utilities.utils import verify_entity_load_and_dump
13+
from test_utilities.utils import verify_entity_load_and_dump, build_temp_folder
1414

1515
from azure.ai.ml import Input, MpiDistribution, Output, TensorFlowDistribution, command, load_component
1616
from azure.ai.ml._utils.utils import load_yaml
@@ -503,12 +503,13 @@ def test_invalid_component_outputs(self) -> None:
503503
def test_component_code_asset_ignoring_pycache(self) -> None:
504504
component_yaml = "./tests/test_configs/components/helloworld_component.yml"
505505
component = load_component(component_yaml)
506-
with tempfile.TemporaryDirectory() as temp_dir:
507-
# create some files/folders expected to ignore
508-
pycache = Path(temp_dir) / "__pycache__"
509-
pycache.mkdir()
510-
expected_exclude = pycache / "a.pyc"
511-
expected_exclude.touch()
506+
with build_temp_folder(
507+
source_base_dir="./tests/test_configs/components",
508+
relative_files_to_copy=["helloworld_component.yml"],
509+
extra_files_to_create={
510+
"__pycache__/a.pyc": None
511+
}
512+
) as temp_dir:
512513
# resolve and test for ignore_file's is_file_excluded
513514
component.code = temp_dir
514515
with component._resolve_local_code() as code:
@@ -518,7 +519,7 @@ def test_component_code_asset_ignoring_pycache(self) -> None:
518519
path = os.path.join(root, name)
519520
if code._ignore_file.is_file_excluded(path):
520521
excluded.append(path)
521-
assert excluded == [str(expected_exclude.absolute())]
522+
assert excluded == [str((Path(temp_dir) / "__pycache__/a.pyc").absolute())]
522523

523524
def test_normalized_arm_id_in_component_dict(self):
524525
component_dict = {

0 commit comments

Comments
 (0)