[ML][Pipelines] feat: add autotoken related run settings to internal Scope (Azure#29643)

elliotzh · web-flow · commit d2c7c9c7b027 · 2023-03-29T20:14:53.000+08:00
* script: record tests with recording mismatch * feat: support autotoken related run settings for internal scope * doc: update CHANGELOG.md * Revert "doc: update CHANGELOG.md" This reverts commit 139b259.
diff --git a/.vscode/cspell.json b/.vscode/cspell.json
@@ -417,7 +417,7 @@
         "Phong"
       ]
     },
-    {              
+    {
       "filename": "tools/azure-sdk-tools/ci_tools/deps.html.j2",
       "words": [
         "isfork",
@@ -738,7 +738,7 @@
       "filename": "sdk/synapse/azure-synapse-artifacts/azure/synapse/artifacts/operations/*.py",
       "words": [
         "Syms",
-        "Updation",
+        "Updation"
       ]
     },
     {
@@ -929,7 +929,8 @@
         "SEPS",
         "wargs",
         "pycache",
-        "ruamel"
+        "ruamel",
+        "reprcrash"
       ]
     },
     {
diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/_schema/node.py
@@ -53,6 +53,9 @@ class ScopeSchema(InternalBaseNodeSchema):
     scope_param = fields.Str()
     custom_job_name_suffix = fields.Str()
     priority = fields.Int()
+    auto_token = fields.Int()
+    tokens = fields.Int()
+    vcp = fields.Float()
 
 
 class HDInsightSchema(InternalBaseNodeSchema):
diff --git a/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/entities/scope.py b/sdk/ml/azure-ai-ml/azure/ai/ml/_internal/entities/scope.py
@@ -24,6 +24,9 @@ def __init__(self, **kwargs):
         self._scope_param = kwargs.pop("scope_param", None)
         self._custom_job_name_suffix = kwargs.pop("custom_job_name_suffix", None)
         self._priority = kwargs.pop("priority", None)
+        self._auto_token = kwargs.pop("auto_token", None)
+        self._tokens = kwargs.pop("tokens", None)
+        self._vcp = kwargs.pop("vcp", None)
         self._init = False
 
     @property
@@ -65,9 +68,36 @@ def priority(self) -> int:
     def priority(self, value: int):
         self._priority = value
 
+    @property
+    def auto_token(self) -> int:
+        """A predictor for estimating the peak resource usage of scope job."""
+        return self._auto_token
+
+    @auto_token.setter
+    def auto_token(self, value: int):
+        self._auto_token = value
+
+    @property
+    def tokens(self) -> int:
+        """Standard token allocation in integer."""
+        return self._tokens
+
+    @tokens.setter
+    def tokens(self, value: int):
+        self._tokens = value
+
+    @property
+    def vcp(self) -> float:
+        """Standard VC percent allocation; should be a float between 0 and 1."""
+        return self._vcp
+
+    @vcp.setter
+    def vcp(self, value: float):
+        self._vcp = value
+
     @classmethod
     def _picked_fields_from_dict_to_rest_object(cls) -> List[str]:
-        return ["custom_job_name_suffix", "scope_param", "adla_account_name", "priority"]
+        return ["custom_job_name_suffix", "scope_param", "adla_account_name", "priority", "auto_token", "tokens", "vcp"]
 
     @classmethod
     def _create_schema_for_validation(cls, context) -> Union[PathAwareSchema, Schema]:
diff --git a/sdk/ml/azure-ai-ml/dev_requirements.txt b/sdk/ml/azure-ai-ml/dev_requirements.txt
@@ -18,4 +18,6 @@ numpy;platform.python_implementation!="PyPy"
 scikit-image;platform.python_implementation!="PyPy"
 mldesigner
 azure-mgmt-resourcegraph<9.0.0,>=2.0.0
-azure-mgmt-resource<23.0.0,>=3.0.0
+azure-mgmt-resource<23.0.0,>=3.0.0
+pytest-reportlog
+python-dotenv
diff --git a/sdk/ml/azure-ai-ml/scripts/.gitignore b/sdk/ml/azure-ai-ml/scripts/.gitignore
@@ -0,0 +1 @@
+tmp
diff --git a/sdk/ml/azure-ai-ml/scripts/run_tests.py b/sdk/ml/azure-ai-ml/scripts/run_tests.py
@@ -1,46 +1,202 @@
 # ---------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # ---------------------------------------------------------
-
+import argparse
+import contextlib
+import json
+import os
+import re
 import subprocess
 import sys
+from collections import defaultdict
 from pathlib import Path
 
+import dotenv
 
-def run_tests(input_file):
-    """Run tests listed in a file. Lines starting with # or ; are ignored.
 
-    :param input_file: Path to a file containing a list of tests to run.
-    :type input_file: str
-    """
-    tests_to_run = []
+def normalize_test_name(test_name):
+    if "[" in test_name:
+        test_name = test_name.split("[")[0]
+    return test_name.strip()
+
+
+def extract_test_location(location):
+    test_path, line_no, test_func = location
+    test_class_name, test_func_name = test_func.split(".", 1)
+    test_class = test_path.split(os.path.sep, 3)[-1] + "::" + test_class_name
+    m = re.match(r"(\w+)\[(\w+)]", test_func_name)
+    if m:
+        test_func_name, test_param = m.groups()
+    else:
+        test_param = None
+    return test_class, test_func_name, test_param
+
+
+def load_tests_from_file(input_file):
+    tests_to_run = set()
     with open(input_file, "r") as f:
 
         for line in f:
             if len(line) < 1 or line[0] in ["#", ";"]:
                 continue
-            if "[" in line:
-                line = line.split("[")[0]
-            line = line.strip()
-            if line not in tests_to_run:
-                tests_to_run.append(line)
+            line = normalize_test_name(line)
+            tests_to_run.add(line)
+    return tests_to_run
+
+
+@contextlib.contextmanager
+def update_dot_env_file(env_override):
+    """Update env file with env_override, and restore it after the context is exited.
+    Support bool variable only for now.
+    """
+    env_file = dotenv.find_dotenv(raise_error_if_not_found=True)
+    print(f"Updating env file: {env_file}")
+    origin_env_content = None
+    try:
+        with open(env_file, "r") as f:
+            origin_env_content = f.read()
+            env_vars = [line.strip() for line in origin_env_content.splitlines() if line.strip()]
+        for key, value in env_override.items():
+            if isinstance(value, bool):
+                target_line = f"{key}='true'"
+                for i, line in enumerate(env_vars):
+                    if line == target_line and not value:
+                        env_vars[i] = f"#{target_line}"
+                    elif re.match(rf"# *{target_line}", line) and value:
+                        env_vars[i] = f"{target_line}"
+        with open(env_file, "w") as f:
+            f.write("\n".join(env_vars))
+        yield
+    finally:
+        if origin_env_content is not None:
+            with open(env_file, "w") as f:
+                f.write(origin_env_content)
+
+
+def run_simple(tests_to_run, working_dir, extra_params, is_live_and_recording):
+    print(f"Running {len(tests_to_run)} tests under {working_dir}: ")
     for test_name in tests_to_run:
         print(test_name)
 
-    for test_name in tests_to_run:
-        print(f"Running test: {test_name}")
-        subprocess.call(
-            [
-                sys.executable,
-                "-m",
-                "pytest",
-                "--disable-warnings",
-                "--disable-pytest-warnings",
-                test_name,
-            ],
-            cwd=Path(__file__).parent.parent,
-        )
+    with update_dot_env_file(
+        {"AZURE_TEST_RUN_LIVE": is_live_and_recording, "AZURE_SKIP_LIVE_RECORDING": not is_live_and_recording},
+    ):
+        for test_name in tests_to_run:
+            print(
+                f"pytest {test_name} {' '.join(extra_params)} in {'live' if is_live_and_recording else 'playback'} mode..."
+            )
+            subprocess.run(
+                [
+                    sys.executable,
+                    "-m",
+                    "pytest",
+                    test_name,
+                ]
+                + extra_params,
+                cwd=working_dir,
+            )
+
+
+def run_tests(tests_to_run, extras, *, skip_first_run=False, record_mismatch=False, is_live_and_recording=False):
+    working_dir = Path(__file__).parent.parent
+    if record_mismatch:
+        log_file_path = working_dir / "scripts" / "tmp" / "pytest_log.json"
+        log_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        if not skip_first_run:
+            run_simple(
+                tests_to_run,
+                working_dir,
+                extra_params=[
+                    "--disable-warnings",
+                    "--disable-pytest-warnings",
+                    "--report-log",
+                    log_file_path.as_posix(),
+                ]
+                + extras,
+                is_live_and_recording=False,
+            )
+
+        tests_failed_with_recording_mismatch = defaultdict(dict)
+        with open(log_file_path, "r") as f:
+            for line in f:
+                node = json.loads(line)
+                if "outcome" not in node:
+                    continue
+                if node["outcome"] != "failed":
+                    continue
+                test_class, test_name, test_param = extract_test_location(node["location"])
+
+                msg = node["longrepr"]["reprcrash"]["message"]
+                if "ResourceNotFoundError" in msg:
+                    if test_param is None:
+                        tests_failed_with_recording_mismatch[test_class][test_name] = None
+                    elif test_name not in tests_failed_with_recording_mismatch[test_class]:
+                        tests_failed_with_recording_mismatch[test_class][test_name] = [test_param]
+                    else:
+                        tests_failed_with_recording_mismatch[test_class][test_name].append(test_param)
+
+        if tests_failed_with_recording_mismatch:
+            # re-run the tests with recording mismatch in live mode
+            for test_class, test_info in tests_failed_with_recording_mismatch.items():
+                keys = []
+                for test_name, test_params in test_info.items():
+                    if test_params is not None:
+                        keys.append(f"{test_name}[{'-'.join(test_params)}]")
+                    else:
+                        keys.append(test_name)
+                run_simple(
+                    [test_class],
+                    working_dir,
+                    ["-k", " or ".join(keys), "--tb=line"],
+                    is_live_and_recording=True,
+                )
+
+            # re-run the original tests to check if they are still failing
+            run_simple(tests_to_run, working_dir, extras, is_live_and_recording=False)
+    else:
+        run_simple(tests_to_run, working_dir, extras, is_live_and_recording=is_live_and_recording)
 
 
 if __name__ == "__main__":
-    run_tests(sys.argv[1])
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--file",
+        type=str,
+        help="File containing tests to run, each line is a test name",
+    )
+    parser.add_argument(
+        "--name",
+        type=str,
+        help="Name of the test to run. Format is aligned with pytest, e.g. 'tests/pipeline_job/'.",
+    )
+    parser.add_argument(
+        "--record-mismatch",
+        "-r",
+        action="store_true",
+        help="If specified, pytest log will be outputted to tmp/pytest_log.json, "
+        "then tests failed with recording not found error will be rerun in live & recording mode."
+        "Note that .env file will be updated during the process, so please revert the change manually "
+        "if the script run is stopped early.",
+    )
+    parser.add_argument(
+        "--skip-first-run",
+        "-s",
+        action="store_true",
+        help="If specified, will skip the first run in record-mismatch mode.",
+    )
+
+    _args, _extras = parser.parse_known_args()
+
+    if _args.file:
+        _tests = load_tests_from_file(_args.file)
+    elif _args.name:
+        _tests = [_args.name]
+    else:
+        raise ValueError("Must specify either --file or --name")
+    run_tests(
+        _tests,
+        _extras,
+        skip_first_run=_args.skip_first_run,
+        record_mismatch=_args.record_mismatch,
+    )
diff --git a/sdk/ml/azure-ai-ml/tests/internal/_utils.py b/sdk/ml/azure-ai-ml/tests/internal/_utils.py
@@ -91,6 +91,9 @@
             "scope_param": "-tokens 50",  # runsettings.scope.scope_param
             "custom_job_name_suffix": "component_sdk_test",  # runsettings.scope.custom_job_name_suffix
             "priority": 800,  # runsettings.scope.priority
+            "auto_token": 150,  # runsettings.scope.auto_token
+            "tokens": 2,  # runsettings.scope.token
+            "vcp": 0.2,  # runsettings.scope.vcp
         },
         {
             "default_compute": "cpu-cluster",
diff --git a/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_pipeline_job_with_anonymous_internal_component[3-component_spec.yaml].json b/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_pipeline_job_with_anonymous_internal_component[3-component_spec.yaml].json
diff --git a/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_pipeline_job_with_registered_internal_component[3-component_spec.yaml].json b/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_pipeline_job_with_registered_internal_component[3-component_spec.yaml].json
diff --git a/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_register_output_for_anonymous_internal_component[test_case2].json b/sdk/ml/azure-ai-ml/tests/recordings/internal/e2etests/test_pipeline_job.pyTestPipelineJobtest_register_output_for_anonymous_internal_component[test_case2].json

Original file line number	Diff line number	Diff line change
`@@ -417,7 +417,7 @@`
`417`	`417`	`"Phong"`
`418`	`418`	`]`
`419`	`419`	`},`
`420`		`- {`
	`420`	`+ {`
`421`	`421`	`"filename": "tools/azure-sdk-tools/ci_tools/deps.html.j2",`
`422`	`422`	`"words": [`
`423`	`423`	`"isfork",`
`@@ -738,7 +738,7 @@`
`738`	`738`	`"filename": "sdk/synapse/azure-synapse-artifacts/azure/synapse/artifacts/operations/*.py",`
`739`	`739`	`"words": [`
`740`	`740`	`"Syms",`
`741`		`- "Updation",`
	`741`	`+ "Updation"`
`742`	`742`	`]`
`743`	`743`	`},`
`744`	`744`	`{`
`@@ -929,7 +929,8 @@`
`929`	`929`	`"SEPS",`
`930`	`930`	`"wargs",`
`931`	`931`	`"pycache",`
`932`		`- "ruamel"`
	`932`	`+ "ruamel",`
	`933`	`+ "reprcrash"`
`933`	`934`	`]`
`934`	`935`	`},`
`935`	`936`	`{`