package-url · keshav-space · Aug 19, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -10,6 +10,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
       - name: Setup Python environment
         uses: actions/setup-python@v5
@@ -23,7 +25,7 @@ jobs:
       - name: Validate
         run: |
           isort --check-only src/ tests/
-          black --check --line-length 100 .
+          black --check --line-length 100 src/ tests/
           mypy
 
   build-and-test:
@@ -44,6 +46,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          submodules: recursive
 
       - name: Setup Python environment
         uses: actions/setup-python@v5
@@ -56,7 +60,7 @@ jobs:
           pip install -e .[build]
 
       - name: Test
-        run: py.test -vvs
+        run: py.test -vvs --ignore=spec/
 
       - name: Build
         run: python setup.py build sdist bdist_wheel
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "spec"]
+	path = spec
+	url = https://github.com/package-url/purl-spec.git
diff --git a/Makefile b/Makefile
@@ -26,7 +26,7 @@
 PYTHON_EXE?=python3
 ACTIVATE?=. bin/activate;
 VIRTUALENV_PYZ=thirdparty/virtualenv.pyz
-BLACK_ARGS=--exclude=".cache|lib|bin|var" --line-length 100
+BLACK_ARGS=src/ --exclude="\.cache|lib|bin|var" --line-length 100
 
 virtualenv:
 	@echo "-> Bootstrap the virtualenv with PYTHON_EXE=${PYTHON_EXE}"
@@ -52,7 +52,7 @@ isort:
 
 black:
 	@echo "-> Apply black code formatter"
-	@${ACTIVATE} black ${BLACK_ARGS} .
+	@${ACTIVATE} black ${BLACK_ARGS}
 
 mypy:
 	@echo "-> Type check the Python code."

diff --git a/spec b/spec
diff --git a/src/packageurl/__init__.py b/src/packageurl/__init__.py
@@ -29,6 +29,7 @@
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import Optional
 from typing import Union
 from typing import overload
 from urllib.parse import quote as _percent_quote
@@ -116,37 +117,66 @@ def normalize_namespace(
 
     namespace_str = namespace if isinstance(namespace, str) else namespace.decode("utf-8")
     namespace_str = namespace_str.strip().strip("/")
-    if ptype in ("bitbucket", "github", "pypi", "gitlab"):
+    if ptype in ("bitbucket", "github", "pypi", "gitlab", "composer"):
         namespace_str = namespace_str.lower()
     segments = [seg for seg in namespace_str.split("/") if seg.strip()]
     segments_quoted = map(get_quoter(encode), segments)
     return "/".join(segments_quoted) or None
 
 
+def normalize_mlflow_name(
+    name_str: str,
+    qualifiers: Union[str, bytes, dict[str, str], None],
+) -> Optional[str]:
+    """MLflow purl names are case-sensitive for Azure ML, it is case sensitive and must be kept as-is in the package URL
+    For Databricks, it is case insensitive and must be lowercased in the package URL"""
+    if isinstance(qualifiers, dict):
+        repo_url = qualifiers.get("repository_url")
+        if repo_url and "azureml" in repo_url.lower():
+            return name_str
+        if repo_url and "databricks" in repo_url.lower():
+            return name_str.lower()
+    if isinstance(qualifiers, str):
+        if "azureml" in qualifiers.lower():
+            return name_str
+        if "databricks" in qualifiers.lower():
+            return name_str.lower()
+    return name_str
+
+
 def normalize_name(
-    name: AnyStr | None, ptype: str | None, encode: bool | None = True
-) -> str | None:
+    name: AnyStr | None,
+    qualifiers: Union[Union[str, bytes], dict[str, str], None],
+    ptype: str | None,
+    encode: bool | None = True,
+) -> Optional[str]:
     if not name:
         return None
 
     name_str = name if isinstance(name, str) else name.decode("utf-8")
     quoter = get_quoter(encode)
     name_str = quoter(name_str)
     name_str = name_str.strip().strip("/")
-    if ptype in ("bitbucket", "github", "pypi", "gitlab"):
+    if ptype and ptype in ("mlflow"):
+        return normalize_mlflow_name(name_str, qualifiers)
+    if ptype in ("bitbucket", "github", "pypi", "gitlab", "composer"):
         name_str = name_str.lower()
     if ptype == "pypi":
         name_str = name_str.replace("_", "-")
     return name_str or None
 
 
-def normalize_version(version: AnyStr | None, encode: bool | None = True) -> str | None:
+def normalize_version(
+    version: AnyStr | None, ptype: Optional[Union[str, bytes]], encode: bool | None = True
+) -> str | None:
     if not version:
         return None
 
     version_str = version if isinstance(version, str) else version.decode("utf-8")
     quoter = get_quoter(encode)
     version_str = quoter(version_str.strip())
+    if ptype and isinstance(ptype, str) and ptype in ("huggingface"):
+        return version_str.lower()
     return version_str or None
 
 
@@ -304,8 +334,8 @@ def normalize(
     """
     type_norm = normalize_type(type, encode)
     namespace_norm = normalize_namespace(namespace, type_norm, encode)
-    name_norm = normalize_name(name, type_norm, encode)
-    version_norm = normalize_version(version, encode)
+    name_norm = normalize_name(name, qualifiers, type_norm, encode)
+    version_norm = normalize_version(version, type, encode)
     qualifiers_norm = normalize_qualifiers(qualifiers, encode)
     subpath_norm = normalize_subpath(subpath, encode)
     return type_norm, namespace_norm, name_norm, version_norm, qualifiers_norm, subpath_norm
@@ -464,8 +494,19 @@ def from_string(cls, purl: str) -> Self:
         if not type_ or not sep:
             raise ValueError(f"purl is missing the required type component: {purl!r}.")
 
+        valid_chars = string.ascii_letters + string.digits + ".-_"
+        if not all(c in valid_chars for c in type_):
+            raise ValueError(
+                f"purl type must be composed only of ASCII letters and numbers, period, dash and underscore: {type_!r}."
+            )
+
+        if type_[0] in string.digits:
+            raise ValueError(f"purl type cannot start with a number: {type_!r}.")
+
         type_ = type_.lower()
 
+        original_remainder = remainder
+
         scheme, authority, path, qualifiers_str, subpath = _urlsplit(
             url=remainder, scheme="", allow_fragments=True
         )
@@ -480,7 +521,9 @@ def from_string(cls, purl: str) -> Self:
             path = authority + ":" + path
 
         if scheme:
-            path = scheme + ":" + path
+            # This is a way to preserve the casing of the original scheme
+            original_scheme = original_remainder.split(":", 1)[0]
+            path = original_scheme + ":" + path
 
         path = path.lstrip("/")
 

diff --git a/tests/test_purl_spec.py b/tests/test_purl_spec.py
@@ -0,0 +1,153 @@
+# Copyright (c) the purl authors
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Visit https://github.com/package-url/packageurl-python for support and
+# download.
+
+import json
+import os
+
+import pytest
+
+from packageurl import PackageURL
+
+current_dir = os.path.dirname(__file__)
+root_dir = os.path.abspath(os.path.join(current_dir, ".."))
+spec_file_path = os.path.join(root_dir, "spec", "tests", "spec", "specification-test.json")
+
+with open(spec_file_path, "r", encoding="utf-8") as f:
+    test_cases = json.load(f)
+
+tests = test_cases["tests"]
+
+parse_tests = [t for t in tests if t["test_type"] == "parse"]
+build_tests = [t for t in tests if t["test_type"] == "build"]
+
+
+def load_spec_files(spec_dir):
+    """
+    Load all JSON files from the given directory into a dictionary.
+    Key = filename, Value = parsed JSON content
+    """
+    spec_data = {}
+    for filename in os.listdir(spec_dir):
+        if filename.endswith("-test.json"):
+            filepath = os.path.join(spec_dir, filename)
+            with open(filepath, "r", encoding="utf-8") as f:
+                try:
+                    data = json.load(f)
+                    spec_data[filename] = data["tests"]
+                except json.JSONDecodeError as e:
+                    print(f"Error parsing {filename}: {e}")
+    return spec_data
+
+
+SPEC_DIR = os.path.join(os.path.dirname(__file__), "..", "spec", "tests", "types")
+spec_dict = load_spec_files(SPEC_DIR)
+
+flattened_cases = []
+for filename, cases in spec_dict.items():
+    for case in cases:
+        flattened_cases.append((filename, case["description"], case))
+
+
+@pytest.mark.parametrize(
+    "description, input_str, expected_output, expected_failure",
+    [
+        (t["description"], t["input"], t["expected_output"], t["expected_failure"])
+        for t in parse_tests
+    ],
+)
+def test_parse(description, input_str, expected_output, expected_failure):
+    if expected_failure:
+        with pytest.raises(Exception):
+            PackageURL.from_string(input_str)
+    else:
+        result = PackageURL.from_string(input_str)
+        assert result.to_string() == expected_output
+
+
+@pytest.mark.parametrize(
+    "description, input_dict, expected_output, expected_failure",
+    [
+        (t["description"], t["input"], t["expected_output"], t["expected_failure"])
+        for t in build_tests
+    ],
+)
+def test_build(description, input_dict, expected_output, expected_failure):
+    kwargs = {
+        "type": input_dict.get("type"),
+        "namespace": input_dict.get("namespace"),
+        "name": input_dict.get("name"),
+        "version": input_dict.get("version"),
+        "qualifiers": input_dict.get("qualifiers"),
+        "subpath": input_dict.get("subpath"),
+    }
+
+    if expected_failure:
+        with pytest.raises(Exception):
+            PackageURL(**kwargs).to_string()
+    else:
+        purl = PackageURL(**kwargs)
+        assert purl.to_string() == expected_output
+
+
+@pytest.mark.parametrize("filename,description,test_case", flattened_cases)
+def test_package_type_case(filename, description, test_case):
+    test_type = test_case["test_type"]
+    expected_failure = test_case.get("expected_failure", False)
+
+    if expected_failure:
+        with pytest.raises(Exception):
+            run_test_case(test_case, test_type, description)
+    else:
+        run_test_case(test_case, test_type, description)
+
+
+def run_test_case(case, test_type, desc):
+    if test_type == "parse":
+        purl = PackageURL.from_string(case["input"])
+        expected = case["expected_output"]
+        assert purl.type == expected["type"]
+        assert purl.namespace == expected["namespace"]
+        assert purl.name == expected["name"]
+        assert purl.version == expected["version"]
+        if expected["qualifiers"]:
+            assert purl.qualifiers == expected["qualifiers"]
+        else:
+            assert not purl.qualifiers
+        assert purl.subpath == expected["subpath"]
+
+    elif test_type == "roundtrip":
+        purl = PackageURL.from_string(case["input"])
+        assert purl.to_string() == case["expected_output"]
+
+    elif test_type == "build":
+        input_data = case["input"]
+        purl = PackageURL(
+            type=input_data["type"],
+            namespace=input_data["namespace"],
+            name=input_data["name"],
+            version=input_data["version"],
+            qualifiers=input_data.get("qualifiers"),
+            subpath=input_data.get("subpath"),
+        )
+        assert purl.to_string() == case["expected_output"]