databrickslabs
diff --git a/‎dbldatagen/spec/column_spec.py‎
Lines changed: 54 additions & 2 deletions b/‎dbldatagen/spec/column_spec.py‎
Lines changed: 54 additions & 2 deletions
diff --git a/‎dbldatagen/spec/compat.py‎
Lines changed: 48 additions & 21 deletions b/‎dbldatagen/spec/compat.py‎
Lines changed: 48 additions & 21 deletions
@@ -22,7 +22,48 @@
     "bigint",
     "tinyint",
 ]
+"""Type alias representing supported basic Spark SQL data types for column definitions.
+
+Includes both standard SQL types (e.g. string, int, double) and Spark-specific type names
+(e.g. bigint, tinyint). These types are used in the ColumnDefinition to specify the data type
+for generated columns.
+"""
+
+
 class ColumnDefinition(BaseModel):
+    """Defines the specification for a single column in a synthetic data table.
+
+    This class encapsulates all the information needed to generate data for a single column,
+    including its name, type, constraints, and generation options. It supports both primary key
+    columns and derived columns that can reference other columns.
+
+    :param name: Name of the column to be generated
+    :param type: Spark SQL data type for the column (e.g., "string", "int", "timestamp").
+                 If None, type may be inferred from options or baseColumn
+    :param primary: If True, this column will be treated as a primary key column with unique values.
+                    Primary columns cannot have min/max options and cannot be nullable
+    :param options: Dictionary of additional options controlling column generation behavior.
+                    Common options include: min, max, step, values, template, distribution, etc.
+                    See dbldatagen documentation for full list of available options
+    :param nullable: If True, the column may contain NULL values. Primary columns cannot be nullable
+    :param omit: If True, this column will be generated internally but excluded from the final output.
+                 Useful for intermediate columns used in calculations
+    :param baseColumn: Name of another column to use as the basis for generating this column's values.
+                       Default is "id" which refers to the internal row identifier
+    :param baseColumnType: Method for deriving values from the baseColumn. Common values:
+                          "auto" (infer behavior), "hash" (hash the base column values),
+                          "values" (use base column values directly)
+
+    .. note::
+        Primary columns have special constraints:
+        - Must have a type defined
+        - Cannot have min/max options
+        - Cannot be nullable
+
+    .. note::
+        Columns can be chained via baseColumn references, but circular dependencies
+        will be caught during validation
+    """
     name: str
     type: DbldatagenBasicType | None = None
     primary: bool = False
@@ -34,8 +75,19 @@ class ColumnDefinition(BaseModel):
 
     @root_validator()
     def check_model_constraints(cls, values: dict[str, Any]) -> dict[str, Any]:
-        """
-        Validates constraints across the entire model after individual fields are processed.
+        """Validates constraints across the entire ColumnDefinition model.
+
+        This validator runs after all individual field validators and checks for cross-field
+        constraints that depend on multiple fields being set. It ensures that primary key
+        columns meet all necessary requirements and that conflicting options are not specified.
+
+        :param values: Dictionary of all field values for this ColumnDefinition instance
+        :returns: The validated values dictionary, unmodified if all validations pass
+        :raises ValueError: If primary column has min/max options, or if primary column is nullable,
+                           or if primary column doesn't have a type defined
+
+        .. note::
+            This is a Pydantic root validator that runs automatically during model instantiation
         """
         is_primary = values.get("primary")
         options = values.get("options") or {}  # Handle None case
 
@@ -1,30 +1,57 @@
-# This module acts as a compatibility layer for Pydantic V1 and V2.
+"""Pydantic compatibility layer for supporting both Pydantic V1 and V2.
+
+This module provides a unified interface for Pydantic functionality that works across both
+Pydantic V1.x and V2.x versions. It ensures that the dbldatagen spec API works in multiple
+environments without requiring specific Pydantic version installations.
+
+The module exports a consistent Pydantic V1-compatible API regardless of which version is installed:
+
+- **BaseModel**: Base class for all Pydantic models
+- **Field**: Field definition with metadata and validation
+- **constr**: Constrained string type for validation
+- **root_validator**: Decorator for model-level validation
+- **validator**: Decorator for field-level validation
+
+Usage in other modules:
+    Always import from this compat module, not directly from pydantic::
+
+        # Correct
+        from .compat import BaseModel, validator
+
+        # Incorrect - don't do this
+        from pydantic import BaseModel, validator
+
+Environment Support:
+    - **Pydantic V2.x environments**: Imports from pydantic.v1 compatibility layer
+    - **Pydantic V1.x environments**: Imports directly from pydantic package
+    - **Databricks runtimes**: Works with pre-installed Pydantic versions without conflicts
+
+.. note::
+    This approach is inspired by FastAPI's compatibility layer:
+    https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py
+
+Benefits:
+    - **No Installation Required**: Works with whatever Pydantic version is available
+    - **Single Codebase**: One set of code works across both Pydantic versions
+    - **Environment Agnostic**: Application code doesn't need to know which version is installed
+    - **Future-Ready**: Easy migration path to Pydantic V2 API when ready
+    - **Databricks Compatible**: Avoids conflicts with pre-installed libraries
+
+Future Migration:
+    When ready to migrate to native Pydantic V2 API:
+    1. Update application code to use V2 patterns
+    2. Modify this compat.py to import from native V2 locations
+    3. Test in both environments
+    4. Deploy incrementally
+"""
 
 try:
     # This will succeed on environments with Pydantic V2.x
+    # Pydantic V2 provides a v1 compatibility layer for backwards compatibility
     from pydantic.v1 import BaseModel, Field, constr, root_validator, validator
 except ImportError:
     # This will be executed on environments with only Pydantic V1.x
+    # Import directly from pydantic since v1 subpackage doesn't exist
     from pydantic import BaseModel, Field, constr, root_validator, validator  # type: ignore[assignment,no-redef]
 
 __all__ = ["BaseModel", "Field", "constr", "root_validator", "validator"]
-# In your application code, do this:
-# from .compat import BaseModel
-# NOT this:
-# from pydantic import BaseModel
-
-# FastAPI Notes
-# https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py
-
-
-"""
-## Why This Approach
-No Installation Required: It directly addresses your core requirement.
-You don't need to %pip install anything, which avoids conflicts with the pre-installed libraries on Databricks.
-Single Codebase: You maintain one set of code that is guaranteed to work with the Pydantic V1 API, which is available in both runtimes.
-
-Environment Agnostic: Your application code in models.py has no idea which version of Pydantic is actually installed. The compat.py module handles that complexity completely.
-
-Future-Ready: When you eventually decide to migrate fully to the Pydantic V2 API (to take advantage of its speed and features),
-you only need to change your application code and your compat.py import statements, making the transition much clearer.
-"""