Skip to content

Commit 0a8fa2a

Browse files
committed
updating docs
1 parent f5214ca commit 0a8fa2a

File tree

4 files changed

+445
-93
lines changed

4 files changed

+445
-93
lines changed

dbldatagen/spec/column_spec.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,48 @@
2222
"bigint",
2323
"tinyint",
2424
]
25+
"""Type alias representing supported basic Spark SQL data types for column definitions.
26+
27+
Includes both standard SQL types (e.g. string, int, double) and Spark-specific type names
28+
(e.g. bigint, tinyint). These types are used in the ColumnDefinition to specify the data type
29+
for generated columns.
30+
"""
31+
32+
2533
class ColumnDefinition(BaseModel):
34+
"""Defines the specification for a single column in a synthetic data table.
35+
36+
This class encapsulates all the information needed to generate data for a single column,
37+
including its name, type, constraints, and generation options. It supports both primary key
38+
columns and derived columns that can reference other columns.
39+
40+
:param name: Name of the column to be generated
41+
:param type: Spark SQL data type for the column (e.g., "string", "int", "timestamp").
42+
If None, type may be inferred from options or baseColumn
43+
:param primary: If True, this column will be treated as a primary key column with unique values.
44+
Primary columns cannot have min/max options and cannot be nullable
45+
:param options: Dictionary of additional options controlling column generation behavior.
46+
Common options include: min, max, step, values, template, distribution, etc.
47+
See dbldatagen documentation for full list of available options
48+
:param nullable: If True, the column may contain NULL values. Primary columns cannot be nullable
49+
:param omit: If True, this column will be generated internally but excluded from the final output.
50+
Useful for intermediate columns used in calculations
51+
:param baseColumn: Name of another column to use as the basis for generating this column's values.
52+
Default is "id" which refers to the internal row identifier
53+
:param baseColumnType: Method for deriving values from the baseColumn. Common values:
54+
"auto" (infer behavior), "hash" (hash the base column values),
55+
"values" (use base column values directly)
56+
57+
.. note::
58+
Primary columns have special constraints:
59+
- Must have a type defined
60+
- Cannot have min/max options
61+
- Cannot be nullable
62+
63+
.. note::
64+
Columns can be chained via baseColumn references, but circular dependencies
65+
will be caught during validation
66+
"""
2667
name: str
2768
type: DbldatagenBasicType | None = None
2869
primary: bool = False
@@ -34,8 +75,19 @@ class ColumnDefinition(BaseModel):
3475

3576
@root_validator()
3677
def check_model_constraints(cls, values: dict[str, Any]) -> dict[str, Any]:
37-
"""
38-
Validates constraints across the entire model after individual fields are processed.
78+
"""Validates constraints across the entire ColumnDefinition model.
79+
80+
This validator runs after all individual field validators and checks for cross-field
81+
constraints that depend on multiple fields being set. It ensures that primary key
82+
columns meet all necessary requirements and that conflicting options are not specified.
83+
84+
:param values: Dictionary of all field values for this ColumnDefinition instance
85+
:returns: The validated values dictionary, unmodified if all validations pass
86+
:raises ValueError: If primary column has min/max options, or if primary column is nullable,
87+
or if primary column doesn't have a type defined
88+
89+
.. note::
90+
This is a Pydantic root validator that runs automatically during model instantiation
3991
"""
4092
is_primary = values.get("primary")
4193
options = values.get("options") or {} # Handle None case

dbldatagen/spec/compat.py

Lines changed: 48 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,57 @@
1-
# This module acts as a compatibility layer for Pydantic V1 and V2.
1+
"""Pydantic compatibility layer for supporting both Pydantic V1 and V2.
2+
3+
This module provides a unified interface for Pydantic functionality that works across both
4+
Pydantic V1.x and V2.x versions. It ensures that the dbldatagen spec API works in multiple
5+
environments without requiring specific Pydantic version installations.
6+
7+
The module exports a consistent Pydantic V1-compatible API regardless of which version is installed:
8+
9+
- **BaseModel**: Base class for all Pydantic models
10+
- **Field**: Field definition with metadata and validation
11+
- **constr**: Constrained string type for validation
12+
- **root_validator**: Decorator for model-level validation
13+
- **validator**: Decorator for field-level validation
14+
15+
Usage in other modules:
16+
Always import from this compat module, not directly from pydantic::
17+
18+
# Correct
19+
from .compat import BaseModel, validator
20+
21+
# Incorrect - don't do this
22+
from pydantic import BaseModel, validator
23+
24+
Environment Support:
25+
- **Pydantic V2.x environments**: Imports from pydantic.v1 compatibility layer
26+
- **Pydantic V1.x environments**: Imports directly from pydantic package
27+
- **Databricks runtimes**: Works with pre-installed Pydantic versions without conflicts
28+
29+
.. note::
30+
This approach is inspired by FastAPI's compatibility layer:
31+
https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py
32+
33+
Benefits:
34+
- **No Installation Required**: Works with whatever Pydantic version is available
35+
- **Single Codebase**: One set of code works across both Pydantic versions
36+
- **Environment Agnostic**: Application code doesn't need to know which version is installed
37+
- **Future-Ready**: Easy migration path to Pydantic V2 API when ready
38+
- **Databricks Compatible**: Avoids conflicts with pre-installed libraries
39+
40+
Future Migration:
41+
When ready to migrate to native Pydantic V2 API:
42+
1. Update application code to use V2 patterns
43+
2. Modify this compat.py to import from native V2 locations
44+
3. Test in both environments
45+
4. Deploy incrementally
46+
"""
247

348
try:
449
# This will succeed on environments with Pydantic V2.x
50+
# Pydantic V2 provides a v1 compatibility layer for backwards compatibility
551
from pydantic.v1 import BaseModel, Field, constr, root_validator, validator
652
except ImportError:
753
# This will be executed on environments with only Pydantic V1.x
54+
# Import directly from pydantic since v1 subpackage doesn't exist
855
from pydantic import BaseModel, Field, constr, root_validator, validator # type: ignore[assignment,no-redef]
956

1057
__all__ = ["BaseModel", "Field", "constr", "root_validator", "validator"]
11-
# In your application code, do this:
12-
# from .compat import BaseModel
13-
# NOT this:
14-
# from pydantic import BaseModel
15-
16-
# FastAPI Notes
17-
# https://github.com/fastapi/fastapi/blob/master/fastapi/_compat.py
18-
19-
20-
"""
21-
## Why This Approach
22-
No Installation Required: It directly addresses your core requirement.
23-
You don't need to %pip install anything, which avoids conflicts with the pre-installed libraries on Databricks.
24-
Single Codebase: You maintain one set of code that is guaranteed to work with the Pydantic V1 API, which is available in both runtimes.
25-
26-
Environment Agnostic: Your application code in models.py has no idea which version of Pydantic is actually installed. The compat.py module handles that complexity completely.
27-
28-
Future-Ready: When you eventually decide to migrate fully to the Pydantic V2 API (to take advantage of its speed and features),
29-
you only need to change your application code and your compat.py import statements, making the transition much clearer.
30-
"""

0 commit comments

Comments
 (0)