databrickslabs
diff --git a/‎.github/workflows/push.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/push.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎PULL_REQUEST_TEMPLATE.md‎
Lines changed: 11 additions & 30 deletions b/‎PULL_REQUEST_TEMPLATE.md‎
Lines changed: 11 additions & 30 deletions
diff --git a/‎dbldatagen/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎dbldatagen/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎dbldatagen/config.py‎
Lines changed: 36 additions & 0 deletions b/‎dbldatagen/config.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎dbldatagen/data_generator.py‎
Lines changed: 41 additions & 12 deletions b/‎dbldatagen/data_generator.py‎
Lines changed: 41 additions & 12 deletions
diff --git a/‎dbldatagen/datasets/basic_stock_ticker.py‎
Lines changed: 1 addition & 2 deletions b/‎dbldatagen/datasets/basic_stock_ticker.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎dbldatagen/datasets_object.py‎
Lines changed: 26 additions & 24 deletions b/‎dbldatagen/datasets_object.py‎
Lines changed: 26 additions & 24 deletions
@@ -61,7 +61,8 @@ jobs:
           python-version: '3.10'
 
       - name: Install Hatch
-        run: pip install hatch
+        # click 8.3+ introduced bug for hatch
+        run: pip install "hatch==1.13.0" "click<8.3"
 
       - name: Run unit tests
         run: make dev test
 
@@ -1,34 +1,15 @@
-## Proposed changes
+## Changes
+<!-- Summarize your changes. Add code examples or screenshots when necessary. -->
 
-Describe the big picture of your changes here to communicate to the maintainers. 
-If it fixes a bug or resolves a feature request, please provide a link to that issue.
+### Linked issues
+<!-- DOC: Link issue with a keyword: close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved. See https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword -->
 
-## Types of changes
+Resolves #..
 
-What types of changes does your code introduce to dbldatagen?
-_Put an `x` in the boxes that apply_
+### Requirements
+<!-- How are your changes documented and tested? Please see the checklist below. -->
 
-- [ ] Bug fix (non-breaking change which fixes an issue)
-- [ ] New feature (non-breaking change which adds functionality)
-- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected)
-- [ ] Change to tutorials, tests or examples
-- [ ] Non code change (readme, images or other non-code assets)
-- [ ] Documentation Update (if none of the other choices apply)
-
-## Checklist
-
-_Put an `x` in the boxes that apply. You can also fill these out after creating the PR. 
-If you're unsure about any of them, don't hesitate to ask. We're here to help! 
-This is simply a reminder of what we are going to look for before merging your code._
-
-- [ ] Lint and unit tests pass locally with my changes
-- [ ] I have added tests that prove my fix is effective or that my feature works
-- [ ] I have added necessary documentation (if appropriate)
-- [ ] Any dependent changes have been merged and published in downstream modules
-- [ ] Submission does not reduce code coverage numbers
-- [ ] Submission does not increase alerts or messages from prospector / lint
-
-## Further comments
-
-If this is a relatively large or complex change, kick off the discussion by explaining why you chose the solution you 
-did and what alternatives you considered, etc...
+- [ ] manually tested
+- [ ] updated documentation
+- [ ] updated demos
+- [ ] updated tests
@@ -47,11 +47,12 @@
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
 from .html_utils import HtmlUtils
 from .datasets_object import Datasets
+from .config import OutputDataset
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils", "datasets_object", "constraints"
+           "text_generator_plugins", "html_utils", "datasets_object", "constraints", "config"
            ]
 
 
 
@@ -0,0 +1,36 @@
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+This module implements configuration classes for writing generated data.
+"""
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True, slots=True)
+class OutputDataset:
+    """
+    This class implements an output sink configuration used to write generated data. An output location must be
+    provided. The output mode, format, and options can be provided.
+
+    :param location: Output location for writing data. This could be an absolute path, a relative path to a Databricks
+        Volume, or a full table location using Unity catalog's 3-level namespace.
+    :param output_mode: Output mode for writing data (default is ``"append"``).
+    :param format: Output data format (default is ``"delta"``).
+    :param options: Optional dictionary of options for writing data (e.g. ``{"mergeSchema": "true"}``)
+    """
+    location: str
+    output_mode: str = "append"
+    format: str = "delta"
+    options: dict[str, str] | None = None
+    trigger: dict[str, str] | None = None
+
+    def __post_init__(self) -> None:
+        if not self.trigger:
+            return
+
+        # Only processingTime is currently supported
+        if "processingTime" not in self.trigger:
+            valid_trigger_format = '{"processingTime": "10 SECONDS"}'
+            raise ValueError(f"Attribute 'trigger' must be a dictionary of the form '{valid_trigger_format}'")
@@ -15,11 +15,13 @@
 from typing import Any
 
 from pyspark.sql import DataFrame, SparkSession
+from pyspark.sql.streaming.query import StreamingQuery
 from pyspark.sql.types import DataType, IntegerType, LongType, StringType, StructField, StructType
 
 from dbldatagen import datagen_constants
 from dbldatagen._version import _get_spark_version
 from dbldatagen.column_generation_spec import ColumnGenerationSpec
+from dbldatagen.config import OutputDataset
 from dbldatagen.constraints import Constraint, SqlExpr
 from dbldatagen.datarange import DataRange
 from dbldatagen.distributions import DataDistribution
@@ -28,7 +30,14 @@
 from dbldatagen.serialization import SerializableToDict
 from dbldatagen.spark_singleton import SparkSingleton
 from dbldatagen.text_generators import TextGenerator
-from dbldatagen.utils import DataGenError, deprecated, ensure, split_list_matching_condition, topologicalSort
+from dbldatagen.utils import (
+    DataGenError,
+    deprecated,
+    ensure,
+    split_list_matching_condition,
+    topologicalSort,
+    write_data_to_output,
+)
 
 
 _OLD_MIN_OPTION: str = "min"
@@ -1204,9 +1213,9 @@ def _generateColumnDefinition(
     ) -> ColumnGenerationSpec:
         """ generate field definition and column spec
 
-        .. note:: Any time that a new column definition is added,
-                  we'll mark that the build plan needs to be regenerated.
-           For our purposes, the build plan determines the order of column generation etc.
+        .. note::
+            Any time that a new column definition is added, we'll mark that the build plan needs to be regenerated.
+            For our purposes, the build plan determines the order of column generation etc.
 
         :returns: Newly added column_spec
         """
@@ -1381,7 +1390,6 @@ def _adjustBuildOrderForSqlDependencies(self, buildOrder: list[list[str]], colum
         :param buildOrder: list of lists of ids - each sublist represents phase of build
         :param columnSpecsByName: dictionary to map column names to column specs
         :returns: Spark SQL dataframe of generated test data
-
         """
         new_build_order = []
 
@@ -1476,8 +1484,8 @@ def withConstraint(self, constraint: Constraint) -> "DataGenerator":
         :returns: A modified version of the current DataGenerator with the constraint applied
 
         .. note::
-        Constraints are applied at the end of the data generation. Depending on the type of the constraint, the
-        constraint may also affect other aspects of the data generation.
+            Constraints are applied at the end of the data generation. Depending on the type of the constraint, the
+            constraint may also affect other aspects of the data generation.
         """
         assert constraint is not None, "Constraint cannot be empty"
         assert isinstance(constraint, Constraint),  \
@@ -1494,8 +1502,8 @@ def withConstraints(self, constraints: list[Constraint]) -> "DataGenerator":
         :returns: A modified version of the current `DataGenerator` with the constraints applied
 
         .. note::
-        Constraints are applied at the end of the data generation. Depending on the type of the constraint, the
-        constraint may also affect other aspects of the data generation.
+            Constraints are applied at the end of the data generation. Depending on the type of the constraint, the
+            constraint may also affect other aspects of the data generation.
         """
         assert constraints is not None, "Constraints list cannot be empty"
 
@@ -1515,9 +1523,9 @@ def withSqlConstraint(self, sqlExpression: str) -> "DataGenerator":
         :returns: A modified version of the current `DataGenerator` with the SQL expression constraint applied
 
         .. note::
-        Note in the current implementation, this may be equivalent to adding where clauses to the generated dataframe
-        but in future releases, this may be optimized to affect the underlying data generation so that constraints
-        are satisfied more efficiently.
+            Note in the current implementation, this may be equivalent to adding where clauses to the generated dataframe
+            but in future releases, this may be optimized to affect the underlying data generation so that constraints
+            are satisfied more efficiently.
         """
         self.withConstraint(SqlExpr(sqlExpression))
         return self
@@ -1909,6 +1917,27 @@ def scriptMerge(
 
         return result
 
+    def saveAsDataset(
+            self,
+            dataset: OutputDataset,
+            with_streaming: bool | None = None,
+            generator_options: dict[str, Any] | None = None
+    ) -> StreamingQuery | None:
+        """
+        Builds a `DataFrame` from the `DataGenerator` and writes the data to an output dataset (e.g. a table or files).
+
+        :param dataset: Output dataset for writing generated data
+        :param with_streaming: Whether to generate data using streaming. If None, auto-detects based on trigger
+        :param generator_options: Options for building the generator (e.g. `{"rowsPerSecond": 100}`)
+        :returns: A Spark `StreamingQuery` if data is written in streaming, otherwise `None`
+        """
+        # Auto-detect streaming mode if not explicitly specified
+        if with_streaming is None:
+            with_streaming = dataset.trigger is not None and len(dataset.trigger) > 0
+
+        df = self.build(withStreaming=with_streaming, options=generator_options)
+        return write_data_to_output(df, output_dataset=dataset)
+
     @staticmethod
     def loadFromJson(options: str) -> "DataGenerator":
         """
 
@@ -15,7 +15,7 @@
 class BasicStockTickerProvider(DatasetProvider.NoAssociatedDatasetsMixin, DatasetProvider):
     """
     Basic Stock Ticker Dataset
-    ========================
+    ==========================
 
     This is a basic stock ticker dataset with time-series `symbol`, `open`, `close`, `high`, `low`,
     `adj_close`, and `volume` values.
@@ -31,7 +31,6 @@ class BasicStockTickerProvider(DatasetProvider.NoAssociatedDatasetsMixin, Datase
 
     Note that this dataset does not use any features that would prevent it from being used as a source for a
     streaming dataframe, and so the flag `supportsStreaming` is set to True.
-
     """
     DEFAULT_NUM_SYMBOLS = 100
     DEFAULT_START_DATE = "2024-10-01"
 
@@ -20,8 +20,9 @@
 import re
 
 from dbldatagen.datasets.dataset_provider import DatasetProvider
-from .spark_singleton import SparkSingleton
-from .utils import strip_margins
+from dbldatagen.spark_singleton import SparkSingleton
+from dbldatagen.utils import strip_margins
+from dbldatagen.data_generator import DataGenerator
 
 
 class Datasets:
@@ -211,12 +212,6 @@ def get(self, table=None, rows=-1, partitions=-1, **kwargs):
         If the dataset supports multiple tables, the table may be specified in the `table` parameter.
         If none is specified, the primary table is used.
 
-        :param table: name of table to retrieve
-        :param rows: number of rows to generate. if -1, provider should compute defaults.
-        :param partitions: number of partitions to use.If -1, the number of partitions is computed automatically
-        table size and partitioning.If applied to a dataset with only a single table, this is ignored.
-        :param kwargs: additional keyword arguments to pass to the provider
-
         If `rows` or `partitions` are not specified, default values are supplied by the provider.
 
         For multi-table datasets, the table name must be specified. For single table datasets, the table name may
@@ -225,41 +220,44 @@ def get(self, table=None, rows=-1, partitions=-1, **kwargs):
         Additionally, for multi-table datasets, the table name must be one of the tables supported by the provider.
         Default number of rows for multi-table datasets may differ - for example a 'customers' table may have a
         100,000 rows while a 'sales' table may have 1,000,000 rows.
+
+        :param table: name of table to retrieve
+        :param rows: number of rows to generate. if -1, provider should compute defaults.
+        :param partitions: number of partitions to use.If -1, the number of partitions is computed automatically
+            table size and partitioning.If applied to a dataset with only a single table, this is ignored.
+        :param kwargs: additional keyword arguments to pass to the provider
+        :returns: table generator
         """
 
         return self._get(providerName=self._name, tableName=table, rows=rows, partitions=partitions,
                          **kwargs)
 
     def _getSupportingTable(self, *, providerName, tableName, rows=-1, partitions=-1, **kwargs):
-        providerInstance, providerDefinition = \
-            self._getProviderInstanceAndMetadata(providerName, supportsStreaming=self._streamingRequired)
+        providerInstance, providerDefinition = self._getProviderInstanceAndMetadata(
+            providerName, supportsStreaming=self._streamingRequired
+        )
 
         assert tableName is not None and len(tableName.strip()) > 0, "Data set name must be provided"
 
         if tableName not in providerDefinition.associatedDatasets:
             raise ValueError(f"Dataset `{tableName}` not a recognized dataset option")
 
-        dfSupportingTable = providerInstance.getAssociatedDataset(self._sparkSession, tableName=tableName, rows=rows,
-                                                                  partitions=partitions,
-                                                                  **kwargs)
+        dfSupportingTable = providerInstance.getAssociatedDataset(
+            self._sparkSession, tableName=tableName, rows=rows, partitions=partitions, **kwargs
+        )
         return dfSupportingTable
 
-    def getAssociatedDataset(self, *, table, rows=-1, partitions=-1, **kwargs):
-        """Get a table generator from the dataset provider
+    def getAssociatedDataset(self, *, table, rows=-1, partitions=-1, **kwargs) -> DataGenerator:
+        """
+        Gets a table generator from the dataset provider.
 
-        These are DataGenerator instances that can be used to generate the data.
+        Associated datasets are DataGenerator instances that can be used to generate the data.
         The dataset providers also optionally can provide supporting tables which are computed tables based on
         parameters. These are retrieved using the `getAssociatedDataset` method
 
         If the dataset supports multiple tables, the table may be specified in the `table` parameter.
         If none is specified, the primary table is used.
 
-        :param table: name of table to retrieve
-        :param rows: number of rows to generate. if -1, provider should compute defaults.
-        :param partitions: number of partitions to use.If -1, the number of partitions is computed automatically
-        table size and partitioning.If applied to a dataset with only a single table, this is ignored.
-        :param kwargs: additional keyword arguments to pass to the provider
-
         If `rows` or `partitions` are not specified, default values are supplied by the provider.
 
         For multi-table datasets, the table name must be specified. For single table datasets, the table name may
@@ -269,9 +267,13 @@ def getAssociatedDataset(self, *, table, rows=-1, partitions=-1, **kwargs):
         Default number of rows for multi-table datasets may differ - for example a 'customers' table may have a
         100,000 rows while a 'sales' table may have 1,000,000 rows.
 
-        .. note ::
+        :param table: Name of table to retrieve
+        :param rows: Number of rows to generate. if -1, provider should compute defaults
+        :param partitions: number of partitions to use. If -1, the number of partitions is computed automatically table
+            size and partitioning. If applied to a dataset with only a single table, this is ignored.
 
-           This method may also be invoked via the aliased names - `getSupportingDataset` and `getCombinedDataset`
+        .. note ::
+            This method may also be invoked via the aliased names - `getSupportingDataset` and `getCombinedDataset`
         """
         return self._getSupportingTable(providerName=self._name, tableName=table, rows=rows, partitions=partitions,
                                         **kwargs)