From c340e01e6e766e45b1b65cdd648ca236871dc299 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Sat, 4 Feb 2023 21:11:42 -0800
Subject: [PATCH 1/4] new documentation on generating JSON data

---
 docs/source/generating_json_data.rst | 236 +++++++++++++++++++++++++++
 docs/source/index.rst                |   1 +
 2 files changed, 237 insertions(+)
 create mode 100644 docs/source/generating_json_data.rst

diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst
new file mode 100644
index 00000000..15fe5445
--- /dev/null
+++ b/docs/source/generating_json_data.rst
@@ -0,0 +1,236 @@
+.. Test Data Generator documentation master file, created by
+   sphinx-quickstart on Sun Jun 21 10:54:30 2020.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Generating JSON and structured column data
+==========================================
+
+This section explores generating JSON and structured column data. By structured columns,
+we mean columns that are some combination of `struct`, `array` and `map` of other types.
+
+Generating JSON data
+--------------------
+There are several method for generating JSON data:
+
+- Generate a dataframe and save it as JSON will generate full data set as JSON
+- Generate JSON valued fields using SQL functions such as `named_struct` and `to_json`
+
+Writing dataframe as JSON data
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The following example illustrates the basic technique for generating JSON data from a dataframe.
+
+.. code-block:: python
+
+   from pyspark.sql.types import LongType, IntegerType, StringType
+
+   import dbldatagen as dg
+
+
+   country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
+                    'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
+   country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8,
+                      17]
+
+   manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices']
+
+   lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
+
+   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                                    partitions=8,
+                                    randomSeedMethod='hash_fieldname')
+                   .withIdOutput()
+                   # we'll use hash of the base field to generate the ids to
+                   # avoid a simple incrementing sequence
+                   .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
+                               uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+                   # note for format strings, we must use "%lx" not "%x" as the
+                   # underlying value is a long
+                   .withColumn("device_id", StringType(), format="0x%013x",
+                               baseColumn="internal_device_id")
+
+                   # the device / user attributes will be the same for the same device id
+                   # so lets use the internal device id as the base column for these attribute
+                   .withColumn("country", StringType(), values=country_codes,
+                               weights=country_weights,
+                               baseColumn="internal_device_id")
+                   .withColumn("manufacturer", StringType(), values=manufacturers,
+                               baseColumn="internal_device_id")
+
+                   # use omit = True if you don't want a column to appear in the final output
+                   # but just want to use it as part of generation of another column
+                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                               baseColumnType="hash")
+                   .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
+                               baseColumn="device_id",
+                               baseColumnType="hash", omit=True)
+
+                   .withColumn("event_type", StringType(),
+                               values=["activation", "deactivation", "plan change",
+                                       "telecoms activity", "internet activity", "device error"],
+                               random=True)
+                   .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                               interval="1 minute", random=True)
+
+                   )
+
+   dfTestData = testDataSpec.build()
+
+   dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData1")
+
+In the most basic form, you can simply save the dataframe to storage in JSON format.
+
+Use of nested structures in data generation specifications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When we save a dataframe containing complex column types such as `map`, `struct` and `array`, these will be
+converted to equivalent constructs in JSON.
+
+So how do we go about creating these?
+
+We can use a struct valued column to hold the nested structure data and write the results out as JSON
+
+Struct / array and map valued columns can be created by adding a column of the appropriate type and using the `expr`
+attribute to assemble the complex column.
+
+Note that in the current release, the `expr` attribute will override other column data generation rules.
+
+.. code-block:: python
+
+   from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
+       TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField
+
+   import dbldatagen as dg
+
+
+   country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
+                    'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
+   country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8,
+                      17]
+
+   manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices']
+
+   lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
+
+   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                                    partitions=8,
+                                    randomSeedMethod='hash_fieldname')
+                   .withIdOutput()
+                   # we'll use hash of the base field to generate the ids to
+                   # avoid a simple incrementing sequence
+                   .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
+                               uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+                   # note for format strings, we must use "%lx" not "%x" as the
+                   # underlying value is a long
+                   .withColumn("device_id", StringType(), format="0x%013x",
+                               baseColumn="internal_device_id")
+
+                   # the device / user attributes will be the same for the same device id
+                   # so lets use the internal device id as the base column for these attribute
+                   .withColumn("country", StringType(), values=country_codes,
+                               weights=country_weights,
+                               baseColumn="internal_device_id")
+
+                   .withColumn("manufacturer", StringType(), values=manufacturers,
+                               baseColumn="internal_device_id", omit=True)
+                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                               baseColumnType="hash", omit=True)
+                   .withColumn("manufacturer_info", StructType([StructField('line',StringType()), StructField('manufacturer', StringType())]),
+                                                    expr="named_struct('line', line, 'manufacturer', manufacturer)",
+                               baseColumn=['manufacturer', 'line'])
+
+
+                   .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
+                               baseColumn="device_id",
+                               baseColumnType="hash", omit=True)
+
+                   .withColumn("event_type", StringType(),
+                               values=["activation", "deactivation", "plan change",
+                                       "telecoms activity", "internet activity", "device error"],
+                               random=True, omit=True)
+                   .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                               interval="1 minute", random=True, omit=True)
+
+                   .withColumn("event_info", StructType([StructField('event_type',StringType()), StructField('event_ts', TimestampType())]),
+                                                    expr="named_struct('event_type', event_type, 'event_ts', event_ts)",
+                               baseColumn=['event_type', 'event_ts'])
+                   )
+
+   dfTestData = testDataSpec.build()
+   dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2")
+
+Generating JSON valued fields
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+JSON valued fields can be generated as fields of `string` type and assembled using a combination of Spark SQL
+functions such as `named_struct` and `to_json`.
+
+.. code-block:: python
+
+   from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \
+       TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField
+
+   import dbldatagen as dg
+
+
+   country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
+                    'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
+   country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8,
+                      17]
+
+   manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices']
+
+   lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
+
+   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                                    partitions=8,
+                                    randomSeedMethod='hash_fieldname')
+                   .withIdOutput()
+                   # we'll use hash of the base field to generate the ids to
+                   # avoid a simple incrementing sequence
+                   .withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
+                               uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+                   # note for format strings, we must use "%lx" not "%x" as the
+                   # underlying value is a long
+                   .withColumn("device_id", StringType(), format="0x%013x",
+                               baseColumn="internal_device_id")
+
+                   # the device / user attributes will be the same for the same device id
+                   # so lets use the internal device id as the base column for these attribute
+                   .withColumn("country", StringType(), values=country_codes,
+                               weights=country_weights,
+                               baseColumn="internal_device_id")
+
+                   .withColumn("manufacturer", StringType(), values=manufacturers,
+                               baseColumn="internal_device_id", omit=True)
+                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                               baseColumnType="hash", omit=True)
+                   .withColumn("manufacturer_info", "string",
+                                                    expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
+                               baseColumn=['manufacturer', 'line'])
+
+
+                   .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11,
+                               baseColumn="device_id",
+                               baseColumnType="hash", omit=True)
+
+                   .withColumn("event_type", StringType(),
+                               values=["activation", "deactivation", "plan change",
+                                       "telecoms activity", "internet activity", "device error"],
+                               random=True, omit=True)
+                   .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                               interval="1 minute", random=True, omit=True)
+
+                   .withColumn("event_info", "string",
+                                                    expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
+                               baseColumn=['event_type', 'event_ts'])
+                   )
+
+   dfTestData = testDataSpec.build()
+
+   #dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2")
+   display(dfTestData)
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f26f23f8..4eecc0ac 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,6 +32,7 @@ As it is installable via `%pip install`, it can also be incorporated in environm
    Options for column specification <options_and_features>
    Generating repeatable data  <repeatable_data_generation>
    Using streaming data <using_streaming_data>
+   Generating JSON and structured column data <generating_json_data>
    Generating Change Data Capture (CDC) data<generating_cdc_data>
    Using multiple tables <multi_table_data>
    Extending text generation  <extending_text_generation>

From bd80537559e0696ac0929d276ca5c9fabe45fe3b Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Sun, 5 Feb 2023 17:45:12 -0800
Subject: [PATCH 2/4] doc changes

---
 docs/source/options_and_features.rst | 63 +++++++++++++++---
 docs/source/troubleshooting.rst      | 98 +++++++++++++++++++++++++---
 2 files changed, 145 insertions(+), 16 deletions(-)

diff --git a/docs/source/options_and_features.rst b/docs/source/options_and_features.rst
index d104336d..49ffefc1 100644
--- a/docs/source/options_and_features.rst
+++ b/docs/source/options_and_features.rst
@@ -12,27 +12,60 @@ Options for column specification
 The following table lists some of the common options that can be applied with the ``withColumn`` and ``withColumnSpec``
 methods.
 
+.. table:: Column creation options
+
 ================  ==============================
 Parameter         Usage
 ================  ==============================
 minValue          Minimum value for range of generated value. As alternative use ``dataRange``.
+
 maxValue          Minimum value for range of generated value. As alternative use ``dataRange``.
-step              Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter
-random            If True, will generate random values for column value. Defaults to `False`
-randomSeedMethod  Determines how seed will be used. If 'fixed', will use fixed random seed. If set to 'hash_fieldname'
-                  will use a hash of the field name as the random seed for a specific column.
+
+step              Step to use for range of generated value.
+
+                  As an alternative, you may use the `dataRange` parameter
+
+random            If `True`, will generate random values for column value. Defaults to `False`
+
+randomSeedMethod  Determines how seed will be used.
+
+                  If set to the value 'fixed', will use fixed random seed.
+
+                  If set to 'hash_fieldname', it will use a hash of the field name as the random seed
+                  for a specific column.
+
 baseColumn        Either the string name of the base column, or a list of columns to use to control data generation.
-values            List of discrete values for the column. Discrete values can numeric, dates timestamps, strings etc.
+
+values            List of discrete values for the column.
+
+                  Discrete values can numeric, dates timestamps, strings etc.
+
 weights           List of discrete weights for the column. Controls spread of values
-percentNulls      Percentage of nulls to generate for column. Fraction representing percentage between 0.0 and 1.0
+
+percentNulls      Percentage of nulls to generate for column.
+
+                  Fraction representing percentage between 0.0 and 1.0
+
 uniqueValues      Number of distinct unique values for the column. Use as alternative to data range.
+
 begin             Beginning of range for date and timestamp fields.
+
 end               End of range for date and timestamp fields.
+
 interval          Interval of range for date and timestamp fields.
-dataRange         An instance of an `NRange` or `DateRange` object. This can be used in place of ``minValue``, etc.
+
+dataRange         An instance of an `NRange` or `DateRange` object.
+
+                  This can be used in place of ``minValue``, etc.
+
 template          Template controlling text generation
-omit              If True, omit column from final output. Use when column is only needed to compute other columns.
+
+omit              If True, omit column from final output.
+
+                  Use when column is only needed to compute other columns.
+
 expr              SQL expression to control data generation
+
 ================  ==============================
 
 
@@ -44,12 +77,26 @@ expr              SQL expression to control data generation
      For more information, see :data:`~dbldatagen.daterange.DateRange`
      or :data:`~dbldatagen.daterange.NRange`.
 
+Using custom SQL to control data generation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The `expr` attribute can be used to specify an arbitrary Spark SQL expression to control how the data is
+generated for a column. If the body of the SQL references other columns, you will need to ensure that
+those columns are created first.
+
+By default, the columns are created in the order specified.
+
+However, you can control the order of column creation using the `baseColumn` attribute.
+
+More Details
+^^^^^^^^^^^^
 
 The full set of options for column specification which may be used with the ``withColumn``, ``withColumnSpec`` and
 and ``withColumnSpecs`` method can be found at:
 
    * :data:`~dbldatagen.column_spec_options.ColumnSpecOptions`
 
+
 Generating views automatically
 ------------------------------
 
diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
index 66cf4497..4f18eff6 100644
--- a/docs/source/troubleshooting.rst
+++ b/docs/source/troubleshooting.rst
@@ -6,6 +6,11 @@
 Troubleshooting
 ===============
 
+Tools and aids to troubleshooting
+---------------------------------
+
+Use of the datagenerator `explain` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 To aid in debugging data generation issues, you may use the `explain` method of the
 data generator class to produce a synopsis of how the data will be generated.
 
@@ -31,18 +36,95 @@ For example:
    import dbldatagen as dg
    import pyspark.sql.functions as F
 
-   partitions_requested = 32
    data_rows = 10 * 1000 * 1000
 
    uniqueCustomers = 10 * 1000000
 
-   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested,
-                                verbose=True)
-               .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
-               ...
-               )
-   df1 = dataspec.build()
+   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=4, verbose=True)
+                  .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
+                  .withColumn("city", "string", template=r"\w")
+                  .withColumn("name", "string", template=r"\w \w|\w \w \w")
+                  .withColumn("email", "string", template=r"\w@\w.com|\w@\w.org|\w.\w@\w.com")
+                  )
+   df = dataspec.build()
+
+   display(df)   df1 = dataspec.build()
 
 See:
+  * :data:`~dbldatagen.data_generator.DataGenerator`
+
+Operational message logging
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default the data generation process output produce error and warning messages via the
+Python `logging` module.
+
+In addition, the operation of the data generation process will record messages related to how
+the data is being or was generated to an internal explain log during the execution of the `build` method.
+
+So essentially the `explain` method displays the contents of the explain log from the last `build` invocation.
+If `build` has not yet been run, it will display the explain logging messages from the build planning process.
+
+Building planning performs pre-build tasks such as computing the order in which columns need to be generated.
+
+Examining log outputs
+^^^^^^^^^^^^^^^^^^^^^
+Logging outputs will be displayed automatically when using the data generator in a Databricks notebook environment
+
+Common issues and resolution
+----------------------------
+
+Attempting to add a column named `id`
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default, the data generator reserves the column named `id` to act as the seed column for other columns in the
+data generation spec. However you may need to use the name `id` may be used for a specific column definition in the
+generated data which differs from the default seed column in operation.
+
+In this case, you may customize the name of the seed column to an alternative name via the `seedColumnName` parameter
+to the construction of the `DataGenerator` instance
+
+The following code shows its use:
+
+.. code-block:: python
+
+   import dbldatagen as dg
+   import pyspark.sql.functions as F
+
+   data_rows = 10 * 1000 * 1000
+
+   uniqueCustomers = 10 * 1000000
+
+   dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=4, seedColumnName='_id')
+                  .withColumn("id","long", uniqueValues=uniqueCustomers)
+                  .withColumn("city", "string", template=r"\w")
+                  .withColumn("name", "string", template=r"\w \w|\w \w \w")
+                  .withColumn("email", "string", template=r"\w@\w.com|\w@\w.org|\w.\w@\w.com")
+                  )
+   df = dataspec.build()
+
+   display(df)
+
+Attempting to compute column before dependent columns are computed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+By default, the value for a column is computed based on some transformation of the seed column (named `id` by default).
+You can use other columns as the seed column for a given column via the `baseColumn` attribute - which takes either
+the name of column as a string or a Python list of column names, if the column is dependent on multiple columns.
+
+Use of the `expr` attribute (which allows for the use of arbitrary SQL expressions) can also create dependencies on
+other columns.
+
+If a column depends on other columns through referencing them in the body of the expression specified in the `expr`
+attribute, it is necessary to ensure that the columns on which the expression depends are computed first.
+
+
+.. sidebar:: Column build ordering
+
+   By default, columns will  be built in
+   the order they are specified unless there are
+   forward references
+
+
+Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute
+may specify either a string that names the column on which the current column depends or a list of column names
+specified as a list of strings.
+
 
-  * :data:`~dbldatagen.data_generator.DataGenerator`
\ No newline at end of file

From eb35a4439ee60d73af44d0d8f081af1fb735cdb8 Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Mon, 6 Feb 2023 01:42:12 -0800
Subject: [PATCH 3/4] updates to docs

---
 docs/source/troubleshooting.rst | 105 +++++++++++++++++++++++++++++---
 1 file changed, 97 insertions(+), 8 deletions(-)

diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst
index 4f18eff6..386d35ab 100644
--- a/docs/source/troubleshooting.rst
+++ b/docs/source/troubleshooting.rst
@@ -55,6 +55,13 @@ See:
 
 Operational message logging
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+.. sidebar:: Logging
+
+   Warning, error and info messages are available via standard logging capabilities.
+
+
 By default the data generation process output produce error and warning messages via the
 Python `logging` module.
 
@@ -64,7 +71,13 @@ the data is being or was generated to an internal explain log during the executi
 So essentially the `explain` method displays the contents of the explain log from the last `build` invocation.
 If `build` has not yet been run, it will display the explain logging messages from the build planning process.
 
-Building planning performs pre-build tasks such as computing the order in which columns need to be generated.
+Regular logging messages are generated using the standard logger.
+
+You can display additional logging messages by specifying the `verbose` option during creation of the `DataGenerator`
+instance.
+
+.. note:: Building planning performs pre-build tasks such as computing the order in which columns need to be generated.
+          Build planning messages are available via the `explain` method
 
 Examining log outputs
 ^^^^^^^^^^^^^^^^^^^^^
@@ -75,6 +88,14 @@ Common issues and resolution
 
 Attempting to add a column named `id`
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+.. sidebar:: Customizing seed column
+
+   Use the `seedColumnName` attribute when creating `DataGenerator` instance to
+   customize the seed column name.
+
+
 By default, the data generator reserves the column named `id` to act as the seed column for other columns in the
 data generation spec. However you may need to use the name `id` may be used for a specific column definition in the
 generated data which differs from the default seed column in operation.
@@ -109,22 +130,90 @@ By default, the value for a column is computed based on some transformation of t
 You can use other columns as the seed column for a given column via the `baseColumn` attribute - which takes either
 the name of column as a string or a Python list of column names, if the column is dependent on multiple columns.
 
+
+.. sidebar:: Column build ordering
+
+   Column build order is optimized for best performance during data generation.
+   To ensure columns are computed in correct order, use the `baseColumn` attribute.
+
+
 Use of the `expr` attribute (which allows for the use of arbitrary SQL expressions) can also create dependencies on
 other columns.
 
 If a column depends on other columns through referencing them in the body of the expression specified in the `expr`
 attribute, it is necessary to ensure that the columns on which the expression depends are computed first.
+Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute
+may specify either a string that names the column on which the current column depends or a list of column names
+specified as a list of strings.
 
+For example, the following code has dependencies in some of the `expr` SQL expressions on earlier columns.
+In these cases, we use the `baseColumn` attribute to ensure the correct column build order.
 
-.. sidebar:: Column build ordering
+.. code-block:: python
 
-   By default, columns will  be built in
-   the order they are specified unless there are
-   forward references
+   import dbldatagen as dg
+
+
+   country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG',
+                    'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL']
+   country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8,
+                      17]
+
+   device_population = 100000
+
+   manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices']
+
+   lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid']
+
+   testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000,
+                                    partitions=8,
+                                    randomSeedMethod='hash_fieldname')
+                   # we'll use hash of the base field to generate the ids to
+                   # avoid a simple incrementing sequence
+                   .withColumn("internal_device_id", "long", minValue=0x1000000000000,
+                               uniqueValues=device_population, omit=True, baseColumnType="hash")
+
+                   # note for format strings, we must use "%lx" not "%x" as the
+                   # underlying value is a long
+                   .withColumn("device_id", "string", format="0x%013x",
+                               baseColumn="internal_device_id")
+
+                   # the device / user attributes will be the same for the same device id
+                   # so lets use the internal device id as the base column for these attribute
+                   .withColumn("country", "string", values=country_codes,
+                               weights=country_weights,
+                               baseColumn="internal_device_id")
+
+                   .withColumn("manufacturer", "string", values=manufacturers,
+                               baseColumn="internal_device_id", omit=True)
+
+                   .withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
+                               baseColumnType="hash", omit=True)
+
+                   # note use of baseColumn to control column build ordering
+                   .withColumn("manufacturer_info", "string",
+                                expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))",
+                               baseColumn=["line", "manufacturer"]
+                              )
+
+                   .withColumn("event_type", "string",
+                               values=["activation", "deactivation", "plan change",
+                                       "telecoms activity", "internet activity", "device error"],
+                               random=True, omit=True)
+
+                   .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00",
+                               interval="1 minute", random=True, omit=True)
+
+                   # note use of baseColumn to control column build ordering
+                   .withColumn("event_info", "string",
+                                expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))",
+                                baseColumn=["event_type", "event_ts"])
+                   )
+
+   dfTestData = testDataSpec.build()
+
+   display(dfTestData)
 
 
-Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute
-may specify either a string that names the column on which the current column depends or a list of column names
-specified as a list of strings.
 
 

From ea63d718eab68acd136d6c50da7507262576485c Mon Sep 17 00:00:00 2001
From: ronanstokes-db <ronan.stokes@databricks.com>
Date: Wed, 8 Feb 2023 14:02:16 -0800
Subject: [PATCH 4/4] fixed typo in docs

---
 docs/source/generating_json_data.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst
index 15fe5445..a49cd0bb 100644
--- a/docs/source/generating_json_data.rst
+++ b/docs/source/generating_json_data.rst
@@ -11,7 +11,7 @@ we mean columns that are some combination of `struct`, `array` and `map` of othe
 
 Generating JSON data
 --------------------
-There are several method for generating JSON data:
+There are several methods for generating JSON data:
 
 - Generate a dataframe and save it as JSON will generate full data set as JSON
 - Generate JSON valued fields using SQL functions such as `named_struct` and `to_json`