From c340e01e6e766e45b1b65cdd648ca236871dc299 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Sat, 4 Feb 2023 21:11:42 -0800 Subject: [PATCH 1/4] new documentation on generating JSON data --- docs/source/generating_json_data.rst | 236 +++++++++++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 237 insertions(+) create mode 100644 docs/source/generating_json_data.rst diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst new file mode 100644 index 00000000..15fe5445 --- /dev/null +++ b/docs/source/generating_json_data.rst @@ -0,0 +1,236 @@ +.. Test Data Generator documentation master file, created by + sphinx-quickstart on Sun Jun 21 10:54:30 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Generating JSON and structured column data +========================================== + +This section explores generating JSON and structured column data. By structured columns, +we mean columns that are some combination of `struct`, `array` and `map` of other types. + +Generating JSON data +-------------------- +There are several method for generating JSON data: + +- Generate a dataframe and save it as JSON will generate full data set as JSON +- Generate JSON valued fields using SQL functions such as `named_struct` and `to_json` + +Writing dataframe as JSON data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The following example illustrates the basic technique for generating JSON data from a dataframe. + +.. code-block:: python + + from pyspark.sql.types import LongType, IntegerType, StringType + + import dbldatagen as dg + + + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, + 17] + + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] + + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] + + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + .withIdOutput() + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", StringType(), format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", StringType(), values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + .withColumn("manufacturer", StringType(), values=manufacturers, + baseColumn="internal_device_id") + + # use omit = True if you don't want a column to appear in the final output + # but just want to use it as part of generation of another column + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash") + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, + baseColumn="device_id", + baseColumnType="hash", omit=True) + + .withColumn("event_type", StringType(), + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True) + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", + interval="1 minute", random=True) + + ) + + dfTestData = testDataSpec.build() + + dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData1") + +In the most basic form, you can simply save the dataframe to storage in JSON format. + +Use of nested structures in data generation specifications +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When we save a dataframe containing complex column types such as `map`, `struct` and `array`, these will be +converted to equivalent constructs in JSON. + +So how do we go about creating these? + +We can use a struct valued column to hold the nested structure data and write the results out as JSON + +Struct / array and map valued columns can be created by adding a column of the appropriate type and using the `expr` +attribute to assemble the complex column. + +Note that in the current release, the `expr` attribute will override other column data generation rules. + +.. code-block:: python + + from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \ + TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField + + import dbldatagen as dg + + + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, + 17] + + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] + + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] + + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + .withIdOutput() + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", StringType(), format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", StringType(), values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + + .withColumn("manufacturer", StringType(), values=manufacturers, + baseColumn="internal_device_id", omit=True) + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash", omit=True) + .withColumn("manufacturer_info", StructType([StructField('line',StringType()), StructField('manufacturer', StringType())]), + expr="named_struct('line', line, 'manufacturer', manufacturer)", + baseColumn=['manufacturer', 'line']) + + + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, + baseColumn="device_id", + baseColumnType="hash", omit=True) + + .withColumn("event_type", StringType(), + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True, omit=True) + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", + interval="1 minute", random=True, omit=True) + + .withColumn("event_info", StructType([StructField('event_type',StringType()), StructField('event_ts', TimestampType())]), + expr="named_struct('event_type', event_type, 'event_ts', event_ts)", + baseColumn=['event_type', 'event_ts']) + ) + + dfTestData = testDataSpec.build() + dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2") + +Generating JSON valued fields +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +JSON valued fields can be generated as fields of `string` type and assembled using a combination of Spark SQL +functions such as `named_struct` and `to_json`. + +.. code-block:: python + + from pyspark.sql.types import LongType, FloatType, IntegerType, StringType, DoubleType, BooleanType, ShortType, \ + TimestampType, DateType, DecimalType, ByteType, BinaryType, ArrayType, MapType, StructType, StructField + + import dbldatagen as dg + + + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, + 17] + + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] + + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] + + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + .withIdOutput() + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", StringType(), format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", StringType(), values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + + .withColumn("manufacturer", StringType(), values=manufacturers, + baseColumn="internal_device_id", omit=True) + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash", omit=True) + .withColumn("manufacturer_info", "string", + expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", + baseColumn=['manufacturer', 'line']) + + + .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, + baseColumn="device_id", + baseColumnType="hash", omit=True) + + .withColumn("event_type", StringType(), + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True, omit=True) + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", + interval="1 minute", random=True, omit=True) + + .withColumn("event_info", "string", + expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", + baseColumn=['event_type', 'event_ts']) + ) + + dfTestData = testDataSpec.build() + + #dfTestData.write.format("json").mode("overwrite").save("/tmp/jsonData2") + display(dfTestData) \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index f26f23f8..4eecc0ac 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,6 +32,7 @@ As it is installable via `%pip install`, it can also be incorporated in environm Options for column specification Generating repeatable data Using streaming data + Generating JSON and structured column data Generating Change Data Capture (CDC) data Using multiple tables Extending text generation From bd80537559e0696ac0929d276ca5c9fabe45fe3b Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Sun, 5 Feb 2023 17:45:12 -0800 Subject: [PATCH 2/4] doc changes --- docs/source/options_and_features.rst | 63 +++++++++++++++--- docs/source/troubleshooting.rst | 98 +++++++++++++++++++++++++--- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/docs/source/options_and_features.rst b/docs/source/options_and_features.rst index d104336d..49ffefc1 100644 --- a/docs/source/options_and_features.rst +++ b/docs/source/options_and_features.rst @@ -12,27 +12,60 @@ Options for column specification The following table lists some of the common options that can be applied with the ``withColumn`` and ``withColumnSpec`` methods. +.. table:: Column creation options + ================ ============================== Parameter Usage ================ ============================== minValue Minimum value for range of generated value. As alternative use ``dataRange``. + maxValue Minimum value for range of generated value. As alternative use ``dataRange``. -step Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter -random If True, will generate random values for column value. Defaults to `False` -randomSeedMethod Determines how seed will be used. If 'fixed', will use fixed random seed. If set to 'hash_fieldname' - will use a hash of the field name as the random seed for a specific column. + +step Step to use for range of generated value. + + As an alternative, you may use the `dataRange` parameter + +random If `True`, will generate random values for column value. Defaults to `False` + +randomSeedMethod Determines how seed will be used. + + If set to the value 'fixed', will use fixed random seed. + + If set to 'hash_fieldname', it will use a hash of the field name as the random seed + for a specific column. + baseColumn Either the string name of the base column, or a list of columns to use to control data generation. -values List of discrete values for the column. Discrete values can numeric, dates timestamps, strings etc. + +values List of discrete values for the column. + + Discrete values can numeric, dates timestamps, strings etc. + weights List of discrete weights for the column. Controls spread of values -percentNulls Percentage of nulls to generate for column. Fraction representing percentage between 0.0 and 1.0 + +percentNulls Percentage of nulls to generate for column. + + Fraction representing percentage between 0.0 and 1.0 + uniqueValues Number of distinct unique values for the column. Use as alternative to data range. + begin Beginning of range for date and timestamp fields. + end End of range for date and timestamp fields. + interval Interval of range for date and timestamp fields. -dataRange An instance of an `NRange` or `DateRange` object. This can be used in place of ``minValue``, etc. + +dataRange An instance of an `NRange` or `DateRange` object. + + This can be used in place of ``minValue``, etc. + template Template controlling text generation -omit If True, omit column from final output. Use when column is only needed to compute other columns. + +omit If True, omit column from final output. + + Use when column is only needed to compute other columns. + expr SQL expression to control data generation + ================ ============================== @@ -44,12 +77,26 @@ expr SQL expression to control data generation For more information, see :data:`~dbldatagen.daterange.DateRange` or :data:`~dbldatagen.daterange.NRange`. +Using custom SQL to control data generation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The `expr` attribute can be used to specify an arbitrary Spark SQL expression to control how the data is +generated for a column. If the body of the SQL references other columns, you will need to ensure that +those columns are created first. + +By default, the columns are created in the order specified. + +However, you can control the order of column creation using the `baseColumn` attribute. + +More Details +^^^^^^^^^^^^ The full set of options for column specification which may be used with the ``withColumn``, ``withColumnSpec`` and and ``withColumnSpecs`` method can be found at: * :data:`~dbldatagen.column_spec_options.ColumnSpecOptions` + Generating views automatically ------------------------------ diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 66cf4497..4f18eff6 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -6,6 +6,11 @@ Troubleshooting =============== +Tools and aids to troubleshooting +--------------------------------- + +Use of the datagenerator `explain` method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ To aid in debugging data generation issues, you may use the `explain` method of the data generator class to produce a synopsis of how the data will be generated. @@ -31,18 +36,95 @@ For example: import dbldatagen as dg import pyspark.sql.functions as F - partitions_requested = 32 data_rows = 10 * 1000 * 1000 uniqueCustomers = 10 * 1000000 - dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested, - verbose=True) - .withColumn("customer_id","long", uniqueValues=uniqueCustomers) - ... - ) - df1 = dataspec.build() + dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=4, verbose=True) + .withColumn("customer_id","long", uniqueValues=uniqueCustomers) + .withColumn("city", "string", template=r"\w") + .withColumn("name", "string", template=r"\w \w|\w \w \w") + .withColumn("email", "string", template=r"\w@\w.com|\w@\w.org|\w.\w@\w.com") + ) + df = dataspec.build() + + display(df) df1 = dataspec.build() See: + * :data:`~dbldatagen.data_generator.DataGenerator` + +Operational message logging +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +By default the data generation process output produce error and warning messages via the +Python `logging` module. + +In addition, the operation of the data generation process will record messages related to how +the data is being or was generated to an internal explain log during the execution of the `build` method. + +So essentially the `explain` method displays the contents of the explain log from the last `build` invocation. +If `build` has not yet been run, it will display the explain logging messages from the build planning process. + +Building planning performs pre-build tasks such as computing the order in which columns need to be generated. + +Examining log outputs +^^^^^^^^^^^^^^^^^^^^^ +Logging outputs will be displayed automatically when using the data generator in a Databricks notebook environment + +Common issues and resolution +---------------------------- + +Attempting to add a column named `id` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +By default, the data generator reserves the column named `id` to act as the seed column for other columns in the +data generation spec. However you may need to use the name `id` may be used for a specific column definition in the +generated data which differs from the default seed column in operation. + +In this case, you may customize the name of the seed column to an alternative name via the `seedColumnName` parameter +to the construction of the `DataGenerator` instance + +The following code shows its use: + +.. code-block:: python + + import dbldatagen as dg + import pyspark.sql.functions as F + + data_rows = 10 * 1000 * 1000 + + uniqueCustomers = 10 * 1000000 + + dataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=4, seedColumnName='_id') + .withColumn("id","long", uniqueValues=uniqueCustomers) + .withColumn("city", "string", template=r"\w") + .withColumn("name", "string", template=r"\w \w|\w \w \w") + .withColumn("email", "string", template=r"\w@\w.com|\w@\w.org|\w.\w@\w.com") + ) + df = dataspec.build() + + display(df) + +Attempting to compute column before dependent columns are computed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +By default, the value for a column is computed based on some transformation of the seed column (named `id` by default). +You can use other columns as the seed column for a given column via the `baseColumn` attribute - which takes either +the name of column as a string or a Python list of column names, if the column is dependent on multiple columns. + +Use of the `expr` attribute (which allows for the use of arbitrary SQL expressions) can also create dependencies on +other columns. + +If a column depends on other columns through referencing them in the body of the expression specified in the `expr` +attribute, it is necessary to ensure that the columns on which the expression depends are computed first. + + +.. sidebar:: Column build ordering + + By default, columns will be built in + the order they are specified unless there are + forward references + + +Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute +may specify either a string that names the column on which the current column depends or a list of column names +specified as a list of strings. + - * :data:`~dbldatagen.data_generator.DataGenerator` \ No newline at end of file From eb35a4439ee60d73af44d0d8f081af1fb735cdb8 Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Mon, 6 Feb 2023 01:42:12 -0800 Subject: [PATCH 3/4] updates to docs --- docs/source/troubleshooting.rst | 105 +++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/docs/source/troubleshooting.rst b/docs/source/troubleshooting.rst index 4f18eff6..386d35ab 100644 --- a/docs/source/troubleshooting.rst +++ b/docs/source/troubleshooting.rst @@ -55,6 +55,13 @@ See: Operational message logging ^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. sidebar:: Logging + + Warning, error and info messages are available via standard logging capabilities. + + By default the data generation process output produce error and warning messages via the Python `logging` module. @@ -64,7 +71,13 @@ the data is being or was generated to an internal explain log during the executi So essentially the `explain` method displays the contents of the explain log from the last `build` invocation. If `build` has not yet been run, it will display the explain logging messages from the build planning process. -Building planning performs pre-build tasks such as computing the order in which columns need to be generated. +Regular logging messages are generated using the standard logger. + +You can display additional logging messages by specifying the `verbose` option during creation of the `DataGenerator` +instance. + +.. note:: Building planning performs pre-build tasks such as computing the order in which columns need to be generated. + Build planning messages are available via the `explain` method Examining log outputs ^^^^^^^^^^^^^^^^^^^^^ @@ -75,6 +88,14 @@ Common issues and resolution Attempting to add a column named `id` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. sidebar:: Customizing seed column + + Use the `seedColumnName` attribute when creating `DataGenerator` instance to + customize the seed column name. + + By default, the data generator reserves the column named `id` to act as the seed column for other columns in the data generation spec. However you may need to use the name `id` may be used for a specific column definition in the generated data which differs from the default seed column in operation. @@ -109,22 +130,90 @@ By default, the value for a column is computed based on some transformation of t You can use other columns as the seed column for a given column via the `baseColumn` attribute - which takes either the name of column as a string or a Python list of column names, if the column is dependent on multiple columns. + +.. sidebar:: Column build ordering + + Column build order is optimized for best performance during data generation. + To ensure columns are computed in correct order, use the `baseColumn` attribute. + + Use of the `expr` attribute (which allows for the use of arbitrary SQL expressions) can also create dependencies on other columns. If a column depends on other columns through referencing them in the body of the expression specified in the `expr` attribute, it is necessary to ensure that the columns on which the expression depends are computed first. +Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute +may specify either a string that names the column on which the current column depends or a list of column names +specified as a list of strings. +For example, the following code has dependencies in some of the `expr` SQL expressions on earlier columns. +In these cases, we use the `baseColumn` attribute to ensure the correct column build order. -.. sidebar:: Column build ordering +.. code-block:: python - By default, columns will be built in - the order they are specified unless there are - forward references + import dbldatagen as dg + + + country_codes = ['CN', 'US', 'FR', 'CA', 'IN', 'JM', 'IE', 'PK', 'GB', 'IL', 'AU', 'SG', + 'ES', 'GE', 'MX', 'ET', 'SA', 'LB', 'NL'] + country_weights = [1300, 365, 67, 38, 1300, 3, 7, 212, 67, 9, 25, 6, 47, 83, 126, 109, 58, 8, + 17] + + device_population = 100000 + + manufacturers = ['Delta corp', 'Xyzzy Inc.', 'Lakehouse Ltd', 'Acme Corp', 'Embanks Devices'] + + lines = ['delta', 'xyzzy', 'lakehouse', 'gadget', 'droid'] + + testDataSpec = (dg.DataGenerator(spark, name="device_data_set", rows=1000000, + partitions=8, + randomSeedMethod='hash_fieldname') + # we'll use hash of the base field to generate the ids to + # avoid a simple incrementing sequence + .withColumn("internal_device_id", "long", minValue=0x1000000000000, + uniqueValues=device_population, omit=True, baseColumnType="hash") + + # note for format strings, we must use "%lx" not "%x" as the + # underlying value is a long + .withColumn("device_id", "string", format="0x%013x", + baseColumn="internal_device_id") + + # the device / user attributes will be the same for the same device id + # so lets use the internal device id as the base column for these attribute + .withColumn("country", "string", values=country_codes, + weights=country_weights, + baseColumn="internal_device_id") + + .withColumn("manufacturer", "string", values=manufacturers, + baseColumn="internal_device_id", omit=True) + + .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", + baseColumnType="hash", omit=True) + + # note use of baseColumn to control column build ordering + .withColumn("manufacturer_info", "string", + expr="to_json(named_struct('line', line, 'manufacturer', manufacturer))", + baseColumn=["line", "manufacturer"] + ) + + .withColumn("event_type", "string", + values=["activation", "deactivation", "plan change", + "telecoms activity", "internet activity", "device error"], + random=True, omit=True) + + .withColumn("event_ts", "timestamp", begin="2020-01-01 01:00:00", end="2020-12-31 23:59:00", + interval="1 minute", random=True, omit=True) + + # note use of baseColumn to control column build ordering + .withColumn("event_info", "string", + expr="to_json(named_struct('event_type', event_type, 'event_ts', event_ts))", + baseColumn=["event_type", "event_ts"]) + ) + + dfTestData = testDataSpec.build() + + display(dfTestData) -Use the `baseColumn` attribute to ensure that dependent columns are computed first. The `baseColumn` attribute -may specify either a string that names the column on which the current column depends or a list of column names -specified as a list of strings. From ea63d718eab68acd136d6c50da7507262576485c Mon Sep 17 00:00:00 2001 From: ronanstokes-db Date: Wed, 8 Feb 2023 14:02:16 -0800 Subject: [PATCH 4/4] fixed typo in docs --- docs/source/generating_json_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst index 15fe5445..a49cd0bb 100644 --- a/docs/source/generating_json_data.rst +++ b/docs/source/generating_json_data.rst @@ -11,7 +11,7 @@ we mean columns that are some combination of `struct`, `array` and `map` of othe Generating JSON data -------------------- -There are several method for generating JSON data: +There are several methods for generating JSON data: - Generate a dataframe and save it as JSON will generate full data set as JSON - Generate JSON valued fields using SQL functions such as `named_struct` and `to_json`