databrickslabs
diff --git a/‎dbldatagen/text_generators.py‎
Lines changed: 344 additions & 67 deletions b/‎dbldatagen/text_generators.py‎
Lines changed: 344 additions & 67 deletions
diff --git a/‎docs/source/repeatable_data_generation.rst‎
Lines changed: 13 additions & 2 deletions b/‎docs/source/repeatable_data_generation.rst‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎tests/test_data_generation_plugins.py‎
Lines changed: 18 additions & 37 deletions b/‎tests/test_data_generation_plugins.py‎
Lines changed: 18 additions & 37 deletions
diff --git a/‎tests/test_distributions.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/test_distributions.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/test_pandas_integration.py‎
Lines changed: 1 addition & 15 deletions b/‎tests/test_pandas_integration.py‎
Lines changed: 1 addition & 15 deletions
@@ -36,7 +36,7 @@ In addition, two columns in two different tables produced with the same generati
 the same values allowing for creation of multiple tables with referential integrity for use in joins.
 
 .. note::
-   The key exception to repeatability is where the data set contains the timestamp or date of when the
+   The exception to repeatability is where the data set contains the timestamp or date of when the
    data is written. In these cases, runs from a later date will have different values.
 
    This is why we stress generating date or timestamp ranges with a specific ``begin``, ``end`` and ``interval``
@@ -59,6 +59,11 @@ All columns will use the same random seed unless the random seed method is speci
 the seed is overridden at the column level. In the case of the use of the 'hash_fieldname' generation method,
 it will use a hash value of the field name so that each column has a different seed.
 
+.. note::
+   The text generators for templates and ILText always generate random data irrespective of the base column.
+   That means, that these will produce repeatable data from run to run if a random seed is used - but not produce the
+   same values for the same value of the base column.
+
 True random Data
 ^^^^^^^^^^^^^^^^
 To generate true random values, the random seed of -1 must be specified, either at the data spec level or at the
@@ -68,13 +73,19 @@ In this case,
 there is no guarantees of data repeatability - but you can constrain the data generated to specific ranges to use as
 foreign keys for data in other tables.
 
-If columns are not marked random - they will produce a repeatable set of data. For most columns, as the columns
+If columns are not marked random - they will produce a repeatable set of data (with the exception of ILText, Template
+generation and third party library integration). For most columns, as the columns
 are produced by a deterministic transformation on the corresponding base columns, the data will always be repeatable.
 
 For columns generated using an inherently random process such as those produced with the template generation, ILText
 and text data generator plugins, the random process will be seeded with a constant value unless the corresponding
 column specification is marked as ``random``.
 
+.. note::
+   Again this means data will be repeatable run to run but not for a specific
+   value of the base column. For some 3rd party libraries such as `Faker` there is no integration of the random seeding
+   capabilities at present so data will not be repeatable run to run.
+
 If a random seed is provided, either as an argument to the DataGenerator instance specification,
 or as option on the column specification, the random seed will be applied to fields when random data generation is used.
 
 
@@ -1,4 +1,4 @@
-import unittest
+import pytest
 
 import pandas as pd
 import numpy as np
@@ -9,30 +9,30 @@
 spark = dg.SparkSingleton.getLocalInstance("basic tests")
 
 
-class TestTextGenerationPlugins(unittest.TestCase):
+class TestTextGenerationPlugins:
     row_count = 15000
     column_count = 10
 
-    def test_plugins(self):
+    @pytest.mark.parametrize("dataRows", [1000, 10000, 100000])
+    def test_plugins(self, dataRows):
         partitions_requested = 4
-        data_rows = 100 * 1000
 
         def initPluginContext(context):
             context.prefix = "testing"
 
         text_generator = (lambda context, v: context.prefix + str(v))
 
-        pluginDataspec = (dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
+        pluginDataspec = (dg.DataGenerator(spark, rows=dataRows, partitions=partitions_requested)
                           .withColumn("text", text=PyfuncText(text_generator, init=initPluginContext))
                           )
         dfPlugin = pluginDataspec.build()
 
-        self.assertTrue(dfPlugin.count() == data_rows)
+        assert dfPlugin.count() == dataRows
 
         dfCheck = dfPlugin.where("text like 'testing%'")
         new_count = dfCheck.count()
 
-        self.assertTrue(new_count == data_rows)
+        assert new_count == dataRows
 
     def test_plugin_clone(self):
         partitions_requested = 4
@@ -51,7 +51,7 @@ def initPluginContext(context):
         dfCheck = dfPlugin.where("text like 'testing%'")
         new_count = dfCheck.count()
 
-        self.assertTrue(new_count == data_rows)
+        assert new_count == data_rows
 
         # now check the clone
 
@@ -61,7 +61,7 @@ def initPluginContext(context):
         dfCheck2 = dfPlugin2.where("text like 'testing%'")
         new_count2 = dfCheck2.count()
 
-        self.assertTrue(new_count2 == data_rows)
+        assert new_count2 == data_rows
 
     def test_plugins_extended_syntax(self):
         """ test property syntax"""
@@ -85,12 +85,12 @@ def initPluginContext(context):
                           )
         dfPlugin = pluginDataspec.build()
 
-        self.assertTrue(dfPlugin.count() == data_rows)
+        assert dfPlugin.count() == data_rows
 
         dfCheck = dfPlugin.where("text like 'testing1'")
         new_count = dfCheck.count()
 
-        self.assertTrue(new_count == data_rows)
+        assert new_count == data_rows
 
     def test_plugins_extended_syntax2(self):
         """ test arg passing"""
@@ -115,12 +115,12 @@ def initPluginContext(context):
                           )
         dfPlugin = pluginDataspec.build()
 
-        self.assertTrue(dfPlugin.count() == data_rows)
+        assert dfPlugin.count() == data_rows
 
         dfCheck = dfPlugin.where("text like 'testing1'")
         new_count = dfCheck.count()
 
-        self.assertTrue(new_count == data_rows)
+        assert new_count == data_rows
 
     def test_plugins_extended_syntax3(self):
         partitions_requested = 4
@@ -143,12 +143,12 @@ def initPluginContext(context):
                           )
         dfPlugin = pluginDataspec.build()
 
-        self.assertTrue(dfPlugin.count() == data_rows)
+        assert dfPlugin.count() == data_rows
 
         dfCheck = dfPlugin.where("text like 'testing1again'")
         new_count = dfCheck.count()
 
-        self.assertTrue(new_count == data_rows)
+        assert new_count == data_rows
 
     def test_plugins_extended_syntax4(self):
         """ Test syntax extensions """
@@ -175,7 +175,7 @@ def initPluginContext(context):
         output = list(textGen.pandasGenerateText(inputSeries))
 
         for x in output:
-            self.assertEqual(x, "testing1again")
+            assert x == "testing1again"
 
     def test_plugins_faker_integration(self):
         """ test faker integration with mock objects"""
@@ -203,7 +203,7 @@ def test_plugins_faker_integration(self):
         dfFaker2 = fakerDataspec2.build()
         output = dfFaker2.select("name").collect()
         for x in output:
-            self.assertTrue(x["name"].startswith("<MagicMock"))
+            assert x["name"].startswith("<MagicMock")
 
     def test_plugins_faker_integration2(self):
         """ test faker integration with mock objects"""
@@ -231,24 +231,5 @@ def test_plugins_faker_integration2(self):
         dfFaker2 = fakerDataspec2.build()
         output = dfFaker2.select("name").collect()
         for x in output:
-            self.assertTrue(x["name"].startswith("<MagicMock"))
-
-
-# run the tests
-# if __name__ == '__main__':
-#  print("Trying to run tests")
-#  unittest.main(argv=['first-arg-is-ignored'],verbosity=2,exit=False)
-
-# def runTests(suites):
-#    suite = unittest.TestSuite()
-#    result = unittest.TestResult()
-#    for testSuite in suites:
-#        suite.addTest(unittest.makeSuite(testSuite))
-#    runner = unittest.TextTestRunner()
-#    print(runner.run(suite))
-
-
-# runTests([TestBasicOperation])
+            assert x["name"].startswith("<MagicMock")
 
-if __name__ == '__main__':
-    unittest.main()
@@ -34,6 +34,9 @@ def setUpClass(cls):
         )
         cls.testdata_generator.build().cache().createOrReplaceTempView("testdata")
 
+        # change to test build process
+        print("inside setupClass")
+
     @classmethod
     def unique_timestamp_seconds(cls):
         return (datetime.datetime.utcnow() - datetime.datetime.fromtimestamp(0)).total_seconds()
 
@@ -98,18 +98,4 @@ def test_numpy2(self):
 
         self.assertGreater(np.sum(data), 0)
 
-# run the tests
-# if __name__ == '__main__':
-#  print("Trying to run tests")
-#  unittest.main(argv=['first-arg-is-ignored'],verbosity=2,exit=False)
-
-# def runTests(suites):
-#    suite = unittest.TestSuite()
-#    result = unittest.TestResult()
-#    for testSuite in suites:
-#        suite.addTest(unittest.makeSuite(testSuite))
-#    runner = unittest.TextTestRunner()
-#    print(runner.run(suite))
-
-
-# runTests([TestBasicOperation])
+#
Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,9 @@ def setUpClass(cls):`
`34`	`34`	`)`
`35`	`35`	`cls.testdata_generator.build().cache().createOrReplaceTempView("testdata")`
`36`	`36`
	`37`	`+ # change to test build process`
	`38`	`+ print("inside setupClass")`
	`39`	`+`
`37`	`40`	`@classmethod`
`38`	`41`	`def unique_timestamp_seconds(cls):`
`39`	`42`	`return (datetime.datetime.utcnow() - datetime.datetime.fromtimestamp(0)).total_seconds()`