aws-samples
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/README.md‎
Lines changed: 2 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/README.md‎
Lines changed: 245 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/README.md‎
Lines changed: 245 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/imgs/readme_1_tpcds-options.png‎
383 KB b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/imgs/readme_1_tpcds-options.png‎
383 KB
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_data_partitioning_test.py‎
Lines changed: 90 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_data_partitioning_test.py‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_data_schema_test.py‎
Lines changed: 64 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_data_schema_test.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_datasource_test.py‎
Lines changed: 27 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/pyspark/glue_job_validation_datasource_test.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataPartitioningTest.scala‎
Lines changed: 79 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataPartitioningTest.scala‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataSchemaTest.scala‎
Lines changed: 75 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataSchemaTest.scala‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataSourceTest.scala‎
Lines changed: 37 additions & 0 deletions b/‎GlueCustomConnectors/development/Spark/glue-3.0/tpcds-custom-connector-for-glue3.0/jobvalidation/scala/GlueJobValidationDataSourceTest.scala‎
Lines changed: 37 additions & 0 deletions
@@ -3,6 +3,8 @@
 ## Introduction
 This document shows how to develop a connector supporting Glue 3.0 and Spark 3.1.1 for reading and writing data. We’ll use the simple example to show an overview of DataSourceV2 interface implementations in Spark 3. The full code example is [MinimalSpark3Connector](./MinimalSpark3Connector.scala) in the same folder.
 
+In addition to the minimal connector, please refer to the [TPC-DS connector for Glue 3.0](./tpcds-custom-connector-for-glue3.0) in the same folder, which is the actual Glue custom connector package published on AWS Marketplace.
+
 ## Setup Environment
 Build a local Scala environment with local Glue ETL maven library: [Developing Locally with Scala](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html). You may also refer to [GlueSparkRuntime](https://github.com/aws-samples/aws-glue-samples/blob/master/GlueCustomConnectors/development/GlueSparkRuntime/README.md) for more details to custom the local environment for advanced testing.
 
 
@@ -0,0 +1,90 @@
+#  Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  SPDX-License-Identifier: MIT-0
+
+from pyspark.context import SparkContext
+from awsglue.context import GlueContext
+
+
+glue_context = GlueContext(SparkContext(), minPartitions=1, targetPartitions=20)
+logger = glue_context.get_logger()
+connection_name = "GlueTPCDSConnection"
+connection_type = "marketplace.spark" # use `custom.spark` when running job validation tests by self-build custom connector
+
+'''
+1. Partition count - generated single chunk data and row count is more than numPartitions
+'''
+connection_options_1 = {
+    "table": "customer",
+    "scale": "1",
+    "numPartitions": "5",
+    "connectionName" : connection_name
+}
+
+# read data from data source
+dyf_1 = glue_context.create_dynamic_frame_from_options(
+    connection_type=connection_type,
+    connection_options=connection_options_1)
+
+# validate number of partitions and row count
+expected_partitions = 5
+result_partitions = dyf_1.getNumPartitions()
+assert result_partitions == expected_partitions
+logger.info(f'Expected partition count: {expected_partitions}, result partition count: {result_partitions}')
+
+expected_count = 100000
+result_count = dyf_1.count()
+assert dyf_1.count() == expected_count
+logger.info(f'Expected record count: {expected_count}, result record count: {result_count}')
+
+'''
+2. Partition count - generated single chunk data and row count is less than numPartitions
+'''
+connection_options_2 = {
+    "table": "call_center",
+    "scale": "1",
+    "numPartitions": "100",
+    "connectionName" : connection_name
+}
+
+# read data from data source
+dyf_2 = glue_context.create_dynamic_frame_from_options(
+    connection_type=connection_type,
+    connection_options=connection_options_2)
+
+# validate number of partitions and row count
+expected_partitions = 1
+result_partitions = dyf_2.getNumPartitions()
+assert result_partitions == expected_partitions
+logger.info(f'Expected partition count: {expected_partitions}, result partition count: {result_partitions}')
+
+expected_count = 6
+result_count = dyf_2.count()
+assert result_count == expected_count
+logger.info(f'Expected record count: {expected_count}, result record count: {result_count}')
+
+
+'''
+3. Partition count - generated multiple chunk data - in parallel
+'''
+connection_options_3 = {
+    "table": "customer",
+    "scale": "100",
+    "numPartitions": "100",
+    "connectionName" : connection_name
+}
+
+# read data from data source
+dyf_3 = glue_context.create_dynamic_frame_from_options(
+    connection_type=connection_type,
+    connection_options=connection_options_3)
+
+# validate number of partitions and row count
+expected_partitions = 100
+result_partitions = dyf_3.getNumPartitions()
+assert result_partitions == expected_partitions
+logger.info(f'Expected partition count: {expected_partitions}, result partition count: {result_partitions}')
+
+expected_count = 2000000
+result_count = dyf_3.count()
+assert result_count == expected_count
+logger.info(f'Expected record count: {expected_count}, result record count: {result_count}')
@@ -0,0 +1,64 @@
+#  Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  SPDX-License-Identifier: MIT-0
+
+from pyspark.context import SparkContext
+from awsglue.context import GlueContext
+from awsglue.gluetypes import Field, IntegerType, LongType, StructType, StringType, DecimalType, DateType
+
+
+glue_context = GlueContext(SparkContext())
+logger = glue_context.get_logger()
+
+connection_options = {
+    "table": "call_center",
+    "scale": "1",
+    "numPartitions": "1",
+    "connectionName" : "GlueTPCDSConnection"
+}
+
+# read data from data source
+datasource0 = glue_context.create_dynamic_frame_from_options(
+    connection_type="marketplace.spark", # use `custom.spark` when running job validation tests by self-build custom connector
+    connection_options=connection_options)
+
+# validate data schema
+expected_schema = StructType([
+    Field("cc_call_center_sk", LongType()),
+    Field("cc_call_center_id", StringType()),
+    Field("cc_rec_start_date", DateType()),
+    Field("cc_rec_end_date", DateType()),
+    Field("cc_closed_date_sk", IntegerType()),
+    Field("cc_open_date_sk", IntegerType()),
+    Field("cc_name", StringType()),
+    Field("cc_class", StringType()),
+    Field("cc_employees", IntegerType()),
+    Field("cc_sq_ft", IntegerType()),
+    Field("cc_hours", StringType()),
+    Field("cc_manager", StringType()),
+    Field("cc_mkt_id", IntegerType()),
+    Field("cc_mkt_class", StringType()),
+    Field("cc_mkt_desc", StringType()),
+    Field("cc_market_manager", StringType()),
+    Field("cc_division", IntegerType()),
+    Field("cc_division_name", StringType()),
+    Field("cc_company", IntegerType()),
+    Field("cc_company_name", StringType()),
+    Field("cc_street_number", StringType()),
+    Field("cc_street_name", StringType()),
+    Field("cc_street_type", StringType()),
+    Field("cc_suite_number", StringType()),
+    Field("cc_city", StringType()),
+    Field("cc_county", StringType()),
+    Field("cc_state", StringType()),
+    Field("cc_zip", StringType()),
+    Field("cc_country", StringType()),
+    Field("cc_gmt_offset", DecimalType(precision=5, scale=2)),
+    Field("cc_tax_percentage", DecimalType(precision=5, scale=2))
+])
+
+result_schema = datasource0.schema()
+assert result_schema == expected_schema
+logger.info(f'Expected schema: {expected_schema.jsonValue()}')
+logger.info(f'Result schema: {result_schema.jsonValue()}')
+logger.info("Result schema in tree structure: ")
+datasource0.printSchema()
@@ -0,0 +1,27 @@
+#  Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  SPDX-License-Identifier: MIT-0
+
+from pyspark.context import SparkContext
+from awsglue.context import GlueContext
+
+
+glue_context = GlueContext(SparkContext())
+logger = glue_context.get_logger()
+
+connection_options = {
+    "table": "customer",
+    "scale": "1",
+    "numPartitions": "1",
+    "connectionName" : "GlueTPCDSConnection"
+}
+
+# read data from data source
+datasource0 = glue_context.create_dynamic_frame_from_options(
+    connection_type="marketplace.spark",
+    connection_options=connection_options)
+
+# validate data reading and row count
+expected_count = 100000
+result_count = datasource0.count()
+assert result_count == expected_count
+logger.info(f'Expected record count: {expected_count}, result record count: {result_count}')
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: MIT-0
+ */
+
+import com.amazonaws.services.glue.GlueContext
+import com.amazonaws.services.glue.errors.CallSite
+import com.amazonaws.services.glue.schema.{Schema, TypeCode}
+import com.amazonaws.services.glue.schema.builders.SchemaBuilder
+import com.amazonaws.services.glue.schema.types.DecimalType
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import com.amazonaws.services.glue.util.JsonOptions
+import org.apache.spark.SparkContext
+
+import scala.collection.JavaConverters._
+
+object GlueJobValidationDataPartitioningTest {
+  def main(sysArgs: Array[String]) {
+    val spark: SparkContext = new SparkContext()
+    val glueContext: GlueContext = new GlueContext(spark, 1, 20) // Change these values for preventing from repartition by GlueContext default config.
+
+    val connectionName = "GlueTPCDSConnection"
+    val connectionType = "marketplace.spark"
+
+    // 1. Partition count - generated single chunk data and row count is more than numPartitions
+    val options_1 = Map(
+      "table" -> "customer",
+      "scale" -> "1",
+      "numPartitions" -> "5",
+      "connectionName" -> connectionName
+    )
+
+    val dyf_1 = glueContext.getSource(
+      connectionType = connectionType,
+      connectionOptions = JsonOptions(options_1),
+      transformationContext = "dyf"
+    ).getDynamicFrame()
+
+    assert(dyf_1.getNumPartitions == 5)
+    assert(dyf_1.count == 100000)
+
+
+    // 2. Partition count - generated single chunk data and row count is less than numPartitions
+    val options_2 = Map(
+      "table" -> "call_center",
+      "scale" -> "1",
+      "numPartitions" -> "100",
+      "connectionName" -> connectionName
+    )
+
+    val dyf_2 = glueContext.getSource(
+      connectionType = connectionType,
+      connectionOptions = JsonOptions(options_2),
+      transformationContext = "dyf"
+    ).getDynamicFrame()
+
+    assert(dyf_2.getNumPartitions == 1)
+    assert(dyf_2.count == 6)
+
+
+    // 3. Partition count - generated multiple chunk data - in parallel
+    val options_3 = Map(
+      "table" -> "customer",
+      "scale" -> "100",
+      "numPartitions" -> "100",
+      "connectionName" -> connectionName
+    )
+
+    val dyf_3 = glueContext.getSource(
+      connectionType = connectionType,
+      connectionOptions = JsonOptions(options_3),
+      transformationContext = "dyf"
+    ).getDynamicFrame()
+
+    assert(dyf_3.getNumPartitions == 100)
+    assert(dyf_3.count == 2000000)
+  }
+}
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: MIT-0
+ */
+
+import com.amazonaws.services.glue.GlueContext
+import com.amazonaws.services.glue.errors.CallSite
+import com.amazonaws.services.glue.schema.{Schema, TypeCode}
+import com.amazonaws.services.glue.schema.builders.SchemaBuilder
+import com.amazonaws.services.glue.schema.types.DecimalType
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import com.amazonaws.services.glue.util.JsonOptions
+import org.apache.spark.SparkContext
+
+import scala.collection.JavaConverters._
+
+object GlueJobValidationDataSchemaTest {
+  def main(sysArgs: Array[String]) {
+    val spark: SparkContext = new SparkContext()
+    val glueContext: GlueContext = new GlueContext(spark)
+
+    val optionsMap = Map(
+      "table" -> "call_center",
+      "scale" -> "1",
+      "numPartitions" -> "1",
+      "connectionName" -> "GlueTPCDSConnection"
+    )
+
+    // create DataSource and read data
+    val customSource = glueContext.getSource(
+      connectionType = "marketplace.spark",
+      connectionOptions = JsonOptions(optionsMap),
+      transformationContext = "customSource")
+    val dyf = customSource.getDynamicFrame()
+
+    // verify schema of 'customer' table
+    val expectedSchema = new Schema(new SchemaBuilder()
+      .beginStruct()
+      .atomicField("cc_call_center_sk", TypeCode.LONG)
+      .atomicField("cc_call_center_id", TypeCode.STRING)
+      .atomicField("cc_rec_start_date", TypeCode.DATE)
+      .atomicField("cc_rec_end_date", TypeCode.DATE)
+      .atomicField("cc_closed_date_sk", TypeCode.INT)
+      .atomicField("cc_open_date_sk", TypeCode.INT)
+      .atomicField("cc_name", TypeCode.STRING)
+      .atomicField("cc_class", TypeCode.STRING)
+      .atomicField("cc_employees", TypeCode.INT)
+      .atomicField("cc_sq_ft", TypeCode.INT)
+      .atomicField("cc_hours", TypeCode.STRING)
+      .atomicField("cc_manager", TypeCode.STRING)
+      .atomicField("cc_mkt_id", TypeCode.INT)
+      .atomicField("cc_mkt_class", TypeCode.STRING)
+      .atomicField("cc_mkt_desc", TypeCode.STRING)
+      .atomicField("cc_market_manager", TypeCode.STRING)
+      .atomicField("cc_division", TypeCode.INT)
+      .atomicField("cc_division_name", TypeCode.STRING)
+      .atomicField("cc_company", TypeCode.INT)
+      .atomicField("cc_company_name", TypeCode.STRING)
+      .atomicField("cc_street_number", TypeCode.STRING)
+      .atomicField("cc_street_name", TypeCode.STRING)
+      .atomicField("cc_street_type", TypeCode.STRING)
+      .atomicField("cc_suite_number", TypeCode.STRING)
+      .atomicField("cc_city", TypeCode.STRING)
+      .atomicField("cc_county", TypeCode.STRING)
+      .atomicField("cc_state", TypeCode.STRING)
+      .atomicField("cc_zip", TypeCode.STRING)
+      .atomicField("cc_country", TypeCode.STRING)
+      .atomicField("cc_gmt_offset", new DecimalType(5, 2))
+      .atomicField("cc_tax_percentage", new DecimalType(5, 2))
+      .endStruct().build())
+
+    assert(dyf.schema == expectedSchema)
+  }
+}
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: MIT-0
+ */
+
+import com.amazonaws.services.glue.GlueContext
+import com.amazonaws.services.glue.errors.CallSite
+import com.amazonaws.services.glue.util.GlueArgParser
+import com.amazonaws.services.glue.util.Job
+import com.amazonaws.services.glue.util.JsonOptions
+import org.apache.spark.SparkContext
+import scala.collection.JavaConverters._
+
+object GlueJobValidationDataSourceTest {
+  def main(sysArgs: Array[String]) {
+    val spark: SparkContext = new SparkContext()
+    val glueContext: GlueContext = new GlueContext(spark)
+
+    val optionsMap = Map(
+      "table" -> "customer",
+      "scale" -> "1",
+      "numPartitions" -> "1",
+      "connectionName" -> "GlueTPCDSConnection"
+    )
+
+    // create DataSource
+    val customSource = glueContext.getSource(
+      connectionType = "marketplace.spark",
+      connectionOptions = JsonOptions(optionsMap),
+      transformationContext = "customSource")
+    val dyf = customSource.getDynamicFrame()
+
+    // verify data
+    val expectedRowCount = 100000
+    assert(dyf.count == expectedRowCount)
+  }
+}