Cosmos-Spark: Ported the NYC-Taxi-Data end-to-end sample from pyspark to scala (Azure#26969)

FabianMeiswinkel · web-flow · commit 1c47518c7762 · 2022-02-09T17:09:26.000Z
diff --git a/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Python/NYC-Taxi-Data/01_Batch.ipynb b/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Python/NYC-Taxi-Data/01_Batch.ipynb
@@ -438,6 +438,7 @@
     "\n",
     "print(\"Starting count validation via query: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n",
     "count_query_schema=StructType(fields=[StructField(\"Count\", LongType(), True)])\n",
+    "readCfg[\"spark.cosmos.read.customQuery\"] = \"SELECT COUNT(0) AS Count FROM c\"\n",
     "query_df = spark.read.format(\"cosmos.oltp\").schema(count_query_schema).options(**readCfg).load()\n",
     "count_query = query_df.select(F.sum(\"Count\").alias(\"TotalCount\")).first()[\"TotalCount\"]\n",
     "print(\"Number of records retrieved via query: \", count_query) \n",
diff --git a/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Scala/NYC-Taxi-Data/01_Batch.scala b/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Scala/NYC-Taxi-Data/01_Batch.scala
@@ -0,0 +1,354 @@
+// Databricks notebook source
+// MAGIC %md
+// MAGIC **Secrets**
+// MAGIC 
+// MAGIC The secrets below  like the Cosmos account key are retrieved from a secret scope. If you don't have defined a secret scope for a Cosmos Account you want to use when going through this sample you can find the instructions on how to create one here:
+// MAGIC - Here you can [Create a new secret scope](./#secrets/createScope) for the current Databricks workspace
+// MAGIC   - See how you can create an [Azure Key Vault backed secret scope](https://docs.microsoft.com/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope) 
+// MAGIC   - See how you can create a [Databricks backed secret scope](https://docs.microsoft.com/azure/databricks/security/secrets/secret-scopes#create-a-databricks-backed-secret-scope)
+// MAGIC - And here you can find information on how to [add secrets to your Spark configuration](https://docs.microsoft.com/azure/databricks/security/secrets/secrets#read-a-secret)
+// MAGIC If you don't want to use secrets at all you can of course also just assign the values in clear-text below - but for obvious reasons we recommend the usage of secrets.
+
+// COMMAND ----------
+
+val cosmosEndpoint = spark.conf.get("spark.cosmos.accountEndpoint")
+val cosmosMasterKey = spark.conf.get("spark.cosmos.accountKey")
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Preparation - creating the Cosmos DB container to ingest the data into**
+// MAGIC 
+// MAGIC Configure the Catalog API to be used
+
+// COMMAND ----------
+
+spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
+spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
+spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)
+spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.views.repositoryPath", "/viewDefinitions")
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC And execute the command to create the new container with a throughput of up-to 100,000 RU (Autoscale - so 10,000 - 100,000 RU based on scale) and only system properties (like /id) being indexed. We will also create a second container that will be used to store metadata for the global throughput control
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC CREATE DATABASE IF NOT EXISTS cosmosCatalog.SampleDatabase;
+// MAGIC 
+// MAGIC CREATE TABLE IF NOT EXISTS cosmosCatalog.SampleDatabase.GreenTaxiRecords
+// MAGIC USING cosmos.oltp
+// MAGIC TBLPROPERTIES(partitionKeyPath = '/id', autoScaleMaxThroughput = '100000', indexingPolicy = 'OnlySystemProperties');
+// MAGIC 
+// MAGIC CREATE TABLE IF NOT EXISTS cosmosCatalog.SampleDatabase.GreenTaxiRecordsCFSink
+// MAGIC USING cosmos.oltp
+// MAGIC TBLPROPERTIES(partitionKeyPath = '/id', autoScaleMaxThroughput = '100000', indexingPolicy = 'OnlySystemProperties');
+// MAGIC 
+// MAGIC /* NOTE: It is important to enable TTL (can be off/-1 by default) on the throughput control container */
+// MAGIC CREATE TABLE IF NOT EXISTS cosmosCatalog.SampleDatabase.ThroughputControl
+// MAGIC USING cosmos.oltp
+// MAGIC OPTIONS(spark.cosmos.database = 'SampleDatabase')
+// MAGIC TBLPROPERTIES(partitionKeyPath = '/groupId', autoScaleMaxThroughput = '4000', indexingPolicy = 'AllProperties', defaultTtlInSeconds = '-1');
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Preparation - loading data source "[NYC Taxi & Limousine Commission - green taxi trip records](https://azure.microsoft.com/services/open-datasets/catalog/nyc-taxi-limousine-commission-green-taxi-trip-records/)"**
+// MAGIC 
+// MAGIC The green taxi trip records include fields capturing pick-up and drop-off dates/times, pick-up and drop-off locations, trip distances, itemized fares, rate types, payment types, and driver-reported passenger counts. This data set has over 80 million records (>8 GB) of data and is available via a publicly accessible Azure Blob Storage Account located in the East-US Azure region.
+
+// COMMAND ----------
+
+import java.time.Instant;
+import java.time.ZoneId;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.UUID
+
+val formatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC));
+
+println(s"Starting preparation: ${formatter.format(Instant.now)}")
+
+// Azure storage access info
+val blob_account_name = "azureopendatastorage"
+val blob_container_name = "nyctlc"
+val blob_relative_path = "green"
+val blob_sas_token = ""
+// Allow SPARK to read from Blob remotely
+val wasbs_path = s"wasbs://${blob_container_name}@${blob_account_name}.blob.core.windows.net/${blob_relative_path}"
+spark.conf.set(
+  s"fs.azure.sas.${blob_container_name}.$blob_account_name{}.blob.core.windows.net",
+  blob_sas_token)
+print(s"Remote blob path: ${wasbs_path}")
+// SPARK read parquet, note that it won't load any data yet by now
+// NOTE - if you want to experiment with larger dataset sizes - consider switching to Option B (commenting code 
+// for Option A/uncommenting code for option B) the lines below or increase the value passed into the 
+// limit function restricting the dataset size below
+
+// ------------------------------------------------------------------------------------
+//  Option A - with limited dataset size
+// ------------------------------------------------------------------------------------
+val df_rawInputWithoutLimit = spark.read.parquet(wasbs_path)
+val partitionCount = df_rawInputWithoutLimit.rdd.getNumPartitions
+val df_rawInput = df_rawInputWithoutLimit.limit(1000 * 1000).repartition(partitionCount)
+df_rawInput.persist()
+
+// ------------------------------------------------------------------------------------
+// Option B - entire dataset
+// ------------------------------------------------------------------------------------
+// val df_rawInput = spark.read.parquet(wasbs_path)
+
+// Adding an id column with unique values
+val uuidUdf=udf[String](() => UUID.randomUUID().toString)
+val df_input_withId = df_rawInput.withColumn("id", uuidUdf())
+
+print("Register the DataFrame as a SQL temporary view: source")
+df_input_withId.createOrReplaceTempView("source")
+print("Finished preparation: ${formatter.format(Instant.now)}")
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC ** Sample - ingesting the NYC Green Taxi data into Cosmos DB**
+// MAGIC 
+// MAGIC By setting the target throughput threshold to 0.95 (95%) we reduce throttling but still allow the ingestion to consume most of the provisioned throughput. For scenarios where ingestion should only take a smaller subset of the available throughput this threshold can be reduced accordingly.
+
+// COMMAND ----------
+
+println(s"Starting ingestion: ${formatter.format(Instant.now)}")
+
+val writeCfg = Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> "SampleDatabase",
+  "spark.cosmos.container" -> "GreenTaxiRecords",
+  "spark.cosmos.write.strategy" -> "ItemOverwrite",
+  "spark.cosmos.write.bulk.enabled" -> "true",
+  "spark.cosmos.throughputControl.enabled" -> "true",
+  "spark.cosmos.throughputControl.name" -> "NYCGreenTaxiDataIngestion",
+  "spark.cosmos.throughputControl.targetThroughputThreshold" -> "0.95",
+  "spark.cosmos.throughputControl.globalControl.database" -> "SampleDatabase",
+  "spark.cosmos.throughputControl.globalControl.container" -> "ThroughputControl",
+)
+
+val df_NYCGreenTaxi_Input = spark.sql("SELECT * FROM source")
+
+df_NYCGreenTaxi_Input
+  .write
+  .format("cosmos.oltp")
+  .mode("Append")
+  .options(writeCfg)
+  .save()
+
+println(s"Finished ingestion: ${formatter.format(Instant.now)}")
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Getting the reference record count**
+
+// COMMAND ----------
+
+val count_source = spark.sql("SELECT * FROM source").count()
+println(s"Number of records in source: ${count_source}") 
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - validating the record count via query**
+
+// COMMAND ----------
+
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
+
+println(s"Starting validation via query: ${formatter.format(Instant.now)}")
+val readCfg = Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> "SampleDatabase",
+  "spark.cosmos.container" -> "GreenTaxiRecords",
+  "spark.cosmos.read.partitioning.strategy" -> "Restrictive", // IMPORTANT - any other partitioning strategy will result in indexing not being use to count - so latency and RU would spike up
+  "spark.cosmos.read.inferSchema.enabled" -> "false",
+  "spark.cosmos.read.customQuery" -> "SELECT COUNT(0) AS Count FROM c"
+)
+
+val count_query_schema=StructType(Array(StructField("Count", LongType, true)))
+val query_df = spark.read.format("cosmos.oltp").schema(count_query_schema).options(readCfg).load()
+val count_query = query_df.agg(sum("Count").as("TotalCount")).first.getLong(0)
+println(s"Number of records retrieved via query: ${count_query}") 
+println(s"Finished validation via query: ${formatter.format(Instant.now)}")
+
+assert(count_source == count_query)
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - validating the record count via change feed**
+
+// COMMAND ----------
+
+println(s"Starting validation via change feed: ${formatter.format(Instant.now)}")
+val changeFeedCfg = Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> "SampleDatabase",
+  "spark.cosmos.container" -> "GreenTaxiRecords",
+  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.inferSchema.enabled" -> "false",
+  "spark.cosmos.changeFeed.startFrom" -> "Beginning",
+  "spark.cosmos.changeFeed.mode" -> "Incremental"
+)
+val changeFeed_df = spark.read.format("cosmos.oltp.changeFeed").options(changeFeedCfg).load()
+val count_changeFeed = changeFeed_df.count()
+println(s"Number of records retrieved via change feed: ${count_changeFeed}") 
+println(s"Finished validation via change feed: ${formatter.format(Instant.now)}")
+
+assert(count_source == count_changeFeed)
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - bulk deleting documents and validating document count afterwards**
+
+// COMMAND ----------
+
+import scala.math._
+
+println(s"Starting to identify to be deleted documents: ${formatter.format(Instant.now)}")
+val readCfg = Map(
+  "spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+  "spark.cosmos.accountKey" -> cosmosMasterKey,
+  "spark.cosmos.database" -> "SampleDatabase",
+  "spark.cosmos.container" -> "GreenTaxiRecords",
+  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.inferSchema.enabled" -> "false",
+)
+
+val toBeDeleted_df = spark.read.format("cosmos.oltp").options(readCfg).load().limit(100000)
+println(s"Number of records to be deleted: ${toBeDeleted_df.count}") 
+
+println(s"Starting to bulk delete documents: ${formatter.format(Instant.now)}")
+val deleteCfg = writeCfg + ("spark.cosmos.write.strategy" -> "ItemDelete")
+toBeDeleted_df
+        .write
+        .format("cosmos.oltp")
+        .mode("Append")
+        .options(deleteCfg)
+        .save()
+println(s"Finished deleting documents: ${formatter.format(Instant.now)}")
+
+println(s"Starting count validation via query: ${formatter.format(Instant.now)}")
+val countCfg = readCfg + ("spark.cosmos.read.customQuery" -> "SELECT COUNT(0) AS Count FROM c")
+val count_query_schema=StructType(Array(StructField("Count", LongType, true)))
+val query_df = spark.read.format("cosmos.oltp").schema(count_query_schema).options(countCfg).load()
+val count_query = query_df.agg(sum("Count").as("TotalCount")).first.getLong(0)
+println(s"Number of records retrieved via query: ${count_query}") 
+println(s"Finished count validation via query: ${formatter.format(Instant.now)}")
+
+assert (math.max(0, count_source - 100000) == count_query)
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - showing the existing Containers**
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC SHOW TABLES FROM cosmosCatalog.SampleDatabase
+
+// COMMAND ----------
+
+val df_Tables = spark.sql("SHOW TABLES FROM cosmosCatalog.SampleDatabase")
+assert(df_Tables.count() == 3)
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - querying a Cosmos Container via Spark Catalog**
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC SELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecords LIMIT 10
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Sample - querying a Cosmos Container with custom settings via Spark Catalog**
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC Creating the view with custom settings (in this case adding a projection, disabling schema inference and switching to aggressive partitioning strategy)
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC CREATE TABLE cosmosCatalog.SampleDatabase.GreenTaxiRecordsView 
+// MAGIC   (id STRING, _ts TIMESTAMP, vendorID INT, totalAmount DOUBLE)
+// MAGIC USING cosmos.oltp
+// MAGIC TBLPROPERTIES(isCosmosView = 'True')
+// MAGIC OPTIONS (
+// MAGIC   spark.cosmos.database = 'SampleDatabase',
+// MAGIC   spark.cosmos.container = 'GreenTaxiRecords',
+// MAGIC   spark.cosmos.read.inferSchema.enabled = 'False',
+// MAGIC   spark.cosmos.read.inferSchema.includeSystemProperties = 'True',
+// MAGIC   spark.cosmos.read.partitioning.strategy = 'Aggressive');
+// MAGIC 
+// MAGIC SELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsView LIMIT 10
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC Creating another view with custom settings (in this case enabling schema inference and switching to restrictive partitioning strategy)
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC CREATE TABLE cosmosCatalog.SampleDatabase.GreenTaxiRecordsAnotherView 
+// MAGIC USING cosmos.oltp
+// MAGIC TBLPROPERTIES(isCosmosView = 'True')
+// MAGIC OPTIONS (
+// MAGIC   spark.cosmos.database = 'SampleDatabase',
+// MAGIC   spark.cosmos.container = 'GreenTaxiRecords',
+// MAGIC   spark.cosmos.read.inferSchema.enabled = 'True',
+// MAGIC   spark.cosmos.read.inferSchema.includeSystemProperties = 'False',
+// MAGIC   spark.cosmos.read.partitioning.strategy = 'Restrictive');
+// MAGIC 
+// MAGIC SELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsAnotherView LIMIT 10
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC Show all Tables in the Cosmos Catalog to show that both the "real" Containers as well as the views show-up
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC SHOW TABLES FROM cosmosCatalog.SampleDatabase
+
+// COMMAND ----------
+
+val df_Tables = spark.sql("SHOW TABLES FROM cosmosCatalog.SampleDatabase")
+assert(df_Tables.count() == 5)
+
+// COMMAND ----------
+
+// MAGIC %md
+// MAGIC **Cleanup the views again**
+
+// COMMAND ----------
+
+// MAGIC %sql
+// MAGIC DROP TABLE IF EXISTS cosmosCatalog.SampleDatabase.GreenTaxiRecordsView;
+// MAGIC DROP TABLE IF EXISTS cosmosCatalog.SampleDatabase.GreenTaxiRecordsAnotherView;
+// MAGIC SHOW TABLES FROM cosmosCatalog.SampleDatabase
+
+// COMMAND ----------
+
+val df_Tables = spark.sql("SHOW TABLES FROM cosmosCatalog.SampleDatabase")
+assert(df_Tables.count() == 3)
diff --git a/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Scala/NYC-Taxi-Data/03_CleanUp.scala b/sdk/cosmos/azure-cosmos-spark_3_2-12/Samples/Scala/NYC-Taxi-Data/03_CleanUp.scala