Cosmos Spark: Support CustomQuery to be used for inference (Azure#22079)

ealsur · web-flow · commit 1905567420ec · 2021-06-07T16:49:40.000Z
* Updating name

* Using custom query if available

* Tests

* Tests

* Adding doc
diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/configuration-reference.md b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/docs/configuration-reference.md
@@ -35,7 +35,7 @@ Configuration Reference:
 ### Query Config
 | Config Property Name      | Default | Description |
 | :---        |    :----   |         :--- | 
-| `spark.cosmos.read.customQuery`      | None   | When provided the custom query will be processed against the Cosmos endpoint instead of dynamically generating the query via predicate push down. Usually it is recommended to rely on Spark's predicate push down because that will allow to generate the most efficient set of filters based on the query plan. But there are a couple of of predicates like aggregates (count, group by, avg, sum etc.) that cannot be pushed down yet (at least in Spark 3.1) - so the custom query is a fallback to allow them to be pushed into the query sent to Cosmos. |
+| `spark.cosmos.read.customQuery`      | None   | When provided the custom query will be processed against the Cosmos endpoint instead of dynamically generating the query via predicate push down. Usually it is recommended to rely on Spark's predicate push down because that will allow to generate the most efficient set of filters based on the query plan. But there are a couple of predicates like aggregates (count, group by, avg, sum etc.) that cannot be pushed down yet (at least in Spark 3.1) - so the custom query is a fallback to allow them to be pushed into the query sent to Cosmos. If specified, with schema inference enabled, the custom query will also be used to infer the schema. |
 
 #### Schema Inference Config
 
diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosConfig.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosConfig.scala
@@ -549,7 +549,7 @@ private object CosmosSchemaInferenceConfig {
     parseFromStringFunction = query => query,
     helpMessage = "When schema inference is enabled, used as custom query to infer it")
 
-  def parseCosmosReadConfig(cfg: Map[String, String]): CosmosSchemaInferenceConfig = {
+  def parseCosmosInferenceConfig(cfg: Map[String, String]): CosmosSchemaInferenceConfig = {
     val samplingSize = CosmosConfigEntry.parse(cfg, inferSchemaSamplingSize)
     val enabled = CosmosConfigEntry.parse(cfg, inferSchemaEnabled)
     val query = CosmosConfigEntry.parse(cfg, inferSchemaQuery)
diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/main/scala/com/azure/cosmos/spark/CosmosTableSchemaInferrer.scala
@@ -75,29 +75,34 @@ private object CosmosTableSchemaInferrer
   private[spark] def inferSchema(client: CosmosAsyncClient,
                                  userConfig: Map[String, String],
                                  defaultSchema: StructType): StructType = {
-    val cosmosReadConfig = CosmosSchemaInferenceConfig.parseCosmosReadConfig(userConfig)
-    if (cosmosReadConfig.inferSchemaEnabled) {
+    val cosmosInferenceConfig = CosmosSchemaInferenceConfig.parseCosmosInferenceConfig(userConfig)
+    val cosmosReadConfig = CosmosReadConfig.parseCosmosReadConfig(userConfig)
+    if (cosmosInferenceConfig.inferSchemaEnabled) {
       val cosmosContainerConfig = CosmosContainerConfig.parseCosmosContainerConfig(userConfig)
       val sourceContainer = ThroughputControlHelper.getContainer(userConfig, cosmosContainerConfig, client)
       val queryOptions = new CosmosQueryRequestOptions()
-      queryOptions.setMaxBufferedItemCount(cosmosReadConfig.inferSchemaSamplingSize)
-      val queryText = cosmosReadConfig.inferSchemaQuery match {
-        case None => s"select TOP ${cosmosReadConfig.inferSchemaSamplingSize} * from c"
-        case _ => cosmosReadConfig.inferSchemaQuery.get
+      queryOptions.setMaxBufferedItemCount(cosmosInferenceConfig.inferSchemaSamplingSize)
+      val queryText = cosmosInferenceConfig.inferSchemaQuery match {
+        case None =>
+          cosmosReadConfig.customQuery match {
+            case None => s"select TOP ${cosmosInferenceConfig.inferSchemaSamplingSize} * from c"
+            case _ => cosmosReadConfig.customQuery.get.queryText
+          }
+        case _ => cosmosInferenceConfig.inferSchemaQuery.get
       }
 
       val pagedFluxResponse =
         sourceContainer.queryItems(queryText, queryOptions, classOf[ObjectNode])
 
       val feedResponseList = pagedFluxResponse
-        .take(cosmosReadConfig.inferSchemaSamplingSize)
+        .take(cosmosInferenceConfig.inferSchemaSamplingSize)
         .collectList
         .block
 
       inferSchema(feedResponseList.asScala,
-        cosmosReadConfig.inferSchemaQuery.isDefined || cosmosReadConfig.includeSystemProperties,
-        cosmosReadConfig.inferSchemaQuery.isDefined || cosmosReadConfig.includeTimestamp,
-        cosmosReadConfig.allowNullForInferredProperties)
+        cosmosInferenceConfig.inferSchemaQuery.isDefined || cosmosInferenceConfig.includeSystemProperties,
+        cosmosInferenceConfig.inferSchemaQuery.isDefined || cosmosInferenceConfig.includeTimestamp,
+        cosmosInferenceConfig.allowNullForInferredProperties)
     } else {
       defaultSchema
     }
diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosConfigSpec.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/CosmosConfigSpec.scala
@@ -222,7 +222,7 @@ class CosmosConfigSpec extends UnitSpec {
       "spark.cosmos.read.inferSchema.query" -> customQuery
     )
 
-    val config = CosmosSchemaInferenceConfig.parseCosmosReadConfig(userConfig)
+    val config = CosmosSchemaInferenceConfig.parseCosmosInferenceConfig(userConfig)
     config.inferSchemaSamplingSize shouldEqual 50
     config.inferSchemaEnabled shouldBe false
     config.includeSystemProperties shouldBe true
@@ -233,7 +233,7 @@ class CosmosConfigSpec extends UnitSpec {
   it should "provide default schema inference config" in {
     val userConfig = Map[String, String]()
 
-    val config = CosmosSchemaInferenceConfig.parseCosmosReadConfig(userConfig)
+    val config = CosmosSchemaInferenceConfig.parseCosmosInferenceConfig(userConfig)
 
     config.inferSchemaSamplingSize shouldEqual 1000
     config.inferSchemaEnabled shouldBe true
diff --git a/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala b/sdk/cosmos/azure-cosmos-spark_3-1_2-12/src/test/scala/com/azure/cosmos/spark/SparkE2EQueryITest.scala
@@ -609,6 +609,49 @@ class SparkE2EQueryITest
     item.getAs[String]("id") shouldEqual id
   }
 
+  "spark query" should "use Custom Query also for inference" in {
+    val cosmosEndpoint = TestConfigurations.HOST
+    val cosmosMasterKey = TestConfigurations.MASTER_KEY
+
+    val container = cosmosClient.getDatabase(cosmosDatabase).getContainer(cosmosContainer)
+    for (state <- Array(true, false)) {
+      val objectNode = Utils.getSimpleObjectMapper.createObjectNode()
+      objectNode.put("name", "Shrodigner's duck")
+      objectNode.put("type", "duck")
+      objectNode.put("age", 20)
+      objectNode.put("isAlive", state)
+      objectNode.put("id", UUID.randomUUID().toString)
+      container.createItem(objectNode).block()
+    }
+
+    val cfgWithInference = Map("spark.cosmos.accountEndpoint" -> cosmosEndpoint,
+      "spark.cosmos.accountKey" -> cosmosMasterKey,
+      "spark.cosmos.database" -> cosmosDatabase,
+      "spark.cosmos.container" -> cosmosContainer,
+      "spark.cosmos.read.inferSchema.enabled" -> "true",
+      "spark.cosmos.read.customQuery" ->
+        "SELECT c.type, c.age, c.isAlive FROM c where c.type = 'duck' and c.isAlive = true",
+      "spark.cosmos.read.partitioning.strategy" -> "Restrictive"
+    )
+
+    // Not passing schema, letting inference work
+    val dfWithInference = spark.read.format("cosmos.oltp").options(cfgWithInference).load()
+    val rowsArrayWithInference = dfWithInference.collect()
+    rowsArrayWithInference should have size 1
+
+    val rowWithInference = rowsArrayWithInference(0)
+    rowWithInference.getAs[String]("type") shouldEqual "duck"
+    rowWithInference.getAs[Integer]("age") shouldEqual 20
+    rowWithInference.getAs[Boolean]("isAlive") shouldEqual true
+
+    val fieldNames = rowWithInference.schema.fields.map(field => field.name)
+    fieldNames.contains(CosmosTableSchemaInferrer.SelfAttributeName) shouldBe false
+    fieldNames.contains(CosmosTableSchemaInferrer.TimestampAttributeName) shouldBe false
+    fieldNames.contains(CosmosTableSchemaInferrer.ResourceIdAttributeName) shouldBe false
+    fieldNames.contains(CosmosTableSchemaInferrer.ETagAttributeName) shouldBe false
+    fieldNames.contains(CosmosTableSchemaInferrer.AttachmentsAttributeName) shouldBe false
+  }
+
   //scalastyle:on magic.number
   //scalastyle:on multiple.string.literals
 }