data-integrations
diff --git a/‎docs/ScalaSparkSink-sparksink.md‎
Lines changed: 60 additions & 0 deletions b/‎docs/ScalaSparkSink-sparksink.md‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/main/java/co/cask/hydrator/plugin/spark/dynamic/RecordToRow.java‎
Lines changed: 39 additions & 0 deletions b/‎src/main/java/co/cask/hydrator/plugin/spark/dynamic/RecordToRow.java‎
Lines changed: 39 additions & 0 deletions
@@ -0,0 +1,60 @@
+# Spark Sink in Scala
+
+Description
+-----------
+Executes user-provided Spark code in Scala that operates on an input RDD or Dataframe with full
+access to all Spark features.
+
+Use Case
+--------
+This plugin can be used when you want to have complete control on the Spark computation.
+For example, you may want to join the input RDD with another Dataset and select a subset
+of the join result using Spark SQL before writing the results out to files in parquet format.
+
+Properties
+----------
+**scalaCode** Spark code in Scala defining how to transform RDD to RDD. 
+The code must implement a function called ``sink``, whose signature should be one of:
+
+    def sink(df: DataFrame) : DataFrame
+
+    def sink(df: DataFrame, context: SparkExecutionPluginContext) : DataFrame
+    
+The input ``DataFrame`` has the same schema as the input schema to this stage.
+Using the ``SparkExecutionPluginContext``, you can access CDAP
+entities such as Datasets, as well as providing access to the underlying ``SparkContext`` in use.
+ 
+Operating on lower level ``RDD`` is also possible by using the one of the following forms of the ``sink`` method:
+
+    def sink(rdd: RDD[StructuredRecord]) : RDD[StructuredRecord]
+
+    def sink(rdd: RDD[StructuredRecord], context: SparkExecutionPluginContext) : RDD[StructuredRecord]
+   
+For example:
+
+    def sink(rdd: RDD[StructuredRecord], context: SparkExecutionPluginContext) : Unit = {
+      val outputSchema = context.getOutputSchema
+      rdd
+        .flatMap(_.get[String]("body").split("\\s+"))
+        .map(s => (s, 1))
+        .reduceByKey(_ + _)
+        .saveAsTextFile("output")
+    }
+        
+This will perform a word count on the input field ``'body'``, then write out the results as a text file.
+
+The following imports are included automatically and are ready for the user code to use:
+
+      import co.cask.cdap.api.data.format._
+      import co.cask.cdap.api.data.schema._;
+      import co.cask.cdap.etl.api.batch._
+      import org.apache.spark._
+      import org.apache.spark.api.java._
+      import org.apache.spark.rdd._
+      import org.apache.spark.sql._
+      import org.apache.spark.SparkContext._
+      import scala.collection.JavaConversions._
+
+
+**deployCompile** Specify whether the code will get validated during pipeline creation time. Setting this to `false`
+will skip the validation.
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018 Cask Data, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may not
+ * use this file except in compliance with the License. You may obtain a copy of
+ * the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package co.cask.hydrator.plugin.spark.dynamic;
+
+import co.cask.cdap.api.data.format.StructuredRecord;
+import co.cask.cdap.api.spark.sql.DataFrames;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * Function to map from {@link StructuredRecord} to {@link Row}.
+ */
+public class RecordToRow implements Function<StructuredRecord, Row> {
+  private final StructType rowType;
+
+  public RecordToRow(StructType rowType) {
+    this.rowType = rowType;
+  }
+
+  @Override
+  public Row call(StructuredRecord record) throws Exception {
+    return DataFrames.toRow(record, rowType);
+  }
+}