Merge branch 'release/2.0' into develop

chtyim · chtyim · commit 846abc4b05d4 · 2017-12-15T11:00:23.000-08:00
diff --git a/pom.xml b/pom.xml
@@ -28,8 +28,8 @@
     <!-- properties for script build step that creates the config files for the artifacts -->
     <widgets.dir>widgets</widgets.dir>
     <docs.dir>docs</docs.dir>
-    <data.pipeline.parent>system:cdap-data-pipeline[4.3.0-SNAPSHOT,5.0.0-SNAPSHOT)</data.pipeline.parent>
-    <data.stream.parent>system:cdap-data-streams[4.3.0-SNAPSHOT,5.0.0-SNAPSHOT)</data.stream.parent>
+    <data.pipeline.parent>system:cdap-data-pipeline[4.3.0-SNAPSHOT,6.0.0-SNAPSHOT)</data.pipeline.parent>
+    <data.stream.parent>system:cdap-data-streams[4.3.0-SNAPSHOT,6.0.0-SNAPSHOT)</data.stream.parent>
     <!-- this is here because project.basedir evaluates to null in the script build step -->
     <main.basedir>${project.basedir}</main.basedir>
 
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkCompute.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkCompute.java
@@ -34,17 +34,21 @@
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.rdd.RDD;
-import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.StructType;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.io.File;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.io.StringWriter;
 import java.lang.reflect.Method;
 import java.lang.reflect.ParameterizedType;
 import java.lang.reflect.Type;
+import java.nio.file.Files;
 import javax.annotation.Nullable;
 
 /**
@@ -55,12 +59,15 @@
 @Description("Executes user-provided Spark code written in Scala that performs RDD to RDD transformation")
 public class ScalaSparkCompute extends SparkCompute<StructuredRecord, StructuredRecord> {
 
+  private static final Logger LOG = LoggerFactory.getLogger(ScalaSparkCompute.class);
+
   private static final String CLASS_NAME_PREFIX = "co.cask.hydrator.plugin.spark.dynamic.generated.UserSparkCompute$";
+  private static final Class<?> DATAFRAME_TYPE = getDataFrameType();
   private static final Class<?>[][] ACCEPTABLE_PARAMETER_TYPES = new Class<?>[][] {
     { RDD.class, SparkExecutionPluginContext.class },
     { RDD.class },
-    { DataFrame.class, SparkExecutionPluginContext.class},
-    { DataFrame.class }
+    { DATAFRAME_TYPE, SparkExecutionPluginContext.class},
+    { DATAFRAME_TYPE }
   };
 
   private final ThreadLocal<SQLContext> sqlContextThreadLocal = new InheritableThreadLocal<>();
@@ -90,10 +97,16 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
       throw new IllegalArgumentException("Unable to parse output schema " + config.getSchema(), e);
     }
 
-    if (!config.containsMacro("scalaCode") && Boolean.TRUE.equals(config.getDeployCompile())) {
+    if (!config.containsMacro("scalaCode") && !config.containsMacro("dependencies")
+      && Boolean.TRUE.equals(config.getDeployCompile())) {
       SparkInterpreter interpreter = SparkCompilers.createInterpreter();
       if (interpreter != null) {
+        File dir = null;
         try {
+          if (config.getDependencies() != null) {
+            dir = Files.createTempDirectory("sparkprogram").toFile();
+            SparkCompilers.addDependencies(dir, interpreter, config.getDependencies());
+          }
           // We don't need the actual stage name as this only happen in deployment time for compilation check.
           String className = generateClassName("dummy");
           interpreter.compile(generateSourceClass(className));
@@ -102,12 +115,16 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
           Method method = getTransformMethod(interpreter.getClassLoader(), className);
 
           // If the method takes DataFrame, make sure it has input schema
-          if (method.getParameterTypes()[0].equals(DataFrame.class) && stageConfigurer.getInputSchema() == null) {
+          if (method.getParameterTypes()[0].equals(DATAFRAME_TYPE) && stageConfigurer.getInputSchema() == null) {
             throw new IllegalArgumentException("Missing input schema for transformation using DataFrame");
           }
 
         } catch (CompilationFailureException e) {
           throw new IllegalArgumentException(e.getMessage(), e);
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        } finally {
+          SparkCompilers.deleteDir(dir);
         }
       }
     }
@@ -117,9 +134,17 @@ public void configurePipeline(PipelineConfigurer pipelineConfigurer) throws Ille
   public void initialize(SparkExecutionPluginContext context) throws Exception {
     String className = generateClassName(context.getStageName());
     interpreter = context.createSparkInterpreter();
+    File dir = config.getDependencies() == null ? null : Files.createTempDirectory("sparkprogram").toFile();
+    try {
+      if (config.getDependencies() != null) {
+        SparkCompilers.addDependencies(dir, interpreter, config.getDependencies());
+      }
     interpreter.compile(generateSourceClass(className));
     method = getTransformMethod(interpreter.getClassLoader(), className);
-    isDataFrame = method.getParameterTypes()[0].equals(DataFrame.class);
+    } finally {
+      SparkCompilers.deleteDir(dir);
+    }
+    isDataFrame = method.getParameterTypes()[0].equals(DATAFRAME_TYPE);
     takeContext = method.getParameterTypes().length == 2;
 
     // Input schema shouldn't be null
@@ -154,18 +179,18 @@ public JavaRDD<StructuredRecord> transform(SparkExecutionPluginContext context,
     StructType rowType = DataFrames.toDataType(inputSchema);
     JavaRDD<Row> rowRDD = javaRDD.map(new RecordToRow(rowType));
 
-    DataFrame dataFrame = sqlContext.createDataFrame(rowRDD, rowType);
-    DataFrame result = (DataFrame) (takeContext ?
-      method.invoke(null, dataFrame, context) : method.invoke(null, dataFrame));
+    Object dataFrame = sqlContext.createDataFrame(rowRDD, rowType);
+    Object result = takeContext ? method.invoke(null, dataFrame, context) : method.invoke(null, dataFrame);
 
     // Convert the DataFrame back to RDD<StructureRecord>
     Schema outputSchema = context.getOutputSchema();
     if (outputSchema == null) {
       // If there is no output schema configured, derive it from the DataFrame
       // Otherwise, assume the DataFrame has the correct schema already
-      outputSchema = DataFrames.toSchema(result.schema());
+      outputSchema = DataFrames.toSchema((DataType) invokeDataFrameMethod(result, "schema"));
     }
-    return result.toJavaRDD().map(new RowToRecord(outputSchema));
+    //noinspection unchecked
+    return ((JavaRDD<Row>) invokeDataFrameMethod(result, "toJavaRDD")).map(new RowToRecord(outputSchema));
   }
 
   private String generateSourceClass(String className) {
@@ -251,7 +276,7 @@ private Method getTransformMethod(ClassLoader classLoader, String className) {
       Type[] parameterTypes = method.getGenericParameterTypes();
 
       // The first parameter should be of type RDD[StructuredRecord] if it takes RDD
-      if (!parameterTypes[0].equals(DataFrame.class)) {
+      if (!parameterTypes[0].equals(DATAFRAME_TYPE)) {
         validateRDDType(parameterTypes[0],
                         "The first parameter of the 'transform' method should have type as 'RDD[StructuredRecord]'");
       }
@@ -264,8 +289,8 @@ private Method getTransformMethod(ClassLoader classLoader, String className) {
 
       // The return type of the method must be RDD[StructuredRecord] if it takes RDD
       // Or it must be DataFrame if it takes DataFrame
-      if (parameterTypes[0].equals(DataFrame.class)) {
-        if (!method.getReturnType().equals(DataFrame.class)) {
+      if (parameterTypes[0].equals(DATAFRAME_TYPE)) {
+        if (!method.getReturnType().equals(DATAFRAME_TYPE)) {
           throw new IllegalArgumentException("The return type of the 'transform' method should be 'DataFrame'");
         }
       } else {
@@ -323,6 +348,16 @@ public static final class Config extends PluginConfig {
     @Macro
     private final String scalaCode;
 
+    @Description(
+      "Extra dependencies for the Spark program. " +
+        "It is a ',' separated list of URI for the location of dependency jars. " +
+        "A path can be ended with an asterisk '*' as a wildcard, in which all files with extension '.jar' under the " +
+        "parent path will be included."
+    )
+    @Macro
+    @Nullable
+    private final String dependencies;
+
     @Description("The schema of output objects. If no schema is given, it is assumed that the output schema is " +
       "the same as the input schema.")
     @Nullable
@@ -334,9 +369,11 @@ public static final class Config extends PluginConfig {
     @Nullable
     private final Boolean deployCompile;
 
-    public Config(String scalaCode, @Nullable String schema, @Nullable Boolean deployCompile) {
+    public Config(String scalaCode, @Nullable String schema, @Nullable String dependencies,
+                  @Nullable Boolean deployCompile) {
       this.scalaCode = scalaCode;
       this.schema = schema;
+      this.dependencies = dependencies;
       this.deployCompile = deployCompile;
     }
 
@@ -349,6 +386,11 @@ public String getSchema() {
       return schema;
     }
 
+    @Nullable
+    public String getDependencies() {
+      return dependencies;
+    }
+
     @Nullable
     public Boolean getDeployCompile() {
       return deployCompile;
@@ -388,4 +430,26 @@ public StructuredRecord call(Row row) throws Exception {
       return DataFrames.fromRow(row, schema);
     }
   }
+
+  @Nullable
+  private static Class<?> getDataFrameType() {
+    // For Spark1, it has the DataFrame class
+    // For Spark2, there is no more DataFrame class, and it becomes Dataset<Row>
+    try {
+      return ScalaSparkCompute.class.getClassLoader().loadClass("org.apache.spark.sql.DataFrame");
+    } catch (ClassNotFoundException e) {
+      try {
+        return ScalaSparkCompute.class.getClassLoader().loadClass("org.apache.spark.sql.Dataset");
+      } catch (ClassNotFoundException e1) {
+        LOG.warn("Failed to determine the type of Spark DataFrame. " +
+                   "DataFrame is not supported in the ScalaSparkCompute plugin.");
+        return null;
+      }
+    }
+  }
+
+  private static <T> T invokeDataFrameMethod(Object dataFrame, String methodName) throws Exception {
+    //noinspection unchecked
+    return (T) dataFrame.getClass().getMethod(methodName).invoke(dataFrame);
+  }
 }
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkProgram.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/ScalaSparkProgram.java
@@ -35,11 +35,7 @@
 import java.io.IOException;
 import java.lang.reflect.Method;
 import java.lang.reflect.Modifier;
-import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.SimpleFileVisitor;
-import java.nio.file.attribute.BasicFileAttributes;
 import java.util.concurrent.Callable;
 import javax.annotation.Nullable;
 
@@ -79,7 +75,7 @@ public ScalaSparkProgram(Config config) throws CompilationFailureException, IOEx
               getMethodCallable(interpreter.getClassLoader(), config.getMainClass(), null);
             }
           } finally {
-            deleteDir(dir);
+            SparkCompilers.deleteDir(dir);
           }
         } finally {
           interpreter.close();
@@ -98,7 +94,7 @@ public void run(JavaSparkExecutionContext sec) throws Exception {
       interpreter.compile(config.getScalaCode());
       getMethodCallable(interpreter.getClassLoader(), config.getMainClass(), sec).call();
     } finally {
-      deleteDir(dir);
+      SparkCompilers.deleteDir(dir);
     }
   }
 
@@ -166,32 +162,6 @@ public Void call() throws Exception {
     }
   }
 
-  /**
-   * Recursively delete a directory.
-   */
-  public static void deleteDir(@Nullable File dir) {
-    if (dir == null) {
-      return;
-    }
-    try {
-      Files.walkFileTree(dir.toPath(), new SimpleFileVisitor<Path>() {
-        @Override
-        public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
-          Files.deleteIfExists(file);
-          return FileVisitResult.CONTINUE;
-        }
-
-        @Override
-        public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
-          Files.deleteIfExists(dir);
-          return FileVisitResult.CONTINUE;
-        }
-      });
-    } catch (IOException e) {
-      LOG.warn("Failed to cleanup temporary directory {}", dir, e);
-    }
-  }
-
   /**
    * Plugin configuration
    */
diff --git a/src/main/java/co/cask/hydrator/plugin/spark/dynamic/SparkCompilers.java b/src/main/java/co/cask/hydrator/plugin/spark/dynamic/SparkCompilers.java
@@ -24,6 +24,8 @@
 import org.apache.hadoop.fs.LocatedFileStatus;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.RemoteIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import scala.Function0;
 import scala.Option$;
 import scala.collection.JavaConversions;
@@ -44,7 +46,10 @@
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -57,6 +62,8 @@
  */
 public final class SparkCompilers {
 
+  private static final Logger LOG = LoggerFactory.getLogger(SparkCompilers.class);
+
   private static final FilenameFilter JAR_FILE_FILTER = new FilenameFilter() {
     @Override
     public boolean accept(File dir, String name) {
@@ -214,4 +221,30 @@ private static void copyPathAndAdd(FileSystem fs, Path from, File dir, Collectio
   private SparkCompilers() {
     // no-op
   }
+
+  /**
+   * Recursively delete a directory.
+   */
+  public static void deleteDir(@Nullable File dir) {
+    if (dir == null) {
+      return;
+    }
+    try {
+      Files.walkFileTree(dir.toPath(), new SimpleFileVisitor<java.nio.file.Path>() {
+        @Override
+        public FileVisitResult visitFile(java.nio.file.Path file, BasicFileAttributes attrs) throws IOException {
+          Files.deleteIfExists(file);
+          return FileVisitResult.CONTINUE;
+        }
+
+        @Override
+        public FileVisitResult postVisitDirectory(java.nio.file.Path dir, IOException exc) throws IOException {
+          Files.deleteIfExists(dir);
+          return FileVisitResult.CONTINUE;
+        }
+      });
+    } catch (IOException e) {
+      LOG.warn("Failed to cleanup temporary directory {}", dir, e);
+    }
+  }
 }
diff --git a/widgets/ScalaSparkCompute-sparkcompute.json b/widgets/ScalaSparkCompute-sparkcompute.json
@@ -15,6 +15,11 @@
             "default": "/**\n * Transforms the provided input Apache Spark RDD or DataFrame into another RDD or DataFrame.\n *\n * The input DataFrame has the same schema as the input schema to this stage and the transform method should return a DataFrame that has the same schema as the output schema setup for this stage.\n * To emit logs, use: \n *     import org.slf4j.LoggerFactory\n *     val logger = LoggerFactory.getLogger('mylogger')\n *     logger.info('Logging')\n *\n *\n * @param input the input DataFrame which has the same schema as the input schema to this stage.\n * @param context a SparkExecutionPluginContext object that can be used to emit zero or more records (using the emitter.emit() method) or errors (using the emitter.emitError() method) \n * @param context an object that provides access to:\n *      1. CDAP Datasets and Streams - context.fromDataset('counts'); or context.fromStream('input');\n *      2. Original Spark Context - context.getSparkContext();\n *      3. Runtime Arguments - context.getArguments.get('priceThreshold')\n */\ndef transform(df: DataFrame, context: SparkExecutionPluginContext) : DataFrame = {\n  df\n}"
           }
         },
+        {
+          "widget-type": "csv",
+          "label": "Dependencies",
+          "name": "dependencies"
+        },
         {
           "widget-type": "select",
           "label": "Compile at Deployment Time",
diff --git a/widgets/ScalaSparkProgram-sparkprogram.json b/widgets/ScalaSparkProgram-sparkprogram.json
@@ -23,12 +23,9 @@
           }
         },
         {
-          "widget-type": "dsv",
+          "widget-type": "csv",
           "label": "Dependencies",
-          "name": "dependencies",
-          "widget-attributes": {
-            "delimiter": ","
-          }
+          "name": "dependencies"
         },
         {
           "widget-type": "select",

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,11 @@`
`15`	`15`	"default": "/*\n Transforms the provided input Apache Spark RDD or DataFrame into another RDD or DataFrame.\n \n The input DataFrame has the same schema as the input schema to this stage and the transform method should return a DataFrame that has the same schema as the output schema setup for this stage.\n * To emit logs, use: \n * import org.slf4j.LoggerFactory\n * val logger = LoggerFactory.getLogger('mylogger')\n * logger.info('Logging')\n \n \n * @param input the input DataFrame which has the same schema as the input schema to this stage.\n * @param context a SparkExecutionPluginContext object that can be used to emit zero or more records (using the emitter.emit() method) or errors (using the emitter.emitError() method) \n * @param context an object that provides access to:\n * 1. CDAP Datasets and Streams - context.fromDataset('counts'); or context.fromStream('input');\n * 2. Original Spark Context - context.getSparkContext();\n * 3. Runtime Arguments - context.getArguments.get('priceThreshold')\n */\ndef transform(df: DataFrame, context: SparkExecutionPluginContext) : DataFrame = {\n df\n}"
`16`	`16`	`}`
`17`	`17`	`},`
	`18`	`+ {`
	`19`	`+ "widget-type": "csv",`
	`20`	`+ "label": "Dependencies",`
	`21`	`+ "name": "dependencies"`
	`22`	`+ },`
`18`	`23`	`{`
`19`	`24`	`"widget-type": "select",`
`20`	`25`	`"label": "Compile at Deployment Time",`
Original file line number	Diff line number	Diff line change
`@@ -23,12 +23,9 @@`
`23`	`23`	`}`
`24`	`24`	`},`
`25`	`25`	`{`
`26`		`- "widget-type": "dsv",`
	`26`	`+ "widget-type": "csv",`
`27`	`27`	`"label": "Dependencies",`
`28`		`- "name": "dependencies",`
`29`		`- "widget-attributes": {`
`30`		`- "delimiter": ","`
`31`		`- }`
	`28`	`+ "name": "dependencies"`
`32`	`29`	`},`
`33`	`30`	`{`
`34`	`31`	`"widget-type": "select",`