From 5c3a3b897afbbb36e7f40c8bbbde4686511f5a6d Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Mon, 6 Nov 2017 18:34:48 -0200 Subject: [PATCH 01/19] Adding base and marvin-spark docker files --- docker/marvin-base-docker/Dockerfile | 27 +++++ docker/marvin-base-docker/README.md | 20 ++++ docker/marvin-base-docker/build.sh | 9 ++ docker/marvin-spark-docker/Dockerfile | 28 +++++ docker/marvin-spark-docker/README.md | 20 ++++ docker/marvin-spark-docker/build.sh | 9 ++ docker/marvin-spark-docker/core-site.xml | 145 +++++++++++++++++++++++ docker/marvin-spark-docker/hdfs-site.xml | 101 ++++++++++++++++ 8 files changed, 359 insertions(+) create mode 100644 docker/marvin-base-docker/Dockerfile create mode 100644 docker/marvin-base-docker/README.md create mode 100755 docker/marvin-base-docker/build.sh create mode 100644 docker/marvin-spark-docker/Dockerfile create mode 100644 docker/marvin-spark-docker/README.md create mode 100755 docker/marvin-spark-docker/build.sh create mode 100644 docker/marvin-spark-docker/core-site.xml create mode 100644 docker/marvin-spark-docker/hdfs-site.xml diff --git a/docker/marvin-base-docker/Dockerfile b/docker/marvin-base-docker/Dockerfile new file mode 100644 index 0000000..294d9c9 --- /dev/null +++ b/docker/marvin-base-docker/Dockerfile @@ -0,0 +1,27 @@ +FROM debian:jessie + +RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list \ + && apt-get update \ + && apt-get install -y software-properties-common curl wget \ + && apt-get install -yt jessie-backports openjdk-8-jdk \ + && apt-get install -y git \ + && apt-get -qy install python2.7-dev python-pip ipython libsasl2-dev gcc \ + && apt-get -qy install libffi-dev \ + && apt-get -qy install libssl-dev \ + && apt-get -qy install libxml2-dev libxslt1-dev \ + && apt-get -qy install libpng12-dev libfreetype6-dev \ + && apt-get -qy install python-tk + +#Engines will run using the user marvin +RUN useradd --create-home -s /bin/bash -G sudo marvin + +##Install virtualenv +ENV WORKON_HOME /home/marvin/.virtualenvs +RUN pip install -q virtualenvwrapper \ + && echo 'source /usr/local/bin/virtualenvwrapper.sh' >> /home/marvin/.profile + +#Take ownership of needed folders +RUN chown -R marvin:marvin /opt + +USER marvin +WORKDIR /home/marvin diff --git a/docker/marvin-base-docker/README.md b/docker/marvin-base-docker/README.md new file mode 100644 index 0000000..7d0d401 --- /dev/null +++ b/docker/marvin-base-docker/README.md @@ -0,0 +1,20 @@ +### Marvin Prediction-IO Docker Image + +This docker project builds a **Prediction-IO** image to support things like + +- Build and test engines +- Create docker containers to run pio train and pio deploy + + +## How to use + +To build the image just run `sh build.sh ` specifying the tag for your image. + +## Dependencies + +To build this project the following dependencies must be met + +- Elasticsearch download - The build process will try to download elastcisearch **1.7.5** from the internet. +- Spark download - The build process will try to download spark **1.6.3** from the internet. +- Prediction-IO download - The build process will try to download Prediction-IO **0.10.0** (B2W Fork) from the internal network. +- SBT cache files - In order to optmize the build time the build process will try to download ivy and sbt custom tars from the internal network. These files are not required to build, if they're missing you can remove from Dockerfile. \ No newline at end of file diff --git a/docker/marvin-base-docker/build.sh b/docker/marvin-base-docker/build.sh new file mode 100755 index 0000000..d98887f --- /dev/null +++ b/docker/marvin-base-docker/build.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -z "$1" ] + then + echo "You must specify the version of the image being built" + exit 1 +fi +docker build -t registry.b2w.io/b2wdigital/predictionio-b2w:"$1" . + + diff --git a/docker/marvin-spark-docker/Dockerfile b/docker/marvin-spark-docker/Dockerfile new file mode 100644 index 0000000..263b487 --- /dev/null +++ b/docker/marvin-spark-docker/Dockerfile @@ -0,0 +1,28 @@ +FROM marvinai/marvin-base + +USER root + +#ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ + +#Unpack tgzs +#RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt/ \ +# && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark + +ENV SPARK_HOME /opt/spark +ENV HADOOP_CONF_DIR /opt/spark/conf + +#Add configuration files +ADD hdfs-site.xml /opt/spark/conf +ADD core-site.xml /opt/spark/conf + +USER marvin + +RUN git clone https://github.com/marvin-ai/marvin-python-toolbox.git \ + && cd marvin-python-toolbox \ + && bash -c "source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv python-toolbox-env \ + && setvirtualenvproject && make marvin" + +ENTRYPOINT bash -c 'source /usr/local/bin/virtualenvwrapper.sh && workon python-toolbox-env && marvin --help' \ + && "${@}" + +CMD /bin/bash diff --git a/docker/marvin-spark-docker/README.md b/docker/marvin-spark-docker/README.md new file mode 100644 index 0000000..7d0d401 --- /dev/null +++ b/docker/marvin-spark-docker/README.md @@ -0,0 +1,20 @@ +### Marvin Prediction-IO Docker Image + +This docker project builds a **Prediction-IO** image to support things like + +- Build and test engines +- Create docker containers to run pio train and pio deploy + + +## How to use + +To build the image just run `sh build.sh ` specifying the tag for your image. + +## Dependencies + +To build this project the following dependencies must be met + +- Elasticsearch download - The build process will try to download elastcisearch **1.7.5** from the internet. +- Spark download - The build process will try to download spark **1.6.3** from the internet. +- Prediction-IO download - The build process will try to download Prediction-IO **0.10.0** (B2W Fork) from the internal network. +- SBT cache files - In order to optmize the build time the build process will try to download ivy and sbt custom tars from the internal network. These files are not required to build, if they're missing you can remove from Dockerfile. \ No newline at end of file diff --git a/docker/marvin-spark-docker/build.sh b/docker/marvin-spark-docker/build.sh new file mode 100755 index 0000000..d98887f --- /dev/null +++ b/docker/marvin-spark-docker/build.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -z "$1" ] + then + echo "You must specify the version of the image being built" + exit 1 +fi +docker build -t registry.b2w.io/b2wdigital/predictionio-b2w:"$1" . + + diff --git a/docker/marvin-spark-docker/core-site.xml b/docker/marvin-spark-docker/core-site.xml new file mode 100644 index 0000000..074bd08 --- /dev/null +++ b/docker/marvin-spark-docker/core-site.xml @@ -0,0 +1,145 @@ + + + + + + fs.defaultFS + hdfs://nameservice1 + + + fs.trash.interval + 1 + + + io.compression.codecs + org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec + + + hadoop.security.authentication + simple + + + hadoop.security.authorization + false + + + hadoop.rpc.protection + authentication + + + hadoop.security.auth_to_local + DEFAULT + + + hadoop.proxyuser.oozie.hosts + * + + + hadoop.proxyuser.oozie.groups + * + + + hadoop.proxyuser.mapred.hosts + * + + + hadoop.proxyuser.mapred.groups + * + + + hadoop.proxyuser.flume.hosts + * + + + hadoop.proxyuser.flume.groups + * + + + hadoop.proxyuser.HTTP.hosts + * + + + hadoop.proxyuser.HTTP.groups + * + + + hadoop.proxyuser.hive.hosts + * + + + hadoop.proxyuser.hive.groups + * + + + hadoop.proxyuser.hue.hosts + * + + + hadoop.proxyuser.hue.groups + * + + + hadoop.proxyuser.httpfs.hosts + * + + + hadoop.proxyuser.httpfs.groups + * + + + hadoop.proxyuser.hdfs.groups + * + + + hadoop.proxyuser.hdfs.hosts + * + + + hadoop.proxyuser.yarn.hosts + * + + + hadoop.proxyuser.yarn.groups + * + + + hadoop.security.group.mapping + org.apache.hadoop.security.ShellBasedUnixGroupsMapping + + + hadoop.security.instrumentation.requires.admin + false + + + net.topology.script.file.name + /etc/hadoop/conf.cloudera.yarn/topology.py + + + io.file.buffer.size + 65536 + + + hadoop.ssl.enabled + false + + + hadoop.ssl.require.client.cert + false + true + + + hadoop.ssl.keystores.factory.class + org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory + true + + + hadoop.ssl.server.conf + ssl-server.xml + true + + + hadoop.ssl.client.conf + ssl-client.xml + true + + diff --git a/docker/marvin-spark-docker/hdfs-site.xml b/docker/marvin-spark-docker/hdfs-site.xml new file mode 100644 index 0000000..a0ef472 --- /dev/null +++ b/docker/marvin-spark-docker/hdfs-site.xml @@ -0,0 +1,101 @@ + + + + + + dfs.nameservices + nameservice1 + + + dfs.client.failover.proxy.provider.nameservice1 + org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider + + + dfs.ha.automatic-failover.enabled.nameservice1 + true + + + ha.zookeeper.quorum + lvdn001-priv.b2w:2181,lvnn-priv.b2w:2181,lvsb-priv.b2w:2181 + + + dfs.ha.namenodes.nameservice1 + namenode118,namenode167 + + + dfs.namenode.rpc-address.nameservice1.namenode118 + lvnn-priv.b2w:8020 + + + dfs.namenode.servicerpc-address.nameservice1.namenode118 + lvnn-priv.b2w:8022 + + + dfs.namenode.http-address.nameservice1.namenode118 + lvnn-priv.b2w:50070 + + + dfs.namenode.https-address.nameservice1.namenode118 + lvnn-priv.b2w:50470 + + + dfs.namenode.rpc-address.nameservice1.namenode167 + lvsb-priv.b2w:8020 + + + dfs.namenode.servicerpc-address.nameservice1.namenode167 + lvsb-priv.b2w:8022 + + + dfs.namenode.http-address.nameservice1.namenode167 + lvsb-priv.b2w:50070 + + + dfs.namenode.https-address.nameservice1.namenode167 + lvsb-priv.b2w:50470 + + + dfs.replication + 3 + + + dfs.blocksize + 134217728 + + + dfs.client.use.datanode.hostname + true + + + fs.permissions.umask-mode + 022 + + + dfs.namenode.acls.enabled + false + + + dfs.client.use.legacy.blockreader + false + + + dfs.client.read.shortcircuit + false + + + dfs.domain.socket.path + /var/run/hdfs-sockets/dn + + + dfs.client.read.shortcircuit.skip.checksum + false + + + dfs.client.domain.socket.data.traffic + false + + + dfs.datanode.hdfs-blocks-metadata.enabled + true + + From 8511b9a8cf725f4694618c7c3b03c75f2997d4f5 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 8 Nov 2017 17:06:16 -0200 Subject: [PATCH 02/19] Moving docker to templates --- docker/marvin-spark-docker/core-site.xml | 145 ------------------ docker/marvin-spark-docker/hdfs-site.xml | 101 ------------ marvin_python_toolbox/management/engine.py | 2 + .../docker}/marvin-base-docker/Dockerfile | 0 .../docker}/marvin-base-docker/README.md | 0 .../docker}/marvin-base-docker/build.sh | 0 .../docker}/marvin-spark-docker/Dockerfile | 8 +- .../docker}/marvin-spark-docker/README.md | 0 .../docker}/marvin-spark-docker/build.sh | 0 9 files changed, 7 insertions(+), 249 deletions(-) delete mode 100644 docker/marvin-spark-docker/core-site.xml delete mode 100644 docker/marvin-spark-docker/hdfs-site.xml rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-base-docker/Dockerfile (100%) rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-base-docker/README.md (100%) rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-base-docker/build.sh (100%) rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-spark-docker/Dockerfile (73%) rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-spark-docker/README.md (100%) rename {docker => marvin_python_toolbox/management/templates/python-engine/docker}/marvin-spark-docker/build.sh (100%) diff --git a/docker/marvin-spark-docker/core-site.xml b/docker/marvin-spark-docker/core-site.xml deleted file mode 100644 index 074bd08..0000000 --- a/docker/marvin-spark-docker/core-site.xml +++ /dev/null @@ -1,145 +0,0 @@ - - - - - - fs.defaultFS - hdfs://nameservice1 - - - fs.trash.interval - 1 - - - io.compression.codecs - org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec - - - hadoop.security.authentication - simple - - - hadoop.security.authorization - false - - - hadoop.rpc.protection - authentication - - - hadoop.security.auth_to_local - DEFAULT - - - hadoop.proxyuser.oozie.hosts - * - - - hadoop.proxyuser.oozie.groups - * - - - hadoop.proxyuser.mapred.hosts - * - - - hadoop.proxyuser.mapred.groups - * - - - hadoop.proxyuser.flume.hosts - * - - - hadoop.proxyuser.flume.groups - * - - - hadoop.proxyuser.HTTP.hosts - * - - - hadoop.proxyuser.HTTP.groups - * - - - hadoop.proxyuser.hive.hosts - * - - - hadoop.proxyuser.hive.groups - * - - - hadoop.proxyuser.hue.hosts - * - - - hadoop.proxyuser.hue.groups - * - - - hadoop.proxyuser.httpfs.hosts - * - - - hadoop.proxyuser.httpfs.groups - * - - - hadoop.proxyuser.hdfs.groups - * - - - hadoop.proxyuser.hdfs.hosts - * - - - hadoop.proxyuser.yarn.hosts - * - - - hadoop.proxyuser.yarn.groups - * - - - hadoop.security.group.mapping - org.apache.hadoop.security.ShellBasedUnixGroupsMapping - - - hadoop.security.instrumentation.requires.admin - false - - - net.topology.script.file.name - /etc/hadoop/conf.cloudera.yarn/topology.py - - - io.file.buffer.size - 65536 - - - hadoop.ssl.enabled - false - - - hadoop.ssl.require.client.cert - false - true - - - hadoop.ssl.keystores.factory.class - org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory - true - - - hadoop.ssl.server.conf - ssl-server.xml - true - - - hadoop.ssl.client.conf - ssl-client.xml - true - - diff --git a/docker/marvin-spark-docker/hdfs-site.xml b/docker/marvin-spark-docker/hdfs-site.xml deleted file mode 100644 index a0ef472..0000000 --- a/docker/marvin-spark-docker/hdfs-site.xml +++ /dev/null @@ -1,101 +0,0 @@ - - - - - - dfs.nameservices - nameservice1 - - - dfs.client.failover.proxy.provider.nameservice1 - org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider - - - dfs.ha.automatic-failover.enabled.nameservice1 - true - - - ha.zookeeper.quorum - lvdn001-priv.b2w:2181,lvnn-priv.b2w:2181,lvsb-priv.b2w:2181 - - - dfs.ha.namenodes.nameservice1 - namenode118,namenode167 - - - dfs.namenode.rpc-address.nameservice1.namenode118 - lvnn-priv.b2w:8020 - - - dfs.namenode.servicerpc-address.nameservice1.namenode118 - lvnn-priv.b2w:8022 - - - dfs.namenode.http-address.nameservice1.namenode118 - lvnn-priv.b2w:50070 - - - dfs.namenode.https-address.nameservice1.namenode118 - lvnn-priv.b2w:50470 - - - dfs.namenode.rpc-address.nameservice1.namenode167 - lvsb-priv.b2w:8020 - - - dfs.namenode.servicerpc-address.nameservice1.namenode167 - lvsb-priv.b2w:8022 - - - dfs.namenode.http-address.nameservice1.namenode167 - lvsb-priv.b2w:50070 - - - dfs.namenode.https-address.nameservice1.namenode167 - lvsb-priv.b2w:50470 - - - dfs.replication - 3 - - - dfs.blocksize - 134217728 - - - dfs.client.use.datanode.hostname - true - - - fs.permissions.umask-mode - 022 - - - dfs.namenode.acls.enabled - false - - - dfs.client.use.legacy.blockreader - false - - - dfs.client.read.shortcircuit - false - - - dfs.domain.socket.path - /var/run/hdfs-sockets/dn - - - dfs.client.read.shortcircuit.skip.checksum - false - - - dfs.client.domain.socket.data.traffic - false - - - dfs.datanode.hdfs-blocks-metadata.enabled - true - - diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index 06d1c06..db168dd 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -541,6 +541,7 @@ def _call_git_init(dest): default='all', type=click.Choice(['all', 'acquisitor', 'tpreparator', 'trainer', 'evaluator', 'ppreparator', 'predictor']), help='Marvin engine action name') +@click.option('--model-protocol', '-mp', help='Model protocol to be loaded. Useful for loading a previous trained model.', type=click.Path(exists=True)) @click.option('--initial-dataset', '-id', help='Initial dataset file path', type=click.Path(exists=True)) @click.option('--dataset', '-d', help='Dataset file path', type=click.Path(exists=True)) @click.option('--model', '-m', help='Engine model file path', type=click.Path(exists=True)) @@ -578,6 +579,7 @@ def engine_httpserver(ctx, action, params_file, initial_dataset, dataset, '-DmarvinConfig.engineHome={}'.format(ctx.obj['config']['inidir']), '-DmarvinConfig.ipAddress={}'.format(http_host), '-DmarvinConfig.port={}'.format(http_port), + '-DmarvinConfig.modelProtocol={}'.format(model-protocol), '-jar', executor_path]) diff --git a/docker/marvin-base-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile similarity index 100% rename from docker/marvin-base-docker/Dockerfile rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile diff --git a/docker/marvin-base-docker/README.md b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md similarity index 100% rename from docker/marvin-base-docker/README.md rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md diff --git a/docker/marvin-base-docker/build.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh similarity index 100% rename from docker/marvin-base-docker/build.sh rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh diff --git a/docker/marvin-spark-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile similarity index 73% rename from docker/marvin-spark-docker/Dockerfile rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile index 263b487..35f70de 100644 --- a/docker/marvin-spark-docker/Dockerfile +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile @@ -2,11 +2,11 @@ FROM marvinai/marvin-base USER root -#ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ +ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ #Unpack tgzs -#RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt/ \ -# && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark +RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt/ \ + && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark ENV SPARK_HOME /opt/spark ENV HADOOP_CONF_DIR /opt/spark/conf @@ -22,6 +22,8 @@ RUN git clone https://github.com/marvin-ai/marvin-python-toolbox.git \ && bash -c "source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv python-toolbox-env \ && setvirtualenvproject && make marvin" +ADD ../* + ENTRYPOINT bash -c 'source /usr/local/bin/virtualenvwrapper.sh && workon python-toolbox-env && marvin --help' \ && "${@}" diff --git a/docker/marvin-spark-docker/README.md b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md similarity index 100% rename from docker/marvin-spark-docker/README.md rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md diff --git a/docker/marvin-spark-docker/build.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/build.sh similarity index 100% rename from docker/marvin-spark-docker/build.sh rename to marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/build.sh From 4f614e4a6925c021f7174ce5eb6816f56f2702e2 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Mon, 27 Nov 2017 18:21:00 -0200 Subject: [PATCH 03/19] - Modifying Dockerfile to include spark config files. Testing new entrypoint to fix virtualenv problem. --- .../docker/marvin-spark-docker/Dockerfile | 39 ++++++++++++------- .../virtualenv_entrypoint.sh | 3 ++ 2 files changed, 27 insertions(+), 15 deletions(-) create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile index 35f70de..0003346 100644 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile @@ -2,29 +2,38 @@ FROM marvinai/marvin-base USER root -ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ +#ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ -#Unpack tgzs -RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt/ \ - && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark +COPY spark-2.1.1-bin-hadoop2.6.tgz /opt/ -ENV SPARK_HOME /opt/spark -ENV HADOOP_CONF_DIR /opt/spark/conf +#Unpack tgzs +RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt \ + && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark #Add configuration files ADD hdfs-site.xml /opt/spark/conf ADD core-site.xml /opt/spark/conf +ADD hive-site.xml /opt/spark/conf +ADD yarn-site.xml /opt/spark/conf -USER marvin +ADD engine.tar /opt/engine -RUN git clone https://github.com/marvin-ai/marvin-python-toolbox.git \ - && cd marvin-python-toolbox \ - && bash -c "source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv python-toolbox-env \ - && setvirtualenvproject && make marvin" +COPY virtualenv_entrypoint.sh / -ADD ../* +RUN chown marvin:marvin -R /opt/engine \ + && apt-get install -y pkg-config \ + && pip install -U setuptools \ + && chown marvin:marvin /virtualenv_entrypoint.sh + +USER marvin + +ENV SPARK_HOME /opt/spark +ENV HADOOP_CONF_DIR /opt/spark/conf +ENV MARVIN_HOME /opt/engine -ENTRYPOINT bash -c 'source /usr/local/bin/virtualenvwrapper.sh && workon python-toolbox-env && marvin --help' \ - && "${@}" +RUN cd /opt/engine \ + && bash -c 'source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv engine-env && setvirtualenvproject' -CMD /bin/bash +ENTRYPOINT "/virtualenv_entrypoint.sh" +CMD ["workon engine-env"] +#workon engine-env && marvin engine-httpserver \ No newline at end of file diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh new file mode 100644 index 0000000..6691a6d --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh @@ -0,0 +1,3 @@ +#!/bin/sh +source /usr/local/bin/virtualenvwrapper.sh +exec "$@" From 9f00d9fb7d53a003e75426387176aff123e7247d Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Mon, 27 Nov 2017 18:21:17 -0200 Subject: [PATCH 04/19] - Updating pip version to use the latest. --- .../python-engine/docker/marvin-base-docker/Dockerfile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile index 294d9c9..a2a14fa 100644 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/Dockerfile @@ -15,13 +15,17 @@ RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/s #Engines will run using the user marvin RUN useradd --create-home -s /bin/bash -G sudo marvin -##Install virtualenv +##Install virtualenv & update pip ENV WORKON_HOME /home/marvin/.virtualenvs RUN pip install -q virtualenvwrapper \ - && echo 'source /usr/local/bin/virtualenvwrapper.sh' >> /home/marvin/.profile + && echo 'source /usr/local/bin/virtualenvwrapper.sh' >> /home/marvin/.profile \ + && mkdir -p /opt/marvin/data \ + && pip install --upgrade pip #Take ownership of needed folders RUN chown -R marvin:marvin /opt +ENV MARVIN_DATA_PATH /opt/marvin/data + USER marvin WORKDIR /home/marvin From 1d0ca2c97f10e099df416f72fa65996ee4230b57 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 11:17:00 -0200 Subject: [PATCH 05/19] Adjusting entrypoint to run virtualenv and the engine httpserver. --- .../docker/marvin-spark-docker/Dockerfile | 22 +++++-------------- .../virtualenv_entrypoint.sh | 5 ++--- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile index 0003346..c3dd844 100644 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile @@ -2,28 +2,20 @@ FROM marvinai/marvin-base USER root -#ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ - -COPY spark-2.1.1-bin-hadoop2.6.tgz /opt/ +ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ #Unpack tgzs RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt \ && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark #Add configuration files -ADD hdfs-site.xml /opt/spark/conf -ADD core-site.xml /opt/spark/conf -ADD hive-site.xml /opt/spark/conf -ADD yarn-site.xml /opt/spark/conf +ADD spark-conf/* /opt/spark/conf/ ADD engine.tar /opt/engine -COPY virtualenv_entrypoint.sh / +COPY virtualenv_entrypoint.sh /opt/engine -RUN chown marvin:marvin -R /opt/engine \ - && apt-get install -y pkg-config \ - && pip install -U setuptools \ - && chown marvin:marvin /virtualenv_entrypoint.sh +RUN chown marvin:marvin -R /opt/engine USER marvin @@ -32,8 +24,6 @@ ENV HADOOP_CONF_DIR /opt/spark/conf ENV MARVIN_HOME /opt/engine RUN cd /opt/engine \ - && bash -c 'source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv engine-env && setvirtualenvproject' + && bash -c 'source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv engine-env && setvirtualenvproject && make marvin' -ENTRYPOINT "/virtualenv_entrypoint.sh" -CMD ["workon engine-env"] -#workon engine-env && marvin engine-httpserver \ No newline at end of file +ENTRYPOINT "/opt/engine/virtualenv_entrypoint.sh" \ No newline at end of file diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh index 6691a6d..ddf0c4e 100644 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/virtualenv_entrypoint.sh @@ -1,3 +1,2 @@ -#!/bin/sh -source /usr/local/bin/virtualenvwrapper.sh -exec "$@" +#!/bin/bash +/bin/bash -c "source /usr/local/bin/virtualenvwrapper.sh && workon engine-env && marvin engine-httpserver" \ No newline at end of file From e0aa816b3fda798bd688cf4e0327f57e61050538 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 12:24:59 -0200 Subject: [PATCH 06/19] Removing tar from command, since docker ADD already unpack --- .../python-engine/docker/marvin-spark-docker/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile index c3dd844..ea0934f 100644 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/Dockerfile @@ -5,7 +5,7 @@ USER root ADD http://archive.apache.org/dist/spark/spark-2.1.1/spark-2.1.1-bin-hadoop2.6.tgz /opt/ #Unpack tgzs -RUN tar -zxvf /opt/spark-2.1.1-bin-hadoop2.6.tgz --directory=/opt \ +RUN ls -l /opt \ && mv /opt/spark-2.1.1-bin-hadoop2.6 /opt/spark #Add configuration files @@ -26,4 +26,4 @@ ENV MARVIN_HOME /opt/engine RUN cd /opt/engine \ && bash -c 'source /usr/local/bin/virtualenvwrapper.sh && mkvirtualenv engine-env && setvirtualenvproject && make marvin' -ENTRYPOINT "/opt/engine/virtualenv_entrypoint.sh" \ No newline at end of file +ENTRYPOINT "/opt/engine/virtualenv_entrypoint.sh" From 681a4c9740928675cc2ec0fd70311f6d4534a6c9 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 15:18:31 -0200 Subject: [PATCH 07/19] Adding commands to build docker in marvin CLI. --- marvin_python_toolbox/management/engine.py | 32 +++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index db168dd..b1de0a6 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -318,6 +318,37 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data server.stop(0) +@cli.command('engine-dockerbuild', help='Builds a docker image containing the engine. Requires docker service running in the host machine.') +@click.option( + '--type', + '-t', + type=click.Choice(['spark', 'base']), + default='spark', + help='What image type to build. Example: marvin with spark.', +) +@click.option('--tag', '-t', default='marvin-engine', help='Image tag to be used.') +@click.option('--version', '-v', default=VERSION, help="Image version to be used.") +def build_docker(type, tag, version): + logger.info("Will generate a package with the engine in order to build the docker image.") + commandTar = ['tar', '-cf', 'engine.tar', '*'] + run_command(commandTar, "Failed to generate tar file.") + + logger.info("Will move the package to the docker folder.") + commandMv = ['mv', 'engine.tar', 'docker/marvin-spark-docker/'] + run_command(commandMv, "Failed to move the package to docker folder.") + + logger.info("Building docker image.") + command = ['docker', 'build', '-t {0}:{1}'.format(tag, version), 'docker/marvin-spark-docker/'] + run_command(command, "Failed to build docker image.") + + +def run_command(command, error_message="A failure occurred."): + try: + subprocess.Popen(command, env=os.environ).wait() + except: + logger.exception(error_message) + sys.exit(1) + TEMPLATE_BASES = { 'python-engine': os.path.join(os.path.dirname(__file__), 'templates', 'python-engine') } @@ -334,7 +365,6 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data _orig_type = type - @cli.command('engine-generateenv', help='Generate a new marvin engine environment and install default requirements.') @click.argument('engine-path', type=click.Path(exists=True)) def generate_env(engine_path): From b91c0e812ff3cda45d9ad914ecc13220544ce144 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 15:34:36 -0200 Subject: [PATCH 08/19] Fix tar command. --- marvin_python_toolbox/management/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index b1de0a6..d61aa39 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -330,7 +330,7 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data @click.option('--version', '-v', default=VERSION, help="Image version to be used.") def build_docker(type, tag, version): logger.info("Will generate a package with the engine in order to build the docker image.") - commandTar = ['tar', '-cf', 'engine.tar', '*'] + commandTar = ['tar', '-cf', 'engine.tar', '.'] run_command(commandTar, "Failed to generate tar file.") logger.info("Will move the package to the docker folder.") From 318f9310ba7b6ba757e22ab6d313acf5fbfeda20 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 15:56:14 -0200 Subject: [PATCH 09/19] Adjusting docker command to specify the tag --- marvin_python_toolbox/management/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index d61aa39..0a11937 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -338,7 +338,7 @@ def build_docker(type, tag, version): run_command(commandMv, "Failed to move the package to docker folder.") logger.info("Building docker image.") - command = ['docker', 'build', '-t {0}:{1}'.format(tag, version), 'docker/marvin-spark-docker/'] + command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/marvin-spark-docker/'] run_command(command, "Failed to build docker image.") From 92128f3c438e3c060d21860b59ad11dca1785de6 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 16:08:18 -0200 Subject: [PATCH 10/19] Adding dynamic config for folders. --- marvin_python_toolbox/management/engine.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index 0a11937..6605d38 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -328,17 +328,27 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data ) @click.option('--tag', '-t', default='marvin-engine', help='Image tag to be used.') @click.option('--version', '-v', default=VERSION, help="Image version to be used.") -def build_docker(type, tag, version): +def build_docker(buildtype, tag, version): + buildTypes = { + "spark": { + "folder": "marvin-spark-docker" + }, + "base": { + "folder": "marvin-base-docker" + } + } logger.info("Will generate a package with the engine in order to build the docker image.") - commandTar = ['tar', '-cf', 'engine.tar', '.'] - run_command(commandTar, "Failed to generate tar file.") + command_tar = ['tar', '-cf', 'engine.tar', '.'] + run_command(command_tar, "Failed to generate tar file.") + docker_folder = buildTypes[buildtype]["folder"] logger.info("Will move the package to the docker folder.") - commandMv = ['mv', 'engine.tar', 'docker/marvin-spark-docker/'] - run_command(commandMv, "Failed to move the package to docker folder.") + command_mv = ['mv', 'engine.tar', 'docker/{0}/'.format(docker_folder)] + run_command(command_mv, "Failed to move the package to docker folder.") logger.info("Building docker image.") - command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/marvin-spark-docker/'] + tag = "{0}-{1}".format(tag, buildtype) + command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/{0}/'.format(docker_folder)] run_command(command, "Failed to build docker image.") From c3be8bb4828800a17416fefddae5f59d0618e7cf Mon Sep 17 00:00:00 2001 From: "lucas.bonatto" Date: Wed, 29 Nov 2017 16:15:30 -0200 Subject: [PATCH 11/19] adjusting build type param name --- marvin_python_toolbox/management/engine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index 6605d38..cd764d0 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -328,7 +328,7 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data ) @click.option('--tag', '-t', default='marvin-engine', help='Image tag to be used.') @click.option('--version', '-v', default=VERSION, help="Image version to be used.") -def build_docker(buildtype, tag, version): +def build_docker(type, tag, version): buildTypes = { "spark": { "folder": "marvin-spark-docker" @@ -341,13 +341,13 @@ def build_docker(buildtype, tag, version): command_tar = ['tar', '-cf', 'engine.tar', '.'] run_command(command_tar, "Failed to generate tar file.") - docker_folder = buildTypes[buildtype]["folder"] + docker_folder = buildTypes[type]["folder"] logger.info("Will move the package to the docker folder.") command_mv = ['mv', 'engine.tar', 'docker/{0}/'.format(docker_folder)] run_command(command_mv, "Failed to move the package to docker folder.") logger.info("Building docker image.") - tag = "{0}-{1}".format(tag, buildtype) + tag = "{0}-{1}".format(tag, type) command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/{0}/'.format(docker_folder)] run_command(command, "Failed to build docker image.") From 7032070736f6575a9d192343dacae63bb86118a8 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 16:30:21 -0200 Subject: [PATCH 12/19] Adjusting tag names --- marvin_python_toolbox/management/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index cd764d0..ade6243 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -326,7 +326,7 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data default='spark', help='What image type to build. Example: marvin with spark.', ) -@click.option('--tag', '-t', default='marvin-engine', help='Image tag to be used.') +@click.option('--tag', '-t', default='marvinai/marvin', help='Image tag to be used.') @click.option('--version', '-v', default=VERSION, help="Image version to be used.") def build_docker(type, tag, version): buildTypes = { From 28a30caad2854eea080bf66e2adf447c36dc43de Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 16:33:45 -0200 Subject: [PATCH 13/19] Adjusting tag parameter name --- marvin_python_toolbox/management/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index ade6243..1073f01 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -326,7 +326,7 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data default='spark', help='What image type to build. Example: marvin with spark.', ) -@click.option('--tag', '-t', default='marvinai/marvin', help='Image tag to be used.') +@click.option('--tag', '-tg', default='marvinai/marvin', help='Image tag to be used.') @click.option('--version', '-v', default=VERSION, help="Image version to be used.") def build_docker(type, tag, version): buildTypes = { From 4d526f435c7765f34dfd6234362a0e001a0d4fd4 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 16:46:55 -0200 Subject: [PATCH 14/19] Adding final success status message. --- marvin_python_toolbox/management/engine.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index 1073f01..bede625 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -351,6 +351,8 @@ def build_docker(type, tag, version): command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/{0}/'.format(docker_folder)] run_command(command, "Failed to build docker image.") + logger.info("Successfully built docker image with tag {0}. To start the engine-httpserver with docker run .".format(tag)) + def run_command(command, error_message="A failure occurred."): try: From 2fc42303d03805f7e42a5bf31ff8d91d42dda567 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 16:48:36 -0200 Subject: [PATCH 15/19] Changing log info to print. Finishing docker image build impl. Fixes #35 --- marvin_python_toolbox/management/engine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index bede625..539ddc8 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -337,21 +337,21 @@ def build_docker(type, tag, version): "folder": "marvin-base-docker" } } - logger.info("Will generate a package with the engine in order to build the docker image.") + print("Will generate a package with the engine in order to build the docker image.") command_tar = ['tar', '-cf', 'engine.tar', '.'] run_command(command_tar, "Failed to generate tar file.") docker_folder = buildTypes[type]["folder"] - logger.info("Will move the package to the docker folder.") + print("Will move the package to the docker folder.") command_mv = ['mv', 'engine.tar', 'docker/{0}/'.format(docker_folder)] run_command(command_mv, "Failed to move the package to docker folder.") - logger.info("Building docker image.") + print("Building docker image.") tag = "{0}-{1}".format(tag, type) command = ['docker', 'build', '-t', '{0}:{1}'.format(tag, version), 'docker/{0}/'.format(docker_folder)] run_command(command, "Failed to build docker image.") - logger.info("Successfully built docker image with tag {0}. To start the engine-httpserver with docker run .".format(tag)) + print("Successfully built docker image with tag {0}. To start the engine-httpserver with docker run .".format(tag)) def run_command(command, error_message="A failure occurred."): From 5be24c774994f8de6df08ecd4b26006eec626801 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 17:15:57 -0200 Subject: [PATCH 16/19] Adding correct spark files. --- .../spark-conf/core-site.xml | 0 .../spark-conf/fairscheduler.xml | 31 ++++ .../spark-conf/hdfs-site.xml | 0 .../spark-conf/hive-site.xml | 0 .../spark-conf/log4j.properties | 40 +++++ .../spark-conf/metrics.properties | 170 ++++++++++++++++++ .../marvin-spark-docker/spark-conf/slaves | 19 ++ .../spark-conf/spark-defaults.conf | 27 +++ .../spark-conf/spark-env.sh | 66 +++++++ .../spark-conf/yarn-site.xml | 0 10 files changed, 353 insertions(+) create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/core-site.xml create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hdfs-site.xml create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hive-site.xml create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/slaves create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-defaults.conf create mode 100755 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-env.sh create mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/yarn-site.xml diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/core-site.xml b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/core-site.xml new file mode 100644 index 0000000..e69de29 diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml new file mode 100644 index 0000000..385b2e7 --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/fairscheduler.xml @@ -0,0 +1,31 @@ + + + + + + + FAIR + 1 + 2 + + + FIFO + 2 + 3 + + diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hdfs-site.xml b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hdfs-site.xml new file mode 100644 index 0000000..e69de29 diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hive-site.xml b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/hive-site.xml new file mode 100644 index 0000000..e69de29 diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties new file mode 100644 index 0000000..ec1aa18 --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/log4j.properties @@ -0,0 +1,40 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n + +# Set the default spark-shell log level to WARN. When running the spark-shell, the +# log level for this class is used to overwrite the root logger's log level, so that +# the user can have different defaults for the shell and regular Spark apps. +log4j.logger.org.apache.spark.repl.Main=WARN + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.spark_project.jetty=WARN +log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO +log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO +log4j.logger.org.apache.parquet=ERROR +log4j.logger.parquet=ERROR + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties new file mode 100644 index 0000000..aeb76c9 --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/metrics.properties @@ -0,0 +1,170 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# syntax: [instance].sink|source.[name].[options]=[value] + +# This file configures Spark's internal metrics system. The metrics system is +# divided into instances which correspond to internal components. +# Each instance can be configured to report its metrics to one or more sinks. +# Accepted values for [instance] are "master", "worker", "executor", "driver", +# and "applications". A wildcard "*" can be used as an instance name, in +# which case all instances will inherit the supplied property. +# +# Within an instance, a "source" specifies a particular set of grouped metrics. +# there are two kinds of sources: +# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will +# collect a Spark component's internal state. Each instance is paired with a +# Spark source that is added automatically. +# 2. Common sources, like JvmSource, which will collect low level state. +# These can be added through configuration options and are then loaded +# using reflection. +# +# A "sink" specifies where metrics are delivered to. Each instance can be +# assigned one or more sinks. +# +# The sink|source field specifies whether the property relates to a sink or +# source. +# +# The [name] field specifies the name of source or sink. +# +# The [options] field is the specific property of this source or sink. The +# source or sink is responsible for parsing this property. +# +# Notes: +# 1. To add a new sink, set the "class" option to a fully qualified class +# name (see examples below). +# 2. Some sinks involve a polling period. The minimum allowed polling period +# is 1 second. +# 3. Wildcard properties can be overridden by more specific properties. +# For example, master.sink.console.period takes precedence over +# *.sink.console.period. +# 4. A metrics specific configuration +# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be +# added to Java properties using -Dspark.metrics.conf=xxx if you want to +# customize metrics system. You can also put the file in ${SPARK_HOME}/conf +# and it will be loaded automatically. +# 5. The MetricsServlet sink is added by default as a sink in the master, +# worker and driver, and you can send HTTP requests to the "/metrics/json" +# endpoint to get a snapshot of all the registered metrics in JSON format. +# For master, requests to the "/metrics/master/json" and +# "/metrics/applications/json" endpoints can be sent separately to get +# metrics snapshots of the master instance and applications. This +# MetricsServlet does not have to be configured. + +## List of available common sources and their properties. + +# org.apache.spark.metrics.source.JvmSource +# Note: Currently, JvmSource is the only available common source. +# It can be added to an instance by setting the "class" option to its +# fully qualified class name (see examples below). + +## List of available sinks and their properties. + +# org.apache.spark.metrics.sink.ConsoleSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period + +# org.apache.spark.metrics.sink.CSVSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period +# directory /tmp Where to store CSV files + +# org.apache.spark.metrics.sink.GangliaSink +# Name: Default: Description: +# host NONE Hostname or multicast group of the Ganglia server, +# must be set +# port NONE Port of the Ganglia server(s), must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# ttl 1 TTL of messages sent by Ganglia +# dmax 0 Lifetime in seconds of metrics (0 never expired) +# mode multicast Ganglia network mode ('unicast' or 'multicast') + +# org.apache.spark.metrics.sink.JmxSink + +# org.apache.spark.metrics.sink.MetricsServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# sample false Whether to show entire set of samples for histograms +# ('false' or 'true') +# +# * Default path is /metrics/json for all instances except the master. The +# master has two paths: +# /metrics/applications/json # App information +# /metrics/master/json # Master information + +# org.apache.spark.metrics.sink.GraphiteSink +# Name: Default: Description: +# host NONE Hostname of the Graphite server, must be set +# port NONE Port of the Graphite server, must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# prefix EMPTY STRING Prefix to prepend to every metric's name +# protocol tcp Protocol ("tcp" or "udp") to use + +## Examples +# Enable JmxSink for all instances by class name +#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink + +# Enable ConsoleSink for all instances by class name +#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink + +# Polling period for the ConsoleSink +#*.sink.console.period=10 +# Unit of the polling period for the ConsoleSink +#*.sink.console.unit=seconds + +# Polling period for the ConsoleSink specific for the master instance +#master.sink.console.period=15 +# Unit of the polling period for the ConsoleSink specific for the master +# instance +#master.sink.console.unit=seconds + +# Enable CsvSink for all instances by class name +#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink + +# Polling period for the CsvSink +#*.sink.csv.period=1 +# Unit of the polling period for the CsvSink +#*.sink.csv.unit=minutes + +# Polling directory for CsvSink +#*.sink.csv.directory=/tmp/ + +# Polling period for the CsvSink specific for the worker instance +#worker.sink.csv.period=10 +# Unit of the polling period for the CsvSink specific for the worker instance +#worker.sink.csv.unit=minutes + +# Enable Slf4jSink for all instances by class name +#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink + +# Polling period for the Slf4JSink +#*.sink.slf4j.period=1 +# Unit of the polling period for the Slf4jSink +#*.sink.slf4j.unit=minutes + +# Enable JvmSource for instance master, worker, driver and executor +#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/slaves b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/slaves new file mode 100644 index 0000000..be42a63 --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/slaves @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +localhost \ No newline at end of file diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-defaults.conf b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-defaults.conf new file mode 100644 index 0000000..19cba6e --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-defaults.conf @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-env.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-env.sh new file mode 100755 index 0000000..5c1e876 --- /dev/null +++ b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/spark-env.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_CLASSPATH, default classpath entries to append +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in YARN client mode +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_INSTANCES, to set the number of worker processes per node +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/yarn-site.xml b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/spark-conf/yarn-site.xml new file mode 100644 index 0000000..e69de29 From 9f9afbc21b2c8638e785c9ce24fd21189e32c7f6 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Wed, 29 Nov 2017 17:16:38 -0200 Subject: [PATCH 17/19] Removing unneeded files. --- .../docker/marvin-base-docker/README.md | 20 ------------------- .../docker/marvin-base-docker/build.sh | 9 --------- 2 files changed, 29 deletions(-) delete mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md delete mode 100755 marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md deleted file mode 100644 index 7d0d401..0000000 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/README.md +++ /dev/null @@ -1,20 +0,0 @@ -### Marvin Prediction-IO Docker Image - -This docker project builds a **Prediction-IO** image to support things like - -- Build and test engines -- Create docker containers to run pio train and pio deploy - - -## How to use - -To build the image just run `sh build.sh ` specifying the tag for your image. - -## Dependencies - -To build this project the following dependencies must be met - -- Elasticsearch download - The build process will try to download elastcisearch **1.7.5** from the internet. -- Spark download - The build process will try to download spark **1.6.3** from the internet. -- Prediction-IO download - The build process will try to download Prediction-IO **0.10.0** (B2W Fork) from the internal network. -- SBT cache files - In order to optmize the build time the build process will try to download ivy and sbt custom tars from the internal network. These files are not required to build, if they're missing you can remove from Dockerfile. \ No newline at end of file diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh deleted file mode 100755 index d98887f..0000000 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-base-docker/build.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -if [ -z "$1" ] - then - echo "You must specify the version of the image being built" - exit 1 -fi -docker build -t registry.b2w.io/b2wdigital/predictionio-b2w:"$1" . - - From 53d2002765718660575665f8423b339c5cf05a58 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Thu, 30 Nov 2017 09:15:44 -0200 Subject: [PATCH 18/19] Minor change on the version parameter in order to fix Jenkis failing. --- marvin_python_toolbox/management/engine.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/marvin_python_toolbox/management/engine.py b/marvin_python_toolbox/management/engine.py index bf606ca..4478815 100644 --- a/marvin_python_toolbox/management/engine.py +++ b/marvin_python_toolbox/management/engine.py @@ -329,7 +329,7 @@ def engine_server(ctx, action, params_file, metadata_file, initial_dataset, data help='What image type to build. Example: marvin with spark.', ) @click.option('--tag', '-tg', default='marvinai/marvin', help='Image tag to be used.') -@click.option('--version', '-v', default=VERSION, help="Image version to be used.") +@click.option('--version', '-v', help="Image version to be used.") def build_docker(type, tag, version): buildTypes = { "spark": { @@ -339,6 +339,8 @@ def build_docker(type, tag, version): "folder": "marvin-base-docker" } } + if version is None: + version = VERSION print("Will generate a package with the engine in order to build the docker image.") command_tar = ['tar', '-cf', 'engine.tar', '.'] run_command(command_tar, "Failed to generate tar file.") From 658b23ced70201d0448ef8b4f249c39a05e3d1f0 Mon Sep 17 00:00:00 2001 From: lucasbm88 Date: Tue, 5 Dec 2017 16:57:33 -0200 Subject: [PATCH 19/19] Removing uneeded README file. --- .../docker/marvin-spark-docker/README.md | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md diff --git a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md b/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md deleted file mode 100644 index 7d0d401..0000000 --- a/marvin_python_toolbox/management/templates/python-engine/docker/marvin-spark-docker/README.md +++ /dev/null @@ -1,20 +0,0 @@ -### Marvin Prediction-IO Docker Image - -This docker project builds a **Prediction-IO** image to support things like - -- Build and test engines -- Create docker containers to run pio train and pio deploy - - -## How to use - -To build the image just run `sh build.sh ` specifying the tag for your image. - -## Dependencies - -To build this project the following dependencies must be met - -- Elasticsearch download - The build process will try to download elastcisearch **1.7.5** from the internet. -- Spark download - The build process will try to download spark **1.6.3** from the internet. -- Prediction-IO download - The build process will try to download Prediction-IO **0.10.0** (B2W Fork) from the internal network. -- SBT cache files - In order to optmize the build time the build process will try to download ivy and sbt custom tars from the internal network. These files are not required to build, if they're missing you can remove from Dockerfile. \ No newline at end of file