diff --git a/README.md b/README.md index 4f4a413..f51188a 100644 --- a/README.md +++ b/README.md @@ -8,31 +8,31 @@ ## 1.基本软件环境介绍 -###1. 软件版本 +###1.1 软件版本 - 操作系统: CentOS 6 - Java环境: OpenJDK 8 - Hadoop: 2.7.2 -- Spark: 1.6.2/2.1.0 -- Hive: 1.1.1/2.1.1 +- Spark: 2.1.0 +- Hive: 2.1.1 - HBase: 1.2.2 - Zookeeper: 3.4.8 - 基于docker-compose管理镜像和容器,并进行集群的编排 - 所有软件的二进制包均通过网络下载。其中包含自行编译的Hadoop和Protobuf二进制包,保存在Github上,其它软件的二进制包均使用Apache官方镜像。 -###2. 镜像依赖关系 +###1.2 镜像依赖关系 ![镜像依赖关系图](https://github.com/ruoyu-chen/hadoop-docker/raw/master/images/arch.jpeg "镜像依赖关系") 上图中,灰色的镜像(centos:6)为docker hub官方基础镜像。其它镜像(twinsen/hadoop:2.7.2等)都是在下层镜像的基础上实现的。这一镜像之间的依赖关系,决定了镜像的编译顺序. ## 2.使用方法简介 -###1. 安装docker +###2.1 安装docker 具体安装方法请自行百度,安装完成后,在命令行下输入docker info进行测试,输出结果如下图所示,说明安装成功 ![docker安装测试结果](https://github.com/ruoyu-chen/hadoop-docker/raw/master/images/docker_info.png "Docker安装测试") -###2. 构建镜像 +###2.2 构建镜像 首先,下载工程文件( https://github.com/ruoyu-chen/hadoop-docker/archive/1.1.zip ),解压到任意目录下。 接下来,可以在工程根目录下(包含有docker-compose-build-all.yml文件),在系统命令行中,依次使用下列命令构建镜像: @@ -44,30 +44,28 @@ `docker pull centos:6` -- 构建基本操作系统和OpenJDK环境,包含CentOS 6和OpenJDK 8 +- 拉取基本操作系统和OpenJDK环境,包含CentOS 6和OpenJDK 8 -`docker-compose -f docker-compose-build-all.yml build os-jvm` +`docker pull twinsen/os-jvm:centos6-openjdk8` -- 构建Hadoop环境,包含Hadoop 2.7.2 +- 拉取Hadoop环境,包含Hadoop 2.7.2 -`docker-compose -f docker-compose-build-all.yml build hadoop` +`docker pull twinsen/hadoop:2.7.2` -- 构建Hive环境,包含Hive 2.1.1 +- 拉取Hive环境,包含Hive 2.1.1 -`docker-compose -f docker-compose-build-all.yml build hive-2.1.1` +`docker pull twinsen/hive:2.1.1` -- 构建Spark环境,包含Spark 2.1.0 +- 拉取Spark环境,包含Spark 2.1.0 -`docker-compose -f docker-compose-build-all.yml build spark-2.1.0` +`docker pull twinsen/spark:2.1.0` -###3. 启动及停止集群 +###2.3 环境准备 完成上一步的镜像编译工作后,在系统命令行中,可以使用docker images命令查看目前docker环境下的镜像,如下图所示: ![查看docker本机镜像列表](https://github.com/ruoyu-chen/hadoop-docker/raw/master/images/docker_images.png "查看Docker本机镜像列表") 为了方便使用,在工程根目录下放置了一个docker-compose.yml文件,这一文件中已经预先配置好了由3个slave节点和1个master节点组成的Spark集群。 -下面简要介绍启动和关闭Spark集群的步骤(以下步骤均在命令行环境下完成,在工程根目录下执行) - -- 初始化工作 +在使用集群之前,需要先完成初始化

 #[创建容器]
@@ -76,8 +74,21 @@ docker-compose up -d
 docker-compose exec spark-master hdfs namenode -format
 #[初始化Hive数据库。仅在第一次启动集群前执行一次]
 docker-compose exec spark-master schematool -dbType mysql -initSchema
+#[将Spark相关的jar文件打包,存储在/code目录下,命名为spark-libs.jar]
+docker-compose exec spark-master jar cv0f /code/spark-libs.jar -C /root/spark/jars/ .
+#[启动HDFS]
+docker-compose exec spark-master start-dfs.sh
+#[在HDFS中创建/user/spark/share/lib/目录]
+docker-compose exec spark-master hadoop fs -mkdir -p /user/spark/share/lib/
+#[将/code/spark-libs.jar文件上传至HDFS下的/user/spark/share/lib/目录下]
+docker-compose exec spark-master hadoop fs -put /code/spark-libs.jar /user/spark/share/lib/
+#[关闭HDFS]
+docker-compose exec spark-master stop-dfs.sh
 
+###2.4 启动及停止集群 + +下面简要介绍启动和关闭Spark集群的步骤(以下步骤均在命令行环境下完成,在工程根目录下执行) - 启动集群进程,依次执行:

@@ -102,7 +113,7 @@ docker-compose exec spark-master stop-dfs.sh
 docker-compose down
-###4. 开发与测试过程中的集群使用方法 +###2.5 开发与测试过程中的集群使用方法 目前集群中采用的是1个master节点和3个slave节点的分配方案,可以通过调整docker-compose配置文件以及相应软件的配置文件来实现集群扩容,暂时无法做到自动化扩容。 diff --git a/docker-compose-build-all.yml b/docker-compose-build-all.yml index 22eba5e..2a67519 100755 --- a/docker-compose-build-all.yml +++ b/docker-compose-build-all.yml @@ -1,17 +1,21 @@ version: '2' services: os-jvm: - build: ./os-jvm-docker/centos6-openjdk8 + build: ./services/os-jvm-docker/centos6-openjdk8 image: twinsen/os-jvm:centos6-openjdk8 + os7-jvm: + build: ./services/os-jvm-docker/centos7-openjdk8 + image: twinsen/os-jvm:centos7-openjdk8 + hadoop: - build: ./hadoop-docker/2.7.2 + build: ./services/hadoop-docker/2.7.2 image: twinsen/hadoop:2.7.2 hive-2.1.1: - build: ./hive-docker/2.1.1 + build: ./services/hive-docker/2.1.1 image: twinsen/hive:2.1.1 spark-2.1.0: - build: ./spark-docker/2.1.0 + build: ./services/spark-docker/2.1.0 image: twinsen/spark:2.1.0 \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index e8335fc..234dca2 100755 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,6 +7,7 @@ services: - "./volume/hadoop/work/slave1:/works" - "./volume/hadoop/logs/slave1:/root/hadoop/logs/" - "./volume/spark/logs/slave1:/root/spark/logs/" + - "./volume/hadoop/tmp/slave1:/tmp" - "./volume/ro_data:/ro_data:ro" hostname: hadoop-slave1 networks: @@ -22,6 +23,7 @@ services: - "./volume/hadoop/work/slave2:/works" - "./volume/hadoop/logs/slave2:/root/hadoop/logs/" - "./volume/spark/logs/slave2:/root/spark/logs/" + - "./volume/hadoop/tmp/slave2:/tmp" - "./volume/ro_data:/ro_data:ro" hostname: hadoop-slave2 networks: @@ -37,6 +39,7 @@ services: - "./volume/hadoop/work/slave3:/works" - "./volume/hadoop/logs/slave3:/root/hadoop/logs/" - "./volume/spark/logs/slave3:/root/spark/logs/" + - "./volume/hadoop/tmp/slave3:/tmp" - "./volume/ro_data:/ro_data:ro" hostname: hadoop-slave3 networks: @@ -68,6 +71,7 @@ services: - "./volume/hadoop/work/master:/works" - "./volume/hadoop/logs/master:/root/hadoop/logs/" - "./volume/spark/logs/master:/root/spark/logs/" + - "./volume/hadoop/tmp/master:/tmp" - "./volume/code:/code" - "./volume/ro_data:/ro_data:ro" container_name: spark-master diff --git a/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml b/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml index a499909..35bed8e 100644 --- a/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml +++ b/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml @@ -17,6 +17,10 @@ + + hadoop.tmp.dir + file:/works/hadoop_tmp/hadoop_${user.name} + io,native.lib.available true diff --git a/services/hadoop-docker/2.7.2/docker-compose.yml b/services/hadoop-docker/2.7.2/docker-compose.yml index e33c68a..032fa9e 100755 --- a/services/hadoop-docker/2.7.2/docker-compose.yml +++ b/services/hadoop-docker/2.7.2/docker-compose.yml @@ -6,6 +6,7 @@ services: volumes: - "./volume/hadoop/work/slave1:/works" - "./volume/hadoop/logs/slave1:/root/hadoop/logs/" + - "./volume/hadoop/tmp/slave1:/tmp" hostname: hadoop-slave1 networks: - hadoop @@ -17,6 +18,7 @@ services: volumes: - "./volume/hadoop/work/slave2:/works" - "./volume/hadoop/logs/slave2:/root/hadoop/logs/" + - "./volume/hadoop/tmp/slave2:/tmp" hostname: hadoop-slave2 networks: - hadoop @@ -28,6 +30,7 @@ services: volumes: - "./volume/hadoop/work/slave3:/works" - "./volume/hadoop/logs/slave3:/root/hadoop/logs/" + - "./volume/hadoop/tmp/slave3:/tmp" hostname: hadoop-slave3 networks: - hadoop @@ -42,6 +45,7 @@ services: volumes: - "./volume/hadoop/work/master:/works" - "./volume/hadoop/logs/master:/root/hadoop/logs/" + - "./volume/hadoop/tmp/master:/tmp" - "./volume/code:/code" hostname: hadoop-master links: diff --git a/services/os-jvm-docker/centos7-openjdk8/Dockerfile b/services/os-jvm-docker/centos7-openjdk8/Dockerfile new file mode 100644 index 0000000..25d6329 --- /dev/null +++ b/services/os-jvm-docker/centos7-openjdk8/Dockerfile @@ -0,0 +1,17 @@ +FROM centos:7 + +MAINTAINER twinsen + +USER root + +ENV JAVA_HOME=/usr/lib/jvm/java-openjdk + +ENV PATH=$PATH:$JAVA_HOME/bin:. + +# 安装 OpenJDK +RUN yum update -y && \ + yum install -y java-1.8.0-openjdk-devel && \ + yum clean all && \ + cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/services/spark-docker/2.1.0/config/spark/spark-defaults.conf b/services/spark-docker/2.1.0/config/spark/spark-defaults.conf index 4215fde..5a4fb5c 100644 --- a/services/spark-docker/2.1.0/config/spark/spark-defaults.conf +++ b/services/spark-docker/2.1.0/config/spark/spark-defaults.conf @@ -11,4 +11,5 @@ spark.executor.extraClassPath /root/hive/lib/mysql-connector-java-5.1.40-bin.jar:/root/hive/lib/guava-14.0.1.jar -spark.driver.extraClassPath /root/hive/lib/mysql-connector-java-5.1.40-bin.jar:/root/hive/lib/guava-14.0.1.jar \ No newline at end of file +spark.driver.extraClassPath /root/hive/lib/mysql-connector-java-5.1.40-bin.jar:/root/hive/lib/guava-14.0.1.jar +spark.yarn.archive hdfs://hadoop-master:54310/user/spark/share/lib/spark-libs.jar \ No newline at end of file