diff --git a/README.md b/README.md index 4f4a413..f51188a 100644 --- a/README.md +++ b/README.md @@ -8,31 +8,31 @@ ## 1.基本软件环境介绍 -###1. 软件版本 +###1.1 软件版本 - 操作系统: CentOS 6 - Java环境: OpenJDK 8 - Hadoop: 2.7.2 -- Spark: 1.6.2/2.1.0 -- Hive: 1.1.1/2.1.1 +- Spark: 2.1.0 +- Hive: 2.1.1 - HBase: 1.2.2 - Zookeeper: 3.4.8 - 基于docker-compose管理镜像和容器,并进行集群的编排 - 所有软件的二进制包均通过网络下载。其中包含自行编译的Hadoop和Protobuf二进制包,保存在Github上,其它软件的二进制包均使用Apache官方镜像。 -###2. 镜像依赖关系 +###1.2 镜像依赖关系  上图中,灰色的镜像(centos:6)为docker hub官方基础镜像。其它镜像(twinsen/hadoop:2.7.2等)都是在下层镜像的基础上实现的。这一镜像之间的依赖关系,决定了镜像的编译顺序. ## 2.使用方法简介 -###1. 安装docker +###2.1 安装docker 具体安装方法请自行百度,安装完成后,在命令行下输入docker info进行测试,输出结果如下图所示,说明安装成功  -###2. 构建镜像 +###2.2 构建镜像 首先,下载工程文件( https://github.com/ruoyu-chen/hadoop-docker/archive/1.1.zip ),解压到任意目录下。 接下来,可以在工程根目录下(包含有docker-compose-build-all.yml文件),在系统命令行中,依次使用下列命令构建镜像: @@ -44,30 +44,28 @@ `docker pull centos:6` -- 构建基本操作系统和OpenJDK环境,包含CentOS 6和OpenJDK 8 +- 拉取基本操作系统和OpenJDK环境,包含CentOS 6和OpenJDK 8 -`docker-compose -f docker-compose-build-all.yml build os-jvm` +`docker pull twinsen/os-jvm:centos6-openjdk8` -- 构建Hadoop环境,包含Hadoop 2.7.2 +- 拉取Hadoop环境,包含Hadoop 2.7.2 -`docker-compose -f docker-compose-build-all.yml build hadoop` +`docker pull twinsen/hadoop:2.7.2` -- 构建Hive环境,包含Hive 2.1.1 +- 拉取Hive环境,包含Hive 2.1.1 -`docker-compose -f docker-compose-build-all.yml build hive-2.1.1` +`docker pull twinsen/hive:2.1.1` -- 构建Spark环境,包含Spark 2.1.0 +- 拉取Spark环境,包含Spark 2.1.0 -`docker-compose -f docker-compose-build-all.yml build spark-2.1.0` +`docker pull twinsen/spark:2.1.0` -###3. 启动及停止集群 +###2.3 环境准备 完成上一步的镜像编译工作后,在系统命令行中,可以使用docker images命令查看目前docker环境下的镜像,如下图所示:  为了方便使用,在工程根目录下放置了一个docker-compose.yml文件,这一文件中已经预先配置好了由3个slave节点和1个master节点组成的Spark集群。 -下面简要介绍启动和关闭Spark集群的步骤(以下步骤均在命令行环境下完成,在工程根目录下执行) - -- 初始化工作 +在使用集群之前,需要先完成初始化
#[创建容器]
@@ -76,8 +74,21 @@ docker-compose up -d
docker-compose exec spark-master hdfs namenode -format
#[初始化Hive数据库。仅在第一次启动集群前执行一次]
docker-compose exec spark-master schematool -dbType mysql -initSchema
+#[将Spark相关的jar文件打包,存储在/code目录下,命名为spark-libs.jar]
+docker-compose exec spark-master jar cv0f /code/spark-libs.jar -C /root/spark/jars/ .
+#[启动HDFS]
+docker-compose exec spark-master start-dfs.sh
+#[在HDFS中创建/user/spark/share/lib/目录]
+docker-compose exec spark-master hadoop fs -mkdir -p /user/spark/share/lib/
+#[将/code/spark-libs.jar文件上传至HDFS下的/user/spark/share/lib/目录下]
+docker-compose exec spark-master hadoop fs -put /code/spark-libs.jar /user/spark/share/lib/
+#[关闭HDFS]
+docker-compose exec spark-master stop-dfs.sh
+###2.4 启动及停止集群
+
+下面简要介绍启动和关闭Spark集群的步骤(以下步骤均在命令行环境下完成,在工程根目录下执行)
- 启动集群进程,依次执行:
@@ -102,7 +113,7 @@ docker-compose exec spark-master stop-dfs.sh
docker-compose down
-###4. 开发与测试过程中的集群使用方法
+###2.5 开发与测试过程中的集群使用方法
目前集群中采用的是1个master节点和3个slave节点的分配方案,可以通过调整docker-compose配置文件以及相应软件的配置文件来实现集群扩容,暂时无法做到自动化扩容。
diff --git a/docker-compose-build-all.yml b/docker-compose-build-all.yml
index 22eba5e..2a67519 100755
--- a/docker-compose-build-all.yml
+++ b/docker-compose-build-all.yml
@@ -1,17 +1,21 @@
version: '2'
services:
os-jvm:
- build: ./os-jvm-docker/centos6-openjdk8
+ build: ./services/os-jvm-docker/centos6-openjdk8
image: twinsen/os-jvm:centos6-openjdk8
+ os7-jvm:
+ build: ./services/os-jvm-docker/centos7-openjdk8
+ image: twinsen/os-jvm:centos7-openjdk8
+
hadoop:
- build: ./hadoop-docker/2.7.2
+ build: ./services/hadoop-docker/2.7.2
image: twinsen/hadoop:2.7.2
hive-2.1.1:
- build: ./hive-docker/2.1.1
+ build: ./services/hive-docker/2.1.1
image: twinsen/hive:2.1.1
spark-2.1.0:
- build: ./spark-docker/2.1.0
+ build: ./services/spark-docker/2.1.0
image: twinsen/spark:2.1.0
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index e8335fc..234dca2 100755
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -7,6 +7,7 @@ services:
- "./volume/hadoop/work/slave1:/works"
- "./volume/hadoop/logs/slave1:/root/hadoop/logs/"
- "./volume/spark/logs/slave1:/root/spark/logs/"
+ - "./volume/hadoop/tmp/slave1:/tmp"
- "./volume/ro_data:/ro_data:ro"
hostname: hadoop-slave1
networks:
@@ -22,6 +23,7 @@ services:
- "./volume/hadoop/work/slave2:/works"
- "./volume/hadoop/logs/slave2:/root/hadoop/logs/"
- "./volume/spark/logs/slave2:/root/spark/logs/"
+ - "./volume/hadoop/tmp/slave2:/tmp"
- "./volume/ro_data:/ro_data:ro"
hostname: hadoop-slave2
networks:
@@ -37,6 +39,7 @@ services:
- "./volume/hadoop/work/slave3:/works"
- "./volume/hadoop/logs/slave3:/root/hadoop/logs/"
- "./volume/spark/logs/slave3:/root/spark/logs/"
+ - "./volume/hadoop/tmp/slave3:/tmp"
- "./volume/ro_data:/ro_data:ro"
hostname: hadoop-slave3
networks:
@@ -68,6 +71,7 @@ services:
- "./volume/hadoop/work/master:/works"
- "./volume/hadoop/logs/master:/root/hadoop/logs/"
- "./volume/spark/logs/master:/root/spark/logs/"
+ - "./volume/hadoop/tmp/master:/tmp"
- "./volume/code:/code"
- "./volume/ro_data:/ro_data:ro"
container_name: spark-master
diff --git a/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml b/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml
index a499909..35bed8e 100644
--- a/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml
+++ b/services/hadoop-docker/2.7.2/config/hadoop/core-site.xml
@@ -17,6 +17,10 @@