Merge pull request #97 from MyYuan/master

hongyanwang · web-flow · commit e7090a634d68 · 2022-06-01T15:46:58.000+08:00
update docs of ipfs&amp;dnn
diff --git a/docs/source/introduction/concepts.md b/docs/source/introduction/concepts.md
@@ -28,7 +28,7 @@ PaddleDTX中有两类任务：
 ## 算法
 PaddleDTX中的算法，一般指的是经过分布式改造的机器学习算法，即联邦学习算法。
 
-目前开源了**纵向联邦学习**算法，包括**多元线性回归**和**多元逻辑回归**。
+目前开源了**纵向联邦学习**算法，包括**多元线性回归**、**多元逻辑回归**和**神经网络**。
 
 ## 训练样本和预测数据集
 PaddleDTX中的训练样本和预测数据集都是以文件的形式存储于中心化存储网络，在发布训练任务或者预测任务的时候，由**计算需求节点**指定。
diff --git a/docs/source/others/issues.md b/docs/source/others/issues.md
@@ -26,11 +26,11 @@ A：实际业务应用中，用户可以按照需求搭建节点，存储节点
 
 **Q：当前模型训练算法什么支持哪些？**
 
-A：目前开源的有线性回归、逻辑回归的纵向联邦学习算法，后续会持续开源决策树、深度神经网络等纵向联邦算法，以及横向联邦学习算法，敬请关注。
+A：目前开源的有线性回归、逻辑回归、神经网络的纵向联邦学习算法，后续会持续开源决策树纵向联邦算法，以及横向联邦学习算法，敬请关注。
 
 **Q：去中心化存储XuperDB当前支持哪些存储引擎，是否支持IPFS呢？**
 
-A：当前仅支持本地文件系统方式，后续存储节点会支持NAS、NFS、IPFS等，针对IPFS的支持会在下一个版本开源。
+A：支持，已在2.0版本开源，用户可以通过修改存储节点的storage.mode下的type配置，选择存储方式，当前支持本地文件系统、IPFS方式。
 
 **Q：参与模型训练和预测的样本数据从哪来？数据使用需求方如何检索到所需数据？**
 
diff --git a/docs/source/others/ongoing.md b/docs/source/others/ongoing.md
@@ -2,7 +2,7 @@
 
 我们即将支持的主要功能如下:
 
-1. 支持更多的机器学习算法和对应的分布式改造，主要包括神经网络、决策树等；
+1. 支持更多的机器学习算法和对应的分布式改造，如决策树算法；
 2. 支持横向联邦学习算法，计划先对多元线性回归和多元逻辑回归进行改造；
 3. 优化目前使用的加法同态算法Paillier的性能；
 4. 去中心化存储服务支持负载均衡策略，根据存储节点剩余资源和以往表现情况，在文件分发时找到最优节点列表；
diff --git a/docs/source/projectcases/dnn-paddlefl.md b/docs/source/projectcases/dnn-paddlefl.md
@@ -24,17 +24,19 @@
 Usage:
   ./paddledtx_test.sh <mode> [-f <sample files>] [-m <model task id>] [-i <task id>]
     <mode> - one of 'upload_sample_files', 'start_vl_linear_train', 'start_vl_linear_predict', 'start_vl_logistic_train'
-         'start_vl_logistic_predict', 'tasklist', 'gettaskbyid'
+         'start_vl_logistic_predict','start_vl_dnn_train', 'start_vl_dnn_predict', 'tasklist', 'gettaskbyid'
       - 'upload_sample_files' - save linear and logistic sample files into XuperDB
       - 'start_vl_linear_train' - start vertical linear training task
       - 'start_vl_linear_predict' - start vertical linear prediction task
       - 'start_vl_logistic_train' - start vertical logistic training task
       - 'start_vl_logistic_predict' - start vertical logistic prediction task
-      - 'start_vl_dnn_train' - start vertical logistic training task
-      - 'start_vl_dnn_predict' - start vertical logistic prediction task
+      - 'start_vl_dnn_train' - start vertical paddlefl-dnn training task
+      - 'start_vl_dnn_predict' - start vertical paddlefl-dnn prediction task
       - 'tasklist' - list task in PaddleDTX
       - 'gettaskbyid' - get task by id from PaddleDTX
     -f <sample files> - linear or logistic sample files
+    -e <model evaluation> - whether to perform model evaluation on the training task, default false, if select true, the evaluate rule is 'Cross Validation'
+    -l <live model evaluation> - whether to perform live model evaluation, default false
     -m <model task id> - finished train task ID from which obtain the model, required for predict task
     -i <task id> - training or prediction task id
 
@@ -46,7 +48,7 @@ Usage:
   ./paddledtx_test.sh start_vl_logistic_train -f b31f53a5-0f8b-4f57-a7ea-956f1c7f7991,f3dddade-1f52-4b9e-9253-835e9fc81901
   ./paddledtx_test.sh start_vl_logistic_predict -f 1e97d684-722f-4798-aaf0-dffe955a94ba,b51a927c-f73e-4b8f-a81c-491b9e938b4d -m d8c8865c-a837-41fd-802b-8bd754b648eb
   ./paddledtx_test.sh start_vl_dnn_train -f 34cf2ee3-81b2-4865-907d-a9eab3c5b384,9dc7e0b7-18dd-4d5a-a3a1-6dace6d04fc8,3eaee2ea-4680-4b0b-bde3-ab4a4949159e
-  ./paddledtx_test.sh start_vl_dnn_predict -f c21b367f-2cb8-4859-87d8-18c52d397b13,043b9f55-68f6-4587-be8b-2340ea4432c2,b36442b6-ea3d-4530-910a-ec44291cd66c -m 91d9c0b7-996b-4954-86e8-95048e91a3b8
+  ./paddledtx_test.sh start_vl_dnn_predict -f 25ec6fd0-904e-4737-9bcc-c1cc11df1170,4442acae-90a2-4b92-b05f-cf1503c9d55e,73176b51-07f1-4f50-82c8-2d9d8908849b -m d8c8865c-a837-41fd-802b-8bd754b648eb
   ./paddledtx_test.sh gettaskbyid -i 9b3ff4be-bfcd-4520-a23b-4aa6ea4d59f1
   ./paddledtx_test.sh tasklist
 ```
diff --git a/docs/source/projectcases/linear.md b/docs/source/projectcases/linear.md
@@ -43,15 +43,19 @@
 Usage:
   ./paddledtx_test.sh <mode> [-f <sample files>] [-m <model task id>] [-i <task id>]
     <mode> - one of 'upload_sample_files', 'start_vl_linear_train', 'start_vl_linear_predict', 'start_vl_logistic_train'
-         'start_vl_logistic_predict', 'tasklist', 'gettaskbyid'
+         'start_vl_logistic_predict','start_vl_dnn_train', 'start_vl_dnn_predict', 'tasklist', 'gettaskbyid'
       - 'upload_sample_files' - save linear and logistic sample files into XuperDB
       - 'start_vl_linear_train' - start vertical linear training task
       - 'start_vl_linear_predict' - start vertical linear prediction task
       - 'start_vl_logistic_train' - start vertical logistic training task
       - 'start_vl_logistic_predict' - start vertical logistic prediction task
+      - 'start_vl_dnn_train' - start vertical paddlefl-dnn training task
+      - 'start_vl_dnn_predict' - start vertical paddlefl-dnn prediction task
       - 'tasklist' - list task in PaddleDTX
       - 'gettaskbyid' - get task by id from PaddleDTX
     -f <sample files> - linear or logistic sample files
+    -e <model evaluation> - whether to perform model evaluation on the training task, default false, if select true, the evaluate rule is 'Cross Validation'
+    -l <live model evaluation> - whether to perform live model evaluation, default false
     -m <model task id> - finished train task ID from which obtain the model, required for predict task
     -i <task id> - training or prediction task id
 
@@ -62,6 +66,8 @@ Usage:
   ./paddledtx_test.sh start_vl_linear_predict -f cb40b8ad-db08-447f-a9d9-628b69d01660,2a8a45ab-3c5d-482e-b945-bc45b7e28bf9 -m 9b3ff4be-bfcd-4520-a23b-4aa6ea4d59f1
   ./paddledtx_test.sh start_vl_logistic_train -f b31f53a5-0f8b-4f57-a7ea-956f1c7f7991,f3dddade-1f52-4b9e-9253-835e9fc81901
   ./paddledtx_test.sh start_vl_logistic_predict -f 1e97d684-722f-4798-aaf0-dffe955a94ba,b51a927c-f73e-4b8f-a81c-491b9e938b4d -m d8c8865c-a837-41fd-802b-8bd754b648eb
+  ./paddledtx_test.sh start_vl_dnn_train -f 34cf2ee3-81b2-4865-907d-a9eab3c5b384,9dc7e0b7-18dd-4d5a-a3a1-6dace6d04fc8,3eaee2ea-4680-4b0b-bde3-ab4a4949159e
+  ./paddledtx_test.sh start_vl_dnn_predict -f 25ec6fd0-904e-4737-9bcc-c1cc11df1170,4442acae-90a2-4b92-b05f-cf1503c9d55e,73176b51-07f1-4f50-82c8-2d9d8908849b -m d8c8865c-a837-41fd-802b-8bd754b648eb
   ./paddledtx_test.sh gettaskbyid -i 9b3ff4be-bfcd-4520-a23b-4aa6ea4d59f1
   ./paddledtx_test.sh tasklist
 ```
diff --git a/docs/source/quickstart/compile-install.md b/docs/source/quickstart/compile-install.md
@@ -131,10 +131,19 @@ PaddleDTX 使用 golang 进行开发，当您使用源码进行编译和安装
 		        userName = "Admin"
 		        orgName = "org1"
 
+		[storage.prover]
+			localRoot = "/root/xdb/data/prove"
+
 		[storage.mode]
-		    type = "local"
-		    [storage.mode.local]
-		        rootPath = "./slices"
+			type = "local"
+			[storage.mode.local]
+				rootPath = "/root/xdb/data/slices"
+			[storage.mode.ipfs]
+				hosts = [
+					"127.0.0.1:5001",
+					"127.0.0.1:5002"
+				]
+				timeout = 5000
 
 		[storage.monitor]
 		    challengingSwitch = "on"
@@ -145,7 +154,7 @@ PaddleDTX 使用 golang 进行开发，当您使用源码进行编译和安装
 		level = "debug"
 		path = "./logs"
 		```
-		其中，listenAddress和publicAddress 指定服务监听的地址及对外暴露的地址，blockchain配置中使用区块链网络部署时创建的账户助记词、合约账户及合约名，rootPath指定文件存储的本地路径。
+		其中，listenAddress和publicAddress 指定服务监听的地址及对外暴露的地址，blockchain配置中指定了使用区块链网络部署时创建的账户助记词、合约账户及合约名，storage.mode定义了存储节点采取的存储方式，支持本地文件系统和ipfs方式。
 
 		启动服务：
 		```
@@ -246,6 +255,7 @@ PaddleDTX 使用 golang 进行开发，当您使用源码进行编译和安装
 	请妥善保存您创建的公私钥对，在后续的配置及命令行使用时您将会频繁的用到它。
 	!!! note ""
 		注意: 任务发布后时，任务执行节点会向数据持有节点发起文件授权申请，数据持有节点可通过或拒绝样本文件授权申请。
+		当前开源的多元线性回归、多元逻辑回归算法支持两个任务执行节点，神经网络算法需要三个任务执行节点，如果需要使用神经网络，请部署3个任务执行节点。
 
 	1. 准备两个任务执行节点的配置
 		```
@@ -261,6 +271,10 @@ PaddleDTX 使用 golang 进行开发，当您使用源码进行编译和安装
 			# executor1
 			listenAddress = ":8184"
 			publicAddress = "127.0.0.1:8184"
+			#定义PaddleFL运行所需的容器地址
+			paddleFLAddress = "paddlefl-env1:38302"
+			paddleFLRole = 0
+
 			# genkey创建的私钥
 			keyPath = "./keys"
 			
@@ -293,6 +307,9 @@ PaddleDTX 使用 golang 进行开发，当您使用源码进行编译和安装
 			# executor2
 			listenAddress = ":8185"
 			publicAddress = "127.0.0.1:8185"
+			#定义PaddleFL运行所需的容器地址
+			paddleFLAddress = "paddlefl-env1:38303"
+			paddleFLRole = 2
 			# genkey创建的私钥
 			keyPath = "./keys"
 
diff --git a/docs/source/quickstart/quickstart.md b/docs/source/quickstart/quickstart.md
@@ -15,10 +15,14 @@
 $ git clone git@github.com:PaddlePaddle/PaddleDTX.git
 $ cd PaddleDTX/scripts
 $ sh network_up.sh start
-$ # 支持三方的DNN 算法，需要启动 PaddleFL 的节点，执行如下命令代替上述命令
-$ # sh network_up.sh start -p true
+
+# 支持启动基于ipfs存储网络的DAI，命令如下：
+$ sh network_up.sh start -s ipfs
+# 支持三方的DNN 算法，需要启动 PaddleFL 的节点，执行如下命令代替上述命令：
+$ sh network_up.sh start -p true
 ```
 
+
 使用脚本也可以快速销毁网络：
 ```
 $ sh network_up.sh stop
@@ -30,32 +34,38 @@ $ sh network_up.sh stop
 
     我们推荐用户采用Linux环境安装，若采用Mac启动，需修改docker运行资源限制，设置较高的Cpus（>4）、Memory（>4GB）、Swap（>4GB）。
 
-    网络启动成功后，可通过docker ps查看脚本启动的服务，共包含3个区块链节点、2个数据持有节点、3个存储节点、2个可信计算节点。
+    网络启动成功后，可通过docker ps查看脚本启动的服务，共包含3个区块链节点、3个数据持有节点、3个存储节点、3个可信计算节点，如果用户采用`sh network_up.sh start -s ipfs -p true`命令启动，则会再启动一个ipfs节点和3个paddlefl节点。
 
     如果用户无需进行模型训练，可以选择只启动去中心化存储网络（Xuperdb），参考 [XuperDB 服务启动和命令使用说明](https://github.com/PaddlePaddle/PaddleDTX/tree/master/xdb/scripts)：
     ``` shell
 
         # 启动基于Xchain的Xuperdb
         $ cd PaddleDTX/xdb/scripts
-        $ sh network_up.sh start
+        $ sh network_up.sh start -b xchain
 
         # 启动基于Fabric网络的Xuperdb
         $ cd PaddleDTX/xdb/scripts
-        $ sh network_up.sh start fabric
+        $ sh network_up.sh start -b fabric
+
+        # 启动采用ipfs存储网络的Xuperdb
+        $ cd PaddleDTX/xdb/scripts
+        $ sh network_up.sh start -b xchain -s ipfs 
     ```
 
 ### 1.3 任务发布和执行
 ./paddledtx_test.sh脚本提供了多种快捷操作，方便用户文件上传、下载、发布训练和预测任务等，快捷命令如下：
 ``` shell
-Usage: 
+Usage:
   ./paddledtx_test.sh <mode> [-f <sample files>] [-m <model task id>] [-i <task id>]
     <mode> - one of 'upload_sample_files', 'start_vl_linear_train', 'start_vl_linear_predict', 'start_vl_logistic_train'
-         'start_vl_logistic_predict', 'tasklist', 'gettaskbyid'
+         'start_vl_logistic_predict','start_vl_dnn_train', 'start_vl_dnn_predict', 'tasklist', 'gettaskbyid'
       - 'upload_sample_files' - save linear and logistic sample files into XuperDB
       - 'start_vl_linear_train' - start vertical linear training task
       - 'start_vl_linear_predict' - start vertical linear prediction task
       - 'start_vl_logistic_train' - start vertical logistic training task
       - 'start_vl_logistic_predict' - start vertical logistic prediction task
+      - 'start_vl_dnn_train' - start vertical paddlefl-dnn training task
+      - 'start_vl_dnn_predict' - start vertical paddlefl-dnn prediction task
       - 'tasklist' - list task in PaddleDTX
       - 'gettaskbyid' - get task by id from PaddleDTX
     -f <sample files> - linear or logistic sample files
@@ -71,26 +81,30 @@ Usage:
   ./paddledtx_test.sh start_vl_linear_predict -f cb40b8ad-db08-447f-a9d9-628b69d01660,2a8a45ab-3c5d-482e-b945-bc45b7e28bf9 -m 9b3ff4be-bfcd-4520-a23b-4aa6ea4d59f1
   ./paddledtx_test.sh start_vl_logistic_train -f b31f53a5-0f8b-4f57-a7ea-956f1c7f7991,f3dddade-1f52-4b9e-9253-835e9fc81901
   ./paddledtx_test.sh start_vl_logistic_predict -f 1e97d684-722f-4798-aaf0-dffe955a94ba,b51a927c-f73e-4b8f-a81c-491b9e938b4d -m d8c8865c-a837-41fd-802b-8bd754b648eb
+  ./paddledtx_test.sh start_vl_dnn_train -f 34cf2ee3-81b2-4865-907d-a9eab3c5b384,9dc7e0b7-18dd-4d5a-a3a1-6dace6d04fc8,3eaee2ea-4680-4b0b-bde3-ab4a4949159e
+  ./paddledtx_test.sh start_vl_dnn_predict -f 25ec6fd0-904e-4737-9bcc-c1cc11df1170,4442acae-90a2-4b92-b05f-cf1503c9d55e,73176b51-07f1-4f50-82c8-2d9d8908849b -m d8c8865c-a837-41fd-802b-8bd754b648eb
   ./paddledtx_test.sh gettaskbyid -i 9b3ff4be-bfcd-4520-a23b-4aa6ea4d59f1
   ./paddledtx_test.sh tasklist
 ```
 !!! note "说明"
 
     用户可通过cat ./paddledtx_test.sh查看脚本默认创建的文件存储命名空间、上传文件列表等，如有额外需求，可自定义配置；
 
-    脚本执行的 start_vl_linear_train、start_vl_linear_predict、start_vl_logistic_train、start_vl_logistic_train 命令，本质为用户展示了波士顿房价预测与鸢尾花分类的项目案例，参考 [项目案例](../projectcases/linear.md)
+    脚本执行的 start_vl_linear_train、start_vl_linear_predict、start_vl_logistic_train、start_vl_logistic_train、start_vl_dnn_train、start_vl_dnn_predic 命令，本质为用户展示了多元线性回归、多元逻辑回归和神经网络算法的项目案例，参考 [项目案例](../projectcases/linear.md)
 
 1. 上传训练及预测样本文件
    ```shell
-   # upload_sample_files会为数据持有节点A/B创建数据存储的命名空间，并上传任务训练和预测所需的样本文件
-   # 该命令共上传了8个文件，包括数据持有方A/B发布纵向线性回归、纵向逻辑回归训练和预测任务所需的文件
+   # upload_sample_files会为数据持有节点A/B/C创建数据存储的命名空间，并上传任务训练和预测所需的样本文件
+   # 该命令共上传了14个文件，包括数据持有方A/B发布纵向线性回归、纵向逻辑回归训练和预测任务所需的8个样本文件，数据持有方A/B/C发布纵向深度神经网络训练和预测任务所需的6个样本文件
    ./paddledtx_test.sh upload_sample_files
 
    # 执行后，命令返回：
    # Vertical linear train sample files：纵向线性训练任务所需样本ID
    # Vertical linear prediction sample files：纵向线性预测任务所需样本ID
    # Vertical logistic train sample files：纵向逻辑回归训练任务所需样本ID
    # Vertical logistic prediction sample files：纵向逻辑回归预测任务所需样本ID
+   # PaddleFL train sample files：纵向深度神经网络训练任务所需样本ID
+   # PaddleFL prediction sample files：纵向深度神经网络预测任务所需样本ID
    ```
 
 2. 启动纵向线性回归训练任务，$vlLinTrainfiles 取值为 **步骤1** 获取到的 Vertical linear train sample files
diff --git a/docs/source/tutorial/dai-config.md b/docs/source/tutorial/dai-config.md
@@ -43,6 +43,12 @@ listenAddress = ":8184"
 # If your network mode is 'host', it is the machine's ip and the port in [server].listenAddress in before section.
 publicAddress = "10.144.94.17:8184"
 
+# PaddleFLAddress is the endpoint of the container which has a runninng environment of PaddleFL.
+# Containers belong to different executors constitute a mpc network
+paddleFLAddress = "paddlefl-env1:38302"
+# PaddleFLRole is the role of the container in paddlefl mpc network.
+paddleFLRole = 0
+
 # The private key of the trusted computing server.
 # Different key express different identity.
 # Only need to choose one from 'privateKey' and 'keyPath', and if both exist, 'privateKey' takes precedence over 'keyPath'
@@ -132,7 +138,7 @@ path = "./logs"
 
 !!! note "配置说明"
 
-    1. 任务执行节点中配置了节点启动所需监听的端口、身份等信息；
+    1. 任务执行节点中配置了节点启动所需监听的端口、身份等信息，paddleFLAddress定义了运行神经网络算法所需的容器地址；
     2. executor.mode 用于指定节点的计算方式，支持代理和自主计算模式，代理模式用于数据持有节点将样本数据授权给任务执行节点进行代理计算，而自主计算模式则适用于计算节点是数据持有节点的客户端场景；
     3. executor.storage 定义了模型、评估结果、预测结果存储的路径，其中预测结果存储支持加密存储到去中心化存储网络；
     4. executor.blockchain 定义了任务执行节点操作的区块链网络配置，当前只支持Xchain网络，后续会支持Fabric；
diff --git a/docs/source/tutorial/xdb-config.md b/docs/source/tutorial/xdb-config.md
@@ -159,12 +159,26 @@ publicAddress = "10.144.94.17:8122"
         userName = "Admin"
         orgName = "org1"
 
-# The storage mode used by the storage node, currently only supports local file system.
+# Prover answers challenges from DataOwner to prove that the node is storing the slices
+[storage.prover]
+    # local storage path to keep temporary data
+    localRoot = "/root/xdb/data/prove"
+
+# The storage mode used by the storage node, currently supports local file system and IPFS.
 [storage.mode]
+    # Denotes what mode you choose, `local` or `ipfs`. 
     type = "local"
     [storage.mode.local]
         # Location of file fragments
         rootPath = "/root/xdb/data/slices"
+    [storage.mode.ipfs]
+        # Denotes peers in IPFS cluster
+        hosts = [
+            "127.0.0.1:5001",
+            "127.0.0.1:5002"
+        ]
+        # The timeout for requesting IPFS, in milliseconds
+        timeout = 5000
 
 # The monitor will query new tasks in blockchain regularly, and trigger the task handler's operations
 [storage.monitor]
@@ -190,8 +204,9 @@ path = "./logs"
 !!! note "配置说明"
 
     1. storage.blockchain 定义了节点操作区块链网络所需的配置，当前支持Xchain、Fabric网络；
-    2. storage.mode 用于指定存储节点的存储方式，当前仅支持本地文件系统方式存储，后续持续支持Ipfs、Nas等；
-    3. storage.monitor 用于存储节点开启心跳检测、配置文件清理时间间隔等；
+    2. storage.prover 用于指定挑战应答时保存临时数据的本地存储路径；
+    3. storage.mode 用于指定存储节点的存储方式，当前支持本地文件系统和ipfs方式存储；
+    4. storage.monitor 用于存储节点开启心跳检测、配置文件清理时间间隔等；
 
 
 <br>
diff --git a/scripts/network_up.sh b/scripts/network_up.sh
diff --git a/scripts/paddledtx_test.sh b/scripts/paddledtx_test.sh