From 6bd9ddc673ed89c6091978e85512a035577d82b5 Mon Sep 17 00:00:00 2001 From: liuhaichaogithub <40063899+liuhaichaogithub@users.noreply.github.com> Date: Tue, 13 Jun 2023 11:19:50 +0800 Subject: [PATCH 1/2] Update README.md (#890) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在文档示例中 docker exec -it confs-10000_client_1 bash 命令错误,根据实际部署情况发现 docker 的容器名为 confs-10000-client-1 Signed-off-by: liuhaichaogithub <40063899+liuhaichaogithub@users.noreply.github.com> --- docker-deploy/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-deploy/README.md b/docker-deploy/README.md index 71e04b4e1..72715e0c9 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -197,7 +197,7 @@ aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s On the target node of each party, a container named `confs-_fateflow_1` should have been created and running the `fate-flow` service. For example, on Party 10000's node, run the following commands to verify the deployment: ```bash -docker exec -it confs-10000_client_1 bash +docker exec -it confs-10000-client-1 bash flow test toy --guest-party-id 10000 --host-party-id 9999 ``` From e552f5ab2554f1916a88773d2d4010b8f4627200 Mon Sep 17 00:00:00 2001 From: Chenlong Ma Date: Mon, 3 Jul 2023 17:55:08 +0800 Subject: [PATCH 2/2] Support FATE v1.11.2 (#898) * Update version tag Signed-off-by: Chenlong Ma * update FATE config Signed-off-by: Chenlong Ma * Add all algorithm adaptations add fix container permissions issue Signed-off-by: Chenlong Ma * Fix docker_deploy.sh --delete when serving_ip_list does not exist Signed-off-by: Chenlong Ma * Update doc of docker compose Signed-off-by: Chenlong Ma * Update chart support llm Signed-off-by: Chenlong Ma * Update fluentd to fluent-bit Signed-off-by: Chenlong Ma * Update docs, fixed #892 Signed-off-by: Chenlong Ma * Fix spark image suffix Signed-off-by: Chenlong Ma * Remove the LLM tag Signed-off-by: Chenlong Ma * add volume for llm Signed-off-by: Chenlong Ma --------- Signed-off-by: Chenlong Ma --- docker-deploy/.env | 2 +- docker-deploy/README.md | 37 +-- docker-deploy/README_zh.md | 82 +++++-- docker-deploy/docker_deploy.sh | 12 + docker-deploy/generate_config.sh | 27 +-- docker-deploy/parties.conf | 7 +- .../eggroll/conf/applicationContext-proxy.xml | 2 + .../backends/eggroll/conf/eggroll.properties | 19 +- .../backends/eggroll/conf/log4j2.properties | 32 ++- .../backends/eggroll/conf/whitelist.json | 46 ++-- .../docker-compose-eggroll.yml | 9 +- .../docker-compose-spark-slim.yml | 3 +- .../docker-compose-spark.yml | 3 +- .../public/client/pipeline_conf.yaml | 3 - .../public/fate_flow/conf/service_conf.yaml | 10 + .../fateboard/conf/application.properties | 6 +- .../mysql/init/create-eggroll-meta-tables.sql | 222 +++++++++++------ ...nd_Computational_Acceleration_Selection.md | 13 + docs/Manage_FATE_and_FATE-Serving_Version.md | 8 +- ...ster_in_One_Linux_Machine_with_MiniKube.md | 18 +- ...r_in_One_Linux_Machine_with_MiniKube_zh.md | 18 +- helm-charts/FATE-Exchange/Chart.yaml | 4 +- .../values-template-example.yaml | 2 +- helm-charts/FATE-Exchange/values.yaml | 2 +- helm-charts/FATE/Chart.yaml | 4 +- .../templates/backends/eggroll/_helpers.tpl | 3 + .../templates/backends/eggroll/configmap.yaml | 19 +- .../eggroll/nodemanager/statefulSet.yaml | 13 +- .../backends/eggroll/rollsite/deployment.yaml | 2 +- .../templates/backends/spark/_helpers.tpl | 3 + helm-charts/FATE/templates/core/_helpers.tpl | 3 + .../templates/core/fateboard/configmap.yaml | 3 + .../templates/core/fateflow/configmap.yaml | 30 ++- .../FATE/templates/core/mysql/configmap.yaml | 224 +++++++++++------- .../templates/core/mysql/statefulSet.yaml | 3 + .../FATE/templates/core/python-spark.yaml | 7 +- helm-charts/FATE/values-template-example.yaml | 9 +- helm-charts/FATE/values-template.yaml | 2 +- helm-charts/FATE/values.yaml | 9 +- helm-charts/UpgradeManager/values.yaml | 4 +- k8s-deploy/README.md | 8 +- k8s-deploy/README_zh.md | 6 +- k8s-deploy/cluster-spark-pulsar.yaml | 8 +- k8s-deploy/cluster-spark-rabbitmq.yaml | 4 +- k8s-deploy/cluster-spark-slim.yaml | 4 +- k8s-deploy/cluster.yaml | 4 +- k8s-deploy/examples/README.md | 2 +- .../examples/party-10000/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-10000/cluster-spark-pulsar.yaml | 2 +- .../party-10000/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-10000/cluster.yaml | 2 +- .../examples/party-9999/cluster-gpu.yaml | 2 +- .../cluster-spark-local-pulsar.yaml | 2 +- .../party-9999/cluster-spark-pulsar.yaml | 2 +- .../party-9999/cluster-spark-rabbitmq.yaml | 2 +- k8s-deploy/examples/party-9999/cluster.yaml | 2 +- .../examples/party-exchange/rollsite.yaml | 2 +- .../party-exchange/trafficServer.yaml | 2 +- k8s-deploy/examples/party.config | 4 +- k8s-deploy/pkg/job/cluster_install.go | 7 + registry/README.md | 2 +- 62 files changed, 667 insertions(+), 331 deletions(-) delete mode 100644 docker-deploy/training_template/public/client/pipeline_conf.yaml diff --git a/docker-deploy/.env b/docker-deploy/.env index a8c9bd219..48a9ffbc4 100644 --- a/docker-deploy/.env +++ b/docker-deploy/.env @@ -1,5 +1,5 @@ RegistryURI= -TAG=1.11.1-release +TAG=1.11.2-release SERVING_TAG=2.1.6-release SSH_PORT=22 diff --git a/docker-deploy/README.md b/docker-deploy/README.md index 72715e0c9..d673cb483 100644 --- a/docker-deploy/README.md +++ b/docker-deploy/README.md @@ -10,7 +10,7 @@ The nodes (target nodes) to install FATE must meet the following requirements: 2. Docker: 19.03.0+ 3. Docker Compose: 1.27.0+ 4. The deployment machine have access to the Internet, so the hosts can communicate with each other; -5. Network connection to Internet to pull container images from Docker Hub. If network connection to Internet is not available, consider to set up [Harbor as a local registry](../registry/README.md) or use [offline images](https://github.com/FederatedAI/FATE/tree/master/build/docker-build). +5. Network connection to Internet to pull container images from Docker Hub. If network connection to Internet is not available, consider to set up [Harbor as a local registry](../registry/README.md) or use [offline images](https://github.com/FederatedAI/FATE-Builder/tree/main/docker-build). 6. A host running FATE is recommended to be with 8 CPUs and 16G RAM. ## Deploying FATE @@ -175,21 +175,30 @@ bash ./docker_deploy.sh 10000 bash ./docker_deploy.sh exchange ``` -Once the commands finish, log in to any host and use `docker ps` to verify the status of the cluster. A sample output is as follows: +Once the commands finish, log in to any host and use `docker compose ps` to verify the status of the cluster. A sample output is as follows: ```bash -CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -5d2e84ba4c77 federatedai/serving-server:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp serving-9999_serving-server_1 -3dca43f3c9d5 federatedai/serving-admin:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8350->8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 -fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 -b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.11.1-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.11.1-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.11.1-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 -aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 +ssh fate@192.168.7.1 +``` + +Verify the instance status using the following command, + +```bash +cd /data/projects/fate/confs-10000 +docker compose ps +```` + +The output is shown as follows. If the status of each component is `Up`, and the status of fateflow is still (healthy), it means that the deployment is successful. + +```bash +NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS +confs-10000-client-1 federatedai/client:1.11.2-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:1.11.2-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:1.11.2-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp +confs-10000-nodemanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-rollsite-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### Verifying the deployment diff --git a/docker-deploy/README_zh.md b/docker-deploy/README_zh.md index 20da699f2..5cabac41a 100644 --- a/docker-deploy/README_zh.md +++ b/docker-deploy/README_zh.md @@ -20,7 +20,7 @@ Compose是用于定义和运行多容器Docker应用程序的工具。通过Comp 2. 所有主机安装Docker 版本 : 19.03.0+; 3. 所有主机安装Docker Compose 版本: 1.27.0+; 4. 部署机可以联网,所以主机相互之间可以网络互通; -5. 运行机已经下载FATE的各组件镜像,如果无法连接dockerhub,请考虑使用harbor([Harbor 作为本地镜像源](../registry/README.md))或者使用离线部署(离线构建镜像参考文档[构建镜像](https://github.com/FederatedAI/FATE/tree/master/build/docker-build))。 +5. 运行机已经下载FATE的各组件镜像,如果无法连接dockerhub,请考虑使用harbor([Harbor 作为本地镜像源](../registry/README.md))或者使用离线部署(离线构建镜像参考文档[构建镜像]( https://github.com/FederatedAI/FATE-Builder/tree/main/docker-build))。 6. 运行FATE的主机推荐配置8CPUs和16G RAM。 ### 下载部署脚本 @@ -171,44 +171,73 @@ FATE GPU的使用只有fateflow组件,所以每个Party最少需要有一个GP ### 执行部署脚本 +**注意:**在运行以下命令之前,所有目标主机必须 + +* 允许使用 SSH 密钥进行无密码 SSH 访问(否则我们将需要为每个主机多次输入密码)。 +* 满足 [准备工作](#准备工作) 中指定的要求。 + +要将 FATE 部署到所有已配置的目标主机,请使用以下命令: + 以下修改可在任意机器执行。 进入目录`kubeFATE\docker-deploy`,然后运行: ```bash bash ./generate_config.sh # 生成部署文件 -bash ./docker_deploy.sh all # 在各个party上部署FATE ``` 脚本将会生成10000、9999两个组织(Party)的部署文件,然后打包成tar文件。接着把tar文件`confs-.tar`、`serving-.tar`分别复制到party对应的主机上并解包,解包后的文件默认在`/data/projects/fate`目录下。然后脚本将远程登录到这些主机并使用docker compose命令启动FATE实例。 -命令成功执行返回后,登录其中任意一个主机: +默认情况下,脚本会同时启动训练和服务集群。 如果您需要单独启动它们,请将 `--training` 或 `--serving` 添加到 `docker_deploy.sh` 中,如下所示。 + +(可选)要部署各方训练集群,请使用以下命令: + +```bash +bash ./docker_deploy.sh all --training +``` + +(可选)要部署各方服务集群,请使用以下命令: + +```bash +bash ./docker_deploy.sh all --serving +``` + +(可选)要将 FATE 部署到单个目标主机,请使用以下命令和参与方的 ID(下例中为 10000): + +```bash +bash ./docker_deploy.sh 10000 +``` + +(可选)要将交换节点部署到目标主机,请使用以下命令: ```bash -ssh root@192.168.7.1 +bash ./docker_deploy.sh exchange +``` + +命令完成后,登录到任何主机并使用 `docker compose ps` 来验证集群的状态。 示例输出如下: + +```bash +ssh fate@192.168.7.1 ``` 使用以下命令验证实例状态, ```bash -docker ps -```` +cd /data/projects/fate/confs-10000 +docker compose ps +``` -输出显示如下,若各个组件都是运行(up)状态,说明部署成功。 +输出显示如下,若各个组件状态都是`Up`状态,并且fateflow的状态还是(healthy),说明部署成功。 ```bash -CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -5d2e84ba4c77 federatedai/serving-server:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8000->8000/tcp, :::8000->8000/tcp serving-9999_serving-server_1 -3dca43f3c9d5 federatedai/serving-admin:2.1.5-release "/bin/sh -c 'java -c…" 5 minutes ago Up 5 minutes 0.0.0.0:8350->8350/tcp, :::8350->8350/tcp serving-9999_serving-admin_1 -fe924918509b federatedai/serving-proxy:2.1.5-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8059->8059/tcp, :::8059->8059/tcp, 0.0.0.0:8869->8869/tcp, :::8869->8869/tcp, 8879/tcp serving-9999_serving-proxy_1 -b62ed8ba42b7 bitnami/zookeeper:3.7.0 "/opt/bitnami/script…" 5 minutes ago Up 5 minutes 0.0.0.0:2181->2181/tcp, :::2181->2181/tcp, 8080/tcp, 0.0.0.0:49226->2888/tcp, :::49226->2888/tcp, 0.0.0.0:49225->3888/tcp, :::49225->3888/tcp serving-9999_serving-zookeeper_1 -3c643324066f federatedai/client:1.11.1-release "/bin/sh -c 'flow in…" 5 minutes ago Up 5 minutes 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp confs-9999_client_1 -3fe0af1ebd71 federatedai/fateboard:1.11.1-release "/bin/sh -c 'java -D…" 5 minutes ago Up 5 minutes 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp confs-9999_fateboard_1 -635b7d99357e federatedai/fateflow:1.11.1-release "container-entrypoin…" 5 minutes ago Up 5 minutes (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 8080/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp confs-9999_fateflow_1 -8b515f08add3 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 8080/tcp, 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp confs-9999_rollsite_1 -108cc061c191 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4670/tcp, 8080/tcp confs-9999_clustermanager_1 -f10575e76899 federatedai/eggroll:1.11.1-release "/tini -- bash -c 'j…" 5 minutes ago Up 5 minutes 4671/tcp, 8080/tcp confs-9999_nodemanager_1 -aa0a0002de93 mysql:8.0.28 "docker-entrypoint.s…" 5 minutes ago Up 5 minutes 3306/tcp, 33060/tcp confs-9999_mysql_1 +NAME IMAGE COMMAND SERVICE CREATED STATUS PORTS +confs-10000-client-1 federatedai/client:1.11.2-release "bash -c 'pipeline i…" client About a minute ago Up About a minute 0.0.0.0:20000->20000/tcp, :::20000->20000/tcp +confs-10000-clustermanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" clustermanager About a minute ago Up About a minute 4670/tcp +confs-10000-fateboard-1 federatedai/fateboard:1.11.2-release "/bin/sh -c 'java -D…" fateboard About a minute ago Up About a minute 0.0.0.0:8080->8080/tcp, :::8080->8080/tcp +confs-10000-fateflow-1 federatedai/fateflow:1.11.2-release "/bin/bash -c 'set -…" fateflow About a minute ago Up About a minute (healthy) 0.0.0.0:9360->9360/tcp, :::9360->9360/tcp, 0.0.0.0:9380->9380/tcp, :::9380->9380/tcp +confs-10000-mysql-1 mysql:8.0.28 "docker-entrypoint.s…" mysql About a minute ago Up About a minute 3306/tcp, 33060/tcp +confs-10000-nodemanager-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" nodemanager About a minute ago Up About a minute 4671/tcp +confs-10000-rollsite-1 federatedai/eggroll:1.11.2-release "/tini -- bash -c 'j…" rollsite About a minute ago Up About a minute 0.0.0.0:9370->9370/tcp, :::9370->9370/tcp ``` ### 验证部署 @@ -218,9 +247,12 @@ docker-compose上的FATE启动成功之后需要验证各个服务是否都正 选择192.168.7.1这个节点验证,使用以下命令验证: ```bash -#在192.168.7.1上执行下列命令 -$ docker exec -it confs-10000_client_1 bash #进入client组件容器内部 -$ flow test toy --guest-party-id 10000 --host-party-id 9999 #验证 +# 在192.168.7.1上执行下列命令 + +# 进入client组件容器内部 +$ docker compose exec client bash +# toy 验证 +$ flow test toy --guest-party-id 10000 --host-party-id 9999 ``` 如果测试通过,屏幕将显示类似如下消息: @@ -243,7 +275,8 @@ $ flow test toy --guest-party-id 10000 --host-party-id 9999 #验证 ##### 进入party10000 client容器 ```bash -docker exec -it confs-10000_client_1 bash +cd /data/projects/fate/confs-10000 +docker compose exec client bash ``` ##### 上传host数据 @@ -257,7 +290,8 @@ flow data upload -c fateflow/examples/upload/upload_host.json ##### 进入party9999 client容器 ```bash -docker exec -it confs-9999_client_1 bash +cd /data/projects/fate/confs-9999 +docker compose exec client bash ``` ##### 上传guest数据 diff --git a/docker-deploy/docker_deploy.sh b/docker-deploy/docker_deploy.sh index 7c4c10a39..ddd4f3b24 100755 --- a/docker-deploy/docker_deploy.sh +++ b/docker-deploy/docker_deploy.sh @@ -166,6 +166,9 @@ cd confs-$target_party_id docker compose down docker volume rm -f confs-${target_party_id}_shared_dir_examples docker volume rm -f confs-${target_party_id}_shared_dir_federatedml +docker volume rm -f confs-${target_party_id}_sdownload_dir +docker volume rm -f confs-${target_party_id}_fate_flow_logs + docker compose up -d cd ../ rm -f confs-${target_party_id}.tar @@ -239,6 +242,8 @@ DeleteCluster() { fi done fi + + # echo "target_party_ip: $target_party_ip" for ((i = 0; i < ${#party_list[*]}; i++)); do if [ "${party_list[$i]}" = "$target_party_id" ]; then @@ -246,6 +251,9 @@ DeleteCluster() { fi done + # echo "target_party_ip: $target_party_ip" + # echo "cluster_type: $cluster_type" + # delete training cluster if [ "$cluster_type" == "--training" ]; then ssh -p ${SSH_PORT} -tt $user@$target_party_ip <#jdbc:mysql://${db_ip}:3306/${db_name}?useSSL=false\&serverTimezone=UTC\&characterEncoding=utf8\&allowPublicKeyRetrieval=true#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties + sed -i "s##jdbc:mysql://${db_ip}:3306/${db_name}?useSSL=false\&serverTimezone=${db_serverTimezone}\&characterEncoding=utf8\&allowPublicKeyRetrieval=true#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties sed -i "s##${db_user}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties sed -i "s##${db_password}#g" ./confs-$party_id/confs/eggroll/conf/eggroll.properties @@ -205,10 +206,10 @@ GenerateConfig() { # federation if [ "$federation" == "RabbitMQ" ]; then cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ - sed -i '201,215d' confs-$party_id/docker-compose.yml + sed -i '200,214d' confs-$party_id/docker-compose.yml elif [ "$federation" == "Pulsar" ]; then cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ - sed -i '182,199d' confs-$party_id/docker-compose.yml + sed -i '181,198d' confs-$party_id/docker-compose.yml fi fi fi @@ -223,10 +224,10 @@ GenerateConfig() { # federation if [ "$federation" == "RabbitMQ" ]; then cp -r training_template/backends/spark/rabbitmq confs-$party_id/confs/ - sed -i '147,159d' confs-$party_id/docker-compose.yml + sed -i '146,160d' confs-$party_id/docker-compose.yml elif [ "$federation" == "Pulsar" ]; then cp -r training_template/backends/spark/pulsar confs-$party_id/confs/ - sed -i '127,143d' confs-$party_id/docker-compose.yml + sed -i '128,144d' confs-$party_id/docker-compose.yml fi fi fi @@ -245,6 +246,8 @@ GenerateConfig() { # algorithm if [ "$algorithm" == "NN" ]; then Suffix=$Suffix"-nn" + elif [ "$algorithm" == "ALL" ]; then + Suffix=$Suffix"-all" fi # device if [ "$device" == "IPCL" ]; then @@ -269,7 +272,7 @@ GenerateConfig() { if [ "$device" == "GPU" ]; then line=0 # line refers to the line number of the fateflow `command` line in docker-compose.yaml if [ "$computing" == "Eggroll" ]; then - line=137 + line=140 fi if [ "$computing" == "Spark" ]; then line=84 @@ -322,18 +325,13 @@ GenerateConfig() { echo >./confs-$party_id/confs/mysql/init/insert-node.sql echo "CREATE DATABASE IF NOT EXISTS ${db_name};" >>./confs-$party_id/confs/mysql/init/insert-node.sql + echo "CREATE DATABASE IF NOT EXISTS fate_flow;" >>./confs-$party_id/confs/mysql/init/insert-node.sql echo "CREATE USER '${db_user}'@'%' IDENTIFIED BY '${db_password}';" >>./confs-$party_id/confs/mysql/init/insert-node.sql echo "GRANT ALL ON *.* TO '${db_user}'@'%';" >>./confs-$party_id/confs/mysql/init/insert-node.sql - + if [[ "$computing" == "Eggroll" ]]; then echo 'USE `'${db_name}'`;' >>./confs-$party_id/confs/mysql/init/insert-node.sql - echo "INSERT INTO server_node (host, port, node_type, status) values ('${clustermanager_ip}', '${clustermanager_port_db}', 'CLUSTER_MANAGER', 'HEALTHY');" >>./confs-$party_id/confs/mysql/init/insert-node.sql - for ((j = 0; j < ${#nodemanager_ip[*]}; j++)); do - echo "INSERT INTO server_node (host, port, node_type, status) values ('${nodemanager_ip[j]}', '${nodemanager_port_db}', 'NODE_MANAGER', 'HEALTHY');" >>./confs-$party_id/confs/mysql/init/insert-node.sql - done - echo "show tables;" >>./confs-$party_id/confs/mysql/init/insert-node.sql - echo "select * from server_node;" >>./confs-$party_id/confs/mysql/init/insert-node.sql - + echo "show tables;" >>./confs-$party_id/confs/mysql/init/insert-node.sql sed -i "s/eggroll_meta/${db_name}/g" ./confs-$party_id/confs/mysql/init/create-eggroll-meta-tables.sql else rm -f ./confs-$party_id/confs/mysql/init/create-eggroll-meta-tables.sql @@ -341,6 +339,7 @@ GenerateConfig() { echo mysql module of $party_id done! # fate_flow + sed -i "s/party_id:/party_id: ${party_id}/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/name: /name: '${db_name}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/user: /user: '${db_user}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml sed -i "s/passwd: /passwd: '${db_password}'/g" ./confs-$party_id/confs/fate_flow/conf/service_conf.yaml diff --git a/docker-deploy/parties.conf b/docker-deploy/parties.conf index 8f621862f..bc4b1fb1a 100644 --- a/docker-deploy/parties.conf +++ b/docker-deploy/parties.conf @@ -13,13 +13,13 @@ computing=Eggroll federation=Eggroll # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage=Eggroll -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm=Basic -# Device: IPCL, CPU +# Device: CPU, IPCL, GPU device=CPU # spark and eggroll -compute_core=4 +compute_core=16 # You only need to configure this parameter when you want to use the GPU, the default value is 1 gpu_count=1 @@ -32,6 +32,7 @@ mysql_ip=mysql mysql_user=fate mysql_password=fate_dev mysql_db=fate_flow +serverTimezone=UTC name_node=hdfs://namenode:9000 diff --git a/docker-deploy/training_template/backends/eggroll/conf/applicationContext-proxy.xml b/docker-deploy/training_template/backends/eggroll/conf/applicationContext-proxy.xml index 55dfcd3f6..c38a2bb99 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/applicationContext-proxy.xml +++ b/docker-deploy/training_template/backends/eggroll/conf/applicationContext-proxy.xml @@ -39,5 +39,7 @@ + + \ No newline at end of file diff --git a/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties b/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties index f70e42793..4d7bb2f26 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties +++ b/docker-deploy/training_template/backends/eggroll/conf/eggroll.properties @@ -1,4 +1,4 @@ -# Copyright 2019-2020 VMware, Inc. +# Copyright 2019-2023 VMware, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,10 +15,10 @@ [eggroll] # core +#eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=org.h2.Driver eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=com.mysql.cj.jdbc.Driver -#eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=com.mysql.cj.jdbc.Driver +#eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:h2:./data/meta_h2/eggroll_meta.h2;AUTO_SERVER=TRUE;MODE=MySQL;DATABASE_TO_LOWER=TRUE;SCHEMA=eggroll_meta; eggroll.resourcemanager.clustermanager.jdbc.url= -#eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:mysql://localhost:3306/eggroll_meta?useSSL=false&serverTimezone=UTC&characterEncoding=utf8&allowPublicKeyRetrieval=true eggroll.resourcemanager.clustermanager.jdbc.username= eggroll.resourcemanager.clustermanager.jdbc.password= @@ -34,7 +34,7 @@ eggroll.bootstrap.root.script=bin/eggroll_boot.sh eggroll.resourcemanager.bootstrap.egg_pair.exepath=bin/roll_pair/egg_pair_bootstrap.sh eggroll.resourcemanager.bootstrap.egg_pair.venv= -eggroll.resourcemanager.bootstrap.egg_pair.pythonpath= +eggroll.resourcemanager.bootstrap.egg_pair.pythonpath=python eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/roll_pair/egg_pair.py eggroll.resourcemanager.bootstrap.egg_pair.ld_library_path= @@ -78,3 +78,14 @@ eggroll.rollsite.push.max.retry=3 eggroll.rollsite.push.long.retry=2 eggroll.rollsite.push.batches.per.stream=10 eggroll.rollsite.adapter.sendbuf.size=100000 +# deepspeed +## where deepspeed containers locate, required for deepspeed +#eggroll.resourcemanager.nodemanager.containers.data.dir= +## which python exec that deepspeed container used, fallback to eggpair venv/bin/python +#eggroll.container.deepspeed.python.exec= +## provide by submit option for now +#eggroll.container.deepspeed.script.path= +eggroll.container.deepspeed.distributed.backend=nccl +## defaults to cluster manager endpoint +#eggroll.container.deepspeed.distributed.store.host= +#eggroll.container.deepspeed.distributed.store.port= diff --git a/docker-deploy/training_template/backends/eggroll/conf/log4j2.properties b/docker-deploy/training_template/backends/eggroll/conf/log4j2.properties index 00e5514ea..03b1bdfd1 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/log4j2.properties +++ b/docker-deploy/training_template/backends/eggroll/conf/log4j2.properties @@ -46,9 +46,22 @@ appender.errorlog.policies.time.type=TimeBasedTriggeringPolicy appender.errorlog.policies.time.interval=1 appender.errorlog.policies.time.modulate=true appender.errorlog.strategy.type=DefaultRolloverStrategy +# audit +appender.audit.type=RollingFile +appender.audit.name=LOGAUDIT +appender.audit.fileName=${logDir}/${logFile}-audit.log +appender.audit.filePattern=${logDir}/%d{yyyy}/%d{MM}/%d{dd}/${logFile}-audit.log.%d{yyyy-MM-dd-HH} +appender.audit.layout.type=PatternLayout +appender.audit.layout.pattern=${logPattern} +appender.audit.policies.type=Policies +appender.audit.policies.time.type=TimeBasedTriggeringPolicy +appender.audit.policies.time.interval=1 +appender.audit.policies.time.modulate=true +appender.audit.strategy.type=DefaultRolloverStrategy + # loggers -loggers=file, netty +loggers=file, netty, audit, httpclient, httpclientwire # logger - file logger.file.name=file @@ -73,8 +86,23 @@ rootLogger.appenderRef.errorlog.level=ERROR # Uncomment the following line if you always want logs on console. # Otherwise you can enable it by setting EGGROLL_LOG_LEVEL<=DEBUG or EGGROLL_LOG_CONSOLE=1 in system env -rootLogger.appenderRef.stdout.ref=STDOUT +#rootLogger.appenderRef.stdout.ref=STDOUT # logger - netty logger.netty.name=io.grpc.netty logger.netty.level=INFO + +# logger - audit +logger.audit.name=audit +logger.audit.level=info +logger.audit.appenderRefs=audit +logger.audit.appenderRef.file.ref=LOGAUDIT +logger.audit.additivity=false + +# logger - HttpClient +logger.httpclient.name=org.apache.commons.httpclient +logger.httpclient.level=INFO + +logger.httpclientwire.name=httpclient.wire +logger.httpclientwire.level=INFO + diff --git a/docker-deploy/training_template/backends/eggroll/conf/whitelist.json b/docker-deploy/training_template/backends/eggroll/conf/whitelist.json index f0d25cd0b..64aed27ca 100644 --- a/docker-deploy/training_template/backends/eggroll/conf/whitelist.json +++ b/docker-deploy/training_template/backends/eggroll/conf/whitelist.json @@ -5,8 +5,8 @@ "set" ], "collections": [ - "OrderedDict", - "defaultdict" + "defaultdict", + "OrderedDict" ], "eggroll.core.transfer_model": [ "ErRollSiteHeader" @@ -20,8 +20,8 @@ "PackingCipherTensorPackage" ], "federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_histogram": [ - "FeatureHistogramWeights", - "HistogramBag" + "HistogramBag", + "FeatureHistogramWeights" ], "federatedml.ensemble.basic_algorithms.decision_tree.tree_core.feature_importance": [ "FeatureImportance" @@ -44,10 +44,16 @@ "federatedml.feature.binning.optimal_binning.bucket_info": [ "Bucket" ], + "federatedml.feature.binning.optimal_binning.heap": [ + "MinHeap", + "IvHeapNode", + "GiniHeapNode", + "ChiSquareHeapNode" + ], "federatedml.feature.binning.quantile_summaries": [ - "QuantileSummaries", + "SparseQuantileSummaries", "Stats", - "SparseQuantileSummaries" + "QuantileSummaries" ], "federatedml.feature.fate_element_type": [ "NoneType" @@ -65,12 +71,12 @@ "SparseVector" ], "federatedml.framework.weights": [ - "TransferableWeights", - "DictWeights", "NumpyWeights", + "TransferableWeights", + "NumericWeights", "ListWeights", - "OrderDictWeights", - "NumericWeights" + "DictWeights", + "OrderDictWeights" ], "federatedml.linear_model.linear_model_weight": [ "LinearModelWeights" @@ -92,8 +98,8 @@ "CryptoExecutor" ], "federatedml.secureprotol.symmetric_encryption.pohlig_hellman_encryption": [ - "PohligHellmanCipherKey", - "PohligHellmanCiphertext" + "PohligHellmanCiphertext", + "PohligHellmanCipherKey" ], "federatedml.statistic.intersect.intersect_preprocess": [ "BitArray" @@ -105,12 +111,12 @@ "from_binary" ], "numpy": [ - "dtype", - "ndarray" + "ndarray", + "dtype" ], "numpy.core.multiarray": [ - "_reconstruct", - "scalar" + "scalar", + "_reconstruct" ], "numpy.core.numeric": [ "_frombuffer" @@ -121,7 +127,11 @@ "torch._utils": [ "_rebuild_tensor_v2" ], - "torch.storage": [ - "_load_from_bytes" + "ipcl_python.bindings.ipcl_bindings": [ + "ipclPublicKey" + ], + "ipcl_python.ipcl_python": [ + "PaillierPublicKey", + "PaillierEncryptedNumber" ] } diff --git a/docker-deploy/training_template/docker-compose-eggroll.yml b/docker-deploy/training_template/docker-compose-eggroll.yml index 5a7c8ba13..68d53d6f2 100644 --- a/docker-deploy/training_template/docker-compose-eggroll.yml +++ b/docker-deploy/training_template/docker-compose-eggroll.yml @@ -95,16 +95,19 @@ services: - ./shared_dir/data/nodemanager:/data/projects/fate/eggroll/data - /etc/localtime:/etc/localtime:ro - shared_dir_federatedml:/data/projects/fate/fate/python/federatedml + depends_on: + - clustermanager networks: - fate-network + cap_add: + - SYS_PTRACE command: ["bash", "-c", "java -Dlog4j.configurationFile=$${EGGROLL_HOME}/conf/log4j2.properties -cp $${EGGROLL_HOME}/lib/*: com.webank.eggroll.core.Bootstrap --bootstraps com.webank.eggroll.core.resourcemanager.NodeManagerBootstrap -c $${EGGROLL_HOME}/conf/eggroll.properties -p 4671 -s 'EGGROLL_DEAMON'"] fateflow: image: "federatedai/fateflow:${TAG}" environment: - PYTHONPATH: "$$PYTHONPATH:/data/projects/fate/fate/python:/data/projects/fate/eggroll/python:/data/projects/fate/fateflow/python:/data/projects/fate/fate/python/fate_client" FATE_PROJECT_BASE: "/data/projects/fate" - FATE_LOG_LEVEL: "INFO" + FATE_LOG_LEVEL: "DEBUG" ports: - "9360:9360" - "9380:9380" @@ -170,6 +173,8 @@ services: - ./shared_dir/data/mysql:/var/lib/mysql - /etc/localtime:/etc/localtime:ro restart: always + cap_add: + - SYS_NICE environment: MYSQL_ALLOW_EMPTY_PASSWORD: "yes" networks: diff --git a/docker-deploy/training_template/docker-compose-spark-slim.yml b/docker-deploy/training_template/docker-compose-spark-slim.yml index bfab5faf4..690ef7eea 100644 --- a/docker-deploy/training_template/docker-compose-spark-slim.yml +++ b/docker-deploy/training_template/docker-compose-spark-slim.yml @@ -89,7 +89,6 @@ services: set -x sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py environment: - PYTHONPATH: "$$PYTHONPATH:/data/projects/fate/fate/python:/data/projects/fate/eggroll/python:/data/projects/fate/fateflow/python:/data/projects/fate/fate/python/fate_client" FATE_PROJECT_BASE: "/data/projects/fate" FATE_FLOW_UPLOAD_MAX_NUM: "1000000" FATE_FLOW_UPLOAD_MAX_BYTES: "104868093952" @@ -108,6 +107,8 @@ services: MYSQL_ALLOW_EMPTY_PASSWORD: "yes" networks: - fate-network + cap_add: + - SYS_NICE nginx: image: "federatedai/nginx:${TAG}" diff --git a/docker-deploy/training_template/docker-compose-spark.yml b/docker-deploy/training_template/docker-compose-spark.yml index eaace9288..f4875c4d2 100644 --- a/docker-deploy/training_template/docker-compose-spark.yml +++ b/docker-deploy/training_template/docker-compose-spark.yml @@ -88,7 +88,6 @@ services: set -x sleep 5 && python fateflow/python/fate_flow/fate_flow_server.py environment: - PYTHONPATH: "$$PYTHONPATH:/data/projects/fate/fate/python:/data/projects/fate/eggroll/python:/data/projects/fate/fateflow/python:/data/projects/fate/fate/python/fate_client" FATE_PROJECT_BASE: "/data/projects/fate" FATE_FLOW_UPLOAD_MAX_NUM: "1000000" FATE_FLOW_UPLOAD_MAX_BYTES: "104868093952" @@ -227,6 +226,8 @@ services: MYSQL_ALLOW_EMPTY_PASSWORD: "yes" networks: - fate-network + cap_add: + - SYS_NICE nginx: image: "federatedai/nginx:${TAG}" diff --git a/docker-deploy/training_template/public/client/pipeline_conf.yaml b/docker-deploy/training_template/public/client/pipeline_conf.yaml deleted file mode 100644 index 3dd1407d9..000000000 --- a/docker-deploy/training_template/public/client/pipeline_conf.yaml +++ /dev/null @@ -1,3 +0,0 @@ -ip: python -port: 9380 -log_directory: /fml_manager/Examples/Pipeline/logs diff --git a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml index e6893425d..b37755c25 100644 --- a/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml +++ b/docker-deploy/training_template/public/fate_flow/conf/service_conf.yaml @@ -4,6 +4,7 @@ dependent_distribution: false encrypt_password: false encrypt_module: fate_arch.common.encrypt_utils#pwdecrypt private_key: +private_key_file: party_id: hook_module: client_authentication: fate_flow.hook.flow.client_authentication @@ -26,6 +27,15 @@ fateflow: host: 192.167.0.100 http_port: 9380 grpc_port: 9360 + # when you have multiple fateflow server on one party, + # we suggest using nginx for load balancing. + nginx: + host: + http_port: + grpc_port: + # use random instance_id instead of {host}:{http_port} + random_instance_id: false + # support rollsite/nginx/fateflow as a coordination proxy # rollsite support fate on eggroll, use grpc protocol # nginx support fate on eggroll and fate on spark, use http or grpc protocol, default is http diff --git a/docker-deploy/training_template/public/fateboard/conf/application.properties b/docker-deploy/training_template/public/fateboard/conf/application.properties index 2058c6b38..c2fbc0f25 100644 --- a/docker-deploy/training_template/public/fateboard/conf/application.properties +++ b/docker-deploy/training_template/public/fateboard/conf/application.properties @@ -14,12 +14,16 @@ server.tomcat.max-threads=1000 server.tomcat.max-connections=20000 spring.servlet.multipart.max-file-size=10MB spring.servlet.multipart.max-request-size=100MB +spring.servlet.session.timeout=1800s server.compression.enabled=true server.compression.mime-types=application/json,application/xml,text/html,text/xml,text/plain server.board.login.username= server.board.login.password= +server.board.encrypt.private_key= +server.board.encrypt.enable=false +#only [h,m,s] is available server.servlet.session.timeout=4h server.servlet.session.cookie.max-age=4h management.endpoints.web.exposure.exclude=* feign.client.config.default.connectTimeout=10000 -feign.client.config.default.readTimeout=10000 \ No newline at end of file +feign.client.config.default.readTimeout=10000 diff --git a/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql b/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql index b6e28d1c7..1549c6c96 100644 --- a/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql +++ b/docker-deploy/training_template/public/mysql/init/create-eggroll-meta-tables.sql @@ -5,22 +5,24 @@ CREATE DATABASE IF NOT EXISTS `eggroll_meta`; USE `eggroll_meta`; -- store_locator -CREATE TABLE IF NOT EXISTS `store_locator` ( - `store_locator_id` SERIAL PRIMARY KEY, - `store_type` VARCHAR(255) NOT NULL, - `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', - `name` VARCHAR(2000) NOT NULL, - `path` VARCHAR(2000) NOT NULL DEFAULT '', - `total_partitions` INT UNSIGNED NOT NULL, - `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', - `serdes` VARCHAR(2000) NOT NULL DEFAULT '', - `version` INT UNSIGNED NOT NULL DEFAULT 0, - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; - -CREATE UNIQUE INDEX `idx_u_store_loinsert-node.sqlcator_ns_n` ON `store_locator` (`namespace`(120), `name`(640)); +CREATE TABLE IF NOT EXISTS `store_locator` +( + `store_locator_id` SERIAL PRIMARY KEY, + `store_type` VARCHAR(255) NOT NULL, + `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', + `name` VARCHAR(2000) NOT NULL, + `path` VARCHAR(2000) NOT NULL DEFAULT '', + `total_partitions` INT UNSIGNED NOT NULL, + `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', + `serdes` VARCHAR(2000) NOT NULL DEFAULT '', + `version` INT UNSIGNED NOT NULL DEFAULT 0, + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; + +CREATE UNIQUE INDEX `idx_u_store_locator_ns_n` ON `store_locator` (`namespace`(120), `name`(640)); CREATE INDEX `idx_store_locator_st` ON `store_locator` (`store_type`(255)); CREATE INDEX `idx_store_locator_ns` ON `store_locator` (`namespace`(767)); CREATE INDEX `idx_store_locator_n` ON `store_locator` (`name`(767)); @@ -29,28 +31,32 @@ CREATE INDEX `idx_store_locator_v` ON `store_locator` (`version`); -- store (option) -CREATE TABLE IF NOT EXISTS `store_option` ( - `store_option_id` SERIAL PRIMARY KEY, - `store_locator_id` BIGINT UNSIGNED NOT NULL, - `name` VARCHAR(255) NOT NULL, - `data` VARCHAR(2000) NOT NULL DEFAULT '', - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `store_option` +( + `store_option_id` SERIAL PRIMARY KEY, + `store_locator_id` BIGINT UNSIGNED NOT NULL, + `name` VARCHAR(255) NOT NULL, + `data` VARCHAR(2000) NOT NULL DEFAULT '', + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_store_option_si` ON `store_option` (`store_locator_id`); -- store_partition -CREATE TABLE IF NOT EXISTS `store_partition` ( - `store_partition_id` SERIAL PRIMARY KEY, -- self-increment sequence - `store_locator_id` BIGINT UNSIGNED NOT NULL, - `node_id` BIGINT UNSIGNED NOT NULL, - `partition_id` INT UNSIGNED NOT NULL, -- partition id of a store - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `store_partition` +( + `store_partition_id` SERIAL PRIMARY KEY, -- self-increment sequence + `store_locator_id` BIGINT UNSIGNED NOT NULL, + `node_id` BIGINT UNSIGNED NOT NULL, + `partition_id` INT UNSIGNED NOT NULL, -- partition id of a store + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE UNIQUE INDEX `idx_u_store_partition_si_spi_ni` ON `store_partition` (`store_locator_id`, `store_partition_id`, `node_id`); CREATE INDEX `idx_store_partition_sli` ON `store_partition` (`store_locator_id`); @@ -59,18 +65,20 @@ CREATE INDEX `idx_store_partition_s` ON `store_partition` (`status`(255)); -- node -CREATE TABLE IF NOT EXISTS `server_node` ( - `server_node_id` SERIAL PRIMARY KEY, - `name` VARCHAR(2000) NOT NULL DEFAULT '', - `server_cluster_id` BIGINT UNSIGNED NOT NULL DEFAULT 0, - `host` VARCHAR(1000) NOT NULL, - `port` INT NOT NULL, - `node_type` VARCHAR(255) NOT NULL, - `status` VARCHAR(255) NOT NULL, - `last_heartbeat_at` DATETIME DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `server_node` +( + `server_node_id` SERIAL PRIMARY KEY, + `name` VARCHAR(2000) NOT NULL DEFAULT '', + `server_cluster_id` BIGINT UNSIGNED NOT NULL DEFAULT 0, + `host` VARCHAR(1000) NOT NULL, + `port` INT NOT NULL, + `node_type` VARCHAR(255) NOT NULL, + `status` VARCHAR(255) NOT NULL, + `last_heartbeat_at` DATETIME DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_server_node_h_p_nt` ON `server_node` (`host`(600), `port`, `node_type`(100)); CREATE INDEX `idx_server_node_h` ON `server_node` (`host`(767)); @@ -80,46 +88,108 @@ CREATE INDEX `idx_server_node_s` ON `server_node` (`status`(255)); -- session (main) -CREATE TABLE IF NOT EXISTS `session_main` ( - `session_id` VARCHAR(767) PRIMARY KEY, - `name` VARCHAR(2000) NOT NULL DEFAULT '', - `status` VARCHAR(255) NOT NULL, - `tag` VARCHAR(255), - `total_proc_count` INT, - `active_proc_count` INT, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `session_main` +( + `session_id` VARCHAR(767) PRIMARY KEY, + `name` VARCHAR(2000) NOT NULL DEFAULT '', + `status` VARCHAR(255) NOT NULL, + `tag` VARCHAR(255), + `total_proc_count` INT, + `active_proc_count` INT, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_main_s` ON `session_main` (`status`); -- session (option) -CREATE TABLE IF NOT EXISTS `session_option` ( - `session_option_id` SERIAL PRIMARY KEY, - `session_id` VARCHAR(2000), - `name` VARCHAR(255) NOT NULL, - `data` VARCHAR(2000) NOT NULL DEFAULT '', - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `session_option` +( + `session_option_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(2000), + `name` VARCHAR(255) NOT NULL, + `data` VARCHAR(2000) NOT NULL DEFAULT '', + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_option_si` ON `session_option` (`session_id`(767)); -- session (processor) -CREATE TABLE IF NOT EXISTS `session_processor` ( - `processor_id` SERIAL PRIMARY KEY, - `session_id` VARCHAR(767), - `server_node_id` INT NOT NULL, - `processor_type` VARCHAR(255) NOT NULL, - `status` VARCHAR(255), - `tag` VARCHAR(255), - `command_endpoint` VARCHAR(255), - `transfer_endpoint` VARCHAR(255), - `pid` INT NOT NULL DEFAULT -1, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP -) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; +CREATE TABLE IF NOT EXISTS `session_processor` +( + `processor_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `processor_type` VARCHAR(255) NOT NULL, + `status` VARCHAR(255), + `tag` VARCHAR(255), + `command_endpoint` VARCHAR(255), + `transfer_endpoint` VARCHAR(255), + `processor_option` VARCHAR(512), + `pid` INT NOT NULL DEFAULT -1, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_processor_si` ON `session_processor` (`session_id`(767)); + + +CREATE TABLE IF NOT EXISTS `processor_resource` +( + `id` SERIAL PRIMARY KEY, + `processor_id` BIGINT NOT NULL, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `resource_type` VARCHAR(255), + `allocated` BIGINT NOT NULL default 0, + `extention` VARCHAR(512), + `status` VARCHAR(255), + `pid` INT NOT NULL DEFAULT -1, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; +CREATE INDEX `idx_processor_id_processor_resource` ON `processor_resource` (`processor_id`); +CREATE INDEX `idx_node_id_processor_resource` ON `processor_resource` (`server_node_id`); +CREATE INDEX `idx_session_id_processor_resource` ON `processor_resource` (`session_id`); +CREATE INDEX `idx_node_status_processor_resource` ON `processor_resource` (`server_node_id`,`resource_type`,`status`); + + + +CREATE TABLE IF NOT EXISTS `node_resource` +( + `resource_id` SERIAL PRIMARY KEY, + `server_node_id` BIGINT NOT NULL, + `resource_type` VARCHAR(255), + `total` BIGINT NOT NULL default 0, + `used` BIGINT NOT NULL default 0, + `pre_allocated` BIGINT NOT NULL default 0, + `allocated` BIGINT NOT NULL DEFAULT 0, + `extention` VARCHAR(512), + `status` VARCHAR(255), + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; +CREATE INDEX `idx_node_id_node_resource` ON `node_resource` (`server_node_id`); +CREATE INDEX `idx_node_status_node_resource` ON `node_resource` (`server_node_id`,`status`); +CREATE UNIQUE INDEX `idx_u_node_resource` ON `node_resource` (`server_node_id`, `resource_type`); + + +CREATE TABLE IF NOT EXISTS `session_ranks` +( + `container_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `global_rank` INT UNSIGNED NOT NULL, + `local_rank` INT UNSIGNED NOT NULL +) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; + + CREATE INDEX `idx_session_id_session_ranks` ON `session_ranks` (`session_id`); diff --git a/docs/FATE_Algorithm_and_Computational_Acceleration_Selection.md b/docs/FATE_Algorithm_and_Computational_Acceleration_Selection.md index d3b753def..3d5c1ae49 100644 --- a/docs/FATE_Algorithm_and_Computational_Acceleration_Selection.md +++ b/docs/FATE_Algorithm_and_Computational_Acceleration_Selection.md @@ -15,6 +15,8 @@ The choice of algorithm consists of two options: Basic is the default option, which includes dependencies related to the removal of nn (including homo_nn and hetero_nn) algorithms. - `NN` NN contains all the dependencies required for nn to include (homo_nn and hetero_nn). ***NN can only be used when computing is Eggroll*** +- `ALL(LLM)` + ALL represents all algorithms, including basic NN and [FATE-LLM](https://github.com/FederatedAI/FATE-LLM). ## Device @@ -22,5 +24,16 @@ Device selection consists of an option: - `CPU` The CPU is a computing device that uses the CPU as a FATE computing device. +- `IPCL` + IPCL is to use IPCL to speed up FATE. - `GPU` The GPU is a computing device that uses the GPU as a FATE computing device. + +## Support matrix + +Various combinations currently supported by KubeFATE. +| Device \ Algorithm | Basic | NN | ALL(LLM) | +|---|---|---|---|---| +| CPU | EggRoll&Spark | EggRoll&Spark | - | +| IPCL| EggRoll&Spark | - | - | +| GPU | - | EggRoll&Spark | EggRoll&Spark | diff --git a/docs/Manage_FATE_and_FATE-Serving_Version.md b/docs/Manage_FATE_and_FATE-Serving_Version.md index e7f76c3ad..5aaaf2e01 100644 --- a/docs/Manage_FATE_and_FATE-Serving_Version.md +++ b/docs/Manage_FATE_and_FATE-Serving_Version.md @@ -30,18 +30,18 @@ The chart can be downloaded in each KubeFATE release, with name `fate-{release_v Download it and copy it to the folder to upload. ``` -$ kubefate chart upload -f ./fate-v1.11.1.tgz +$ kubefate chart upload -f ./fate-v1.11.2.tgz Upload file success $ kubefate chart ls UUID NAME VERSION APPVERSION -ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.11.1 v1.11.1 +ca3f7843-749a-4f69-9f6b-4c544a7623ac fate v1.11.2 v1.11.2 ``` -Then, we can deploy the fate cluster of v1.11.1 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) +Then, we can deploy the fate cluster of v1.11.2 version. The detail of cluster.yaml please refer to: [FATE Cluster Configuration](./configurations/FATE_cluster_configuration.md) ``` chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 ``` We can delete the chart with: diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md index c032b1a81..34868d487 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube.md @@ -21,14 +21,14 @@ After the tutorial, the deployment architecture looks like the following diagram 5. Network connectivity to dockerhub or 163 Docker Image Registry, and google gcr. 6. Setup the global KubeFATE version using in the tutorial and create a folder for the whole tutorial. ``` -export fate_version=v1.11.1 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.11.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * When talking about KubeFATE version, usually there are 3 notions: * The KubeFATE CLI version, in this tutorial, it is v1.4.5. * The KubeFATE service version, in this tutorial, it is v1.4.5. - * The FATE version, in this tutorial, it is v1.11.1, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. + * The FATE version, in this tutorial, it is v1.11.2, it also means the version of the helm chart of FATE, currently we use this version to tag the KubeFATE GitHub master branch. * **In this tutorial, the IP of the machine we used is 192.168.100.123. Please change it to your machine's IP in all the following commands and config files.** # Start Tutorial @@ -87,7 +87,7 @@ When all the pods are in the ready state, it means your Kubernetes cluster is re ## Setup Kubefate ### Install KubeFATE CLI Go to [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases), and find the latest kubefate-k8s release -pack, which is `v1.11.1` as set to ENVs before. (replace ${fate_version} with the newest version available) +pack, which is `v1.11.2` as set to ENVs before. (replace ${fate_version} with the newest version available) ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -256,7 +256,7 @@ For `/kubefate/examples/party-9999/cluster-spark-pulsar.yaml`, modify it as foll name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -340,7 +340,7 @@ and for fate-10000: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: @@ -440,8 +440,8 @@ or watch the clusters till their STATUS changing to `Running`: ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.1 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.1 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.2 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.2 69s ``` We have about 10G Docker images that need to be pulled, this step will take a while for the first time. An alternative way is offline loading the images to the local environment. @@ -479,13 +479,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.1 +ChartVersion v1.11.2 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.1 + chartVersion: v1.11.2 computing: Spark device: CPU federation: Pulsar diff --git a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md index 24324d887..3e2ac76e1 100644 --- a/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md +++ b/docs/tutorials/Build_Two_Parties_FATE_Cluster_in_One_Linux_Machine_with_MiniKube_zh.md @@ -17,14 +17,14 @@ 5. 要保证安装机器可以正常访问Docker Hub或者网易云镜像仓库,以及Google gcr; 6. 预先创建一个目录,以便整个过程使用该目录作为工作目录,命令如下: ``` -export fate_version=v1.11.1 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo +export fate_version=v1.11.2 && export kubefate_version=v1.4.5 && cd ~ && mkdir demo && cd demo ``` Notes: * 当我们提到"KubeFATE的版本",通常来讲会有三个概念: * KubeFATE命令行工具的版本,在本教程中为v1.4.5。 * KubeFATE服务版本,在本教程中为v1.4.5。 - * FATE版本,在本教程中v1.11.1,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 + * FATE版本,在本教程中v1.11.2,它也意味着FATE的Helm Chart的版本, 值得注意的是我们用这个版本来给GitHub上的KubeFATE的发布打tag。 * **下文介绍的MiniKube机器IP地址是192.168.100.123。请修改为你准备的实验机器IP地址** # 开始安装 @@ -77,7 +77,7 @@ sudo minikube addons enable ingress ## 安装Kubefate ### 下载KubeFATE命令行工具 -我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.11.1`, +我们从Github上 [KubeFATE Release](https://github.com/FederatedAI/KubeFATE/releases)页面找到Kuberetes部署的下载包,并下载对应版本,如前面环境变量设置`v1.11.2`, ``` curl -LO https://github.com/FederatedAI/KubeFATE/releases/download/${fate_version}/kubefate-k8s-${fate_version}.tar.gz && tar -xzf ./kubefate-k8s-${fate_version}.tar.gz ``` @@ -237,7 +237,7 @@ kubectl -n fate-10000 create secret docker-registry myregistrykey \ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -322,7 +322,7 @@ pulsar: name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: @@ -418,8 +418,8 @@ create job success, job id=7752db70-e368-41fa-8827-d39411728d1b ``` kubefate@machine:~/kubefate$ watch kubefate cluster ls UUID NAME NAMESPACE REVISION STATUS CHART ChartVERSION AGE -29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.1 88s -dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.1 69s +29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 fate-9999 fate-9999 1 Running fate v1.11.2 88s +dacc0549-b9fc-463f-837a-4e7316db2537 fate-10000 fate-10000 1 Running fate v1.11.2 69s ``` 因为这个步骤需要到网易云镜像仓库去下载约10G的镜像,所以第一次执行视乎你的网络情况需要一定时间。 检查下载的进度可以用 @@ -446,13 +446,13 @@ UUID 29878fa9-aeee-4ae5-a5b7-fd4e9eb7c1c3 Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.1 +ChartVersion v1.11.2 Revision 1 Age 54m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.1 + chartVersion: v1.11.2 computing: Spark device: CPU federation: Pulsar diff --git a/helm-charts/FATE-Exchange/Chart.yaml b/helm-charts/FATE-Exchange/Chart.yaml index c45169267..473c09abf 100644 --- a/helm-charts/FATE-Exchange/Chart.yaml +++ b/helm-charts/FATE-Exchange/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v1 -appVersion: v1.11.1 +appVersion: v1.11.2 description: A Helm chart for fate exchange name: fate-exchange -version: v1.11.1 +version: v1.11.2 diff --git a/helm-charts/FATE-Exchange/values-template-example.yaml b/helm-charts/FATE-Exchange/values-template-example.yaml index 81fbb016d..01432621b 100644 --- a/helm-charts/FATE-Exchange/values-template-example.yaml +++ b/helm-charts/FATE-Exchange/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 1 registry: "" pullPolicy: diff --git a/helm-charts/FATE-Exchange/values.yaml b/helm-charts/FATE-Exchange/values.yaml index 2f508dbd2..271bc45db 100644 --- a/helm-charts/FATE-Exchange/values.yaml +++ b/helm-charts/FATE-Exchange/values.yaml @@ -4,7 +4,7 @@ partyName: fate-exchange image: registry: federatedai isThridParty: - tag: 1.11.1-release + tag: 1.11.2-release pullPolicy: IfNotPresent imagePullSecrets: # - name: diff --git a/helm-charts/FATE/Chart.yaml b/helm-charts/FATE/Chart.yaml index 18ea6706c..c4e6e52a1 100644 --- a/helm-charts/FATE/Chart.yaml +++ b/helm-charts/FATE/Chart.yaml @@ -1,8 +1,8 @@ apiVersion: v1 -appVersion: v1.11.1 +appVersion: v1.11.2 description: A Helm chart for fate-training name: fate -version: v1.11.1 +version: v1.11.2 home: https://fate.fedai.org icon: https://aisp-1251170195.cos.ap-hongkong.myqcloud.com/wp-content/uploads/sites/12/2019/09/logo.png sources: diff --git a/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl b/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl index 8abd2d17b..4162ca2ec 100644 --- a/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl +++ b/helm-charts/FATE/templates/backends/eggroll/_helpers.tpl @@ -16,6 +16,9 @@ {{- if eq .Values.algorithm "NN" -}} -nn {{- end -}} +{{- if eq .Values.algorithm "ALL" -}} +-all +{{- end -}} {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} diff --git a/helm-charts/FATE/templates/backends/eggroll/configmap.yaml b/helm-charts/FATE/templates/backends/eggroll/configmap.yaml index b8b96b619..7d59888ae 100644 --- a/helm-charts/FATE/templates/backends/eggroll/configmap.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/configmap.yaml @@ -23,8 +23,11 @@ metadata: data: eggroll.properties: | [eggroll] + # core + #eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=org.h2.Driver eggroll.resourcemanager.clustermanager.jdbc.driver.class.name=com.mysql.cj.jdbc.Driver - eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:mysql://{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}:{{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }}/{{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }}?useSSL=false&serverTimezone=UTC&characterEncoding=utf8&allowPublicKeyRetrieval=true + #eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:h2:./data/meta_h2/eggroll_meta.h2;AUTO_SERVER=TRUE;MODE=MySQL;DATABASE_TO_LOWER=TRUE;SCHEMA=eggroll_meta; + eggroll.resourcemanager.clustermanager.jdbc.url=jdbc:mysql://{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}:{{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }}/{{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }}?useSSL=false&serverTimezone={{ .Values.modules.clustermanager.mysqlServerTimezone | default "UTC" }}&characterEncoding=utf8&allowPublicKeyRetrieval=true eggroll.resourcemanager.clustermanager.jdbc.username={{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }} eggroll.resourcemanager.clustermanager.jdbc.password={{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }} @@ -39,7 +42,7 @@ data: eggroll.resourcemanager.bootstrap.egg_pair.exepath=bin/roll_pair/egg_pair_bootstrap.sh eggroll.resourcemanager.bootstrap.egg_pair.venv= - eggroll.resourcemanager.bootstrap.egg_pair.pythonpath=/data/projects/fate/python:/data/projects/fate/eggroll/python + eggroll.resourcemanager.bootstrap.egg_pair.pythonpath=python eggroll.resourcemanager.bootstrap.egg_pair.filepath=python/eggroll/roll_pair/egg_pair.py eggroll.resourcemanager.bootstrap.egg_pair.ld_library_path= @@ -95,6 +98,18 @@ data: eggroll.rollsite.polling.server.enabled=true eggroll.rollsite.polling.concurrency= {{ .Values.modules.rollsite.polling.concurrency | default 50 }} {{- end }} + + # deepspeed + ## where deepspeed containers locate, required for deepspeed + eggroll.resourcemanager.nodemanager.containers.data.dir=/data/projects/fate/eggroll/containers + ## which python exec that deepspeed container used, fallback to eggpair venv/bin/python + eggroll.container.deepspeed.python.exec=/data/projects/python/venv/bin/python + ## provide by submit option for now + #eggroll.container.deepspeed.script.path= + eggroll.container.deepspeed.distributed.backend=nccl + ## defaults to cluster manager endpoint + #eggroll.container.deepspeed.distributed.store.host= + #eggroll.container.deepspeed.distributed.store.port= {{- if .Values.modules.rollsite.enableTLS }} cert_configs: | eggroll.core.security.secure.cluster.enabled=true diff --git a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml index b98f3202b..810c71ca1 100644 --- a/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/nodemanager/statefulSet.yaml @@ -36,9 +36,9 @@ spec: containers: - name: nodemanager-eggrollpair {{- if .Values.image.isThridParty }} - image: {{ .Values.image.registry }}/fluentd:v1.12 + image: {{ .Values.image.registry }}/fluent-bit:2.1.4 {{- else }} - image: fluent/fluentd:v1.12 + image: fluent/fluent-bit:2.1.4 {{- end }} volumeMounts: - name: eggroll-log @@ -55,7 +55,13 @@ spec: {{ toYaml $val | indent 14 }} {{- end }} {{- end }} + env: + - name: HUGGINGFACE_HUB_CACHE + value: "/data/projects/fate/llm" name: nodemanager + securityContext: + capabilities: + add: ["SYS_PTRACE"] command: - bash - -c @@ -103,6 +109,9 @@ spec: - name: data-dir mountPath: /data/projects/fate/eggroll/data subPath: {{ .Values.modules.nodemanager.subPath }} + - mountPath: /data/projects/fate/llm + name: data-dir + subPath: llm {{- with .Values.modules.nodemanager.nodeSelector }} nodeSelector: {{ toYaml . | indent 8 }} diff --git a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml b/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml index 5c9d0127b..5e611eb69 100644 --- a/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml +++ b/helm-charts/FATE/templates/backends/eggroll/rollsite/deployment.yaml @@ -92,7 +92,7 @@ spec: volumeMounts: - mountPath: /data/projects/fate/eggroll/conf/route_table/ name: rollsite-confs - - mountPath: /data/projects/fate/eggroll/conf/temp_eggroll.properties + - mountPath: /data/projects/fate/eggroll/conf/eggroll.properties name: eggroll-confs subPath: eggroll.properties {{- if .Values.modules.rollsite.enableTLS }} diff --git a/helm-charts/FATE/templates/backends/spark/_helpers.tpl b/helm-charts/FATE/templates/backends/spark/_helpers.tpl index 80228c561..5a85014ee 100644 --- a/helm-charts/FATE/templates/backends/spark/_helpers.tpl +++ b/helm-charts/FATE/templates/backends/spark/_helpers.tpl @@ -16,6 +16,9 @@ {{- if eq .Values.algorithm "NN" -}} -nn {{- end -}} +{{- if eq .Values.algorithm "ALL" -}} +-all +{{- end -}} {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} diff --git a/helm-charts/FATE/templates/core/_helpers.tpl b/helm-charts/FATE/templates/core/_helpers.tpl index 5fbd730ac..0868d57fb 100644 --- a/helm-charts/FATE/templates/core/_helpers.tpl +++ b/helm-charts/FATE/templates/core/_helpers.tpl @@ -18,6 +18,9 @@ {{- if eq .Values.algorithm "NN" -}} -nn {{- end -}} +{{- if eq .Values.algorithm "ALL" -}} +-all +{{- end -}} {{- if eq .Values.device "IPCL" -}} -ipcl {{- end -}} diff --git a/helm-charts/FATE/templates/core/fateboard/configmap.yaml b/helm-charts/FATE/templates/core/fateboard/configmap.yaml index 51098d141..3847d0818 100644 --- a/helm-charts/FATE/templates/core/fateboard/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateboard/configmap.yaml @@ -44,6 +44,9 @@ data: server.compression.mime-types=application/json,application/xml,text/html,text/xml,text/plain server.board.login.username={{ .Values.modules.fateboard.username }} server.board.login.password={{ .Values.modules.fateboard.password }} + server.board.encrypt.private_key= + server.board.encrypt.enable=false + #only [h,m,s] is available server.servlet.session.timeout=4h server.servlet.session.cookie.max-age=4h management.endpoints.web.exposure.exclude=* diff --git a/helm-charts/FATE/templates/core/fateflow/configmap.yaml b/helm-charts/FATE/templates/core/fateflow/configmap.yaml index aba22c4ee..7ee41de3f 100644 --- a/helm-charts/FATE/templates/core/fateflow/configmap.yaml +++ b/helm-charts/FATE/templates/core/fateflow/configmap.yaml @@ -48,7 +48,8 @@ data: encrypt_password: false encrypt_module: fate_arch.common.encrypt_utils#pwdecrypt private_key: - party_id: + private_key_file: + party_id: {{ .Values.partyId }} hook_module: client_authentication: fate_flow.hook.flow.client_authentication site_authentication: fate_flow.hook.flow.site_authentication @@ -184,16 +185,25 @@ data: fateboard: host: fateboard port: 8080 - enable_model_store: true + + enable_model_store: false model_store_address: - storage: mysql - database: {{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }} - host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' - port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} - user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' - password: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' - max_connections: 10 - stale_timeout: 10 + # use mysql as the model store engine + # storage: mysql + # database: {{ .Values.externalMysqlDatabase | default .Values.modules.mysql.database | default "eggroll_meta" }} + # host: '{{ .Values.externalMysqlIp | default .Values.modules.mysql.ip | default "mysql" }}' + # port: {{ .Values.externalMysqlPort | default .Values.modules.mysql.port | default "3306" }} + # user: '{{ .Values.externalMysqlUser | default .Values.modules.mysql.user | default "fate" }}' + # password: '{{ .Values.externalMysqlPassword | default .Values.modules.mysql.password | default "fate_dev" }}' + # max_connections: 10 + # stale_timeout: 10 + + # use tencent cos as model store engine + storage: tencent_cos + Region: + SecretId: + SecretKey: + Bucket: {{- with .Values.modules.serving }} servings: hosts: diff --git a/helm-charts/FATE/templates/core/mysql/configmap.yaml b/helm-charts/FATE/templates/core/mysql/configmap.yaml index e21f5e64b..35af166ba 100644 --- a/helm-charts/FATE/templates/core/mysql/configmap.yaml +++ b/helm-charts/FATE/templates/core/mysql/configmap.yaml @@ -28,20 +28,22 @@ data: USE `{{ .Values.modules.mysql.database }}`; -- store_locator - CREATE TABLE IF NOT EXISTS `store_locator` ( - `store_locator_id` SERIAL PRIMARY KEY, - `store_type` VARCHAR(255) NOT NULL, - `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', - `name` VARCHAR(2000) NOT NULL, - `path` VARCHAR(2000) NOT NULL DEFAULT '', - `total_partitions` INT UNSIGNED NOT NULL, - `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', - `serdes` VARCHAR(2000) NOT NULL DEFAULT '', - `version` INT UNSIGNED NOT NULL DEFAULT 0, - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `store_locator` + ( + `store_locator_id` SERIAL PRIMARY KEY, + `store_type` VARCHAR(255) NOT NULL, + `namespace` VARCHAR(2000) NOT NULL DEFAULT 'DEFAULT', + `name` VARCHAR(2000) NOT NULL, + `path` VARCHAR(2000) NOT NULL DEFAULT '', + `total_partitions` INT UNSIGNED NOT NULL, + `partitioner` VARCHAR(2000) NOT NULL DEFAULT 'BYTESTRING_HASH', + `serdes` VARCHAR(2000) NOT NULL DEFAULT '', + `version` INT UNSIGNED NOT NULL DEFAULT 0, + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE UNIQUE INDEX `idx_u_store_locator_ns_n` ON `store_locator` (`namespace`(120), `name`(640)); CREATE INDEX `idx_store_locator_st` ON `store_locator` (`store_type`(255)); @@ -52,28 +54,32 @@ data: -- store (option) - CREATE TABLE IF NOT EXISTS `store_option` ( - `store_option_id` SERIAL PRIMARY KEY, - `store_locator_id` BIGINT UNSIGNED NOT NULL, - `name` VARCHAR(255) NOT NULL, - `data` VARCHAR(2000) NOT NULL DEFAULT '', - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `store_option` + ( + `store_option_id` SERIAL PRIMARY KEY, + `store_locator_id` BIGINT UNSIGNED NOT NULL, + `name` VARCHAR(255) NOT NULL, + `data` VARCHAR(2000) NOT NULL DEFAULT '', + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_store_option_si` ON `store_option` (`store_locator_id`); -- store_partition - CREATE TABLE IF NOT EXISTS `store_partition` ( - `store_partition_id` SERIAL PRIMARY KEY, -- self-increment sequence - `store_locator_id` BIGINT UNSIGNED NOT NULL, - `node_id` BIGINT UNSIGNED NOT NULL, - `partition_id` INT UNSIGNED NOT NULL, -- partition id of a store - `status` VARCHAR(255) NOT NULL, - `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `store_partition` + ( + `store_partition_id` SERIAL PRIMARY KEY, -- self-increment sequence + `store_locator_id` BIGINT UNSIGNED NOT NULL, + `node_id` BIGINT UNSIGNED NOT NULL, + `partition_id` INT UNSIGNED NOT NULL, -- partition id of a store + `status` VARCHAR(255) NOT NULL, + `created_at` DATETIME DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE UNIQUE INDEX `idx_u_store_partition_si_spi_ni` ON `store_partition` (`store_locator_id`, `store_partition_id`, `node_id`); CREATE INDEX `idx_store_partition_sli` ON `store_partition` (`store_locator_id`); @@ -82,18 +88,20 @@ data: -- node - CREATE TABLE IF NOT EXISTS `server_node` ( - `server_node_id` SERIAL PRIMARY KEY, - `name` VARCHAR(2000) NOT NULL DEFAULT '', - `server_cluster_id` BIGINT UNSIGNED NOT NULL DEFAULT 0, - `host` VARCHAR(1000) NOT NULL, - `port` INT NOT NULL, - `node_type` VARCHAR(255) NOT NULL, - `status` VARCHAR(255) NOT NULL, - `last_heartbeat_at` DATETIME DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `server_node` + ( + `server_node_id` SERIAL PRIMARY KEY, + `name` VARCHAR(2000) NOT NULL DEFAULT '', + `server_cluster_id` BIGINT UNSIGNED NOT NULL DEFAULT 0, + `host` VARCHAR(1000) NOT NULL, + `port` INT NOT NULL, + `node_type` VARCHAR(255) NOT NULL, + `status` VARCHAR(255) NOT NULL, + `last_heartbeat_at` DATETIME DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_server_node_h_p_nt` ON `server_node` (`host`(600), `port`, `node_type`(100)); CREATE INDEX `idx_server_node_h` ON `server_node` (`host`(767)); @@ -103,56 +111,112 @@ data: -- session (main) - CREATE TABLE IF NOT EXISTS `session_main` ( - `session_id` VARCHAR(767) PRIMARY KEY, - `name` VARCHAR(2000) NOT NULL DEFAULT '', - `status` VARCHAR(255) NOT NULL, - `tag` VARCHAR(255), - `total_proc_count` INT, - `active_proc_count` INT, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `session_main` + ( + `session_id` VARCHAR(767) PRIMARY KEY, + `name` VARCHAR(2000) NOT NULL DEFAULT '', + `status` VARCHAR(255) NOT NULL, + `tag` VARCHAR(255), + `total_proc_count` INT, + `active_proc_count` INT, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_main_s` ON `session_main` (`status`); -- session (option) - CREATE TABLE IF NOT EXISTS `session_option` ( - `session_option_id` SERIAL PRIMARY KEY, - `session_id` VARCHAR(2000), - `name` VARCHAR(255) NOT NULL, - `data` VARCHAR(2000) NOT NULL DEFAULT '', - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `session_option` + ( + `session_option_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(2000), + `name` VARCHAR(255) NOT NULL, + `data` VARCHAR(2000) NOT NULL DEFAULT '', + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_option_si` ON `session_option` (`session_id`(767)); -- session (processor) - CREATE TABLE IF NOT EXISTS `session_processor` ( - `processor_id` SERIAL PRIMARY KEY, - `session_id` VARCHAR(767), - `server_node_id` INT NOT NULL, - `processor_type` VARCHAR(255) NOT NULL, - `status` VARCHAR(255), - `tag` VARCHAR(255), - `command_endpoint` VARCHAR(255), - `transfer_endpoint` VARCHAR(255), - `pid` INT NOT NULL DEFAULT -1, - `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP - ) DEFAULT CHARACTER SET latin1 COLLATE latin1_swedish_ci; + CREATE TABLE IF NOT EXISTS `session_processor` + ( + `processor_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `processor_type` VARCHAR(255) NOT NULL, + `status` VARCHAR(255), + `tag` VARCHAR(255), + `command_endpoint` VARCHAR(255), + `transfer_endpoint` VARCHAR(255), + `processor_option` VARCHAR(512), + `pid` INT NOT NULL DEFAULT -1, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; CREATE INDEX `idx_session_processor_si` ON `session_processor` (`session_id`(767)); - INSERT INTO server_node (host, port, node_type, status) values ('clustermanager', '4670', 'CLUSTER_MANAGER', 'HEALTHY'); - {{- range .Values.modules.nodemanager.replicas | int | until }} - INSERT INTO server_node (host, port, node_type, status) values ('nodemanager-{{ . }}.nodemanager', '4671', 'NODE_MANAGER', 'HEALTHY'); - {{- end }} - show tables; - select * from server_node; + + CREATE TABLE IF NOT EXISTS `processor_resource` + ( + `id` SERIAL PRIMARY KEY, + `processor_id` BIGINT NOT NULL, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `resource_type` VARCHAR(255), + `allocated` BIGINT NOT NULL default 0, + `extention` VARCHAR(512), + `status` VARCHAR(255), + `pid` INT NOT NULL DEFAULT -1, + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; + CREATE INDEX `idx_processor_id_processor_resource` ON `processor_resource` (`processor_id`); + CREATE INDEX `idx_node_id_processor_resource` ON `processor_resource` (`server_node_id`); + CREATE INDEX `idx_session_id_processor_resource` ON `processor_resource` (`session_id`); + CREATE INDEX `idx_node_status_processor_resource` ON `processor_resource` (`server_node_id`,`resource_type`,`status`); + + + + CREATE TABLE IF NOT EXISTS `node_resource` + ( + `resource_id` SERIAL PRIMARY KEY, + `server_node_id` BIGINT NOT NULL, + `resource_type` VARCHAR(255), + `total` BIGINT NOT NULL default 0, + `used` BIGINT NOT NULL default 0, + `pre_allocated` BIGINT NOT NULL default 0, + `allocated` BIGINT NOT NULL DEFAULT 0, + `extention` VARCHAR(512), + `status` VARCHAR(255), + `created_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + `updated_at` DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; + CREATE INDEX `idx_node_id_node_resource` ON `node_resource` (`server_node_id`); + CREATE INDEX `idx_node_status_node_resource` ON `node_resource` (`server_node_id`,`status`); + CREATE UNIQUE INDEX `idx_u_node_resource` ON `node_resource` (`server_node_id`, `resource_type`); + + + CREATE TABLE IF NOT EXISTS `session_ranks` + ( + `container_id` SERIAL PRIMARY KEY, + `session_id` VARCHAR(767), + `server_node_id` INT NOT NULL, + `global_rank` INT UNSIGNED NOT NULL, + `local_rank` INT UNSIGNED NOT NULL + ) DEFAULT CHARACTER SET latin1 + COLLATE latin1_swedish_ci; + + CREATE INDEX `idx_session_id_session_ranks` ON `session_ranks` (`session_id`); + {{- end }} --- {{- end }} \ No newline at end of file diff --git a/helm-charts/FATE/templates/core/mysql/statefulSet.yaml b/helm-charts/FATE/templates/core/mysql/statefulSet.yaml index 10a71c72d..397bd56e5 100644 --- a/helm-charts/FATE/templates/core/mysql/statefulSet.yaml +++ b/helm-charts/FATE/templates/core/mysql/statefulSet.yaml @@ -38,6 +38,9 @@ spec: {{- end }} imagePullPolicy: {{ .Values.image.pullPolicy }} name: mysql + securityContext: + capabilities: + add: ["SYS_NICE"] env: - name: MYSQL_ALLOW_EMPTY_PASSWORD value: "1" diff --git a/helm-charts/FATE/templates/core/python-spark.yaml b/helm-charts/FATE/templates/core/python-spark.yaml index 766a37b68..893c98e37 100644 --- a/helm-charts/FATE/templates/core/python-spark.yaml +++ b/helm-charts/FATE/templates/core/python-spark.yaml @@ -97,10 +97,10 @@ spec: - name: FATE_LOG_LEVEL value: {{ .Values.modules.python.logLevel }} {{- end }} - - name: PYTHONPATH - value: "$PYTHONPATH:/data/projects/fate/fate/python:/data/projects/fate/eggroll/python:/data/projects/fate/fateflow/python:/data/projects/fate/fate/python/fate_client" - name: FATE_PROJECT_BASE value: "/data/projects/fate" + - name: HUGGINGFACE_HUB_CACHE + value: "/data/projects/fate/llm" {{- if eq .Values.computing "Spark" }} - name: FATE_FLOW_UPLOAD_MAX_NUM value: "1000000" @@ -175,6 +175,9 @@ spec: - mountPath: /data/projects/fate/fateflow/model_local_cache name: python-data subPath: model-local-cache + - mountPath: /data/projects/fate/llm + name: python-data + subPath: llm {{- with .Values.modules.python.nodeSelector }} nodeSelector: {{ toYaml . | indent 8 }} diff --git a/helm-charts/FATE/values-template-example.yaml b/helm-charts/FATE/values-template-example.yaml index 591d881c5..66d71788b 100644 --- a/helm-charts/FATE/values-template-example.yaml +++ b/helm-charts/FATE/values-template-example.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -33,7 +33,7 @@ computing: Eggroll federation: Eggroll # Storage: [Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local)] storage: Eggroll -# Algorithm: [Basic, NN] +# Algorithm: [Basic, NN, ALL] algorithm: Basic # Device: [IPCL, CPU, GPU] device: CPU @@ -145,6 +145,7 @@ skippedKeys: # nodeSelector: # tolerations: # affinity: +# mysqlServerTimezone: UTC # resources: # requests: # cpu: "1" @@ -274,7 +275,7 @@ skippedKeys: # spark: # master: # Image: "federatedai/spark-master" - # ImageTag: "1.11.1-release" + # ImageTag: "1.11.2-release" # replicas: 1 # resources: # requests: @@ -290,7 +291,7 @@ skippedKeys: # nodePort: 30977 # worker: # Image: "federatedai/spark-worker" - # ImageTag: "1.11.1-release" + # ImageTag: "1.11.2-release" # replicas: 2 # resources: # requests: diff --git a/helm-charts/FATE/values-template.yaml b/helm-charts/FATE/values-template.yaml index 8a5ae4611..9f829ffc7 100644 --- a/helm-charts/FATE/values-template.yaml +++ b/helm-charts/FATE/values-template.yaml @@ -302,7 +302,7 @@ modules: {{- with .clustermanager }} ip: clustermanager type: "ClusterIP" - enableTLS: {{ .enableTLS | default false }} + mysqlServerTimezone: {{ .mysqlServerTimezone }} {{- with .nodeSelector }} nodeSelector: {{ toYaml . | indent 6 }} diff --git a/helm-charts/FATE/values.yaml b/helm-charts/FATE/values.yaml index 5ac93be7b..6eca0e4e8 100644 --- a/helm-charts/FATE/values.yaml +++ b/helm-charts/FATE/values.yaml @@ -2,7 +2,7 @@ image: registry: federatedai isThridParty: - tag: 1.11.1-release + tag: 1.11.2-release pullPolicy: IfNotPresent imagePullSecrets: # - name: @@ -16,7 +16,7 @@ computing: Eggroll federation: Eggroll # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: Eggroll -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm: Basic # Device: CPU, IPCL, GPU device: IPCL @@ -198,13 +198,14 @@ modules: nodeSelector: tolerations: affinity: + mysqlServerTimezone: UTC nodemanager: include: true replicas: 2 nodeSelector: tolerations: affinity: - sessionProcessorsPerNode: 2 + sessionProcessorsPerNode: 4 subPath: "nodemanager" storageClass: accessMode: ReadWriteOnce @@ -212,7 +213,7 @@ modules: existingClaim: resources: requests: - cpu: "2" + cpu: "4" memory: "4Gi" mysql: diff --git a/helm-charts/UpgradeManager/values.yaml b/helm-charts/UpgradeManager/values.yaml index 55d81da63..e0bf513c6 100644 --- a/helm-charts/UpgradeManager/values.yaml +++ b/helm-charts/UpgradeManager/values.yaml @@ -1,4 +1,4 @@ username: fate password: fate_dev -start: v1.11.1 -target: v1.11.1 \ No newline at end of file +start: v1.11.2 +target: v1.11.2 \ No newline at end of file diff --git a/k8s-deploy/README.md b/k8s-deploy/README.md index d3b0fbb2e..d92bb3d29 100644 --- a/k8s-deploy/README.md +++ b/k8s-deploy/README.md @@ -143,6 +143,10 @@ We support such definition for: 3. Rabbitmq. 4. Pulsar. +### GPU support + +Starting from v1.11.1, KubeFATE can deploy GPU-enabled FATE clusters. Deploying GPU-enabled FATE requires some special preparation and configuration. You can check this document[KubeFATE Deploying GPU-enabled FATE](../docs/KubeFATE_deploys_GPU-enabled_FATE.md) + ### Checking the status of "Installing Cluster" job After the above command has finished, a job is created for installing a FATE cluster. Run the command `kubefate job describe` to check the status of the job, until the "Status" turns to `Success`. @@ -188,13 +192,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.1 +ChartVersion v1.11.2 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.1 + chartVersion: v1.11.2 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/README_zh.md b/k8s-deploy/README_zh.md index a477d51e1..0c800c010 100644 --- a/k8s-deploy/README_zh.md +++ b/k8s-deploy/README_zh.md @@ -144,7 +144,7 @@ create job success, job id=d92d7a56-7002-46a4-9363-da9c7346e05a ### GPU 支持 -从v1.11.1开始,KubeFATE可以部署支持GPU的FATE集群,部署支持GPU的FATE需要有一些特别准备工作和配置,可以查看这个文档[KubeFATE 部署支持GPU的FATE](../) +从v1.11.1开始,KubeFATE可以部署支持GPU的FATE集群,部署支持GPU的FATE需要有一些特别准备工作和配置,可以查看这个文档[KubeFATE 部署支持GPU的FATE](../docs/KubeFATE_deploys_GPU-enabled_FATE.md) ### 检查安装集群任务的状态 @@ -191,13 +191,13 @@ UUID 24bb75ff-f636-4c64-8c04-1b9073f89a2f Name fate-9999 NameSpace fate-9999 ChartName fate -ChartVersion v1.11.1 +ChartVersion v1.11.2 Revision 1 Age 15m Status Running Spec algorithm: Basic chartName: fate - chartVersion: v1.11.1 + chartVersion: v1.11.2 computing: Eggroll device: CPU federation: Eggroll diff --git a/k8s-deploy/cluster-spark-pulsar.yaml b/k8s-deploy/cluster-spark-pulsar.yaml index 28b79bd55..ab41b48e8 100644 --- a/k8s-deploy/cluster-spark-pulsar.yaml +++ b/k8s-deploy/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -29,7 +29,7 @@ computing: Spark federation: Pulsar # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: HDFS -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm: Basic # Device: CPU, IPCL GPU device: CPU @@ -128,7 +128,7 @@ skippedKeys: # spark: # master: # Image: "federatedai/spark-master" - # ImageTag: "1.11.1-release" + # ImageTag: "1.11.2-release" # replicas: 1 # resources: # requests: @@ -144,7 +144,7 @@ skippedKeys: # nodePort: 30977 # worker: # Image: "federatedai/spark-worker" - # ImageTag: "1.11.1-release" + # ImageTag: "1.11.2-release" # replicas: 2 # resources: # requests: diff --git a/k8s-deploy/cluster-spark-rabbitmq.yaml b/k8s-deploy/cluster-spark-rabbitmq.yaml index 025928409..874e2adfc 100644 --- a/k8s-deploy/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -29,7 +29,7 @@ computing: Spark federation: RabbitMQ # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: HDFS -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm: Basic # Device: CPU, IPCL GPU device: CPU diff --git a/k8s-deploy/cluster-spark-slim.yaml b/k8s-deploy/cluster-spark-slim.yaml index eb2145b0f..3181514c2 100644 --- a/k8s-deploy/cluster-spark-slim.yaml +++ b/k8s-deploy/cluster-spark-slim.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -27,7 +27,7 @@ computing: Spark_local federation: Pulsar # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: LocalFS -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm: Basic # Device: CPU, IPCL GPU device: CPU diff --git a/k8s-deploy/cluster.yaml b/k8s-deploy/cluster.yaml index 6fef5aeec..3560d5e54 100644 --- a/k8s-deploy/cluster.yaml +++ b/k8s-deploy/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: @@ -28,7 +28,7 @@ computing: Eggroll federation: Eggroll # Storage: Eggroll(computing: Eggroll), HDFS(computing: Spark), LocalFS(computing: Spark_local) storage: Eggroll -# Algorithm: Basic, NN +# Algorithm: Basic, NN, ALL algorithm: Basic # Device: CPU, IPCL GPU device: CPU diff --git a/k8s-deploy/examples/README.md b/k8s-deploy/examples/README.md index 97642f526..ee7fd6988 100644 --- a/k8s-deploy/examples/README.md +++ b/k8s-deploy/examples/README.md @@ -4,4 +4,4 @@ - party-10000 Kubernetes NodeIp: 192.168.10.1 -*Modify it according to your own actual situation.* \ No newline at end of file +*Modify it according to your own actual situation.* diff --git a/k8s-deploy/examples/party-10000/cluster-gpu.yaml b/k8s-deploy/examples/party-10000/cluster-gpu.yaml index 6e2f8a318..d16b4a66b 100644 --- a/k8s-deploy/examples/party-10000/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-10000/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml index 71dc3da91..302657537 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml index d153fd64a..2178e4b40 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml index c89bbf221..3797cb641 100644 --- a/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-10000/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-10000/cluster.yaml b/k8s-deploy/examples/party-10000/cluster.yaml index 236111484..85f332997 100644 --- a/k8s-deploy/examples/party-10000/cluster.yaml +++ b/k8s-deploy/examples/party-10000/cluster.yaml @@ -1,7 +1,7 @@ name: fate-10000 namespace: fate-10000 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 10000 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-gpu.yaml b/k8s-deploy/examples/party-9999/cluster-gpu.yaml index 4f1996e2a..a45b27d71 100644 --- a/k8s-deploy/examples/party-9999/cluster-gpu.yaml +++ b/k8s-deploy/examples/party-9999/cluster-gpu.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml index 468ff102d..f7cb9e570 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-local-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml index b075dd1f1..d608bcc6b 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-pulsar.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml index 7de974510..dfb6439bd 100644 --- a/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml +++ b/k8s-deploy/examples/party-9999/cluster-spark-rabbitmq.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-9999/cluster.yaml b/k8s-deploy/examples/party-9999/cluster.yaml index a63b7e972..f5ec6ce56 100644 --- a/k8s-deploy/examples/party-9999/cluster.yaml +++ b/k8s-deploy/examples/party-9999/cluster.yaml @@ -1,7 +1,7 @@ name: fate-9999 namespace: fate-9999 chartName: fate -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 9999 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/rollsite.yaml b/k8s-deploy/examples/party-exchange/rollsite.yaml index 74200191a..657461fb8 100644 --- a/k8s-deploy/examples/party-exchange/rollsite.yaml +++ b/k8s-deploy/examples/party-exchange/rollsite.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party-exchange/trafficServer.yaml b/k8s-deploy/examples/party-exchange/trafficServer.yaml index a26587233..ece20353a 100644 --- a/k8s-deploy/examples/party-exchange/trafficServer.yaml +++ b/k8s-deploy/examples/party-exchange/trafficServer.yaml @@ -1,7 +1,7 @@ name: fate-exchange namespace: fate-exchange chartName: fate-exchange -chartVersion: v1.11.1 +chartVersion: v1.11.2 partyId: 1 registry: "" pullPolicy: diff --git a/k8s-deploy/examples/party.config b/k8s-deploy/examples/party.config index 83762449a..37c5f56b2 100644 --- a/k8s-deploy/examples/party.config +++ b/k8s-deploy/examples/party.config @@ -1,5 +1,5 @@ -fate_chartVersion=v1.11.1 -fate_imageTAG=1.11.1-release +fate_chartVersion=v1.11.2 +fate_imageTAG=1.11.2-release fate_serving_chartVersion=v2.1.6 fate_serving_imageTAG=2.1.6-release party_9999_IP=192.168.9.1 diff --git a/k8s-deploy/pkg/job/cluster_install.go b/k8s-deploy/pkg/job/cluster_install.go index 63d2c5e15..c7fe3b088 100644 --- a/k8s-deploy/pkg/job/cluster_install.go +++ b/k8s-deploy/pkg/job/cluster_install.go @@ -80,6 +80,13 @@ func initJob(clusterArgs *modules.ClusterArgs, method, creator string) (*modules func clusterInstallRun(job *modules.Job) { + defer func() { + if err := recover(); err != nil { + log.Error().Err(err.(error)).Msg("clusterInstallRun panic") + job.SetStatus(modules.JobStatusFailed) + } + }() + log.Debug().Str("jobID", job.Uuid).Msg("job Running") // update status Running err := updateJobStatusToRunning(job) diff --git a/registry/README.md b/registry/README.md index 3bad2e7cf..0cee6f487 100644 --- a/registry/README.md +++ b/registry/README.md @@ -61,7 +61,7 @@ After Harbor has been installed, FATE docker images must be pushed to Harbor reg ## Build images from FATE source code -Refer to [Build Document](https://github.com/FederatedAI/FATE/blob/master/build/docker-build/README.md) to create FATE images. Once images are built, push them to Harbor. +Refer to [Build Document](https://github.com/FederatedAI/FATE-Builder/tree/main/docker-build/README.md) to create FATE images. Once images are built, push them to Harbor. Usually, a user does not need to take the long time to build images from source. It is recommended to use te pre-built docker images of FATE directly. Refer to te below section to for more details.