From f9a36dff3fa62cb5fc12fce7422e623baac5a06d Mon Sep 17 00:00:00 2001 From: lou-lan Date: Wed, 24 Apr 2024 17:17:37 +0800 Subject: [PATCH] Enhance code fix daily e2e failed Signed-off-by: lou-lan --- .github/workflows/call-e2e.yaml | 2 +- test/doc/reliability.md | 4 +- test/doc/reliability_zh.md | 28 +++----- test/e2e/common/calico.go | 7 +- .../egressclusterinfo_test.go | 14 +++- test/e2e/egressgateway/egressgateway_test.go | 65 +++++++++++++++++++ test/e2e/reliability/reliability_test.go | 1 + 7 files changed, 95 insertions(+), 26 deletions(-) diff --git a/.github/workflows/call-e2e.yaml b/.github/workflows/call-e2e.yaml index 3968bf9eb..cea698d8b 100644 --- a/.github/workflows/call-e2e.yaml +++ b/.github/workflows/call-e2e.yaml @@ -124,7 +124,7 @@ jobs: - name: Setup Kind Cluster uses: nick-invision/retry@v3 with: - timeout_minutes: 10 + timeout_minutes: 20 max_attempts: 1 shell: bash command: | diff --git a/test/doc/reliability.md b/test/doc/reliability.md index 4f3aab076..57adc58fe 100644 --- a/test/doc/reliability.md +++ b/test/doc/reliability.md @@ -2,8 +2,8 @@ | Case ID | Title | Priority | Smoke | Status | Other | |---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|---------|--------|-------| -| R00001 | Use `kwok` to create 10 `Node`, create `Deployment` with 1000 replicas, create `Policy` and set `PodSelector` to match `Deployment`,
all matched `Pod`'s egress IP in the real node is `eip` | p3 | false | done | | -| R00002 | Use `kwok` to create 10 `Node`, create `Deployment` with 1000 replicas, create `Policy` and set `PodSelector` to match `Deployment`,
After restarting `Deployment` successfully, all matched `Pod`'s egress IP in the real node is `eip` | p3 | false | done | | +| R00001 | Use `kwok` to create 10 `Node`, create `Deployment` with 1000 replicas, create `Policy` and set `PodSelector` to match `Deployment`,
all matched `Pod`'s egress IP in the real node is `eip` | p3 | false | | | +| R00002 | Use `kwok` to create 10 `Node`, create `Deployment` with 1000 replicas, create `Policy` and set `PodSelector` to match `Deployment`,
After restarting `Deployment` successfully, all matched `Pod`'s egress IP in the real node is `eip` | p3 | false | | | | R00005 | When the node where `eip` takes effect is shut down, `eip` will take effect to another node matching `NodeSelector`, and `egressGatewayStatus` and `EgressClusterStatus` are updated as expected, and the `EgressTunnel` corresponding to the shutdown node ` will be deleted and the egress IP will be accessed as expected | p3 | false | done | | | R00006 | After shutting down all nodes matched by `NodeSelector` in `egressGateway`,
`Pod`’s egress IP will be changed from `eip` to non-`eip`, `egressGatewayStatus.NodeList` will be empty, and the related `EgressIgnoreCIDR.NodeIP` will be deleted and the `EgressTunnel` corresponding to the shutdown node will be deleted.
After one of the `node` is turned on, `egressgateway` will recover in a short time and record the recovery time, and `eip` will be revalidated as the egress IP of `Pod`, and the `nodeIP` will be added to `EgressIgnoreCIDR.NodeIP` and `node` related information in `egressGatewayStatus.NodeList` is updated correctly,
after all boots, `eip` will only take effect on the first recovered `node`, and `EgressIgnoreCIDR.NodeIP` is updated correct | p3 | false | done | | | R00007 | Restart each component in the cluster (including calico, kube-proxy) `Pod` in turn. During the restart process, the access IP to outside the cluster is the set `eip` before, and the traffic cannot be interrupted. After the cluster returns to normal, `egressgateway` The individual `cr` state of the component is correct | p1 | false | done | | diff --git a/test/doc/reliability_zh.md b/test/doc/reliability_zh.md index 85da78e4e..c4aa18f84 100644 --- a/test/doc/reliability_zh.md +++ b/test/doc/reliability_zh.md @@ -1,22 +1,10 @@ - # Reliability E2E 用例 -| 用例编号 | 标题 | 优先级 | 冒烟 | 状态 | 其他 | -|--------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----|-------|-----|-----| -| R00001 | 使用 `kwok` 创建 10 个 `Node`,创建 1000 个副本的 `Deployment`,创建 `Policy` 并设置 `PodSelector`,使之与 `Deployment` 匹配,
真实节点中匹配到的所有 `Pod` 的出口 IP 为 `eip` | p3 | false | done | | -| R00002 | 使用 `kwok` 创建 10 个 `Node`,创建 1000 个副本的 `Deployment`,创建 `Policy` 并设置 `PodSelector`,使之与 `Deployment` 匹配,
重启 `Deployment` 成功后, 真实节点中匹配到的所有 `Pod` 的出口 IP 为 `eip` | p3 | false | done | | -| R00005 | 当关机 `eip` 生效的节点后,`eip` 会生效到另外匹配 `NodeSelector` 的节点上,
并且 `egressGatewayStatus` 及 `EgressClusterStatus` 如预期更新,与被关机的节点对应的 `EgressTunnel` 将被删除,出口 IP 如预期访问 | p3 | false | done | | -| R00006 | 当关机 `egressGateway` 中 `NodeSelector` 匹配的所有节点后,
`Pod` 的出口 IP 将由 `eip` 改为非 `eip`,`egressGatewayStatus.NodeList` 将为空,相关的 `EgressIgnoreCIDR.NodeIP` 将被删除,与被关机的节点对应的 `EgressTunnel` 将被删除。
将其中一个 `node` 开机后,`egressgateway` 会在短时间内恢复并记录恢复时间,并且 `eip` 重新生效为 `Pod` 的出口 IP,`EgressIgnoreCIDR.NodeIP` 将对应的 `nodeIP` 添加并且 `egressGatewayStatus.NodeList` 中 `node` 相关信息更新正确,
全部开机最后 `eip` 只会生效在第一个恢复的 `node` 上,`EgressIgnoreCIDR.NodeIP` 更新正确 | p3 | false | done | | -| R00007 | 依次重启集群中各个组件(包含 calico,kube-proxy)`Pod`, 重启过程中访问集群外部的出口 IP 为设置好的 `eip`,并且业务不能断流, 等待集群恢复正常后,`egressgateway` 组件的各个 `cr` 状态正确 | p1 | false | done | | -| R00008 | 创建 `egressGateway` 分配有 100 个 IP 的池,创建 120 个 policy,做多次删除和创建操作之后,期望 `egressGateway` 及 `Policy` 状态正确, `pod` 的出口 IP 符合预期 | p1 | true | done | | \ No newline at end of file +| 用例编号 | 标题 | 优先级 | 冒烟 | 状态 | 其他 | +|--------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----|-------|------|----| +| R00001 | 使用 `kwok` 创建 10 个 `Node`,创建 1000 个副本的 `Deployment`,创建 `Policy` 并设置 `PodSelector`,使之与 `Deployment` 匹配,
真实节点中匹配到的所有 `Pod` 的出口 IP 为 `eip` | p3 | false | | | +| R00002 | 使用 `kwok` 创建 10 个 `Node`,创建 1000 个副本的 `Deployment`,创建 `Policy` 并设置 `PodSelector`,使之与 `Deployment` 匹配,
重启 `Deployment` 成功后, 真实节点中匹配到的所有 `Pod` 的出口 IP 为 `eip` | p3 | false | | | +| R00005 | 当关机 `eip` 生效的节点后,`eip` 会生效到另外匹配 `NodeSelector` 的节点上,
并且 `egressGatewayStatus` 及 `EgressClusterStatus` 如预期更新,与被关机的节点对应的 `EgressTunnel` 将被删除,出口 IP 如预期访问 | p3 | false | done | | +| R00006 | 当关机 `egressGateway` 中 `NodeSelector` 匹配的所有节点后,
`Pod` 的出口 IP 将由 `eip` 改为非 `eip`,`egressGatewayStatus.NodeList` 将为空,相关的 `EgressIgnoreCIDR.NodeIP` 将被删除,与被关机的节点对应的 `EgressTunnel` 将被删除。
将其中一个 `node` 开机后,`egressgateway` 会在短时间内恢复并记录恢复时间,并且 `eip` 重新生效为 `Pod` 的出口 IP,`EgressIgnoreCIDR.NodeIP` 将对应的 `nodeIP` 添加并且 `egressGatewayStatus.NodeList` 中 `node` 相关信息更新正确,
全部开机最后 `eip` 只会生效在第一个恢复的 `node` 上,`EgressIgnoreCIDR.NodeIP` 更新正确 | p3 | false | done | | +| R00007 | 依次重启集群中各个组件(包含 calico,kube-proxy)`Pod`, 重启过程中访问集群外部的出口 IP 为设置好的 `eip`,并且业务不能断流, 等待集群恢复正常后,`egressgateway` 组件的各个 `cr` 状态正确 | p1 | false | done | | +| R00008 | 创建 `egressGateway` 分配有 100 个 IP 的池,创建 120 个 policy,做多次删除和创建操作之后,期望 `egressGateway` 及 `Policy` 状态正确, `pod` 的出口 IP 符合预期 | p1 | true | done | | \ No newline at end of file diff --git a/test/e2e/common/calico.go b/test/e2e/common/calico.go index a5693a661..20b789fdb 100644 --- a/test/e2e/common/calico.go +++ b/test/e2e/common/calico.go @@ -5,10 +5,12 @@ package common import ( "context" + goerrors "errors" "github.com/go-faker/faker/v4" calicov1 "github.com/tigera/operator/pkg/apis/crd.projectcalico.org/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" ) func CreateCalicoIPPool(ctx context.Context, cli client.Client, @@ -49,7 +51,10 @@ func ListCalicoIPPool(ctx context.Context, cli client.Client) ([]string, error) list := &calicov1.IPPoolList{} err := cli.List(ctx, list) if err != nil { - return res, err + rdfErr := &apiutil.ErrResourceDiscoveryFailed{} + if !goerrors.As(err, &rdfErr) { + return res, err + } } for _, item := range list.Items { diff --git a/test/e2e/egressclusterinfo/egressclusterinfo_test.go b/test/e2e/egressclusterinfo/egressclusterinfo_test.go index 53f800c5b..a5f1f315e 100644 --- a/test/e2e/egressclusterinfo/egressclusterinfo_test.go +++ b/test/e2e/egressclusterinfo/egressclusterinfo_test.go @@ -5,6 +5,8 @@ package egressclusterinfo_test import ( "context" + goerrors "errors" + "os" "strings" . "github.com/onsi/ginkgo/v2" @@ -14,6 +16,7 @@ import ( calicov1 "github.com/tigera/operator/pkg/apis/crd.projectcalico.org/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/apiutil" egressv1 "github.com/spidernet-io/egressgateway/pkg/k8s/apis/v1beta1" "github.com/spidernet-io/egressgateway/test/e2e/common" @@ -41,8 +44,12 @@ var _ = Describe("EgressClusterInfo", Label("EgressClusterInfo"), Serial, func() list := &calicov1.IPPoolList{} err := cli.List(ctx, list) - Expect(err).NotTo(HaveOccurred()) - + if err != nil { + rdfErr := &apiutil.ErrResourceDiscoveryFailed{} + if !goerrors.As(err, &rdfErr) { + Expect(err).NotTo(HaveOccurred()) + } + } for _, p := range list.Items { if strings.Contains(p.Name, calicoIPv4Prefix) || strings.Contains(p.Name, calicoIPv6Prefix) { @@ -67,6 +74,9 @@ var _ = Describe("EgressClusterInfo", Label("EgressClusterInfo"), Serial, func() }) It("Create or update calico IPPool", Serial, Label("I00006"), func() { + if os.Getenv("cni") != "calico" { + Skip("test only for calico") + } if egressConfig.EnableIPv6 { createOrUpdateCalicoIPPoolAndCheck( ctx, cli, eci, calicoIPv6Prefix, "112", common.RandomIPPoolV6Cidr) diff --git a/test/e2e/egressgateway/egressgateway_test.go b/test/e2e/egressgateway/egressgateway_test.go index f53713e1e..11a069aa4 100644 --- a/test/e2e/egressgateway/egressgateway_test.go +++ b/test/e2e/egressgateway/egressgateway_test.go @@ -724,6 +724,71 @@ var _ = Describe("Operate EgressGateway", Label("EgressGateway"), Ordered, func( }) }) +var _ = Describe("Check EgressGateway usage when not mach node", Label("EgressGateway", "EgressGatewayUsage"), Ordered, func() { + var gateway *egressv1.EgressGateway + var singleIpv4Pool, singleIpv6Pool []string + var expIPv4Count, expIPv6Count int + var err error + + BeforeEach(func() { + if egressConfig.EnableIPv4 { + singleIpv4Pool = []string{"10.6.1.21", "10.6.1.11-10.6.1.20"} + expIPv4Count = 11 + } + if egressConfig.EnableIPv6 { + singleIpv6Pool = []string{"fd00::1", "fd01::1-fd01::a"} + expIPv6Count = 11 + } + gateway, err = common.CreateGatewayCustom(context.Background(), cli, func(egw *egressv1.EgressGateway) { + egw.Spec.NodeSelector = egressv1.NodeSelector{ + Policy: common.AVERAGE_SELECTION, + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"not-match-label": "not-match-label"}, + }, + } + egw.Spec.Ippools.IPv4 = singleIpv4Pool + egw.Spec.Ippools.IPv6 = singleIpv6Pool + }) + Expect(err).NotTo(HaveOccurred()) + DeferCleanup(func() { + if gateway != nil { + GinkgoWriter.Printf("Delete egw: %s\n", gateway.Name) + Expect(common.DeleteEgressGateway(context.Background(), cli, gateway, time.Minute/2)).NotTo(HaveOccurred()) + } + }) + }) + + It("should correctly manage gateway IP usage", func() { + ctx, cancel := context.WithTimeout(context.Background(), time.Minute) + defer cancel() + for { + var pass bool + select { + case <-ctx.Done(): + err = fmt.Errorf("check egw timeout") + break + default: + err = cli.Get(ctx, types.NamespacedName{Name: gateway.Name}, gateway) + Expect(err).NotTo(HaveOccurred()) + if egressConfig.EnableIPv4 && gateway.Status.IPUsage.IPv4Total != expIPv4Count { + time.Sleep(time.Second) + continue + } + if egressConfig.EnableIPv6 && gateway.Status.IPUsage.IPv6Total != expIPv6Count { + time.Sleep(time.Second) + continue + } + pass = true + break + } + Expect(err).NotTo(HaveOccurred()) + if pass { + break + } + } + }) +}) + func createEgressGateway(ctx context.Context) (egw *egressv1.EgressGateway) { // create gateway GinkgoWriter.Println("Create EgressGateway") diff --git a/test/e2e/reliability/reliability_test.go b/test/e2e/reliability/reliability_test.go index 2ca101000..495811da8 100644 --- a/test/e2e/reliability/reliability_test.go +++ b/test/e2e/reliability/reliability_test.go @@ -330,6 +330,7 @@ var _ = Describe("Reliability", Serial, Label("Reliability"), func() { ) BeforeEach(func() { + Skip("kwok not support egress agent mock") ctx = context.Background() kNodesNum = 10 // deploy