diff --git a/.github/workflows/gcp-full-infra-test-apply.yaml b/.github/workflows/gcp-full-infra-test-apply.yaml deleted file mode 100644 index 5e74b1ef..00000000 --- a/.github/workflows/gcp-full-infra-test-apply.yaml +++ /dev/null @@ -1,165 +0,0 @@ -name: GCP Full Infra Creation/Deletion Test [APPLY] - -on: - workflow_dispatch: - inputs: - pr_number: - description: "Pull Request Number" - required: true - pull_request_review: - types: [submitted] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: "read" - id-token: "write" - -env: - PR_NUMBER: ${{ github.event.pull_request.number }} - PLAN_WORKFLOW_NAME: gcp-full-infra-test-plan.yaml - tenant_group_name: "workflow-pr-${{ github.event.pull_request.number }}" - tenant_name: "t-1" - TF_VAR_falkordb_version: "edge" - -jobs: - apply: - if: github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request_review' && github.event.review.state == 'approved') - runs-on: ubuntu-latest - environment: testing - steps: - - uses: actions/checkout@v4 - - - name: Set PR_NUMBER env variable - if: github.event_name == 'workflow_dispatch' - run: | - # If event is workflow_dispatch, use the event.workflow_dispatch.inputs.pr_number - echo "PR_NUMBER=${{ github.event.inputs.pr_number }}" >> $GITHUB_ENV - echo "tenant_group_name=workflow-pr-${{ github.event.inputs.pr_number }}" >> $GITHUB_ENV - - - name: Setup GCloud - uses: "google-github-actions/auth@v2.1.2" - with: - workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} - service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - - - name: Set up Cloud SDK - uses: "google-github-actions/setup-gcloud@v2.1.0" - with: - version: ">= 363.0.0" - install_components: "gke-gcloud-auth-plugin" - - - name: Set up Tofu - uses: opentofu/setup-opentofu@v1.0.3 - with: - tofu_wrapper: false - - - name: Retrieve artifacts from last plan - uses: dawidd6/action-download-artifact@v3 - with: - workflow: ${{ env.PLAN_WORKFLOW_NAME }} - pr: ${{ env.PR_NUMBER }} - name: artifacts-${{ env.PR_NUMBER }} - path: artifacts - - - name: Apply GCP Test Environment - id: apply_tenant_group - timeout-minutes: 20 - continue-on-error: true - working-directory: ./tofu/gcp/test_env - run: | - tofu init -backend-config="bucket=${{ vars.GCP_STATE_BUCKET_NAME }}" -backend-config="prefix=test_env/$PR_NUMBER" - tofu apply -auto-approve ${GITHUB_WORKSPACE}/artifacts/test_env.tfplan - - - name: Get link for Tenant Group state - run: echo "https://console.cloud.google.com/storage/browser/${{ vars.GCP_STATE_BUCKET_NAME }}/test_env/$PR_NUMBER?project=${{ vars.GCP_PROJECT_ID }}" - - - name: Get cluster credentials - if: steps.apply_tenant_group.outcome == 'success' - id: get_cluster_credentials - continue-on-error: true - working-directory: ./tofu/gcp/test_env - run: | - gcloud container clusters get-credentials $(tofu output -json | jq -r '.cluster_name.value') --region ${{ vars.GCP_REGION }} --project ${{ vars.GCP_PROJECT_ID }} - - - name: Check connection to cluster - if: steps.get_cluster_credentials.outcome == 'success' - continue-on-error: true - run: | - kubectl get pods --all-namespaces - - - name: Set tenant connection variables - if: steps.apply_tenant_group.outcome == 'success' - continue-on-error: true - working-directory: ./tofu/gcp/test_env - run: | - - echo "FALKORDB_STANDALONE_HOST=$(tofu output -json | jq -r '.ip_address.value')" >> $GITHUB_ENV - echo "FALKORDB_STANDALONE_PORT=$(tofu output -json | jq -r '.falkordb_standalone_tenant_redis_port.value')" >> $GITHUB_ENV - - echo "FALKORDB_SINGLE_ZONE_HOST=$(tofu output -json | jq -r '.ip_address.value')" >> $GITHUB_ENV - echo "FALKORDB_SINGLE_ZONE_PORT=$(tofu output -json | jq -r '.falkordb_single_zone_tenant_redis_port.value')" >> $GITHUB_ENV - echo "FALKORDB_SINGLE_ZONE_TENANT_NAMESPACE=$(tofu output -json | jq -r '.falkordb_single_zone_tenant_namespace.value')" >> $GITHUB_ENV - - echo "FALKORDB_PASSWORD=${{ secrets.TF_VAR_FALKORDB_PASSWORD }}" >> $GITHUB_ENV - - - name: Setup Python environment - if: steps.apply_tenant_group.outcome == 'success' - id: setup_python - continue-on-error: true - run: | - python3 -m venv .venv - source .venv/bin/activate - pip install -r requirements.txt - - - name: Run Standalone tests - if: steps.apply_tenant_group.outcome == 'success' && steps.setup_python.outcome == 'success' - id: python_tests_standalone - continue-on-error: true - run: | - source .venv/bin/activate - pytest -s tofu/gcp/test_env/__tests__/standalone/main.py --hostname ${{ env.FALKORDB_STANDALONE_HOST }} --port ${{ env.FALKORDB_STANDALONE_PORT }} --password ${{ secrets.TF_VAR_FALKORDB_PASSWORD }} - - - name: Run Replica tests - if: steps.apply_tenant_group.outcome == 'success' && steps.setup_python.outcome == 'success' - id: python_tests_replica - continue-on-error: true - run: | - source .venv/bin/activate - pytest -s tofu/gcp/test_env/__tests__/replica/main.py --hostname ${{ env.FALKORDB_SINGLE_ZONE_HOST }} --port ${{ env.FALKORDB_SINGLE_ZONE_PORT }} --namespace ${{ env.FALKORDB_SINGLE_ZONE_TENANT_NAMESPACE }} --password ${{ secrets.TF_VAR_FALKORDB_PASSWORD }} - - - name: Destroy GCP Test Environment - if: always() - continue-on-error: true - working-directory: ./tofu/gcp/test_env - env: - TF_VAR_project_id: ${{ vars.GCP_PROJECT_ID }} - TF_VAR_region: ${{ vars.GCP_REGION }} - TF_VAR_tenant_group_name: ${{ env.tenant_group_name }} - TF_VAR_force_destroy_backup_bucket: ${{ vars.GCP_FORCE_DESTROY_BACKUP_BUCKET }} - TF_VAR_subnet_cidr: ${{ vars.GCP_SUBNET_CIDR }} - TF_VAR_ip_range_pods: ${{ vars.GCP_IP_RANGE_PODS }} - TF_VAR_ip_range_services: ${{ vars.GCP_IP_RANGE_SERVICES }} - TF_VAR_tenant_provision_sa: ${{ vars.GCP_TENANT_PROVISION_SA }} - TF_VAR_backup_retention_policy_days: ${{ vars.GCP_BACKUP_RETENTION_POLICY_DAYS }} - TF_VAR_cluster_deletion_protection: ${{ vars.GCP_CLUSTER_DELETION_PROTECTION }} - TF_VAR_tenant_name: ${{ env.tenant_name }} - TF_VAR_falkordb_password: ${{ secrets.TF_VAR_FALKORDB_PASSWORD }} - TF_VAR_falkordb_cpu: ${{ vars.TF_VAR_FALKORDB_CPU }} - TF_VAR_falkordb_memory: ${{ vars.TF_VAR_FALKORDB_MEMORY }} - TF_VAR_persistence_size: ${{ vars.TF_VAR_PERSISTANCE_SIZE }} - TF_VAR_falkordb_replicas: ${{ vars.TF_VAR_FALKORDB_REPLICAS }} - TF_VAR_backup_schedule: ${{ vars.TF_VAR_BACKUP_SCHEDULE }} - TF_VAR_dns_domain: "${{ env.tenant_name }}.${{ vars.TF_VAR_DNS_DOMAIN }}" - run: | - tofu destroy -auto-approve - - - name: Fail if tests did not pass - if: always() - run: | - # Check if python tests conclusion is success - if [ ${{ steps.python_tests_standalone.outcome }} != 'success' ] || [ ${{ steps.python_tests_replica.outcome }} != 'success' ]; then - exit 1 - fi diff --git a/.github/workflows/gcp-full-infra-test-plan.yaml b/.github/workflows/gcp-full-infra-test-plan.yaml deleted file mode 100644 index 4817a9c5..00000000 --- a/.github/workflows/gcp-full-infra-test-plan.yaml +++ /dev/null @@ -1,81 +0,0 @@ -name: GCP Full Infra Creation/Deletion Test [PLAN] - -on: - workflow_dispatch: - inputs: - pr_number: - description: "Pull Request Number" - required: true - pull_request: - branches: [main] - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: "read" - id-token: "write" - -env: - PR_NUMBER: ${{ github.event.pull_request.number }} - tenant_group_name: "workflow-pr-${{ github.event.pull_request.number }}" - tenant_name: "t-1" - TF_VAR_falkordb_version: 'edge' - -jobs: - plan: - runs-on: ubuntu-latest - environment: testing - steps: - - uses: actions/checkout@v4 - - - name: Set PR_NUMBER env variable - if: github.event_name == 'workflow_dispatch' - run: | - # If event is workflow_dispatch, use the event.workflow_dispatch.inputs.pr_number - echo "PR_NUMBER=${{ github.event.inputs.pr_number }}" >> $GITHUB_ENV - echo "tenant_group_name=workflow-pr-${{ github.event.inputs.pr_number }}" >> $GITHUB_ENV - - - name: Setup GCloud - uses: "google-github-actions/auth@v2.1.2" - with: - workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} - service_account: ${{ secrets.GCP_SERVICE_ACCOUNT }} - - - name: Set up Cloud SDK - uses: "google-github-actions/setup-gcloud@v2.1.0" - with: - version: ">= 363.0.0" - - - name: Set up Tofu - uses: opentofu/setup-opentofu@v1.0.3 - - - name: Plan GCP Test Environment - env: - TF_VAR_project_id: ${{ vars.GCP_PROJECT_ID }} - TF_VAR_region: ${{ vars.GCP_REGION }} - TF_VAR_tenant_group_name: ${{ env.tenant_group_name }} - TF_VAR_force_destroy_backup_bucket: ${{ vars.GCP_FORCE_DESTROY_BACKUP_BUCKET }} - TF_VAR_subnet_cidr: ${{ vars.GCP_SUBNET_CIDR }} - TF_VAR_ip_range_pods: ${{ vars.GCP_IP_RANGE_PODS }} - TF_VAR_ip_range_services: ${{ vars.GCP_IP_RANGE_SERVICES }} - TF_VAR_tenant_provision_sa: ${{ vars.GCP_TENANT_PROVISION_SA }} - TF_VAR_backup_retention_policy_days: ${{ vars.GCP_BACKUP_RETENTION_POLICY_DAYS }} - TF_VAR_cluster_deletion_protection: ${{ vars.GCP_CLUSTER_DELETION_PROTECTION }} - TF_VAR_tenant_name: ${{ env.tenant_name }} - TF_VAR_falkordb_password: ${{ secrets.TF_VAR_FALKORDB_PASSWORD }} - TF_VAR_backup_schedule: ${{ vars.TF_VAR_BACKUP_SCHEDULE }} - TF_VAR_dns_domain: "${{ env.tenant_group_name }}.${{ vars.TF_VAR_DNS_DOMAIN }}" - working-directory: ./tofu/gcp/test_env - run: | - mkdir -p ${GITHUB_WORKSPACE}/artifacts - tofu init -backend-config="bucket=${{ vars.GCP_STATE_BUCKET_NAME }}" -backend-config="prefix=test_env/$PR_NUMBER" - tofu test - tofu plan -out ${GITHUB_WORKSPACE}/artifacts/test_env.tfplan - - - name: Archive artifacts - uses: actions/upload-artifact@v4 - with: - name: artifacts-${{ env.PR_NUMBER }} - path: artifacts diff --git a/.github/workflows/testing-apply-aws-infra.yaml b/.github/workflows/testing-apply-aws-infra.yaml deleted file mode 100644 index 561f0cf4..00000000 --- a/.github/workflows/testing-apply-aws-infra.yaml +++ /dev/null @@ -1,181 +0,0 @@ -name: TESTING Apply AWS infrastructure - -on: - # pull_request_review: - # types: [submitted] - workflow_dispatch: - inputs: - pr_number: - description: "Pull request number" - required: true - -defaults: - run: - working-directory: ./scripts - -env: - PR_NUMBER: ${{github.event.pull_request.number }} - PLAN_WORKFLOW_NAME: testing-plan-aws-infra.yaml - # TF_CACHE_DIR: ${{ github.workspace }}/tofu/.terraform - -jobs: - apply-test-aws-k8s-infra: - if: github.event.review.state == 'approved' || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - environment: testing - steps: - - uses: actions/checkout@v4 - - - name: Set PR_NUMBER env variable - if: github.event_name == 'workflow_dispatch' - run: | - # If event is workflow_dispatch, use the event.workflow_dispatch.inputs.pr_number - echo "PR_NUMBER=${{ github.event.inputs.pr_number }}" >> $GITHUB_ENV - - - name: Init AWS credentials - uses: aws-actions/configure-aws-credentials@v4.0.2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY }} - aws-region: ${{ vars.TF_VAR_REGION }} - - - name: Set up Tofu - uses: opentofu/setup-opentofu@v1.0.3 - with: - tofu_wrapper: false - - # - name: Create Terraform Plugin Cache Dir - # run: mkdir --parents ${{ env.TF_CACHE_DIR }} - - # - name: Cache OpenTofu plugins - # uses: actions/cache@v2 - # id: cache_opentofu_plugins - # with: - # path: ${{ env.TF_CACHE_DIR }} - # key: tofu-${{hashFiles('./tofu/.terraform.lock.hcl')}} - - - name: Get artifacts - uses: dawidd6/action-download-artifact@v3 - with: - workflow: ${{ env.PLAN_WORKFLOW_NAME }} - pr: ${{ env.PR_NUMBER }} - workflow_conclusion: success - name: artifacts-${{ env.PR_NUMBER }} - path: artifacts - - - name: Copy artifacts - working-directory: ./artifacts - run: | - cp plan-aws.out ../tofu/aws/plan-aws - cp terraform.tfvars ../tofu/terraform.tfvars - - - name: Apply AWS infrastructure - id: apply_infrastructure - timeout-minutes: 20 - continue-on-error: true - working-directory: ./tofu/aws - run: | - tofu init - tofu apply -auto-approve -state-out ../../state/state-aws plan-aws - - - name: Set AWS infra output variables - continue-on-error: true - working-directory: ./tofu/aws - run: | - echo "TF_VAR_falkordb_eks_cluster_name=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_name)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_s3_backup_name=$(tofu output -state=../../state/state-aws -raw falkordb_s3_backup_name)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_eks_cluster_oidc_issuer_url=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_oidc_issuer_url)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_eks_cluster_oidc_issuer_arn=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_oidc_issuer_arn)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_eks_cluster_role_arn=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_role_arn)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_eks_cluster_endpoint=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_endpoint)" >> $GITHUB_ENV - echo "TF_VAR_falkordb_eks_cluster_certificate_autority=$(tofu output -state=../../state/state-aws -raw falkordb_eks_cluster_certificate_autority)" >> $GITHUB_ENV - - - name: Archive AWS target state - uses: actions/upload-artifact@v4 - with: - name: state-aws-${{ env.PR_NUMBER }} - path: state/state-aws - if-no-files-found: error - - - name: Get EKS cluster credentials - id: get_eks_credentials - if: steps.apply_infrastructure.outcome == 'success' - continue-on-error: true - run: | - ./aws_update_kubeconfig.sh testing-cluster-${{ env.PR_NUMBER }} ${{ secrets.TF_VAR_ASSUME_ROLE_ARN }} - - - name: Check connection to EKS cluster - if: steps.get_eks_credentials.outcome == 'success' - continue-on-error: true - run: | - kubectl get nodes - - - name: Apply k8s module - if: steps.get_eks_credentials.outcome == 'success' - id: apply_k8s_module - timeout-minutes: 20 - continue-on-error: true - working-directory: ./tofu/k8s - run: | - tofu init - tofu plan -out=local-k8s -var-file=../terraform.tfvars -var "assume_role_arn=${{ secrets.TF_VAR_ASSUME_ROLE_ARN }}" -var "falkordb_password=${{ secrets.TF_VAR_FALKORDB_PASSWORD }}" - tofu apply -auto-approve -state-out ../../state/state-k8s local-k8s - - - name: Archive K8S target state - uses: actions/upload-artifact@v4 - with: - name: state-k8s-${{ env.PR_NUMBER }} - path: state/state-k8s - if-no-files-found: error - - - name: Get FalkorDB endpoint - if: steps.apply_k8s_module.outcome == 'success' - continue-on-error: true - id: lb - run: | - FALKORDB_HOST=$(kubectl get svc -n falkordb falkordb-redis -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') - # If FALKORDB_HOST is empty, wait 30 seconds and try again - if [ -z "$FALKORDB_HOST" ]; then - sleep 30 - FALKORDB_HOST=$(kubectl get svc -n falkordb falkordb-redis -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') - fi - - # If FALKORDB_HOST is still empty, exit with error - if [ -z "$FALKORDB_HOST" ]; then - echo "FALKORDB_HOST is empty" - echo "$(kubectl get svc -n falkordb falkordb-redis -o jsonpath='{.status.loadBalancer.ingress[0].hostname}')" - exit 1 - fi - - echo "FALKORDB_HOST=$FALKORDB_HOST" >> $GITHUB_ENV - echo "FALKORDB_PORT=6379" >> $GITHUB_ENV - echo "FALKORDB_PASSWORD=${{ secrets.TF_VAR_FALKORDB_PASSWORD }}" >> $GITHUB_ENV - - - name: Run python tests - id: python_tests - if: steps.lb.outcome == 'success' - working-directory: . - continue-on-error: true - run: | - python3 -m venv .venv - source .venv/bin/activate - pip install -r requirements.txt - pytest - - - name: Destroy K8S infrastructure - if: always() - working-directory: ./tofu/k8s - run: tofu destroy -auto-approve -state ../../state/state-k8s -var-file=../terraform.tfvars -var "assume_role_arn=${{ secrets.TF_VAR_ASSUME_ROLE_ARN }}" -var "falkordb_password=${{ secrets.TF_VAR_FALKORDB_PASSWORD }}" - - - name: Destroy AWS infrastructure - if: always() - working-directory: ./tofu/aws - run: tofu destroy -auto-approve -state ../../state/state-aws -var-file=../terraform.tfvars -var "assume_role_arn=${{ secrets.TF_VAR_ASSUME_ROLE_ARN }}" -var "eks_auth_role=${{ secrets.TF_VAR_EKS_AUTH_ROLE }}" - - - name: Fail if tests did not pass - if: always() - run: | - # Check if python tests conclusion is success - if [ ${{ steps.python_tests.outcome }} != 'success' ]; then - exit 1 - fi diff --git a/.github/workflows/testing-plan-aws-infra.yaml b/.github/workflows/testing-plan-aws-infra.yaml deleted file mode 100644 index 867f7d71..00000000 --- a/.github/workflows/testing-plan-aws-infra.yaml +++ /dev/null @@ -1,108 +0,0 @@ -name: TESTING Plan AWS infrastructure - -on: - # pull_request: - # branches: [main] - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -defaults: - run: - working-directory: ./scripts - -env: - PR_NUMBER: ${{ github.event.pull_request.number }} - # TF_CACHE_DIR: ${{ github.workspace }}/tofu/.terraform - -jobs: - plan-infra: - runs-on: ubuntu-latest - environment: testing - steps: - - uses: actions/checkout@v4 - - - name: Init AWS credentials - uses: aws-actions/configure-aws-credentials@v4.0.2 - with: - aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} - aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY }} - aws-region: ${{ vars.TF_VAR_REGION }} - - - name: Set up testing environment variables - env: - name: testing-cluster-${{ env.PR_NUMBER }} - tenant_name: testing-tenant-${{ env.PR_NUMBER }} - region: ${{ vars.TF_VAR_REGION }} - k8s_version: ${{ vars.TF_VAR_K8S_VERSION }} - k8s_instance_type: ${{ vars.TF_VAR_K8S_INSTANCE_TYPE }} - k8s_node_count: ${{ vars.TF_VAR_K8S_NODE_COUNT }} - k8s_node_min_count: ${{ vars.TF_VAR_K8S_NODE_MIN_COUNT }} - k8s_node_max_count: ${{ vars.TF_VAR_K8S_NODE_MAX_COUNT }} - backup_retention_period: ${{ vars.TF_VAR_BACKUP_RETENTION_PERIOD }} - falkordb_version: v4.0.3 - falkordb_cpu: ${{ vars.TF_VAR_FALKORDB_CPU }} - falkordb_memory: ${{ vars.TF_VAR_FALKORDB_MEMORY }} - persistance_size: ${{ vars.TF_VAR_PERSISTANCE_SIZE }} - falkordb_replicas: ${{ vars.TF_VAR_FALKORDB_REPLICAS }} - grafana_admin_password: ${{ vars.TF_VAR_GRAFANA_ADMIN_PASSWORD }} - backup_schedule: ${{ vars.TF_VAR_BACKUP_SCHEDULE }} - # falkordb_domain: ${{ vars.TF_VAR_FALKORDB_DOMAIN }} - - # If we add these here, they will be exported in the tfvars file - # assume_role_arn: ${{ secrets.TF_VAR_ASSUME_ROLE_ARN }} - # eks_auth_role: ${{ secrets.TF_VAR_EKS_AUTH_ROLE }} - # falkordb_hosted_zone_id: ${{ secrets.TF_VAR_FALKORDB_HOSTED_ZONE_ID }} - # falkordb_password: ${{ secrets.TF_VAR_FALKORDB_PASSWORD }} - run: | - ./create_tfvars_from_env.sh - mkdir -p ${GITHUB_WORKSPACE}/artifacts - cp ../tofu/terraform.tfvars ${GITHUB_WORKSPACE}/artifacts/terraform.tfvars - - - name: Set up Tofu - uses: opentofu/setup-opentofu@v1.0.3 - - # - name: Create Tofu Plugin Cache Dir - # run: | - # echo 'plugin_cache_dir="$HOME/.terraform.d/plugin-cache"' >~/.terraformrc - # mkdir --parents ~/.terraform.d/plugin-cache - - # - name: Cache OpenTofu modules - # uses: actions/cache@v2 - # id: cache_opentofu_plugins - # with: - # path: ${{ env.TF_CACHE_DIR }} - # key: tofu-${{hashFiles('./tofu/.terraform.lock.hcl')}} - - # - name: Init infrastructure - # # if: steps.cache_opentofu_plugins.outputs.cache-hit != 'true' - # run: ./tofu_init.sh - - # - name: Run tests - # run: ./tofu_test.sh - - - name: Plan AWS module - id: plan_aws - working-directory: ./tofu/aws - run: | - tofu init - tofu test - tofu plan -out ../../local-aws -var "assume_role_arn=${{ secrets.TF_VAR_ASSUME_ROLE_ARN }}" -var "eks_auth_role=${{ secrets.TF_VAR_EKS_AUTH_ROLE }}" - cp ../../local-aws ${GITHUB_WORKSPACE}/artifacts/plan-aws.out - - - name: Plan K8S module - id: plan_k8s - working-directory: ./tofu/k8s - run: | - tofu init - tofu test - tofu plan -out ../../local-k8s -var "falkordb_s3_backup_name='test-backup-bucket-s3'" -var "assume_role_arn=${{ secrets.TF_VAR_ASSUME_ROLE_ARN }}" -var "falkordb_password=${{ secrets.TF_VAR_FALKORDB_PASSWORD }}" -var "falkordb_eks_cluster_oidc_issuer_url=''" -var "falkordb_eks_cluster_oidc_issuer_arn=''" -var "falkordb_eks_cluster_endpoint=''" -var "falkordb_eks_cluster_certificate_autority=dGVzdA==" - cp ../../local-k8s ${GITHUB_WORKSPACE}/artifacts/plan-k8s.out - - - name: Archive artifacts - uses: actions/upload-artifact@v4 - with: - name: artifacts-${{ env.PR_NUMBER }} - path: artifacts diff --git a/README.md b/README.md index fb6dde7c..124fba19 100644 --- a/README.md +++ b/README.md @@ -15,28 +15,29 @@ This repository contains OpenTofu templates to deploy FalkorDB on cloud. Scripts to help work with this repository -1. tofu_init.sh: Initialize OpenTofu working directory. -2. tofu_upgrade.sh: Upgrade OpenTofu dependency. -3. tofu_new_workspace.sh: Create workspace for deployment. -4. tofu_select_workspace.sh: Switch between workspaces. -5. tofu_delete_workspace.sh: Delete workspace. -6. tofu_list_workspace.sh: List available workspaces. -7. tofu_plan.sh: Generate execution plan to be deployed. -8. tofu_plan_aws.sh: Generate execution plan to be deployed for the AWS target. -9. tofu_plan_k8s.sh: Generate execution plan to be deployed for the K8S target. -10. tofu_apply.sh: Deploy the plan to the cloud provider. -11. tofu_apply_aws.sh: Deploy the AWS target to the cloud provider. -12. tofu_apply_k8s.sh: Deploy the K8S target to the cloud provider. -13. tofu_destroy.sh: Delete the deployment from the cloud provider. -14. tofu_output.sh: Show deployment output. -15. tofu_show.sh: Show the state configuration. -16. tofu_test.sh: Run Tofu tests. -17. aws_update_kubeconfig.sh: Update kubectl config. -18. kubectl_connect_falkordb_master.sh: Port forward into the FalkorDB master node. -19. kubectl_connect_grafana.sh: Port forward into the grafana gui. -20. kubectl_connect_prometheus.sh: Port forward into the prometheus gui. -21. kubectl_connect_alertmanager.sh: Port forward into the alert manager gui. -22. gcp_update_kubeconfig.sh: Update kubectl config. Args: 1=cluster-name, 2=region, 3=project-name +1. add_cluster.sh: Add a new application plane cluster to ArgoCD. +2. tofu_init.sh: Initialize OpenTofu working directory. +3. tofu_upgrade.sh: Upgrade OpenTofu dependency. +4. tofu_new_workspace.sh: Create workspace for deployment. +5. tofu_select_workspace.sh: Switch between workspaces. +6. tofu_delete_workspace.sh: Delete workspace. +7. tofu_list_workspace.sh: List available workspaces. +8. tofu_plan.sh: Generate execution plan to be deployed. +9. tofu_plan_aws.sh: Generate execution plan to be deployed for the AWS target. +10. tofu_plan_k8s.sh: Generate execution plan to be deployed for the K8S target. +11. tofu_apply.sh: Deploy the plan to the cloud provider. +12. tofu_apply_aws.sh: Deploy the AWS target to the cloud provider. +13. tofu_apply_k8s.sh: Deploy the K8S target to the cloud provider. +14. tofu_destroy.sh: Delete the deployment from the cloud provider. +15. tofu_output.sh: Show deployment output. +16. tofu_show.sh: Show the state configuration. +17. tofu_test.sh: Run Tofu tests. +18. aws_update_kubeconfig.sh: Update kubectl config. +19. kubectl_connect_falkordb_master.sh: Port forward into the FalkorDB master node. +20. kubectl_connect_grafana.sh: Port forward into the grafana gui. +21. kubectl_connect_prometheus.sh: Port forward into the prometheus gui. +22. kubectl_connect_alertmanager.sh: Port forward into the alert manager gui. +23. gcp_update_kubeconfig.sh: Update kubectl config. Args: 1=cluster-name, 2=region, 3=project-name # Tofu diff --git a/argocd/app_plane/README b/argocd/app_plane/README new file mode 100644 index 00000000..9ce625f5 --- /dev/null +++ b/argocd/app_plane/README @@ -0,0 +1,17 @@ +# Observability Stack - Application plane + +Defines the resources that will be deployed in each application cluster to monitor its contents. + +The application plane for the observability stack is composed of the following components: + +- VictoriaMetrics: Store metrics + - Alertmanager +- Grafana Alloy: Scrape metrics from applications, and write them to the control plane +- Pod Monitor: Scrape metrics from pods +- Grafana Operator: Deploy grafana instance and dashboards +- Kube State Metrics: Scrape metrics from kubernetes +- VMRules + +## Adding clusters to be monitored + +Use the script `scripts/add_cluster.sh` \ No newline at end of file diff --git a/argocd/app_plane/dev/alloy.yaml b/argocd/app_plane/dev/alloy.yaml new file mode 100644 index 00000000..4a4c231c --- /dev/null +++ b/argocd/app_plane/dev/alloy.yaml @@ -0,0 +1,317 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: grafana-alloy + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-alloy' + spec: + project: default + source: + chart: alloy + repoURL: https://grafana.github.io/helm-charts + targetRevision: 0.11.0 + helm: + valuesObject: + fullnameOverride: alloy + alloy: + configMap: + content: |- + prometheus.remote_write "in_cluster" { + endpoint { + url = "http://vmsingle-vm-victoria-metrics-k8s-stack.observability.svc.cluster.local:8429/prometheus/api/v1/write" + } + } + + prometheus.remote_write "ctrl_plane" { + endpoint { + url = "https://vmauth.observability.dev.internal.falkordb.cloud/prometheus/api/v1/write" + + basic_auth { + username = "{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-user" + password_file = "/var/run/secrets/victoriametrics/vmuser/password" + } + + tls_config { + insecure_skip_verify = true + } + } + } + + logging { + level = "info" + format = "logfmt" + } + + livedebugging { + enabled = true + } + + import.git "ksm" { + repository = "https://github.com/grafana/alloy-modules.git" + revision = "main" + path = "modules/kubernetes/kube-state-metrics/metrics.alloy" + pull_frequency = "15m" + } + + prometheus.operator.podmonitors "pods" { + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + namespaces = ["observability"] + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.kubernetes "nodes" { + role = "node" + } + + ksm.kubernetes "targets" { } + + prometheus.exporter.unix "node_exporter" { } + + prometheus.exporter.self "alloy" { } + + discovery.relabel "metrics_cadvisor" { + targets = discovery.kubernetes.nodes.targets + + rule { + action = "labelmap" + regex = "__meta_kubernetes_node_label_(.+)" + } + + rule { + action = "replace" + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + action = "replace" + replacement = "/api/v1/nodes/${1}/proxy/metrics/cadvisor" + target_label = "__metrics_path__" + } + } + + discovery.relabel "metrics_kubelet" { + targets = discovery.kubernetes.nodes.targets + + rule { + action = "replace" + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + action = "replace" + replacement = "/api/v1/nodes/${1}/proxy/metrics" + target_label = "__metrics_path__" + } + } + + ksm.scrape "metrics" { + targets = ksm.kubernetes.targets.output + forward_to = [prometheus.relabel.kube_state_metrics.receiver] + } + + prometheus.scrape "pods" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.kubernetes.pods.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "pods" + } + + prometheus.scrape "cadvisor" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.relabel.metrics_cadvisor.output + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "cadvisor" + } + + prometheus.scrape "kubelet" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.relabel.metrics_kubelet.output + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "kubernetes-kubelet" + } + + prometheus.scrape "node_exporter" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = prometheus.exporter.unix.node_exporter.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "node-exporter" + } + + prometheus.scrape "alloy" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = prometheus.exporter.self.alloy.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "alloy" + } + + prometheus.scrape "vm_operator" { + targets = [ + {__address__ = "vm-victoria-metrics-operator.observability.svc.cluster.local:8080"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-operator" + } + + prometheus.scrape "vm_single" { + targets = [ + {__address__ = "vmsingle-vm-victoria-metrics-k8s-stack.observability.svc.cluster.local:8429"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-single" + } + + prometheus.scrape "vm_alertmanager" { + targets = [ + {__address__ = "vmalertmanager-vm-victoria-metrics-k8s-stack.observability.svc.cluster.local:9093"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-alertmanager" + } + + prometheus.scrape "vm_alert" { + targets = [ + {__address__ = "vmalert-vm-victoria-metrics-k8s-stack.observability.svc.cluster.local:8080"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-alert" + } + + prometheus.relabel "kube_state_metrics" { + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + rule { + replacement = "kube-state-metrics" + target_label = "job" + } + } + + prometheus.relabel "drop_unwanted_metrics" { + forward_to = [prometheus.relabel.add_cluster_label.receiver] + + rule { + source_labels = ["__name__"] + regex = "^(ALERTS|up|_container_status_restarts_total|___customer_redis_blocked_clients|___customer_redis_commands_duration_seconds_total|___customer_redis_commands_total|___customer_redis_connected_clients|___customer_redis_db_keys|___customer_redis_memory_max_bytes|___customer_redis_memory_used_bytes|___customer_redis_net_input_bytes_total|___customer_redis_net_output_bytes_total|___customer_redis_uptime_in_seconds|alertmanager_alerts|alertmanager_alerts_invalid_total|alertmanager_alerts_received_total|alertmanager_notification_latency_seconds_bucket|alertmanager_notification_latency_seconds_count|alertmanager_notification_latency_seconds_sum|alertmanager_notifications_failed_total|alertmanager_notifications_total|alertname|changes|chip_name|cluster|container|container!|container_cpu_cfs_throttled_seconds_total|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_errors_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_errors_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_oom_events_total|controller|controller_runtime_active_workers|controller_runtime_max_concurrent_reconciles|controller_runtime_reconcile_errors_total|controller_runtime_reconcile_time_seconds_bucket|controller_runtime_reconcile_total|created_by_name|device|event_type|go_gc_cpu_seconds_total|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_goroutines|go_memstats_heap_inuse_bytes|go_memstats_stack_inuse_bytes|go_memstats_sys_bytes|go_sched_latencies_seconds_bucket|grafana_alerting_result_total|grafana_build_info|grafana_http_request_duration_seconds_bucket|grafana_http_request_duration_seconds_count|grafana_http_request_duration_seconds_sum|grafana_stat_totals_dashboard|group_left|histogram_quantile|ignoring|image!|increase|instance|integration|interval|iowait|kube_configmap_info|kube_daemonset_labels|kube_deployment_labels|kube_endpoint_info|kube_hpa_labels|kube_ingress_info|kube_namespace_labels|kube_networkpolicy_labels|kube_node_info|kube_persistentvolumeclaim_info|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_last_terminated_exitcode|kube_pod_container_status_last_terminated_reason|kube_pod_container_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_terminated|kube_pod_container_status_waiting|kube_pod_info|kube_pod_status_phase|kube_pod_status_qos_class|kube_pod_status_reason|kube_secret_info|kube_service_info|kube_statefulset_labels|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|label_values|leader_election_master_status|location|machine_cpu_cores|machine_memory_bytes|memory|method|namespace|node_arp_entries|node_boot_time_seconds|node_context_switches_total|node_cooling_device_cur_state|node_cooling_device_max_state|node_cpu_core_throttles_total|node_cpu_guest_seconds_total|node_cpu_scaling_frequency_hertz|node_cpu_scaling_frequency_max_hertz|node_cpu_scaling_frequency_min_hertz|node_cpu_seconds_total|node_disk_discard_time_seconds_total|node_disk_discards_completed_total|node_disk_discards_merged_total|node_disk_io_now|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_bytes_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_reads_merged_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_disk_writes_merged_total|node_disk_written_bytes_total|node_entropy_available_bits|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_device_error|node_filesystem_files|node_filesystem_files_free|node_filesystem_free_bytes|node_filesystem_readonly|node_filesystem_size_bytes|node_forks_total|node_hwmon_chip_names|node_hwmon_temp_celsius|node_hwmon_temp_crit_alarm_celsius|node_hwmon_temp_crit_celsius|node_hwmon_temp_crit_hyst_celsius|node_hwmon_temp_max_celsius|node_interrupts_total|node_intr_total|node_load1|node_load15|node_load5|node_memory_Active_anon_bytes|node_memory_Active_bytes|node_memory_Active_file_bytes|node_memory_AnonHugePages_bytes|node_memory_AnonPages_bytes|node_memory_Bounce_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_CommitLimit_bytes|node_memory_Committed_AS_bytes|node_memory_DirectMap1G_bytes|node_memory_DirectMap2M_bytes|node_memory_DirectMap4k_bytes|node_memory_Dirty_bytes|node_memory_HardwareCorrupted_bytes|node_memory_HugePages_Free|node_memory_HugePages_Rsvd|node_memory_HugePages_Surp|node_memory_HugePages_Total|node_memory_Hugepagesize_bytes|node_memory_Inactive_anon_bytes|node_memory_Inactive_bytes|node_memory_Inactive_file_bytes|node_memory_KernelStack_bytes|node_memory_Mapped_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_Mlocked_bytes|node_memory_NFS_Unstable_bytes|node_memory_PageTables_bytes|node_memory_Percpu_bytes|node_memory_SReclaimable_bytes|node_memory_SUnreclaim_bytes|node_memory_ShmemHugePages_bytes|node_memory_ShmemPmdMapped_bytes|node_memory_Shmem_bytes|node_memory_Slab_bytes|node_memory_SwapCached_bytes|node_memory_SwapFree_bytes|node_memory_SwapTotal_bytes|node_memory_Unevictable_bytes|node_memory_VmallocChunk_bytes|node_memory_VmallocTotal_bytes|node_memory_VmallocUsed_bytes|node_memory_WritebackTmp_bytes|node_memory_Writeback_bytes|node_netstat_Icmp_InErrors|node_netstat_Icmp_InMsgs|node_netstat_Icmp_OutMsgs|node_netstat_IpExt_InOctets|node_netstat_IpExt_OutOctets|node_netstat_Ip_Forwarding|node_netstat_TcpExt_ListenDrops|node_netstat_TcpExt_ListenOverflows|node_netstat_TcpExt_SyncookiesFailed|node_netstat_TcpExt_SyncookiesRecv|node_netstat_TcpExt_SyncookiesSent|node_netstat_TcpExt_TCPOFOQueue|node_netstat_TcpExt_TCPRcvQDrop|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_ActiveOpens|node_netstat_Tcp_CurrEstab|node_netstat_Tcp_InErrs|node_netstat_Tcp_InSegs|node_netstat_Tcp_MaxConn|node_netstat_Tcp_OutRsts|node_netstat_Tcp_OutSegs|node_netstat_Tcp_PassiveOpens|node_netstat_Tcp_RetransSegs|node_netstat_UdpLite_InErrors|node_netstat_Udp_InDatagrams|node_netstat_Udp_InErrors|node_netstat_Udp_NoPorts|node_netstat_Udp_OutDatagrams|node_netstat_Udp_RcvbufErrors|node_netstat_Udp_SndbufErrors|node_network_carrier|node_network_mtu_bytes|node_network_receive_bytes_total|node_network_receive_compressed_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_fifo_total|node_network_receive_frame_total|node_network_receive_multicast_total|node_network_receive_packets_total|node_network_speed_bytes|node_network_transmit_bytes_total|node_network_transmit_carrier_total|node_network_transmit_colls_total|node_network_transmit_compressed_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_fifo_total|node_network_transmit_packets_total|node_network_transmit_queue_length|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_power_supply_online|node_pressure_cpu_waiting_seconds_total|node_pressure_io_stalled_seconds_total|node_pressure_io_waiting_seconds_total|node_pressure_memory_stalled_seconds_total|node_pressure_memory_waiting_seconds_total|node_processes_max_processes|node_processes_max_threads|node_processes_pids|node_processes_state|node_processes_threads|node_procs_blocked|node_procs_running|node_schedstat_running_seconds_total|node_schedstat_timeslices_total|node_schedstat_waiting_seconds_total|node_scrape_collector_duration_seconds|node_scrape_collector_success|node_sockstat_FRAG_inuse|node_sockstat_FRAG_memory|node_sockstat_RAW_inuse|node_sockstat_TCP_alloc|node_sockstat_TCP_inuse|node_sockstat_TCP_mem|node_sockstat_TCP_mem_bytes|node_sockstat_TCP_orphan|node_sockstat_TCP_tw|node_sockstat_UDPLITE_inuse|node_sockstat_UDP_inuse|node_sockstat_UDP_mem|node_sockstat_UDP_mem_bytes|node_sockstat_sockets_used|node_softnet_dropped_total|node_softnet_processed_total|node_softnet_times_squeezed_total|node_systemd_socket_accepted_connections_total|node_systemd_units|node_tcp_connection_states|node_textfile_scrape_error|node_time_seconds|node_timex_estimated_error_seconds|node_timex_frequency_adjustment_ratio|node_timex_loop_time_constant|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_timex_tai_offset_seconds|node_timex_tick_seconds|node_uname_info|node_vmstat_oom_kill|node_vmstat_pgfault|node_vmstat_pgmajfault|node_vmstat_pgpgin|node_vmstat_pgpgout|node_vmstat_pswpin|node_vmstat_pswpout|object_type_name|offset|operator_controller_objects_count|operator_log_messages_total|operator_prometheus_converter_active_watchers|operator_prometheus_converter_watch_events_total|operator_reconcile_throttled_events_total|persistentvolumeclaim|process_cpu_cores_available|process_cpu_seconds_total|process_io_read_syscalls_total|process_io_storage_read_bytes_total|process_io_storage_written_bytes_total|process_io_write_syscalls_total|process_max_fds|process_num_threads|process_open_fds|process_resident_memory_anon_bytes|process_resident_memory_bytes|process_resident_memory_max_bytes|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus|prometheus_build_info|prometheus_tsdb_head_series|protocol|qos_class|reason|recording|resource|rest_client_request_duration_seconds_bucket|rest_client_requests_total|scalar|scheduler_binding_duration_seconds_bucket|scheduler_binding_duration_seconds_count|scheduler_e2e_scheduling_duration_seconds_bucket|scheduler_e2e_scheduling_duration_seconds_count|scheduler_scheduling_algorithm_duration_seconds_bucket|scheduler_scheduling_algorithm_duration_seconds_count|scheduler_volume_scheduling_duration_seconds_bucket|scheduler_volume_scheduling_duration_seconds_count|scrape_job|scrape_series_added|service|short_version|softirq|status_code|system|topk_max|unless|vector|version|vm_active_merges|vm_app_start_timestamp|vm_app_uptime_seconds|vm_app_version|vm_assisted_merges_total|vm_available_cpu_cores|vm_available_memory_bytes|vm_cache_entries|vm_cache_misses_total|vm_cache_requests_total|vm_cache_size_bytes|vm_cache_size_max_bytes|vm_concurrent_insert_capacity|vm_concurrent_insert_current|vm_data_size_bytes|vm_free_disk_space_bytes|vm_free_disk_space_limit_bytes|vm_http_request_errors_total|vm_http_requests_total|vm_ingestserver_request_errors_total|vm_ingestserver_requests_total|vm_log_messages_total|vm_new_timeseries_created_total|vm_parts|vm_pending_rows|vm_persistentqueue_bytes_dropped_total|vm_persistentqueue_read_duration_seconds_total|vm_persistentqueue_write_duration_seconds_total|vm_promscrape_conn_bytes_read_total|vm_promscrape_dial_errors_total|vm_promscrape_max_scrape_size_exceeded_errors_total|vm_promscrape_scrape_duration_seconds_bucket|vm_promscrape_scrape_pool_targets|vm_promscrape_scrape_response_size_bytes_bucket|vm_promscrape_scraped_samples_sum|vm_promscrape_scrapes_failed_total|vm_promscrape_scrapes_gunzip_failed_total|vm_promscrape_scrapes_timed_out_total|vm_promscrape_scrapes_total|vm_promscrape_targets|vm_protoparser_read_errors_total|vm_protoparser_rows_read_total|vm_protoparser_unmarshal_errors_total|vm_request_duration_seconds|vm_rows|vm_rows_added_to_storage_total|vm_rows_ignored_total|vm_rows_inserted_total|vm_rows_invalid_total|vm_rows_merged_total|vm_rows_read_per_query_bucket|vm_rows_read_per_series_bucket|vm_rows_scanned_per_query_bucket|vm_series_read_per_query_bucket|vm_slow_queries_total|vm_slow_row_inserts_total|vm_streamaggr_dedup_flush_duration_seconds_bucket|vm_streamaggr_dedup_flush_timeouts_total|vm_streamaggr_flush_timeouts_total|vm_streamaggr_ignored_samples_total|vm_streamaggr_labels_compressor_items_count|vm_streamaggr_labels_compressor_size_bytes|vm_streamaggr_matched_samples_total|vm_streamaggr_output_samples_total|vm_streamaggr_samples_lag_seconds_bucket|vm_tcplistener_accepts_total|vm_tcplistener_conns|vm_tcplistener_read_bytes_total|vm_tcplistener_written_bytes_total|vmagent_daily_series_limit_current_series|vmagent_daily_series_limit_max_series|vmagent_hourly_series_limit_current_series|vmagent_hourly_series_limit_max_series|vmagent_http_request_errors_total|vmagent_http_requests_total|vmagent_remotewrite_block_size_rows_sum|vmagent_remotewrite_conn_bytes_written_total|vmagent_remotewrite_conns|vmagent_remotewrite_packets_dropped_total|vmagent_remotewrite_pending_data_bytes|vmagent_remotewrite_queues|vmagent_remotewrite_relabel_metrics_dropped_total|vmagent_remotewrite_requests_total|vmagent_remotewrite_retries_count_total|vmagent_remotewrite_send_duration_seconds_total|vmagent_rows_inserted_total|vmalert_alerting_rules_errors_total|vmalert_alerting_rules_last_evaluation_samples|vmalert_alerts_fired_total|vmalert_alerts_firing|vmalert_alerts_pending|vmalert_alerts_send_errors_total|vmalert_alerts_sent_total|vmalert_config_last_reload_successful|vmalert_execution_errors_total|vmalert_execution_total|vmalert_iteration_duration_seconds_count|vmalert_iteration_duration_seconds_sum|vmalert_iteration_missed_total|vmalert_iteration_total|vmalert_recording_rules_errors_total|vmalert_recording_rules_last_evaluation_samples|vmalert_remotewrite_conn_bytes_written_total|vmalert_remotewrite_conns|vmalert_remotewrite_dropped_rows_total|vmalert_remotewrite_sent_rows_total|vmrange|without|workqueue_depth|mmcblk)$" + action = "keep" + } + + } + + prometheus.relabel "add_cluster_label" { + forward_to = [prometheus.relabel.remove_unwanted_labels.receiver] + + rule { + target_label = "cluster" + replacement = "{{ regexFind "h?c-[A-Za-z0-9]+" .name }}" + } + } + + prometheus.relabel "remove_unwanted_labels" { + forward_to = [prometheus.remote_write.in_cluster.receiver, prometheus.remote_write.ctrl_plane.receiver] + + rule { + action = "labeldrop" + regex = "^(cloud_google_com|omnistrate_com).*" + } + } + clustering: + enabled: true + name: alloy + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: "2" + memory: 6Gi + mounts: + extra: + - name: vmuser + mountPath: /var/run/secrets/victoriametrics/vmuser + controller: + type: statefulset + nodeSelector: + node_pool: observability + volumes: + extra: + - name: vmuser + secret: + secretName: vmuser + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/app_plane/dev/applicationset.yaml b/argocd/app_plane/dev/applicationset.yaml new file mode 100644 index 00000000..c9acac59 --- /dev/null +++ b/argocd/app_plane/dev/applicationset.yaml @@ -0,0 +1,33 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: observability-stack + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-observability-stack' + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: argocd/app_plane/dev/manifests + directory: + recurse: true + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/argocd/app_plane/dev/grafana-dashboards.yaml b/argocd/app_plane/dev/grafana-dashboards.yaml new file mode 100644 index 00000000..d1d3f5eb --- /dev/null +++ b/argocd/app_plane/dev/grafana-dashboards.yaml @@ -0,0 +1,44 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: grafana-dashboards + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-grafana-dashb' + spec: + project: default + source: + path: observability/grafana + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + kustomize: + patches: + # remove vmauth dashboard + - target: + kind: GrafanaDashboard + name: vmauth + patch: |- + $patch: delete + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDashboard + metadata: + name: vmauth + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/app_plane/dev/grafana-operator.yaml b/argocd/app_plane/dev/grafana-operator.yaml new file mode 100644 index 00000000..9d3c0796 --- /dev/null +++ b/argocd/app_plane/dev/grafana-operator.yaml @@ -0,0 +1,37 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: grafana-operator + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-grafana-op' + spec: + project: default + source: + chart: grafana-operator + repoURL: ghcr.io/grafana/helm-charts + targetRevision: v5.16.0 + helm: + valuesObject: + fullnameOverride: grafana-operator + nodeSelector: + node_pool: observability + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/app_plane/dev/ksm.yaml b/argocd/app_plane/dev/ksm.yaml new file mode 100644 index 00000000..0c1d035b --- /dev/null +++ b/argocd/app_plane/dev/ksm.yaml @@ -0,0 +1,36 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: kube-state-metrics + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-ksm' + spec: + project: default + source: + chart: kube-state-metrics + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 5.30.0 + helm: + valuesObject: + fullnameOverride: ksm + nodeSelector: + node_pool: observability + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/argocd/app_plane/dev/manifests/grafana-datasource.yaml b/argocd/app_plane/dev/manifests/grafana-datasource.yaml new file mode 100644 index 00000000..762b670b --- /dev/null +++ b/argocd/app_plane/dev/manifests/grafana-datasource.yaml @@ -0,0 +1,20 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: victoriametrics + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + datasource: + isDefault: true + access: proxy + database: prometheus + jsonData: + timeInterval: 5s + tlsSkipVerify: true + name: VictoriaMetrics + type: prometheus + url: http://vmsingle-vm-victoria-metrics-k8s-stack.observability.svc.cluster.local:8429 \ No newline at end of file diff --git a/argocd/app_plane/dev/manifests/grafana.yaml b/argocd/app_plane/dev/manifests/grafana.yaml new file mode 100644 index 00000000..bd9c301e --- /dev/null +++ b/argocd/app_plane/dev/manifests/grafana.yaml @@ -0,0 +1,58 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: "grafana" +spec: + persistentVolumeClaim: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + deployment: + spec: + replicas: 1 + template: + spec: + nodeSelector: + node_pool: observability + securityContext: # Pod-level security context (important for initContainer) + runAsUser: 0 # Run initContainer as root + runAsGroup: 0 + initContainers: + - name: init-grafana-data + image: busybox # Or any image with chown + command: ["chown", "-R", "1001:1001", "/var/lib/grafana"] + volumeMounts: + - name: grafana-data + mountPath: /var/lib/grafana + containers: + - name: grafana + readinessProbe: + failureThreshold: 3 + resources: + requests: + cpu: "500m" + memory: "256Mi" + limits: + cpu: "2" + memory: "2Gi" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + volumes: + - name: grafana-data + persistentVolumeClaim: + claimName: grafana-pvc + config: + log: + mode: "console" + auth: + disable_login_form: "false" diff --git a/argocd/app_plane/dev/manifests/podmonitor.yaml b/argocd/app_plane/dev/manifests/podmonitor.yaml new file mode 100644 index 00000000..551db90e --- /dev/null +++ b/argocd/app_plane/dev/manifests/podmonitor.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: instance-monitor + namespace: observability +spec: + podMetricsEndpoints: + - bearerTokenSecret: + key: "" + name: "" + port: metrics + selector: + matchLabels: + app.kubernetes.io/managed-by: omnistrate + namespaceSelector: + any: true diff --git a/argocd/app_plane/dev/victoriametrics.yaml b/argocd/app_plane/dev/victoriametrics.yaml new file mode 100644 index 00000000..58fa438d --- /dev/null +++ b/argocd/app_plane/dev/victoriametrics.yaml @@ -0,0 +1,136 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: victoriametrics + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-victoriametrics' + spec: + project: default + source: + chart: victoria-metrics-k8s-stack + repoURL: https://victoriametrics.github.io/helm-charts + targetRevision: 0.35.1 + helm: + releaseName: vm + valuesObject: + defaultDashboards: + enabled: true + annotations: + argocd.argoproj.io/sync-options: ServerSideApply=true + dashboards: + victoriametrics-vmagent: + enabled: false + grafanaOperator: + enabled: true + spec: + allowCrossNamespaceImport: false + instanceSelector: + matchLabels: + dashboards: grafana + defaultRules: + create: false + victoria-metrics-operator: + operator: + disable_prometheus_converter: true + nodeSelector: + node_pool: observability + vmsingle: + spec: + retentionPeriod: 3d + logFormat: json + nodeSelector: + node_pool: observability + storage: + resources: + requests: + storage: 50Gi + alertmanager: + spec: + nodeSelector: + node_pool: observability + secrets: + - pagerduty-service-key + configSecret: null + config: + global: + resolve_timeout: 5m + route: + group_by: ["alertname", "namespace", "pod"] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: "pagerduty" + routes: + - match: + alertname: Watchdog + receiver: "blackhole" + - receiver: "pagerduty" + receivers: + - name: blackhole + - name: "pagerduty" + pagerduty_configs: + - service_key_file: "/etc/vm/secrets/pagerduty-service-key/api-key" + url: https://events.pagerduty.com/generic/2010-04-15/create_event.json + send_resolved: true + vmalert: + spec: + nodeSelector: + node_pool: observability + vmagent: + enabled: false + grafana: + enabled: false + prometheus-node-exporter: + enabled: false + kube-state-metrics: + enabled: false + kubeScheduler: + enabled: false + kubelet: + enabled: false + kubeApiServer: + enabled: false + kubeControllerManager: + enabled: false + kubeDns: + enabled: false + coreDns: + enabled: false + kubeEtcd: + enabled: false + enabled: + enabled: false + kubeProxy: + enabled: false + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - RespectIgnoreDifferences=true + automated: + prune: true + selfHeal: true + ignoreDifferences: + - group: "" + kind: Secret + name: victoria-metrics-k8s-stack-vm-validation + namespace: observability + jsonPointers: + - /data + - group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration + name: victoria-metrics-k8s-stack-vm-admission + jqPathExpressions: + - ".webhooks[]?.clientConfig.caBundle" \ No newline at end of file diff --git a/argocd/app_plane/dev/vmrules.yaml b/argocd/app_plane/dev/vmrules.yaml new file mode 100644 index 00000000..bd73a6d5 --- /dev/null +++ b/argocd/app_plane/dev/vmrules.yaml @@ -0,0 +1,33 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: vmrules + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-vmrules' + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: observability/rules + directory: + recurse: true + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/alloy.yaml b/argocd/ctrl_plane/dev/alloy.yaml new file mode 100644 index 00000000..9fb4ff85 --- /dev/null +++ b/argocd/ctrl_plane/dev/alloy.yaml @@ -0,0 +1,302 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: grafana-alloy-ctrl-plane + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: ctrl-plane + template: + metadata: + name: 'alloy' + spec: + project: default + source: + chart: alloy + repoURL: https://grafana.github.io/helm-charts + targetRevision: 0.11.0 + helm: + valuesObject: + fullnameOverride: alloy + alloy: + configMap: + content: |- + prometheus.remote_write "in_cluster" { + endpoint { + url = "http://vmsingle-vm.observability.svc.cluster.local:8429/prometheus/api/v1/write" + } + } + + logging { + level = "info" + format = "logfmt" + } + + livedebugging { + enabled = true + } + + import.git "ksm" { + repository = "https://github.com/grafana/alloy-modules.git" + revision = "main" + path = "modules/kubernetes/kube-state-metrics/metrics.alloy" + pull_frequency = "15m" + } + + prometheus.operator.podmonitors "pods" { + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + namespaces = ["observability"] + } + + discovery.kubernetes "pods" { + role = "pod" + } + + discovery.kubernetes "nodes" { + role = "node" + } + + ksm.kubernetes "targets" { } + + prometheus.exporter.unix "node_exporter" { } + + prometheus.exporter.self "alloy" { } + + discovery.relabel "metrics_cadvisor" { + targets = discovery.kubernetes.nodes.targets + + rule { + action = "labelmap" + regex = "__meta_kubernetes_node_label_(.+)" + } + + rule { + action = "replace" + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + action = "replace" + replacement = "/api/v1/nodes/${1}/proxy/metrics/cadvisor" + target_label = "__metrics_path__" + } + } + + discovery.relabel "metrics_kubelet" { + targets = discovery.kubernetes.nodes.targets + + rule { + action = "replace" + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + action = "replace" + replacement = "/api/v1/nodes/${1}/proxy/metrics" + target_label = "__metrics_path__" + } + } + + ksm.scrape "metrics" { + targets = ksm.kubernetes.targets.output + forward_to = [prometheus.relabel.kube_state_metrics.receiver] + } + + prometheus.scrape "pods" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.kubernetes.pods.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "pods" + } + + prometheus.scrape "cadvisor" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.relabel.metrics_cadvisor.output + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "cadvisor" + } + + prometheus.scrape "kubelet" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = discovery.relabel.metrics_kubelet.output + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "kubernetes-kubelet" + } + + prometheus.scrape "node_exporter" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = prometheus.exporter.unix.node_exporter.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "node-exporter" + } + + prometheus.scrape "alloy" { + scheme = "https" + + tls_config { + server_name = "kubernetes" + ca_file = "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + insecure_skip_verify = false + } + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + targets = prometheus.exporter.self.alloy.targets + scrape_interval = "60s" + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "alloy" + } + + prometheus.scrape "vm_operator" { + targets = [ + {__address__ = "vm-victoria-metrics-operator.observability.svc.cluster.local:8080"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-operator" + } + + prometheus.scrape "vm_single" { + targets = [ + {__address__ = "vmsingle-vm.observability.svc.cluster.local:8429"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-single" + } + + prometheus.scrape "vm_alertmanager" { + targets = [ + {__address__ = "vmalertmanager-vm.observability.svc.cluster.local:9093"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-alertmanager" + } + + prometheus.scrape "vm_alert" { + targets = [ + {__address__ = "vmalert-vm.observability.svc.cluster.local:8080"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-alert" + } + + prometheus.scrape "vm_auth" { + targets = [ + {__address__ = "vmauth-vm.observability.svc.cluster.local:8427"}, + ] + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + job_name = "vm-auth" + } + + prometheus.relabel "kube_state_metrics" { + forward_to = [prometheus.relabel.drop_unwanted_metrics.receiver] + + rule { + replacement = "kube-state-metrics" + target_label = "job" + } + } + + prometheus.relabel "drop_unwanted_metrics" { + forward_to = [prometheus.relabel.add_cluster_label.receiver] + + rule { + source_labels = ["__name__"] + regex = "^(ALERTS|up|_container_status_restarts_total|alertmanager_alerts_received_total|alertmanager_notification_latency_seconds_bucket|alertmanager_notification_latency_seconds_count|alertmanager_notification_latency_seconds_sum|alertmanager_notifications_failed_total|alertmanager_notifications_total|alertname|changes|chip_name|cluster|container|container!|container_cpu_cfs_throttled_seconds_total|container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_errors_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_errors_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|container_oom_events_total|controller|controller_runtime_active_workers|controller_runtime_max_concurrent_reconciles|controller_runtime_reconcile_errors_total|controller_runtime_reconcile_time_seconds_bucket|controller_runtime_reconcile_total|created_by_name|device|event_type|go_gc_cpu_seconds_total|go_gc_duration_seconds_count|go_gc_duration_seconds_sum|go_goroutines|go_memstats_heap_inuse_bytes|go_memstats_stack_inuse_bytes|go_memstats_sys_bytes|go_sched_latencies_seconds_bucket|grafana_alerting_result_total|grafana_build_info|grafana_http_request_duration_seconds_bucket|grafana_http_request_duration_seconds_count|grafana_http_request_duration_seconds_sum|grafana_stat_totals_dashboard|group_left|histogram_quantile|ignoring|image!|increase|instance|integration|interval|iowait|kube_configmap_info|kube_daemonset_labels|kube_deployment_labels|kube_endpoint_info|kube_hpa_labels|kube_ingress_info|kube_namespace_labels|kube_networkpolicy_labels|kube_node_info|kube_persistentvolumeclaim_info|kube_pod_container_info|kube_pod_container_resource_limits|kube_pod_container_resource_requests|kube_pod_container_status_last_terminated_exitcode|kube_pod_container_status_last_terminated_reason|kube_pod_container_status_ready|kube_pod_container_status_restarts_total|kube_pod_container_status_running|kube_pod_container_status_terminated|kube_pod_container_status_waiting|kube_pod_info|kube_pod_status_phase|kube_pod_status_qos_class|kube_pod_status_reason|kube_secret_info|kube_service_info|kube_statefulset_labels|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubelet_volume_stats_used_bytes|label_values|leader_election_master_status|location|machine_cpu_cores|machine_memory_bytes|memory|method|namespace|node_arp_entries|node_boot_time_seconds|node_context_switches_total|node_cooling_device_cur_state|node_cooling_device_max_state|node_cpu_core_throttles_total|node_cpu_guest_seconds_total|node_cpu_scaling_frequency_hertz|node_cpu_scaling_frequency_max_hertz|node_cpu_scaling_frequency_min_hertz|node_cpu_seconds_total|node_disk_discard_time_seconds_total|node_disk_discards_completed_total|node_disk_discards_merged_total|node_disk_io_now|node_disk_io_time_seconds_total|node_disk_io_time_weighted_seconds_total|node_disk_read_bytes_total|node_disk_read_time_seconds_total|node_disk_reads_completed_total|node_disk_reads_merged_total|node_disk_write_time_seconds_total|node_disk_writes_completed_total|node_disk_writes_merged_total|node_disk_written_bytes_total|node_entropy_available_bits|node_filefd_allocated|node_filefd_maximum|node_filesystem_avail_bytes|node_filesystem_device_error|node_filesystem_files|node_filesystem_files_free|node_filesystem_free_bytes|node_filesystem_readonly|node_filesystem_size_bytes|node_forks_total|node_hwmon_chip_names|node_hwmon_temp_celsius|node_hwmon_temp_crit_alarm_celsius|node_hwmon_temp_crit_celsius|node_hwmon_temp_crit_hyst_celsius|node_hwmon_temp_max_celsius|node_interrupts_total|node_intr_total|node_load1|node_load15|node_load5|node_memory_Active_anon_bytes|node_memory_Active_bytes|node_memory_Active_file_bytes|node_memory_AnonHugePages_bytes|node_memory_AnonPages_bytes|node_memory_Bounce_bytes|node_memory_Buffers_bytes|node_memory_Cached_bytes|node_memory_CommitLimit_bytes|node_memory_Committed_AS_bytes|node_memory_DirectMap1G_bytes|node_memory_DirectMap2M_bytes|node_memory_DirectMap4k_bytes|node_memory_Dirty_bytes|node_memory_HardwareCorrupted_bytes|node_memory_HugePages_Free|node_memory_HugePages_Rsvd|node_memory_HugePages_Surp|node_memory_HugePages_Total|node_memory_Hugepagesize_bytes|node_memory_Inactive_anon_bytes|node_memory_Inactive_bytes|node_memory_Inactive_file_bytes|node_memory_KernelStack_bytes|node_memory_Mapped_bytes|node_memory_MemAvailable_bytes|node_memory_MemFree_bytes|node_memory_MemTotal_bytes|node_memory_Mlocked_bytes|node_memory_NFS_Unstable_bytes|node_memory_PageTables_bytes|node_memory_Percpu_bytes|node_memory_SReclaimable_bytes|node_memory_SUnreclaim_bytes|node_memory_ShmemHugePages_bytes|node_memory_ShmemPmdMapped_bytes|node_memory_Shmem_bytes|node_memory_Slab_bytes|node_memory_SwapCached_bytes|node_memory_SwapFree_bytes|node_memory_SwapTotal_bytes|node_memory_Unevictable_bytes|node_memory_VmallocChunk_bytes|node_memory_VmallocTotal_bytes|node_memory_VmallocUsed_bytes|node_memory_WritebackTmp_bytes|node_memory_Writeback_bytes|node_netstat_Icmp_InErrors|node_netstat_Icmp_InMsgs|node_netstat_Icmp_OutMsgs|node_netstat_IpExt_InOctets|node_netstat_IpExt_OutOctets|node_netstat_Ip_Forwarding|node_netstat_TcpExt_ListenDrops|node_netstat_TcpExt_ListenOverflows|node_netstat_TcpExt_SyncookiesFailed|node_netstat_TcpExt_SyncookiesRecv|node_netstat_TcpExt_SyncookiesSent|node_netstat_TcpExt_TCPOFOQueue|node_netstat_TcpExt_TCPRcvQDrop|node_netstat_TcpExt_TCPSynRetrans|node_netstat_Tcp_ActiveOpens|node_netstat_Tcp_CurrEstab|node_netstat_Tcp_InErrs|node_netstat_Tcp_InSegs|node_netstat_Tcp_MaxConn|node_netstat_Tcp_OutRsts|node_netstat_Tcp_OutSegs|node_netstat_Tcp_PassiveOpens|node_netstat_Tcp_RetransSegs|node_netstat_UdpLite_InErrors|node_netstat_Udp_InDatagrams|node_netstat_Udp_InErrors|node_netstat_Udp_NoPorts|node_netstat_Udp_OutDatagrams|node_netstat_Udp_RcvbufErrors|node_netstat_Udp_SndbufErrors|node_network_carrier|node_network_mtu_bytes|node_network_receive_bytes_total|node_network_receive_compressed_total|node_network_receive_drop_total|node_network_receive_errs_total|node_network_receive_fifo_total|node_network_receive_frame_total|node_network_receive_multicast_total|node_network_receive_packets_total|node_network_speed_bytes|node_network_transmit_bytes_total|node_network_transmit_carrier_total|node_network_transmit_colls_total|node_network_transmit_compressed_total|node_network_transmit_drop_total|node_network_transmit_errs_total|node_network_transmit_fifo_total|node_network_transmit_packets_total|node_network_transmit_queue_length|node_network_up|node_nf_conntrack_entries|node_nf_conntrack_entries_limit|node_power_supply_online|node_pressure_cpu_waiting_seconds_total|node_pressure_io_stalled_seconds_total|node_pressure_io_waiting_seconds_total|node_pressure_memory_stalled_seconds_total|node_pressure_memory_waiting_seconds_total|node_processes_max_processes|node_processes_max_threads|node_processes_pids|node_processes_state|node_processes_threads|node_procs_blocked|node_procs_running|node_schedstat_running_seconds_total|node_schedstat_timeslices_total|node_schedstat_waiting_seconds_total|node_scrape_collector_duration_seconds|node_scrape_collector_success|node_sockstat_FRAG_inuse|node_sockstat_FRAG_memory|node_sockstat_RAW_inuse|node_sockstat_TCP_alloc|node_sockstat_TCP_inuse|node_sockstat_TCP_mem|node_sockstat_TCP_mem_bytes|node_sockstat_TCP_orphan|node_sockstat_TCP_tw|node_sockstat_UDPLITE_inuse|node_sockstat_UDP_inuse|node_sockstat_UDP_mem|node_sockstat_UDP_mem_bytes|node_sockstat_sockets_used|node_softnet_dropped_total|node_softnet_processed_total|node_softnet_times_squeezed_total|node_systemd_socket_accepted_connections_total|node_systemd_units|node_tcp_connection_states|node_textfile_scrape_error|node_time_seconds|node_timex_estimated_error_seconds|node_timex_frequency_adjustment_ratio|node_timex_loop_time_constant|node_timex_maxerror_seconds|node_timex_offset_seconds|node_timex_sync_status|node_timex_tai_offset_seconds|node_timex_tick_seconds|node_uname_info|node_vmstat_oom_kill|node_vmstat_pgfault|node_vmstat_pgmajfault|node_vmstat_pgpgin|node_vmstat_pgpgout|node_vmstat_pswpin|node_vmstat_pswpout|object_type_name|offset|operator_controller_objects_count|operator_log_messages_total|operator_prometheus_converter_active_watchers|operator_prometheus_converter_watch_events_total|operator_reconcile_throttled_events_total|persistentvolumeclaim|process_cpu_cores_available|process_cpu_seconds_total|process_io_read_syscalls_total|process_io_storage_read_bytes_total|process_io_storage_written_bytes_total|process_io_write_syscalls_total|process_max_fds|process_num_threads|process_open_fds|process_resident_memory_anon_bytes|process_resident_memory_bytes|process_resident_memory_max_bytes|process_virtual_memory_bytes|process_virtual_memory_max_bytes|prometheus|prometheus_build_info|prometheus_tsdb_head_series|protocol|qos_class|reason|recording|resource|rest_client_request_duration_seconds_bucket|rest_client_requests_total|scalar|scheduler_binding_duration_seconds_bucket|scheduler_binding_duration_seconds_count|scheduler_e2e_scheduling_duration_seconds_bucket|scheduler_e2e_scheduling_duration_seconds_count|scheduler_scheduling_algorithm_duration_seconds_bucket|scheduler_scheduling_algorithm_duration_seconds_count|scheduler_volume_scheduling_duration_seconds_bucket|scheduler_volume_scheduling_duration_seconds_count|scrape_job|scrape_series_added|service|short_version|softirq|status_code|system|topk_max|unless|vector|version|vm_active_merges|vm_app_start_timestamp|vm_app_uptime_seconds|vm_app_version|vm_assisted_merges_total|vm_available_cpu_cores|vm_available_memory_bytes|vm_cache_entries|vm_cache_misses_total|vm_cache_requests_total|vm_cache_size_bytes|vm_cache_size_max_bytes|vm_concurrent_insert_capacity|vm_concurrent_insert_current|vm_data_size_bytes|vm_free_disk_space_bytes|vm_free_disk_space_limit_bytes|vm_http_request_errors_total|vm_http_requests_total|vm_ingestserver_request_errors_total|vm_ingestserver_requests_total|vm_log_messages_total|vm_new_timeseries_created_total|vm_parts|vm_pending_rows|vm_persistentqueue_bytes_dropped_total|vm_persistentqueue_read_duration_seconds_total|vm_persistentqueue_write_duration_seconds_total|vm_promscrape_conn_bytes_read_total|vm_promscrape_dial_errors_total|vm_promscrape_max_scrape_size_exceeded_errors_total|vm_promscrape_scrape_duration_seconds_bucket|vm_promscrape_scrape_pool_targets|vm_promscrape_scrape_response_size_bytes_bucket|vm_promscrape_scraped_samples_sum|vm_promscrape_scrapes_failed_total|vm_promscrape_scrapes_gunzip_failed_total|vm_promscrape_scrapes_timed_out_total|vm_promscrape_scrapes_total|vm_promscrape_targets|vm_protoparser_read_errors_total|vm_protoparser_rows_read_total|vm_protoparser_unmarshal_errors_total|vm_request_duration_seconds|vm_rows|vm_rows_added_to_storage_total|vm_rows_ignored_total|vm_rows_inserted_total|vm_rows_invalid_total|vm_rows_merged_total|vm_rows_read_per_query_bucket|vm_rows_read_per_series_bucket|vm_rows_scanned_per_query_bucket|vm_series_read_per_query_bucket|vm_slow_queries_total|vm_slow_row_inserts_total|vm_streamaggr_dedup_flush_duration_seconds_bucket|vm_streamaggr_dedup_flush_timeouts_total|vm_streamaggr_flush_timeouts_total|vm_streamaggr_ignored_samples_total|vm_streamaggr_labels_compressor_items_count|vm_streamaggr_labels_compressor_size_bytes|vm_streamaggr_matched_samples_total|vm_streamaggr_output_samples_total|vm_streamaggr_samples_lag_seconds_bucket|vm_tcplistener_accepts_total|vm_tcplistener_conns|vm_tcplistener_read_bytes_total|vm_tcplistener_written_bytes_total|vmagent_daily_series_limit_current_series|vmagent_daily_series_limit_max_series|vmagent_hourly_series_limit_current_series|vmagent_hourly_series_limit_max_series|vmagent_http_request_errors_total|vmagent_http_requests_total|vmagent_remotewrite_block_size_rows_sum|vmagent_remotewrite_conn_bytes_written_total|vmagent_remotewrite_conns|vmagent_remotewrite_packets_dropped_total|vmagent_remotewrite_pending_data_bytes|vmagent_remotewrite_queues|vmagent_remotewrite_relabel_metrics_dropped_total|vmagent_remotewrite_requests_total|vmagent_remotewrite_retries_count_total|vmagent_remotewrite_send_duration_seconds_total|vmagent_rows_inserted_total|vmalert_alerting_rules_errors_total|vmalert_alerting_rules_last_evaluation_samples|vmalert_alerts_fired_total|vmalert_alerts_firing|vmalert_alerts_pending|vmalert_alerts_send_errors_total|vmalert_alerts_sent_total|vmalert_config_last_reload_successful|vmalert_execution_errors_total|vmalert_execution_total|vmalert_iteration_duration_seconds_count|vmalert_iteration_duration_seconds_sum|vmalert_iteration_missed_total|vmalert_iteration_total|vmalert_recording_rules_errors_total|vmalert_recording_rules_last_evaluation_samples|vmalert_remotewrite_conn_bytes_written_total|vmalert_remotewrite_conns|vmalert_remotewrite_dropped_rows_total|vmalert_remotewrite_sent_rows_total|vmrange|vmauth_user_concurrent_requests_capacity|vmauth_concurrent_requests_limit_reached_total|vmauth_config_last_reload_successful|vmauth_http_request_errors_total|vmauth_unauthorized_user_concurrent_requests_limit_reached_total|vmauth_unauthorized_user_request_duration_seconds|vmauth_unauthorized_user_requests_total|vmauth_user_concurrent_requests_current|vmauth_user_concurrent_requests_limit_reached_total|vmauth_user_request_duration_seconds|vmauth_user_requests_total|without|workqueue_depth|mmcblk)$" + action = "keep" + } + + } + + prometheus.relabel "add_cluster_label" { + forward_to = [prometheus.relabel.remove_unwanted_labels.receiver] + + rule { + target_label = "cluster" + replacement = "{{ .name }}" + } + } + + prometheus.relabel "remove_unwanted_labels" { + forward_to = [prometheus.remote_write.in_cluster.receiver] + + rule { + action = "labeldrop" + regex = "^(cloud_google_com).*" + } + } + clustering: + enabled: true + name: alloy + resources: + requests: + cpu: 500m + memory: 2Gi + limits: + cpu: "2" + memory: 6Gi + controller: + type: statefulset + nodeSelector: + node_pool: observability-resources + destination: + server: "{{.server}}" + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/certmanager.yaml b/argocd/ctrl_plane/dev/certmanager.yaml new file mode 100644 index 00000000..83d913ba --- /dev/null +++ b/argocd/ctrl_plane/dev/certmanager.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cert-manager + namespace: argocd +spec: + destination: + namespace: cert-manager + server: https://kubernetes.default.svc + project: default + source: + chart: cert-manager + repoURL: https://charts.jetstack.io + targetRevision: v1.17.0 + helm: + valuesObject: + nodeSelector: + node_pool: observability-resources + parameters: + - name: installCRDs + value: "true" + syncPolicy: + automated: {} + syncOptions: + - CreateNamespace=true \ No newline at end of file diff --git a/argocd/ctrl_plane/dev/grafana-dashboards.yaml b/argocd/ctrl_plane/dev/grafana-dashboards.yaml new file mode 100644 index 00000000..5db23199 --- /dev/null +++ b/argocd/ctrl_plane/dev/grafana-dashboards.yaml @@ -0,0 +1,21 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-dashboards + namespace: argocd +spec: + project: default + source: + path: observability/grafana + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/grafana-operator.yaml b/argocd/ctrl_plane/dev/grafana-operator.yaml new file mode 100644 index 00000000..c412edf5 --- /dev/null +++ b/argocd/ctrl_plane/dev/grafana-operator.yaml @@ -0,0 +1,26 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-operator + namespace: argocd +spec: + project: default + source: + chart: grafana-operator + repoURL: ghcr.io/grafana/helm-charts + targetRevision: v5.16.0 + helm: + valuesObject: + fullnameOverride: grafana-operator + nodeSelector: + node_pool: observability-resources + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/ksm.yaml b/argocd/ctrl_plane/dev/ksm.yaml new file mode 100644 index 00000000..c39f42e3 --- /dev/null +++ b/argocd/ctrl_plane/dev/ksm.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kube-state-metrics + namespace: argocd +spec: + project: default + source: + chart: kube-state-metrics + repoURL: https://prometheus-community.github.io/helm-charts + targetRevision: 5.30.0 + helm: + valuesObject: + fullnameOverride: ksm + nodeSelector: + node_pool: observability-resources + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/kubernetes-secret-generator.yaml b/argocd/ctrl_plane/dev/kubernetes-secret-generator.yaml new file mode 100644 index 00000000..ebde9e4f --- /dev/null +++ b/argocd/ctrl_plane/dev/kubernetes-secret-generator.yaml @@ -0,0 +1,25 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kubernetes-secret-generator + namespace: argocd +spec: + project: default + source: + chart: kubernetes-secret-generator + repoURL: https://helm.mittwald.de + targetRevision: 3.4.0 + helm: + valuesObject: + nodeSelector: + node_pool: observability-resources + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - RespectIgnoreDifferences=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/kustomize/vmuser/kustomization.yaml b/argocd/ctrl_plane/dev/kustomize/vmuser/kustomization.yaml new file mode 100644 index 00000000..d8687bda --- /dev/null +++ b/argocd/ctrl_plane/dev/kustomize/vmuser/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +metadata: + name: vmuser-kustomization +namespace: observability +resources: + - vmuser.yaml + - string-secret.yaml \ No newline at end of file diff --git a/argocd/ctrl_plane/dev/kustomize/vmuser/string-secret.yaml b/argocd/ctrl_plane/dev/kustomize/vmuser/string-secret.yaml new file mode 100644 index 00000000..c571abdf --- /dev/null +++ b/argocd/ctrl_plane/dev/kustomize/vmuser/string-secret.yaml @@ -0,0 +1,11 @@ +apiVersion: secretgenerator.mittwald.de/v1alpha1 +kind: StringSecret +metadata: + name: vmuser-secret + namespace: observability +spec: + forceRegenerate: false + fields: + - fieldName: "password" + encoding: "base64" + length: "16" diff --git a/argocd/ctrl_plane/dev/kustomize/vmuser/vmuser.yaml b/argocd/ctrl_plane/dev/kustomize/vmuser/vmuser.yaml new file mode 100644 index 00000000..bd0ea4d5 --- /dev/null +++ b/argocd/ctrl_plane/dev/kustomize/vmuser/vmuser.yaml @@ -0,0 +1,20 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMUser +metadata: + name: vmuser + namespace: observability +spec: + name: vmuser + username: vmuser + passwordRef: + key: password + name: vmuser + targetRefs: + - crd: + kind: VMSingle + name: vm + namespace: observability + paths: + - "/api/v1/write" + - "/api/v1/query" + - "/api/v1/query_range" diff --git a/argocd/ctrl_plane/dev/manifests.yaml b/argocd/ctrl_plane/dev/manifests.yaml new file mode 100644 index 00000000..0801b0fe --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests.yaml @@ -0,0 +1,28 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: observability-stack + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: argocd/ctrl_plane/dev/manifests + directory: + recurse: true + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true + ignoreDifferences: + - jsonPointers: + - /stringData + - /data + kind: Secret + name: vmauth-tls diff --git a/argocd/ctrl_plane/dev/manifests/grafana-datasource.yaml b/argocd/ctrl_plane/dev/manifests/grafana-datasource.yaml new file mode 100644 index 00000000..01bf59e7 --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests/grafana-datasource.yaml @@ -0,0 +1,20 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: victoriametrics + namespace: observability +spec: + instanceSelector: + matchLabels: + dashboards: grafana + allowCrossNamespaceImport: false + datasource: + isDefault: true + access: proxy + database: prometheus + jsonData: + timeInterval: 5s + tlsSkipVerify: true + name: VictoriaMetrics + type: prometheus + url: http://vmsingle-vm.observability.svc.cluster.local:8429 \ No newline at end of file diff --git a/argocd/ctrl_plane/dev/manifests/grafana-managed-certificate.yaml b/argocd/ctrl_plane/dev/manifests/grafana-managed-certificate.yaml new file mode 100644 index 00000000..d8955a80 --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests/grafana-managed-certificate.yaml @@ -0,0 +1,7 @@ +apiVersion: networking.gke.io/v1 +kind: ManagedCertificate +metadata: + name: grafana-managed-cert +spec: + domains: + - "grafana.observability.dev.internal.falkordb.cloud" diff --git a/argocd/ctrl_plane/dev/manifests/grafana.yaml b/argocd/ctrl_plane/dev/manifests/grafana.yaml new file mode 100644 index 00000000..1a83d801 --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests/grafana.yaml @@ -0,0 +1,75 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: "grafana" +spec: + persistentVolumeClaim: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + deployment: + spec: + replicas: 1 + template: + spec: + nodeSelector: + node_pool: observability-resources + securityContext: # Pod-level security context (important for initContainer) + runAsUser: 0 # Run initContainer as root + runAsGroup: 0 + initContainers: + - name: init-grafana-data + image: busybox # Or any image with chown + command: ["chown", "-R", "1001:1001", "/var/lib/grafana"] + volumeMounts: + - name: grafana-data + mountPath: /var/lib/grafana + containers: + - name: grafana + readinessProbe: + failureThreshold: 3 + resources: + requests: + cpu: "500m" + memory: "256Mi" + limits: + cpu: "2" + memory: "2Gi" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsUser: 1001 + runAsGroup: 1001 + runAsNonRoot: true + volumes: + - name: grafana-data + persistentVolumeClaim: + claimName: grafana-pvc + config: + log: + mode: "console" + auth: + disable_login_form: "false" + ingress: + metadata: + annotations: + networking.gke.io/managed-certificates: grafana-managed-cert + kubernetes.io/ingress.class: "gce" # Updated annotation + spec: + rules: + - host: grafana.observability.dev.internal.falkordb.cloud + http: + paths: + - backend: + service: + name: grafana-service + port: + number: 3000 + path: / + pathType: Prefix diff --git a/argocd/ctrl_plane/dev/manifests/letsencrypt-issuer.yaml b/argocd/ctrl_plane/dev/manifests/letsencrypt-issuer.yaml new file mode 100644 index 00000000..cb6afd86 --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests/letsencrypt-issuer.yaml @@ -0,0 +1,15 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: letsencrypt + namespace: observability +spec: + acme: + email: devops+dev@falkordb.com + server: https://acme-staging-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-private-key + solvers: + - http01: + ingress: + name: "vmauth-vm" diff --git a/argocd/ctrl_plane/dev/manifests/vmauth-tls.yaml b/argocd/ctrl_plane/dev/manifests/vmauth-tls.yaml new file mode 100644 index 00000000..c43908c6 --- /dev/null +++ b/argocd/ctrl_plane/dev/manifests/vmauth-tls.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: vmauth-tls + namespace: observability +type: kubernetes.io/tls +stringData: + tls.key: "" + tls.crt: "" \ No newline at end of file diff --git a/argocd/ctrl_plane/dev/victoriametrics.yaml b/argocd/ctrl_plane/dev/victoriametrics.yaml new file mode 100644 index 00000000..8cdded12 --- /dev/null +++ b/argocd/ctrl_plane/dev/victoriametrics.yaml @@ -0,0 +1,152 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: victoriametrics + namespace: argocd +spec: + project: default + source: + chart: victoria-metrics-k8s-stack + repoURL: https://victoriametrics.github.io/helm-charts + targetRevision: 0.35.1 + helm: + releaseName: vm + valuesObject: + fullnameOverride: "vm" + defaultDashboards: + enabled: true + annotations: + argocd.argoproj.io/sync-options: ServerSideApply=true + dashboards: + victoriametrics-vmagent: + enabled: false + grafanaOperator: + enabled: true + spec: + allowCrossNamespaceImport: false + instanceSelector: + matchLabels: + dashboards: grafana + defaultRules: + create: false + victoria-metrics-operator: + operator: + disable_prometheus_converter: true + nodeSelector: + node_pool: observability-resources + vmsingle: + spec: + retentionPeriod: 365d + logFormat: json + nodeSelector: + node_pool: observability-resources + storage: + resources: + requests: + storage: 200Gi + serviceSpec: + metadata: + annotations: + cloud.google.com/load-balancer-type: Internal + spec: + type: LoadBalancer + alertmanager: + spec: + nodeSelector: + node_pool: observability-resources + secrets: + - pagerduty-service-key + configSecret: null + config: + global: + resolve_timeout: 5m + route: + group_by: ["alertname", "namespace", "pod"] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: "pagerduty" + routes: + - match: + alertname: Watchdog + receiver: "blackhole" + - receiver: "pagerduty" + receivers: + - name: blackhole + - name: "pagerduty" + pagerduty_configs: + - service_key_file: "/etc/vm/secrets/pagerduty-service-key/api-key" + url: https://events.pagerduty.com/generic/2010-04-15/create_event.json + send_resolved: true + vmalert: + spec: + nodeSelector: + node_pool: observability-resources + vmauth: + enabled: true + spec: + selectAllByDefault: true + userNamespaceSelector: {} + userSelector: {} + nodeSelector: + node_pool: public-pool + ingress: + annotations: + cert-manager.io/issuer: letsencrypt + kubernetes.io/ingress.class: gce + kubernetes.io/ingress.allow-http: "false" + tlsSecretName: vmauth-tls + tlsHosts: + - "vmauth.observability.dev.internal.falkordb.cloud" + vmagent: + enabled: false + grafana: + enabled: false + prometheus-node-exporter: + enabled: false + kube-state-metrics: + enabled: false + kubelet: + enabled: false + kubeApiServer: + enabled: false + kubeControllerManager: + enabled: false + kubeDns: + enabled: false + coreDns: + enabled: false + kubeEtcd: + enabled: false + enabled: + enabled: false + kubeProxy: + enabled: false + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + - RespectIgnoreDifferences=true + automated: + prune: true + selfHeal: true + ignoreDifferences: + - group: "" + kind: Secret + name: vm-victoria-metrics-operator-validation + namespace: observability + jsonPointers: + - /data + - group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration + name: vm-victoria-metrics-operator-admission + jqPathExpressions: + - ".webhooks[]?.clientConfig.caBundle" + - group: "" + kind: Deployment + name: vm-grafana + namespace: observability + jsonPointers: + - /spec/template/metadata/annotations/checksum~1secret diff --git a/argocd/ctrl_plane/dev/vmrules.yaml b/argocd/ctrl_plane/dev/vmrules.yaml new file mode 100644 index 00000000..90712a93 --- /dev/null +++ b/argocd/ctrl_plane/dev/vmrules.yaml @@ -0,0 +1,22 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vmrules + namespace: argocd +spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: observability/rules + directory: + recurse: true + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/argocd/ctrl_plane/dev/vmusers.yaml b/argocd/ctrl_plane/dev/vmusers.yaml new file mode 100644 index 00000000..d9104f3f --- /dev/null +++ b/argocd/ctrl_plane/dev/vmusers.yaml @@ -0,0 +1,56 @@ +apiVersion: argoproj.io/v1alpha1 +kind: ApplicationSet +metadata: + name: vmusers + namespace: argocd +spec: + goTemplate: true + goTemplateOptions: ["missingkey=error"] + generators: + - clusters: + selector: + matchLabels: + role: app-plane + template: + metadata: + name: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-vmuser' + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: argocd/ctrl_plane/dev/kustomize/vmuser + kustomize: + patches: + - target: + kind: VMUser + name: vmuser + patch: |- + - op: replace + path: /metadata/name + value: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-user' + - op: replace + path: /spec/name + value: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}' + - op: replace + path: /spec/username + value: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-user' + - op: replace + path: /spec/passwordRef/name + value: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-vmuser' + - target: + kind: StringSecret + name: vmuser-secret + patch: |- + - op: replace + path: /metadata/name + value: '{{ regexFind "h?c-[A-Za-z0-9]+" .name }}-vmuser' + destination: + server: https://kubernetes.default.svc + namespace: observability + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + prune: true + selfHeal: true diff --git a/observability/grafana/base/grafana-dashboards.yaml b/observability/grafana/base/grafana-dashboards.yaml new file mode 100644 index 00000000..271c3b75 --- /dev/null +++ b/observability/grafana/base/grafana-dashboards.yaml @@ -0,0 +1,72 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: cluster-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-cluster + key: json +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: pod-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-pods + key: json +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: namespace-overview +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-namespaces + key: json +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: falkordb-cloud +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-falkordb + key: json +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: prometheus-dashboard +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-prometheus + key: json +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + name: vmauth +spec: + instanceSelector: + matchLabels: + dashboards: "grafana" + configMapRef: + name: dashboards-k8s-views-vmauth + key: json +--- \ No newline at end of file diff --git a/observability/grafana/base/kustomization.yaml b/observability/grafana/base/kustomization.yaml new file mode 100644 index 00000000..1befb76b --- /dev/null +++ b/observability/grafana/base/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - grafana-dashboards.yaml \ No newline at end of file diff --git a/observability/grafana/dashboards/cluster-overview.json b/observability/grafana/dashboards/cluster-overview.json new file mode 100644 index 00000000..fa8a4940 --- /dev/null +++ b/observability/grafana/dashboards/cluster-overview.json @@ -0,0 +1,3527 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This is a modern 'Global View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 67, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 77, + "interval": "2m", + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (core) (rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval])))", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"}) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"}) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global CPU Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "indexByName": { + "Limits": 5, + "Real": 1, + "Real Linux": 2, + "Real Windows": 3, + "Requests": 4, + "Time": 0 + }, + "renameByName": {} + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 78, + "options": { + "displayMode": "lcd", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 10, + "minVizWidth": 0, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "text": {}, + "valueMode": "color" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "hide": false, + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_memory_available_bytes{cluster=\"$cluster\"} + windows_memory_cache_bytes{cluster=\"$cluster\"}) / sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + } + ], + "title": "Global RAM Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0 + }, + "renameByName": {} + } + } + ], + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 63, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 9, + "x": 15, + "y": 1 + }, + "id": 52, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kube_namespace_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Namespaces", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_running{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Containers", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Pods", + "refId": "O" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_service_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Services", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_endpoint_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Endpoints", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_ingress_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ingresses", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Deployments", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_statefulset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Statefulsets", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_daemonset_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Daemonsets", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Persistent Volume Claims", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_hpa_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Horizontal Pod Autoscalers", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_configmap_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Configmaps", + "refId": "K" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_secret_info{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Secrets", + "refId": "L" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_networkpolicy_labels{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Network Policies", + "refId": "M" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "count(count by (node) (kube_node_info{cluster=\"$cluster\"}))", + "hide": false, + "interval": "", + "legendFormat": "Nodes", + "refId": "N" + } + ], + "title": "Kubernetes Resource Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 4 + }, + "id": 62, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_phase{phase=\"Running\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Running Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 7 + }, + "id": 59, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(count(up) by (namespace))", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 9 + }, + "id": 37, + "interval": "2m", + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "CPU Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "sum" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true, + "Total Linux": true, + "Total Windows": true + }, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0, + "Total": 8, + "Total Linux": 6, + "Total Windows": 7 + }, + "renameByName": {} + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 9 + }, + "id": 39, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "", + "legendFormat": "Real Linux", + "range": true, + "refId": "Real Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"} - windows_memory_cache_bytes{cluster=\"$cluster\"})", + "hide": false, + "interval": "", + "legendFormat": "Real Windows", + "range": true, + "refId": "Real Windows" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "Requests" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "Limits" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total", + "range": true, + "refId": "Total" + } + ], + "title": "RAM Usage", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Real", + "mode": "reduceRow", + "reduce": { + "include": [ + "Real Linux", + "Real Windows" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Real Linux": true, + "Real Windows": true, + "Time": true + }, + "includeByName": {}, + "indexByName": { + "Limits": 5, + "Real": 3, + "Real Linux": 1, + "Real Windows": 2, + "Requests": 4, + "Time": 0, + "Total": 6 + }, + "renameByName": {} + } + } + ], + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "fe6kx1tpffym8a" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 10 + }, + "id": 89, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "editorMode": "code", + "expr": "count(count(up{namespace=~\"instance-.+\"}) by (namespace))", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Instances", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 71, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd", + "seriesBy": "last" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 72, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])))", + "interval": "$resolution", + "legendFormat": "Linux", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "1 - avg(rate(windows_cpu_time_total{cluster=\"$cluster\",mode=\"idle\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows", + "range": true, + "refId": "Windows" + } + ], + "title": "Cluster CPU Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "CPU usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 0.5 + }, + { + "color": "red", + "value": 0.7 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"})", + "interval": "$resolution", + "legendFormat": "Linux", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"}) / sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows", + "range": true, + "refId": "Windows" + } + ], + "title": "Cluster Memory Utilization", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Memory usage in %", + "mode": "reduceRow", + "reduce": { + "reducer": "mean" + }, + "replaceFields": true + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 46, + "interval": "2m", + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "format": "time_series", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 50, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{image!=\"\", cluster=\"$cluster\"}) by (namespace)", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Utilization by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU %", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 54, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance, cpu) (rate(node_cpu_seconds_total{mode!~\"idle|iowait|steal\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ node }}", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "avg(sum by (instance,core) (rate(windows_cpu_time_total{mode!=\"idle\", cluster=\"$cluster\"}[$__rate_interval]))) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ node }}", + "range": true, + "refId": "Windows" + } + ], + "title": "CPU Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "MEMORY", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 73, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(node_memory_MemTotal_bytes{cluster=\"$cluster\", job=\"$job\"} - node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=\"$job\"}) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "Linux" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(windows_os_visible_memory_bytes{cluster=\"$cluster\"} - windows_memory_available_bytes{cluster=\"$cluster\"}) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "Windows" + } + ], + "title": "Memory Utilization by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 38 + }, + "id": 82, + "interval": "2m", + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "$resolution", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "NB", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 38 + }, + "id": 83, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_cpu_core_throttles_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "{{ instance }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Core Throttled by instance", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 86, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 84, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_qos_class{cluster=\"$cluster\"}) by (qos_class)", + "interval": "", + "legendFormat": "{{ qos_class }} pods", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_info{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total pods", + "range": true, + "refId": "B" + } + ], + "title": "Kubernetes Pods QoS classes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 85, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "interval": "", + "legendFormat": "{{ reason }}", + "range": true, + "refId": "A" + } + ], + "title": "Kubernetes Pods Status Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 87, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 88, + "interval": "2m", + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace) > 0", + "interval": "", + "legendFormat": "{{ namespace }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by namespace", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 65 + }, + "id": 69, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 66 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Received : {{ device }}", + "range": true, + "refId": "Linux Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (device)", + "interval": "$resolution", + "legendFormat": "Transmitted : {{ device }}", + "range": true, + "refId": "Linux Transmitted" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{cluster=\"$cluster\"}[$__rate_interval])) by (nic)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received : {{ nic }}", + "range": true, + "refId": "Windows Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_bytes_sent_total{cluster=\"$cluster\"}[$__rate_interval])) by (nic)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted : {{ device }}", + "range": true, + "refId": "Windows Transmitted" + } + ], + "title": "Global Network Utilization by device", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "DROPPED PACKETS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 66 + }, + "id": 53, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Linux Packets dropped (receive)", + "range": true, + "refId": "Linux Packets dropped (receive)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(node_network_transmit_drop_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Linux Packets dropped (transmit)", + "range": true, + "refId": "Linux Packets dropped (transmit)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_packets_received_discarded_total{cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows Packets dropped (receive)", + "range": true, + "refId": "Windows Packets dropped (receive)" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_packets_outbound_discarded_total{cluster=\"$cluster\"}[$__rate_interval]))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Windows Packets dropped (transmit)", + "range": true, + "refId": "Windows Packets dropped (transmit)" + } + ], + "title": "Network Saturation - Packets dropped", + "transformations": [ + { + "id": "calculateField", + "options": { + "alias": "Packets dropped (receive)", + "mode": "reduceRow", + "reduce": { + "include": [ + "Linux Packets dropped (receive)", + "Windows Packets dropped (receive)" + ], + "reducer": "mean" + } + } + }, + { + "id": "calculateField", + "options": { + "alias": "Packets dropped (transmit)", + "mode": "reduceRow", + "reduce": { + "include": [ + "Linux Packets dropped (transmit)", + "Windows Packets dropped (transmit)" + ], + "reducer": "mean" + } + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Linux Packets dropped (receive)": true, + "Linux Packets dropped (transmit)": true, + "Time": false, + "Windows Packets dropped (receive)": true, + "Windows Packets dropped (transmit)": true + }, + "includeByName": {}, + "indexByName": {}, + "renameByName": {} + } + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 79, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace)", + "interval": "$resolution", + "legendFormat": "Received : {{ namespace }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- (sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\"}[$__rate_interval])) by (namespace))", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted : {{ namespace }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received by namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 80, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "Linux Received bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "Linux Transmitted bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "Windows Received bytes" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(windows_net_bytes_sent_total{cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "Windows Transmitted bytes" + } + ], + "title": "Total Network Received (with all virtual devices) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 82 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device!~\"(veth|azv|lxc|lo).*\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(windows_net_bytes_received_total{nic!~\".*Virtual.*\",cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(windows_net_bytes_sent_total{nic!~\".*Virtual.*\",cluster=\"$cluster\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "D" + } + ], + "title": "Network Received (without loopback) by instance", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Dropped noisy virtual devices for readability.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "BANDWIDTH", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 82 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(node_network_receive_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "interval": "$resolution", + "legendFormat": "Received bytes in {{ instance }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "- sum(rate(node_network_transmit_bytes_total{device=\"lo\", cluster=\"$cluster\", job=\"$job\"}[$__rate_interval])) by (instance)", + "hide": false, + "interval": "$resolution", + "legendFormat": "Transmitted bytes in {{ instance }}", + "range": true, + "refId": "B" + } + ], + "title": "Network Received (loopback only) by instance", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 40, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "fe6kx1tpffym8a" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "c-hcjx5tis6bc", + "value": "c-hcjx5tis6bc" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "includeAll": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "30s", + "value": "30s" + }, + "includeAll": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "type": "custom" + }, + { + "current": { + "text": [ + "node-exporter" + ], + "value": [ + "node-exporter" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(node_cpu_seconds_total{cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Global", + "uid": "k8s_views_global", + "version": 4, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/dashboards/falkordb-cloud.json b/observability/grafana/dashboards/falkordb-cloud.json new file mode 100644 index 00000000..7cdda7cc --- /dev/null +++ b/observability/grafana/dashboards/falkordb-cloud.json @@ -0,0 +1,1155 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "FalkorDB Dashboard", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 9, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "max(max_over_time(___customer_redis_uptime_in_seconds{pod=~\"$pod\"}[$__interval]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "range": true, + "refId": "A", + "step": 1800 + } + ], + "title": "Max Uptime", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "hideTimeOverride": true, + "id": 12, + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(___customer_redis_connected_clients{pod=~\"$pod\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1m", + "title": "Clients", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 0 + }, + "hideTimeOverride": true, + "id": 11, + "maxDataPoints": 100, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(100 * sum(___customer_redis_memory_used_bytes{pod=~\"$pod\"}) / sum(___customer_redis_memory_max_bytes{pod=~\"$pod\"}))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "range": true, + "refId": "A", + "step": 2 + } + ], + "timeFrom": "1m", + "title": "Memory Usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 18, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(___customer_redis_commands_total{pod=~\"$pod\"} [1m])) by (cmd)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ cmd }}", + "metric": "___customer_redis_command_calls_total", + "refId": "A", + "step": 240 + } + ], + "title": "Total Commands / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#BF1B00", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "___customer_redis_memory_used_bytes{pod=~\"$pod\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "used, {{ pod }}", + "metric": "", + "refId": "A", + "step": 240, + "target": "" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "___customer_redis_memory_max_bytes{pod=~\"$pod\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "max, {{ pod }}", + "refId": "B", + "step": 240 + } + ], + "title": "Total Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(___customer_redis_net_input_bytes_total{pod=~\"$pod\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ input }}", + "refId": "A", + "step": 240 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(___customer_redis_net_output_bytes_total{pod=~\"$pod\"}[5m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ output }}", + "refId": "B", + "step": 240 + } + ], + "title": "Network I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 70, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum (___customer_redis_db_keys{pod=~\"$pod\"}) by (db, pod)", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ db }}, {{ pod }}", + "refId": "A", + "step": 240, + "target": "" + } + ], + "title": "Total Items per DB", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(___customer_redis_connected_clients{pod=~\"$pod\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "connected", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(___customer_redis_blocked_clients{pod=~\"$pod\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "blocked", + "refId": "B" + } + ], + "title": "Connected/Blocked Clients", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(irate(___customer_redis_commands_duration_seconds_total{pod =~ \"$pod\"}[1m])) by (cmd)\n /\nsum(irate(___customer_redis_commands_total{pod =~ \"$pod\"}[1m])) by (cmd)\n", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ cmd }}", + "metric": "___customer_redis_command_calls_total", + "refId": "A", + "step": 240 + } + ], + "title": "Average Time Spent by Command / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(irate(___customer_redis_commands_duration_seconds_total{pod=~\"$pod\"}[1m])) by (cmd) != 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ cmd }}", + "metric": "___customer_redis_command_calls_total", + "refId": "A", + "step": 240 + } + ], + "title": "Total Time Spent by Command / sec", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 40, + "tags": [ + "prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "fe6kx1tpffym8a" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(___customer_redis_uptime_in_seconds, namespace)", + "includeAll": false, + "name": "namespace", + "options": [], + "query": "label_values(___customer_redis_uptime_in_seconds, namespace)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(___customer_redis_uptime_in_seconds{namespace=~\"$namespace\"}, pod)", + "includeAll": false, + "multi": true, + "name": "pod", + "options": [], + "query": "label_values(___customer_redis_uptime_in_seconds{namespace=~\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "FalkorDB Dashboard", + "uid": "e008bc3f-81a2-40f9-baf2-a33fd8dec7eg", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/dashboards/namespace-overview.json b/observability/grafana/dashboards/namespace-overview.json new file mode 100644 index 00000000..73002fc7 --- /dev/null +++ b/observability/grafana/dashboards/namespace-overview.json @@ -0,0 +1,2754 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This is a modern 'Namespaces View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 2, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 38, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 46, + "interval": "2m", + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(machine_cpu_cores{cluster=\"$cluster\"})", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "Namespace(s) usage on total cluster CPU in %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50 + }, + { + "color": "red", + "value": 70 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 48, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto", + "text": {} + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}) / sum(machine_memory_bytes{cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Namespace(s) usage on total cluster RAM in %", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running Pods", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_service_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Services", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_ingress_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ingresses", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_deployment_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Deployments", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_statefulset_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Statefulsets", + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_daemonset_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Daemonsets", + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Persistent Volume Claims", + "refId": "G" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_hpa_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Horizontal Pod Autoscalers", + "refId": "H" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_configmap_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Configmaps", + "refId": "I" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_secret_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Secrets", + "refId": "J" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_networkpolicy_labels{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Network Policies", + "refId": "K" + } + ], + "title": "Kubernetes Resource Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 8 + }, + "id": 62, + "interval": "2m", + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\", resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\", resource=\"cpu\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_cpu_cores{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Cluster Total", + "range": true, + "refId": "D" + } + ], + "title": "Namespace(s) CPU Usage in cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 8 + }, + "id": 64, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Real", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_requests{namespace=~\"$namespace\", resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_resource_limits{namespace=~\"$namespace\", resource=\"memory\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(machine_memory_bytes{cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Cluster Total", + "range": true, + "refId": "D" + } + ], + "title": "Namespace(s) RAM Usage in bytes", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 40, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU CORES", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory usage by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 68, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod) > 0", + "interval": "$resolution", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by pod", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 73, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 70, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_qos_class{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (qos_class)", + "interval": "", + "legendFormat": "{{ qos_class }} pods", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "hide": false, + "legendFormat": "Total pods", + "range": true, + "refId": "B" + } + ], + "title": "Kubernetes Pods QoS classes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 72, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kube_pod_status_reason{cluster=\"$cluster\"}) by (reason)", + "interval": "", + "legendFormat": "{{ reason }}", + "range": true, + "refId": "A" + } + ], + "title": "Kubernetes Pods Status Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 74, + "interval": "2m", + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{namespace=~\"${namespace}\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace, pod) > 0", + "interval": "", + "legendFormat": "namespace: {{ namespace }} - pod: {{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by namespace, pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "No data is generally a good thing here.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "points", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 75, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=~\"${namespace}\", cluster=\"$cluster\"}[$__rate_interval])) by (namespace, pod) > 0", + "interval": "", + "legendFormat": "namespace: {{ namespace }} - pod: {{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by namespace, pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_ready{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Ready", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_status_running{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_waiting{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Waiting", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_restarts_total{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Restarts Total", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_container_status_terminated{namespace=~\"$namespace\", cluster=\"$cluster\"})", + "interval": "", + "legendFormat": "Terminated", + "refId": "E" + } + ], + "title": "Nb of pods by state", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_container_info{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "{{ pod }}", + "range": true, + "refId": "A" + } + ], + "title": "Nb of containers by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "List of pods that are not in Running or Succeeded status.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "id": 83, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(kube_pod_status_phase{phase!~\"Running|Succeeded\", namespace=~\"$namespace\", cluster=\"$cluster\"}) by (pod) > 0\n", + "interval": "", + "legendFormat": "{{ deployment }}", + "range": true, + "refId": "A" + } + ], + "title": "Pods with unexpected status", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "bars", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "id": 82, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "last" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(rate(container_cpu_usage_seconds_total{namespace=~\"$namespace\", image!=\"\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (image)", + "interval": "", + "legendFormat": "{{ image }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Image Used", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 42, + "panels": [], + "title": "Kubernetes Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 65, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Capacity and usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 66, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Used", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(kubelet_volume_stats_capacity_bytes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "hide": false, + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Capacity", + "refId": "B" + } + ], + "title": "Persistent Volumes - Capacity and usage in bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 73 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "1 - sum(kubelet_volume_stats_inodes_used{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_inodes{namespace=~\"$namespace\", cluster=\"$cluster\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Inodes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 81 + }, + "id": 76, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 82 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Bandwidth by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 82 + }, + "id": 79, + "interval": "2m", + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Rate by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 90 + }, + "id": 80, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Dropped by pod", + "type": "timeseries" + }, + { + "datasource": { + "default": false, + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 90 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Received - {{ pod }}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{namespace=~\"$namespace\", pod=~\"${created_by}.*\", cluster=\"$cluster\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "Transmitted - {{ pod }}", + "range": true, + "refId": "B" + } + ], + "title": "Network - Errors by pod", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 40, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "fe6kx1tpffym8a" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "c-hcjx5tis6bc", + "value": "c-hcjx5tis6bc" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "includeAll": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "includeAll": true, + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "5m", + "value": "5m" + }, + "includeAll": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": true, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "type": "custom" + }, + { + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"},created_by_name)", + "description": "Can be used to filter on a specific deployment, statefulset or deamonset (only relevant panels).", + "includeAll": true, + "multi": true, + "name": "created_by", + "options": [], + "query": { + "query": "label_values(kube_pod_info{namespace=~\"$namespace\", cluster=\"$cluster\"},created_by_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Namespaces", + "uid": "k8s_views_ns", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/dashboards/pod-overview.json b/observability/grafana/dashboards/pod-overview.json new file mode 100644 index 00000000..ea8f182a --- /dev/null +++ b/observability/grafana/dashboards/pod-overview.json @@ -0,0 +1,2494 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This is a modern 'Pods View' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 4, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 43, + "panels": [], + "title": "Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "purple", + "index": 0 + }, + "1": { + "color": "blue", + "index": 1 + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 63, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_status_phase{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "format": "table", + "instant": false, + "interval": "", + "legendFormat": "{{ phase }}", + "range": true, + "refId": "A" + } + ], + "title": "Status", + "transformations": [ + { + "id": "groupingToMatrix", + "options": { + "columnField": "phase", + "rowField": "Time", + "valueField": "Value" + } + }, + { + "id": "convertFieldType", + "options": { + "conversions": [ + { + "destinationType": "time", + "targetField": "Time\\phase" + } + ], + "fields": {} + } + } + ], + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "links": [ + { + "title": "", + "url": "/d/k8s_views_nodes/kubernetes-views-nodes?var-datasource=${datasource}&var-node=${__field.labels.node}" + } + ], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 33, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ node }}", + "refId": "A" + } + ], + "title": "Running on", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(255, 255, 255)", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 41, + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_info{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ pod_ip }}", + "refId": "A" + } + ], + "title": "Pod IP", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 4 + }, + "id": 56, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_container_status_last_terminated_reason{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ reason }}", + "refId": "A" + } + ], + "title": "Last Terminated Reason", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Panel only works when a single pod is selected.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + }, + { + "color": "red", + "value": 1 + }, + { + "color": "#EAB839", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 4 + }, + "id": 57, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": true + }, + "showPercentChange": false, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "kube_pod_container_status_last_terminated_exitcode{namespace=\"$namespace\", pod=\"$pod\", cluster=\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Last Terminated Exit Code", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 47, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 0, + "y": 9 + }, + "id": 39, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Requests", + "refId": "A" + } + ], + "title": "Total pod CPU Requests usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 3, + "y": 9 + }, + "id": 48, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Limits", + "refId": "A" + } + ], + "title": "Total pod CPU Limits usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "fixed" + }, + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "blue", + "value": null + }, + { + "color": "#EAB839", + "value": 80 + }, + { + "color": "red", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 6, + "y": 9 + }, + "id": 40, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"})", + "instant": true, + "interval": "$resolution", + "legendFormat": "Requests", + "refId": "A" + } + ], + "title": "Total pod RAM Requests usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 3, + "x": 9, + "y": 9 + }, + "id": 49, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) ", + "instant": true, + "interval": "$resolution", + "legendFormat": "Limits", + "refId": "B" + } + ], + "title": "Total pod RAM Limits usage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false, + "minWidth": 100 + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "blue", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Memory Requests" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Limits" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Memory Used" + }, + "properties": [ + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 38, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "E" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}) by (container)", + "format": "table", + "hide": false, + "instant": true, + "range": false, + "refId": "F" + } + ], + "title": "Resources by container", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "container" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Time 1": true, + "Time 2": true, + "Time 4": true, + "__name__": true, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "container": false, + "endpoint": true, + "endpoint 2": true, + "endpoint 3": true, + "endpoint 4": true, + "instance": true, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "job": true, + "job 2": true, + "job 3": true, + "job 4": true, + "namespace": true, + "namespace 2": true, + "namespace 3": true, + "namespace 4": true, + "node": true, + "node 2": true, + "node 3": true, + "node 4": true, + "pod": true, + "pod 2": true, + "pod 3": true, + "pod 4": true, + "resource 1": true, + "resource 2": true, + "resource 3": true, + "resource 4": true, + "service": true, + "service 2": true, + "service 3": true, + "service 4": true, + "uid 1": true, + "uid 2": true, + "uid 3": true, + "uid 4": true, + "unit 1": true, + "unit 2": true, + "unit 3": true, + "unit 4": true + }, + "indexByName": { + "Time 1": 7, + "Time 2": 8, + "Time 3": 9, + "Time 4": 10, + "Time 5": 11, + "Time 6": 12, + "Value #A": 2, + "Value #B": 3, + "Value #C": 5, + "Value #D": 6, + "Value #E": 1, + "Value #F": 4, + "container": 0 + }, + "renameByName": { + "Value #A": "CPU Requests", + "Value #B": "CPU Limits", + "Value #C": "Memory Requests", + "Value #D": "Memory Limits", + "Value #E": "CPU Used", + "Value #F": "Memory Used", + "container": "Container" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "interval": "$resolution", + "legendFormat": "{{ container }} REQUESTS", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"cpu\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "hide": false, + "legendFormat": "{{ container }} LIMITS", + "range": true, + "refId": "B" + } + ], + "title": "CPU Usage / Requests & Limits by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 17 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) by (container) / sum(kube_pod_container_resource_requests{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "interval": "", + "legendFormat": "{{ container }} REQUESTS", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", cluster=\"$cluster\"}) by (container) / sum(kube_pod_container_resource_limits{namespace=\"$namespace\", pod=~\"$pod\", resource=\"memory\", job=~\"$job\", cluster=\"$cluster\"}) by (container)", + "hide": false, + "legendFormat": "{{ container }} LIMITS", + "range": true, + "refId": "B" + } + ], + "title": "Memory Usage / Requests & Limits by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 25 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container, id)", + "interval": "$resolution", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "id": 51, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}) by (container, id)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Usage by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "SECONDS", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 33 + }, + "id": 59, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_cfs_throttled_seconds_total{namespace=~\"$namespace\", pod=~\"$pod\", image!=\"\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "$resolution", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Throttled seconds by container", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 62, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 60, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(container_oom_events_total{namespace=\"${namespace}\", pod=\"${pod}\", container!=\"\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "OOM Events by container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "blue", + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Percent", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "area" + } + }, + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 30 + }, + { + "color": "#EAB839", + "value": 70 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 61, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=~\"${namespace}\", pod=\"${pod}\", container!=\"\", job=~\"$job\", cluster=\"$cluster\"}[$__rate_interval])) by (container)", + "interval": "", + "legendFormat": "{{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Container Restarts by container", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 45, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Bandwidth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Packets Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Packets Dropped", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Received", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{namespace=\"$namespace\", pod=~\"$pod\", cluster=\"$cluster\"}[$__rate_interval]))", + "interval": "$resolution", + "legendFormat": "Transmitted", + "refId": "B" + } + ], + "title": "Network - Errors", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 40, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "fe6kx1tpffym8a" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "c-hcjx5tis6bc", + "value": "c-hcjx5tis6bc" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "includeAll": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "instance-4xvmf1fdb", + "value": "instance-4xvmf1fdb" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "includeAll": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info{cluster=\"$cluster\"}, namespace)", + "refId": "Prometheus-namespace-Variable-Query" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": { + "text": [ + "cluster-mz-0" + ], + "value": [ + "cluster-mz-0" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"}, pod)", + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"}, pod)", + "refId": "Prometheus-pod-Variable-Query" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "5m", + "value": "5m" + }, + "includeAll": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": true, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "type": "custom" + }, + { + "current": { + "text": [ + "kube-state-metrics" + ], + "value": [ + "kube-state-metrics" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"},job)", + "includeAll": false, + "multi": true, + "name": "job", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_pod_info{namespace=\"$namespace\", cluster=\"$cluster\"},job)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes / Views / Pods", + "uid": "k8s_views_pods", + "version": 6, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/dashboards/prometheus-dashboard.json b/observability/grafana/dashboards/prometheus-dashboard.json new file mode 100644 index 00000000..2e1c146f --- /dev/null +++ b/observability/grafana/dashboards/prometheus-dashboard.json @@ -0,0 +1,1428 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "This is a modern 'Prometheus' dashboard for your Kubernetes cluster(s). Made for kube-prometheus-stack and take advantage of the latest Grafana features. GitHub repository: https://github.com/dotdc/grafana-dashboards-kubernetes", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 5, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 89, + "panels": [], + "title": "Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "?", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "orange", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 78, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "prometheus_build_info{pod=~\"$pod\", cluster=~\"$cluster\"}", + "instant": true, + "interval": "", + "legendFormat": "{{ version }}", + "range": false, + "refId": "A" + } + ], + "title": "Prometheus version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 92, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "up{pod=~\"$pod\", cluster=~\"$cluster\", job=\"$job\", namespace=\"$namespace\"} < 1", + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Instance Down", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "noValue": "0", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 72, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "value", + "wideLayout": true + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(prometheus_tsdb_head_series{pod=~\"$pod\", cluster=~\"$cluster\"}) by (pod)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "TSDB Head Series", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 47, + "panels": [], + "title": "Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + }, + { + "id": "custom.fillOpacity", + "value": 0 + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$pod\", image!=\"\", container!=\"\", cluster=~\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (pod, container)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - {{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage by pod, container", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Bytes", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 51, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{pod=~\"$pod\", image!=\"\", container!=\"\", cluster=~\"$cluster\", namespace=\"$namespace\"}) by (pod, container)", + "interval": "", + "legendFormat": "{{ pod }} - {{ container }}", + "range": true, + "refId": "A" + } + ], + "title": "Memory Usage by container", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 66, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 62, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\",namespace=~\"$namespace\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\", namespace=~\"$namespace\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A" + } + ], + "title": "Persistent Volumes - Capacity and usage in %", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 87, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_used_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\",namespace=~\"$namespace\"}) by (persistentvolumeclaim)", + "interval": "", + "legendFormat": "{{ persistentvolumeclaim }} - Used", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\",namespace=~\"$namespace\"}) by (persistentvolumeclaim)", + "hide": false, + "legendFormat": "{{ persistentvolumeclaim }} - Capacity", + "range": true, + "refId": "B" + } + ], + "title": "Persistent Volumes - Capacity and usage in bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 68, + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "1 - sum(kubelet_volume_stats_inodes_used{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\",namespace=~\"$namespace\"}) by (persistentvolumeclaim) / sum(kubelet_volume_stats_inodes{persistentvolumeclaim=~\".*prom.*\", cluster=~\"$cluster\",namespace=~\"$namespace\"}) by (persistentvolumeclaim)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ persistentvolumeclaim }}", + "range": true, + "refId": "A", + "step": 240 + } + ], + "title": "Persistent Volumes - Inodes", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 45, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_bytes_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_bytes_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Bandwidth by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 34, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets rate by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 36, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_dropped_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_dropped_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Dropped by pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.3.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_errors_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_errors_total{pod=~\"$pod\", cluster=~\"$cluster\",namespace=~\"$namespace\"}[$__rate_interval])) by (pod)", + "interval": "$resolution", + "legendFormat": "{{ pod }} - Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Errors by pod", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "30s", + "schemaVersion": 40, + "tags": [ + "Kubernetes", + "Prometheus" + ], + "templating": { + "list": [ + { + "current": { + "text": "prometheus", + "value": "fe6kx1tpffym8a" + }, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "c-hcjx5tis6bc", + "value": "c-hcjx5tis6bc" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(kube_node_info,cluster)", + "includeAll": false, + "name": "cluster", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(kube_node_info,cluster)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "allValue": "(.*)", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(prometheus_build_info{cluster=\"$cluster\"}, pod)", + "includeAll": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(prometheus_build_info{cluster=\"$cluster\"}, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "30s", + "value": "30s" + }, + "includeAll": false, + "name": "resolution", + "options": [ + { + "selected": false, + "text": "1s", + "value": "1s" + }, + { + "selected": false, + "text": "15s", + "value": "15s" + }, + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "3m", + "value": "3m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + } + ], + "query": "1s, 15s, 30s, 1m, 3m, 5m", + "type": "custom" + }, + { + "current": { + "text": "observability-test", + "value": "observability-test" + }, + "definition": "label_values(namespace)", + "name": "namespace", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Prometheus", + "uid": "k8s_addons_prometheus", + "version": 3, + "weekStart": "" +} \ No newline at end of file diff --git a/observability/grafana/dashboards/vmauth.json b/observability/grafana/dashboards/vmauth.json new file mode 100644 index 00000000..d1228465 --- /dev/null +++ b/observability/grafana/dashboards/vmauth.json @@ -0,0 +1,2576 @@ +{ + "__inputs": [], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "10.4.2" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "Overview for VictoriaMetrics vmauth v1.80.0 or higher", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [ + { + "asDropdown": false, + "icon": "doc", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "vmauth docs", + "tooltip": "vmauth docs", + "type": "link", + "url": "https://docs.victoriametrics.com/vmauth.html" + }, + { + "asDropdown": false, + "icon": "question", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "Found a bug?", + "tooltip": "Found a bug?", + "type": "link", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/issues" + }, + { + "asDropdown": false, + "icon": "info", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": true, + "title": "New releases", + "tooltip": "New releases", + "type": "link", + "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/releases" + } + ], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [], + "title": "Stats", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "stepAfter", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 11, + "x": 0, + "y": 1 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.2.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(min_over_time(vm_app_uptime_seconds{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (job)", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{job}}", + "refId": "A" + } + ], + "title": "Uptime", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows if the last configuration update was successful. \"Not Ok\" means there was an unsuccessful attempt to update the configuration due to some error. Check the log for details.", + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Ok" + } + }, + "type": "value" + }, + { + "options": { + "from": 1, + "result": { + "color": "red", + "index": 1, + "text": "Not Ok" + }, + "to": 999999 + }, + "type": "range" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 11, + "y": 1 + }, + "id": 30, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "count(vmauth_config_last_reload_successful{job=~\"$job\", instance=~\"$instance\"} < 1 ) or 0", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Config update", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of requests.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 7, + "x": 17, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(sum(rate(vmauth_user_requests_total{job=~\"$job\", instance=~\"$instance\", username=~\"$user\"}[$__rate_interval])) or 0) + (sum(rate(vmauth_unauthorized_user_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) or 0)", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "Requests rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the total number of users defined at configuration file.", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 11, + "y": 4 + }, + "id": 31, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "count(vmauth_user_concurrent_requests_capacity{job=~\"$job\", instance=~\"$instance\"})", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Users count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of request errors.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 7, + "x": 17, + "y": 4 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(rate(vmauth_http_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "Errors rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false, + "minWidth": 50 + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "displayName", + "value": "Count" + } + ] + } + ] + }, + "gridPos": { + "h": 4, + "w": 11, + "x": 0, + "y": 5 + }, + "id": 6, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(vm_app_version{job=~\"$job\", instance=~\"$instance\"}) by (job, short_version)", + "format": "table", + "instant": true, + "range": false, + "refId": "A" + } + ], + "title": "Version", + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 13, + "panels": [], + "title": "Overview", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_user_requests_total{job=~\"$job\", instance=~\"$instance\", username=~\"$user\"}[$__rate_interval])) by(username)", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_unauthorized_user_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Requests rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows percent utilization of per concurrent requests capacity.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "dashed" + } + }, + "mappings": [], + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 0.9 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(\nmax_over_time(vmauth_user_concurrent_requests_current{job=~\"$job\", instance=~\"$instance\", username=~\"$user\"}[$__rate_interval])\n/ \nvmauth_user_concurrent_requests_capacity{job=~\"$job\", instance=~\"$instance\", username=~\"$user\"}\n) by(username) > 0\n", + "hide": false, + "interval": "5m", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "User concurrent requests usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the rate of rejected requests by a reason.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_http_request_errors_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by (reason)", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Requests rejected rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": " The number of concurrent connections processed by vmauth reached one of limits. Possible solutions:\n- increase global limit with flag -maxConcurrentRequests\n- increase limit with flag: -maxConcurrentPerUserRequests for all users or with config option `max_concurrent_requests` per user.\n- deploy additional vmauth replicas\n- check requests latency at backend service and allocate resources to it if needed", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "bars", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_user_concurrent_requests_limit_reached_total{job=~\"$job\", instance=~\"$instance\", username=~\"$user\"}[$__rate_interval])) by(username) > 0", + "interval": "1m", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_unauthorized_user_concurrent_requests_limit_reached_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) > 0", + "hide": false, + "legendFormat": "unauthorized", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vmauth_concurrent_requests_limit_reached_total[$__rate_interval])) > 0", + "hide": false, + "legendFormat": "global at {{ $instance }}", + "range": true, + "refId": "C" + } + ], + "title": "Concurrent limit reached", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows duration in seconds of user requests by quantile.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Max", + "sortDesc": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(vmauth_user_request_duration_seconds{job=~\"$job\", instance=~\"$instance\", username=~\"$user\", quantile=~\"(0.99|0.5)\"}) by (quantile, username) > 0", + "hide": false, + "legendFormat": "user: {{username}} q: {{ quantile}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max(vmauth_unauthorized_user_request_duration_seconds{job=~\"$job\", instance=~\"$instance\", quantile=~\"(0.99|0.5)\"}) by (quantile) > 0", + "hide": false, + "legendFormat": "user: unauthorized q: {{ quantile}}", + "range": true, + "refId": "B" + } + ], + "title": "User requests duration", + "type": "timeseries" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 37 + }, + "id": 22, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Percentage of used memory (resident).\nThe application's performance will significantly degrade when memory usage is close to 100%.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 11 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(\n max_over_time(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_memory_bytes{job=~\"$job\", instance=~\"$instance\"}\n) by(job)", + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "RSS memory % usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 11 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "max(\n rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n /\n vm_available_cpu_cores{job=~\"$job\", instance=~\"$instance\"}\n) by(job)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "CPU % usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(go_memstats_sys_bytes{job=~\"$job\", instance=~\"$instance\"}) + sum(vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "requested from system", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(go_memstats_heap_inuse_bytes{job=~\"$job\", instance=~\"$instance\"}) + sum(vm_cache_size_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "heap inuse", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(go_memstats_stack_inuse_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "stack inuse", + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(process_resident_memory_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "resident", + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "sum(process_resident_memory_anon_bytes{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "resident anonymous", + "refId": "E" + } + ], + "title": "Memory usage ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Limit" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#F2495C", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "rate(process_cpu_seconds_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "CPU cores used", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "exemplar": false, + "expr": "process_cpu_cores_available{job=~\"$job\", instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Limit", + "refId": "B" + } + ], + "title": "CPU ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(vm_tcplistener_conns{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "connections", + "refId": "A" + } + ], + "title": "TCP connections ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 27 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(rate(vm_tcplistener_accepts_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "connections", + "range": true, + "refId": "A" + } + ], + "title": "TCP connections rate ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows the percentage of open file descriptors compared to the limit set in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "max" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#C4162A", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "max_over_time(process_open_fds{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n/\nprocess_max_fds{job=~\"$job\", instance=~\"$instance\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "range": true, + "refId": "A" + } + ], + "title": "Open FDs ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(go_goroutines{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "gc duration", + "refId": "A" + } + ], + "title": "Goroutines ($instance)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last *", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "expr": "sum(process_num_threads{job=~\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "threads", + "refId": "A" + } + ], + "title": "Threads ($instance)", + "type": "timeseries" + } + ], + "title": "Resource usage", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 35, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 34, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "job" + } + ] + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(flag{is_set=\"true\", job=~\"$job\", instance=~\"$instance\"}) by(job, instance, name, value)", + "format": "table", + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "A" + } + ], + "title": "Non-default flags", + "transformations": [ + { + "id": "groupBy", + "options": { + "fields": { + "instance": { + "aggregations": [] + }, + "job": { + "aggregations": [] + }, + "name": { + "aggregations": [], + "operation": "groupby" + }, + "value": { + "aggregations": [], + "operation": "groupby" + } + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "description": "Shows number of generated error and warning messages in logs. Non-zero value may be a sign of connectivity or missconfiguration errors.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "max", + "last", + "mean" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "editorMode": "code", + "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[$__rate_interval])) by (level)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Log errors", + "type": "timeseries" + } + ], + "title": "Troubleshooting", + "type": "row" + } + ], + "refresh": "30s", + "revision": 1, + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "VictoriaMetrics - cluster", + "value": "PAF93674D0B4E9963" + }, + "hide": 0, + "includeAll": false, + "multi": false, + "name": "ds", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vm_app_version{version=~\"^vmauth.*\"}, job)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "job", + "options": [], + "query": { + "query": "label_values(vm_app_version{version=~\"^vmauth.*\"}, job)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "instance", + "options": [], + "query": { + "query": "label_values(vm_app_version{job=~\"$job\"}, instance)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vmauth_user_requests_total{job=~\"$job\", instance=~\"$instance\"}, username)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "user", + "options": [], + "query": { + "query": "label_values(vmauth_user_requests_total{job=~\"$job\", instance=~\"$instance\"}, username)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "victoriametrics-datasource", + "uid": "P38648FE0F8C5BEA2" + }, + "filters": [], + "hide": 0, + "name": "adhoc", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "VictoriaMetrics - vmauth", + "uid": "nbuo5Mr4k", + "version": 1, + "weekStart": "", + "gnetId": 21394 +} \ No newline at end of file diff --git a/observability/grafana/kustomization.yaml b/observability/grafana/kustomization.yaml new file mode 100644 index 00000000..56fe4f3a --- /dev/null +++ b/observability/grafana/kustomization.yaml @@ -0,0 +1,34 @@ +# Global options +generatorOptions: + disableNameSuffixHash: true + labels: + grafana_dashboard: "1" +commonAnnotations: + grafana_folder: "Kubernetes" + +resources: + - base + +# Generate a ConfigMap for each dashboard +configMapGenerator: + ################################################# + # Views Dashboards + ################################################# + + - name: dashboards-k8s-views-cluster + files: [json=./dashboards/cluster-overview.json] + + - name: dashboards-k8s-views-namespaces + files: [json=./dashboards/namespace-overview.json] + + - name: dashboards-k8s-views-pods + files: [json=./dashboards/pod-overview.json] + + - name: dashboards-k8s-views-falkordb + files: [json=./dashboards/falkordb-cloud.json] + + - name: dashboards-k8s-views-prometheus + files: [json=./dashboards/prometheus-dashboard.json] + + - name: dashboards-k8s-views-vmauth + files: [json=./dashboards/vmauth.json] diff --git a/observability/rules/alertmanager.rules.yml b/observability/rules/alertmanager.rules.yml new file mode 100644 index 00000000..76f6b4bb --- /dev/null +++ b/observability/rules/alertmanager.rules.yml @@ -0,0 +1,134 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: alertmanager.rules +spec: + groups: + - name: alertmanager.rules + params: {} + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ $labels.pod}}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedreload + summary: Reloading an Alertmanager configuration has failed. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="vm-alertmanager",namespace="default"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only found {{ $value }} members of the {{$labels.job}} cluster. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagermembersinconsistent + summary: A member of an Alertmanager cluster has not found all other cluster members. + expr: |- + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="vm-alertmanager",namespace="default"}[5m]) + < on (namespace,service,cluster) group_left + count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="vm-alertmanager",namespace="default"}[5m])) + for: 15m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerfailedtosendalerts + summary: An Alertmanager instance failed to send notifications. + expr: |- + ( + rate(alertmanager_notifications_failed_total{job="vm-alertmanager",namespace="default"}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job="vm-alertmanager",namespace="default"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. + expr: |- + min by (namespace,service,integration,cluster) ( + rate(alertmanager_notifications_failed_total{job="vm-alertmanager",namespace="default", integration=~`.*`}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job="vm-alertmanager",namespace="default", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterfailedtosendalerts + summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. + expr: |- + min by (namespace,service,integration,cluster) ( + rate(alertmanager_notifications_failed_total{job="vm-alertmanager",namespace="default", integration!~`.*`}[5m]) + / + ignoring (reason) group_left rate(alertmanager_notifications_total{job="vm-alertmanager",namespace="default", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerconfiginconsistent + summary: Alertmanager instances within the same cluster have different configurations. + expr: |- + count by (namespace,service,cluster) ( + count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="vm-alertmanager",namespace="default"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclusterdown + summary: Half or more of the Alertmanager instances within the same cluster are down. + expr: |- + ( + count by (namespace,service,cluster) ( + avg_over_time(up{job="vm-alertmanager",namespace="default"}[5m]) < 0.5 + ) + / + count by (namespace,service,cluster) ( + up{job="vm-alertmanager",namespace="default"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: "{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/alertmanager/alertmanagerclustercrashlooping + summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. + expr: |- + ( + count by (namespace,service,cluster) ( + changes(process_start_time_seconds{job="vm-alertmanager",namespace="default"}[10m]) > 4 + ) + / + count by (namespace,service,cluster) ( + up{job="vm-alertmanager",namespace="default"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/observability/rules/containercpulimits.rules.yml b/observability/rules/containercpulimits.rules.yml new file mode 100644 index 00000000..679dc205 --- /dev/null +++ b/observability/rules/containercpulimits.rules.yml @@ -0,0 +1,33 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containercpulimits +spec: + groups: + - name: k8s.rules.container_cpu_limits + params: {} + rules: + - annotations: {} + expr: |- + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster) + group_left() max by (namespace,pod,cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + labels: {} + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits + - annotations: {} + expr: |- + sum by (namespace,cluster) ( + sum by (namespace,pod,cluster) ( + max by (namespace,pod,container,cluster) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + labels: {} + record: namespace_cpu:kube_pod_container_resource_limits:sum diff --git a/observability/rules/containercpurequests.rules.yml b/observability/rules/containercpurequests.rules.yml new file mode 100644 index 00000000..0bc61aa6 --- /dev/null +++ b/observability/rules/containercpurequests.rules.yml @@ -0,0 +1,33 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containercpurequests +spec: + groups: + - name: k8s.rules.container_cpu_requests + params: {} + rules: + - annotations: {} + expr: |- + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster) + group_left() max by (namespace,pod,cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + labels: {} + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests + - annotations: {} + expr: |- + sum by (namespace,cluster) ( + sum by (namespace,pod,cluster) ( + max by (namespace,pod,container,cluster) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + labels: {} + record: namespace_cpu:kube_pod_container_resource_requests:sum diff --git a/observability/rules/containercpuusagesecondstotal.rules.yml b/observability/rules/containercpuusagesecondstotal.rules.yml new file mode 100644 index 00000000..6f271b3b --- /dev/null +++ b/observability/rules/containercpuusagesecondstotal.rules.yml @@ -0,0 +1,21 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containercpuusagesecondstotal +spec: + groups: + - name: k8s.rules.container_cpu_usage_seconds_total + params: {} + rules: + - annotations: {} + expr: |- + sum by (namespace,pod,container,cluster) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) ( + 1, max by (namespace,pod,node,cluster) (kube_pod_info{node!=""}) + ) + labels: {} + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate diff --git a/observability/rules/containermemorycache.rules.yml b/observability/rules/containermemorycache.rules.yml new file mode 100644 index 00000000..4378daf6 --- /dev/null +++ b/observability/rules/containermemorycache.rules.yml @@ -0,0 +1,20 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemorycache +spec: + groups: + - name: k8s.rules.container_memory_cache + params: {} + rules: + - annotations: {} + expr: |- + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1, + max by (namespace,pod,node,cluster) (kube_pod_info{node!=""}) + ) + labels: {} + record: node_namespace_pod_container:container_memory_cache diff --git a/observability/rules/containermemorylimits.rules.yml b/observability/rules/containermemorylimits.rules.yml new file mode 100644 index 00000000..9a40c790 --- /dev/null +++ b/observability/rules/containermemorylimits.rules.yml @@ -0,0 +1,33 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemorylimits +spec: + groups: + - name: k8s.rules.container_memory_limits + params: {} + rules: + - annotations: {} + expr: |- + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster) + group_left() max by (namespace,pod,cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + labels: {} + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits + - annotations: {} + expr: |- + sum by (namespace,cluster) ( + sum by (namespace,pod,cluster) ( + max by (namespace,pod,container,cluster) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + labels: {} + record: namespace_memory:kube_pod_container_resource_limits:sum diff --git a/observability/rules/containermemoryrequests.rules.yml b/observability/rules/containermemoryrequests.rules.yml new file mode 100644 index 00000000..c8264f80 --- /dev/null +++ b/observability/rules/containermemoryrequests.rules.yml @@ -0,0 +1,33 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemoryrequests +spec: + groups: + - name: k8s.rules.container_memory_requests + params: {} + rules: + - annotations: {} + expr: |- + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster) + group_left() max by (namespace,pod,cluster) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + labels: {} + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests + - annotations: {} + expr: |- + sum by (namespace,cluster) ( + sum by (namespace,pod,cluster) ( + max by (namespace,pod,container,cluster) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + labels: {} + record: namespace_memory:kube_pod_container_resource_requests:sum diff --git a/observability/rules/containermemoryrss.rules.yml b/observability/rules/containermemoryrss.rules.yml new file mode 100644 index 00000000..d923f56c --- /dev/null +++ b/observability/rules/containermemoryrss.rules.yml @@ -0,0 +1,20 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemoryrss +spec: + groups: + - name: k8s.rules.container_memory_rss + params: {} + rules: + - annotations: {} + expr: |- + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1, + max by (namespace,pod,node,cluster) (kube_pod_info{node!=""}) + ) + labels: {} + record: node_namespace_pod_container:container_memory_rss diff --git a/observability/rules/containermemoryswap.rules.yml b/observability/rules/containermemoryswap.rules.yml new file mode 100644 index 00000000..f6720e85 --- /dev/null +++ b/observability/rules/containermemoryswap.rules.yml @@ -0,0 +1,20 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemoryswap +spec: + groups: + - name: k8s.rules.container_memory_swap + params: {} + rules: + - annotations: {} + expr: |- + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1, + max by (namespace,pod,node,cluster) (kube_pod_info{node!=""}) + ) + labels: {} + record: node_namespace_pod_container:container_memory_swap diff --git a/observability/rules/containermemoryworkingsetbytes.rules.yml b/observability/rules/containermemoryworkingsetbytes.rules.yml new file mode 100644 index 00000000..6f27a504 --- /dev/null +++ b/observability/rules/containermemoryworkingsetbytes.rules.yml @@ -0,0 +1,20 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.containermemoryworkingsetbytes +spec: + groups: + - name: k8s.rules.container_memory_working_set_bytes + params: {} + rules: + - annotations: {} + expr: |- + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1, + max by (namespace,pod,node,cluster) (kube_pod_info{node!=""}) + ) + labels: {} + record: node_namespace_pod_container:container_memory_working_set_bytes diff --git a/observability/rules/general.rules.yml b/observability/rules/general.rules.yml new file mode 100644 index 00000000..191f23b6 --- /dev/null +++ b/observability/rules/general.rules.yml @@ -0,0 +1,48 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: general.rules +spec: + groups: + - name: general.rules + params: {} + rules: + - alert: TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up{namespace!~"instance-.*"} == 0) BY (job,namespace,service,cluster) / count(up{namespace!~"instance-.*"}) BY (job,namespace,service,cluster)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: | + This is an alert that is used to inhibit info alerts. + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + other alerts. + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a + severity of 'warning' or 'critical' starts firing on the same namespace. + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: ALERTS{severity = "info"} == 1 unless on (namespace,cluster) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none diff --git a/observability/rules/kube-state-metrics.rules.yml b/observability/rules/kube-state-metrics.rules.yml new file mode 100644 index 00000000..1bfd8582 --- /dev/null +++ b/observability/rules/kube-state-metrics.rules.yml @@ -0,0 +1,60 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kube-state-metrics +spec: + groups: + - name: kube-state-metrics + params: {} + rules: + - alert: KubeStateMetricsListErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors + summary: kube-state-metrics is experiencing errors in list operations. + expr: |- + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsWatchErrors + annotations: + description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors + summary: kube-state-metrics is experiencing errors in watch operations. + expr: |- + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) + > 0.01 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardingMismatch + annotations: + description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch + summary: kube-state-metrics sharding is misconfigured. + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsShardsMissing + annotations: + description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing + summary: kube-state-metrics shards are missing. + expr: |- + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 + - + sum( 2 ^ max by (shard_ordinal,cluster) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) + != 0 + for: 15m + labels: + severity: critical diff --git a/observability/rules/kubernetes-apps.rules.yml b/observability/rules/kubernetes-apps.rules.yml new file mode 100644 index 00000000..ba9c5991 --- /dev/null +++ b/observability/rules/kubernetes-apps.rules.yml @@ -0,0 +1,250 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kubernetes-apps +spec: + groups: + - name: kubernetes-apps + params: {} + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: |- + sum by (namespace,pod,cluster) ( + max by (namespace,pod,cluster) ( + kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} + ) * on (namespace,pod,cluster) group_left(owner_kind) topk by (namespace,pod,cluster) ( + 1, max by (namespace,pod,owner_kind,cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} + != + kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: |- + ( + kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDeploymentRolloutStuck + annotations: + description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck + summary: Deployment rollout is not progressing. + expr: |- + kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} + != 0 + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: StatefulSet has not matched the expected number of replicas. + expr: |- + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: |- + ( + max by (namespace,statefulset,job,cluster) ( + kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: |- + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics", namespace=~".*"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: |- + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: |- + time() - max by (namespace,job_name,cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} + and + kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: |- + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: |- + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning diff --git a/observability/rules/kubernetes-resources.rules.yml b/observability/rules/kubernetes-resources.rules.yml new file mode 100644 index 00000000..98c70087 --- /dev/null +++ b/observability/rules/kubernetes-resources.rules.yml @@ -0,0 +1,114 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kubernetes-resources +spec: + groups: + - name: kubernetes-resources + params: {} + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) + / + sum(increase(container_cpu_cfs_periods_total{job="kubelet", metrics_path="/metrics/cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) + > ( 25 / 100 ) + for: 15m + labels: + severity: info diff --git a/observability/rules/kubernetes-storage.rules.yml b/observability/rules/kubernetes-storage.rules.yml new file mode 100644 index 00000000..a072cf2e --- /dev/null +++ b/observability/rules/kubernetes-storage.rules.yml @@ -0,0 +1,105 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kubernetes-storage +spec: + groups: + - name: kubernetes-storage + params: {} + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,cluster) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical diff --git a/observability/rules/node-exporter.rules.yml b/observability/rules/node-exporter.rules.yml new file mode 100644 index 00000000..9316e4d0 --- /dev/null +++ b/observability/rules/node-exporter.rules.yml @@ -0,0 +1,399 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: node-exporter +spec: + groups: + - name: node-exporter.rules + params: {} + rules: + - annotations: {} + expr: |- + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + labels: {} + record: instance:node_num_cpu:sum + - annotations: {} + expr: |- + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + labels: {} + record: instance:node_cpu_utilisation:rate5m + - annotations: {} + expr: |- + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + labels: {} + record: instance:node_load1_per_cpu:ratio + - annotations: {} + expr: |- + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + labels: {} + record: instance:node_memory_utilisation:ratio + - annotations: {} + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + labels: {} + record: instance:node_vmstat_pgmajfault:rate5m + - annotations: {} + expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + labels: {} + record: instance_device:node_disk_io_time_seconds:rate5m + - annotations: {} + expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + labels: {} + record: instance_device:node_disk_io_time_weighted_seconds:rate5m + - annotations: {} + expr: |- + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + labels: {} + record: instance:node_network_receive_bytes_excluding_lo:rate5m + - annotations: {} + expr: |- + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + labels: {} + record: instance:node_network_transmit_bytes_excluding_lo:rate5m + - annotations: {} + expr: |- + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + labels: {} + record: instance:node_network_receive_drop_excluding_lo:rate5m + - annotations: {} + expr: |- + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + labels: {} + record: instance:node_network_transmit_drop_excluding_lo:rate5m + - name: node-exporter + params: {} + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: "{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: |- + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded. + expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array. + expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - alert: NodeCPUHighUsage + annotations: + description: | + CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage + summary: High CPU usage. + expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90 + for: 15m + labels: + severity: info + - alert: NodeSystemSaturation + annotations: + description: | + System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation + summary: System saturated, load per core is very high. + expr: |- + node_load1{job="node-exporter"} + / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 + for: 15m + labels: + severity: warning + - alert: NodeMemoryMajorPagesFaults + annotations: + description: | + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults + summary: Memory major page faults are occurring at very high rate. + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 + for: 15m + labels: + severity: warning + - alert: NodeMemoryHighUtilization + annotations: + description: | + Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization + summary: Host is running out of memory. + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 + for: 15m + labels: + severity: warning + - alert: NodeDiskIOSaturation + annotations: + description: | + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. + This symptom might indicate disk saturation. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation + summary: Disk IO queue is high. + expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 + for: 30m + labels: + severity: warning + - alert: NodeSystemdServiceFailed + annotations: + description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed + summary: Systemd service has entered failed state. + expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 5m + labels: + severity: warning + - alert: NodeBondingDegraded + annotations: + description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded + summary: Bonding interface is degraded + expr: (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning diff --git a/observability/rules/node-network.rules.yml b/observability/rules/node-network.rules.yml new file mode 100644 index 00000000..d511c9f3 --- /dev/null +++ b/observability/rules/node-network.rules.yml @@ -0,0 +1,21 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: node-network +spec: + groups: + - name: node-network + params: {} + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping + summary: Network interface is often changing its status + expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning diff --git a/observability/rules/node.rules.yml b/observability/rules/node.rules.yml new file mode 100644 index 00000000..5e3c1b1c --- /dev/null +++ b/observability/rules/node.rules.yml @@ -0,0 +1,58 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: node.rules +spec: + groups: + - name: node.rules + params: {} + rules: + - annotations: {} + expr: |- + topk by (namespace,pod,cluster) (1, + max by (node,namespace,pod,cluster) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + labels: {} + record: 'node_namespace_pod:kube_pod_info:' + - annotations: {} + expr: |- + count by (node,cluster) ( + node_cpu_seconds_total{mode="idle",job="node-exporter"} + * on (namespace,pod,cluster) group_left(node) + topk by (namespace,pod,cluster) (1, node_namespace_pod:kube_pod_info:) + ) + labels: {} + record: node:node_num_cpu:sum + - annotations: {} + expr: |- + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + labels: {} + record: :node_memory_MemAvailable_bytes:sum + - annotations: {} + expr: |- + avg by (node,cluster) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + labels: {} + record: node:node_cpu_utilization:ratio_rate5m + - annotations: {} + expr: |- + avg by (cluster) ( + node:node_cpu_utilization:ratio_rate5m + ) + labels: {} + record: cluster:node_cpu:ratio_rate5m diff --git a/observability/rules/podowner.rules.yml b/observability/rules/podowner.rules.yml new file mode 100644 index 00000000..6c65a8f3 --- /dev/null +++ b/observability/rules/podowner.rules.yml @@ -0,0 +1,63 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: k8s.rules.podowner +spec: + groups: + - name: k8s.rules.pod_owner + params: {} + rules: + - annotations: {} + expr: |- + max by (namespace,workload,pod,cluster) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on (replicaset,namespace,cluster) group_left(owner_name) topk by (replicaset,namespace,cluster) ( + 1, max by (replicaset,namespace,owner_name,cluster) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - annotations: {} + expr: |- + max by (namespace,workload,pod,cluster) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - annotations: {} + expr: |- + max by (namespace,workload,pod,cluster) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - annotations: {} + expr: |- + max by (namespace,workload,pod,cluster) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel diff --git a/observability/rules/prometheus-general.rules.yml b/observability/rules/prometheus-general.rules.yml new file mode 100644 index 00000000..becd143a --- /dev/null +++ b/observability/rules/prometheus-general.rules.yml @@ -0,0 +1,20 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kube-prometheus-general.rules +spec: + groups: + - name: kube-prometheus-general.rules + params: {} + rules: + - annotations: {} + expr: count without(instance, pod, node) (up == 1) + labels: {} + record: count:up1 + - annotations: {} + expr: count without(instance, pod, node) (up == 0) + labels: {} + record: count:up0 diff --git a/observability/rules/prometheus-node-recording.rules.yml b/observability/rules/prometheus-node-recording.rules.yml new file mode 100644 index 00000000..c9c4356c --- /dev/null +++ b/observability/rules/prometheus-node-recording.rules.yml @@ -0,0 +1,36 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: kube-prometheus-node-recording.rules +spec: + groups: + - name: kube-prometheus-node-recording.rules + params: {} + rules: + - annotations: {} + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance,cluster) + labels: {} + record: instance:node_cpu:rate:sum + - annotations: {} + expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance,cluster) + labels: {} + record: instance:node_network_receive_bytes:rate:sum + - annotations: {} + expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance,cluster) + labels: {} + record: instance:node_network_transmit_bytes:rate:sum + - annotations: {} + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON (instance,cluster) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance,cpu,cluster)) BY (instance,cluster) + labels: {} + record: instance:node_cpu:ratio + - annotations: {} + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) BY (cluster) + labels: {} + record: cluster:node_cpu:sum_rate5m + - annotations: {} + expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,cpu,cluster)) BY (cluster) + labels: {} + record: cluster:node_cpu:ratio diff --git a/observability/rules/vm-health.rules.yml b/observability/rules/vm-health.rules.yml new file mode 100644 index 00000000..00059b99 --- /dev/null +++ b/observability/rules/vm-health.rules.yml @@ -0,0 +1,109 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: vm-health +spec: + groups: + - name: vm-health + params: {} + rules: + - alert: TooManyRestarts + annotations: + description: | + Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. It might be crashlooping. + summary: '{{ $labels.job }} too many restarts (instance {{ $labels.instance }})' + expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2 + labels: + severity: critical + - alert: ServiceDown + annotations: + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.' + summary: Service {{ $labels.job }} is down on {{ $labels.instance }} + expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0 + for: 2m + labels: + severity: critical + - alert: ProcessNearFDLimits + annotations: + description: | + Exhausting OS file descriptors limit can cause severe degradation of the process. + Consider to increase the limit as fast as possible. + summary: Number of free file descriptors is less than 100 for "{{ $labels.job }}"("{{ $labels.instance }}") for the last 5m + expr: (process_max_fds - process_open_fds) < 100 + for: 5m + labels: + severity: critical + - alert: TooHighMemoryUsage + annotations: + description: | + Too high memory usage may result into multiple issues such as OOMs or degraded performance. + Consider to either increase available memory or decrease the load on the process. + summary: It is more than 80% of memory used by "{{ $labels.job }}"("{{ $labels.instance }}") + expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8 + for: 5m + labels: + severity: critical + - alert: TooHighCPUUsage + annotations: + description: | + Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process. + summary: More than 90% of CPU is used by "{{ $labels.job }}"("{{ $labels.instance }}") during the last 5m + expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 + for: 5m + labels: + severity: critical + - alert: TooHighGoroutineSchedulingLatency + annotations: + description: | + Go runtime is unable to schedule goroutines execution in acceptable time. This is usually a sign of insufficient CPU resources or CPU throttling. Verify that service has enough CPU resources. Otherwise, the service could work unreliably with delays in processing. + summary: '"{{ $labels.job }}"("{{ $labels.instance }}") has insufficient CPU resources for >15m' + expr: histogram_quantile(0.99, sum(rate(go_sched_latencies_seconds_bucket[5m])) by (le,job,instance,cluster)) > 0.1 + for: 15m + labels: + severity: critical + - alert: TooManyLogs + annotations: + description: | + Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m. Worth to check logs for specific error messages. + summary: Too many logs printed for job "{{ $labels.job }}" ({{ $labels.instance }}) + expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0 + for: 15m + labels: + severity: warning + - alert: TooManyTSIDMisses + annotations: + description: | + The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}). + Make sure you're running VictoriaMetrics of v1.85.3 or higher. + Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502 + summary: Too many TSID misses for job "{{ $labels.job }}" ({{ $labels.instance }}) + expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0 + for: 10m + labels: + severity: critical + - alert: ConcurrentInsertsHitTheLimit + annotations: + description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\nUsually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.\nIn some cases for components like vmagent or vminsert the alert might trigger if there are too many clients\nmaking write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then \nit might be worth adjusting `-maxConcurrentInserts` cmd-line flag.\n" + summary: '{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit' + expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity + for: 15m + labels: + severity: warning + - alert: IndexDBRecordsDrop + annotations: + description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. \nFor example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number \nof labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and \n`-maxLabelValueLen` command-line flags.\n" + summary: IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}. + expr: increase(vm_indexdb_items_dropped_total[5m]) > 0 + labels: + severity: critical + - alert: RowsRejectedOnIngestion + annotations: + description: 'Ingested rows on instance "{{ $labels.instance }}" are rejected due to the following reason: "{{ $labels.reason }}"' + summary: Some rows are rejected on "{{ $labels.instance }}" on ingestion attempt + expr: rate(vm_rows_ignored_total[5m]) > 0 + for: 15m + labels: + severity: warning diff --git a/observability/rules/vmoperator.rules.yml b/observability/rules/vmoperator.rules.yml new file mode 100644 index 00000000..b5dbc0a8 --- /dev/null +++ b/observability/rules/vmoperator.rules.yml @@ -0,0 +1,75 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: vmoperator +spec: + groups: + - name: vmoperator + params: {} + rules: + - alert: LogErrors + annotations: + dashboard: '{{ $externalURL }}/d/1H179hunk/victoriametrics-operator?ds={{ $labels.dc }}&orgId=1&viewPanel=16' + description: 'Operator has too many errors at logs: {{ $value}}, check operator logs' + summary: 'Too many errors at logs of operator: {{ $value}}' + expr: |- + sum( + rate( + operator_log_messages_total{ + level="error",job=~".*((victoria.*)|vm)-?operator" + }[5m] + ) + ) by (cluster) > 0 + for: 15m + labels: + severity: warning + show_at: dashboard + - alert: ReconcileErrors + annotations: + dashboard: '{{ $externalURL }}/d/1H179hunk/victoriametrics-operator?ds={{ $labels.dc }}&orgId=1&viewPanel=10' + description: 'Operator cannot parse response from k8s api server, possible bug: {{ $value }}, check operator logs' + summary: 'Too many errors at reconcile loop of operator: {{ $value}}' + expr: |- + sum( + rate( + controller_runtime_reconcile_errors_total{ + job=~".*((victoria.*)|vm)-?operator" + }[5m] + ) + ) by (cluster) > 0 + for: 10m + labels: + severity: warning + show_at: dashboard + - alert: HighQueueDepth + annotations: + dashboard: '{{ $externalURL }}/d/1H179hunk/victoriametrics-operator?ds={{ $labels.dc }}&orgId=1&viewPanel=20' + description: 'Operator cannot handle reconciliation load for controller: `{{- $labels.name }}`, current depth: {{ $value }}' + summary: 'Too many `{{- $labels.name }}` in queue: {{ $value }}' + expr: |- + sum( + workqueue_depth{ + job=~".*((victoria.*)|vm)-?operator", + name=~"(vmagent|vmalert|vmalertmanager|vmauth|vmcluster|vmnodescrape|vmpodscrape|vmprobe|vmrule|vmservicescrape|vmsingle|vmstaticscrape)" + } + ) by (name,cluster) > 10 + for: 15m + labels: + severity: warning + show_at: dashboard + - alert: BadObjects + annotations: + dashboard: '{{ $externalURL }}/d/1H179hunk/victoriametrics-operator?ds={{ $labels.dc }}&orgId=1' + description: Operator got incorrect resources in controller {{ $labels.controller }}, check operator logs + summary: Incorrect `{{ $labels.controller }}` resources in the cluster + expr: |- + sum( + operator_controller_bad_objects_count{job=~".*((victoria.*)|vm)-?operator"} + ) by (controller,cluster) > 0 + for: 15m + labels: + severity: warning + show_at: dashboard diff --git a/observability/rules/vmsingle.rules.yml b/observability/rules/vmsingle.rules.yml new file mode 100644 index 00000000..ea00a9fd --- /dev/null +++ b/observability/rules/vmsingle.rules.yml @@ -0,0 +1,121 @@ +--- +# Source: victoria-metrics-k8s-stack/templates/rules/rule.yaml +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + namespace: observability + name: vmsingle +spec: + groups: + - concurrency: 2 + interval: 30s + name: vmsingle + params: {} + rules: + - alert: DiskRunsOutOfSpaceIn3Days + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=73&var-instance={{ $labels.instance }} + description: |- + Taking into account current ingestion rate, free disk space will be enough only for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}. + Consider to limit the ingestion rate, decrease retention or scale the disk space if possible. + summary: Instance {{ $labels.instance }} will run out of disk space soon + expr: |- + sum(vm_free_disk_space_bytes) without(path) / + ( + rate(vm_rows_added_to_storage_total[1d]) * ( + sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) / + sum(vm_rows{type!~"indexdb.*"}) without(type) + ) + ) < 3 * 24 * 3600 > 0 + for: 30m + labels: + severity: critical + - alert: NodeBecomesReadonlyIn3Days + annotations: + dashboard: grafana.domain.com/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }} + description: |- + Taking into account current ingestion rate and free disk space instance {{ $labels.instance }} is writable for {{ $value | humanizeDuration }}. + Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible. + summary: Instance {{ $labels.instance }} will become read-only in 3 days + expr: |- + sum(vm_free_disk_space_bytes - vm_free_disk_space_limit_bytes) without(path) / + ( + rate(vm_rows_added_to_storage_total[1d]) * ( + sum(vm_data_size_bytes{type!~"indexdb.*"}) without(type) / + sum(vm_rows{type!~"indexdb.*"}) without(type) + ) + ) < 3 * 24 * 3600 > 0 + for: 30m + labels: + severity: warning + - alert: DiskRunsOutOfSpace + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }} + description: |- + Disk utilisation on instance {{ $labels.instance }} is more than 80%. + Having less than 20% of free disk space could cripple merge processes and overall performance. Consider to limit the ingestion rate, decrease retention or scale the disk space if possible. + summary: Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon + expr: |- + sum(vm_data_size_bytes) by (job,instance,cluster) / + ( + sum(vm_free_disk_space_bytes) by (job,instance,cluster) + + sum(vm_data_size_bytes) by (job,instance,cluster) + ) > 0.8 + for: 30m + labels: + severity: critical + - alert: RequestErrorsToAPI + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }} + description: Requests to path {{ $labels.path }} are receiving errors. Please verify if clients are sending correct requests. + summary: Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }}) + expr: increase(vm_http_request_errors_total[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: TooHighChurnRate + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }} + description: |- + VM constantly creates new time series on "{{ $labels.instance }}". + This effect is known as Churn Rate. + High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries. + summary: Churn rate is more than 10% on "{{ $labels.instance }}" for the last 15m + expr: |- + ( + sum(rate(vm_new_timeseries_created_total[5m])) by (instance,cluster) + / + sum(rate(vm_rows_inserted_total[5m])) by (instance,cluster) + ) > 0.1 + for: 15m + labels: + severity: warning + - alert: TooHighChurnRate24h + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }} + description: |- + The number of created new time series over last 24h is 3x times higher than current number of active series on "{{ $labels.instance }}". + This effect is known as Churn Rate. + High Churn Rate tightly connected with database performance and may result in unexpected OOM's or slow queries. + summary: Too high number of new series on "{{ $labels.instance }}" created over last 24h + expr: |- + sum(increase(vm_new_timeseries_created_total[24h])) by (instance,cluster) + > + (sum(vm_cache_entries{type="storage/hour_metric_ids"}) by (instance,cluster) * 3) + for: 15m + labels: + severity: warning + - alert: TooHighSlowInsertsRate + annotations: + dashboard: grafana.domain.com/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }} + description: High rate of slow inserts on "{{ $labels.instance }}" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series. See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183 + summary: Percentage of slow inserts is more than 5% on "{{ $labels.instance }}" for the last 15m + expr: |- + ( + sum(rate(vm_slow_row_inserts_total[5m])) by (instance,cluster) + / + sum(rate(vm_rows_inserted_total[5m])) by (instance,cluster) + ) > 0.05 + for: 15m + labels: + severity: warning diff --git a/scripts/add_cluster.sh b/scripts/add_cluster.sh new file mode 100755 index 00000000..fac9e3ba --- /dev/null +++ b/scripts/add_cluster.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +# Require gum (https://github.com/charmbracelet/gum) +if ! command -v gum &> /dev/null +then + echo "gum could not be found. Please install it before running this script." + echo "Installation instructions:" + echo "- macOS: brew install charmbracelet/tap/gum" + echo "- Linux: sudo apt install gum (if available) or download the binary from https://github.com/charmbracelet/gum/releases" + echo "- Windows: Use Scoop: scoop install gum or download from https://github.com/charmbracelet/gum/releases" + exit +fi + +# Enable error handling +set -euo pipefail +trap 'echo "Error on line $LINENO: $(tail -n +$LINENO "$0" | head -n 1)"; exit 1' ERR +trap "echo 'Script interrupted by user'; exit" SIGINT + +# Main menu +PLATFORM=$(gum choose "GCP" "AWS") + +# Environment variables +export ARGOCD_SERVER=$(gum input --placeholder "Enter ArgoCD Server") +export PAGERDUTY_API_KEY=$(gum input --placeholder "Enter PagerDuty API Key") +export CTRL_PLANE_CTX=$(gum input --placeholder "Enter Control Plane Context Name") + +if [ "$PLATFORM" == "GCP" ]; then + export PROJECT=$(gum input --placeholder "Enter GCP Project ID") + export REGION=$(gum input --placeholder "Enter GCP Region") + export CLUSTER=$(gum input --placeholder "Enter GCP App Plane Cluster Name") + export APP_PLANE_CTX=$(gum input --placeholder "Enter App Plane Cluster Context Name") +else + export REGION=$(gum input --placeholder "Enter AWS Region") + export CLUSTER=$(gum input --placeholder "Enter AWS App Plane Cluster Name") + export APP_PLANE_CTX=$(gum input --placeholder "Enter App Plane Cluster Context Name") + export NODE_ROLE=$(gum input --placeholder "Enter AWS Node Role") + export SUBNETS=$(gum input --placeholder "Enter AWS Subnets (comma-separated)") + AWS_PROFILE=$(gum input --placeholder "Enter AWS Profile (leave blank for default)" --value "default") +fi + +# Review inputs +echo "Review your inputs:" > /tmp/review.txt +echo "Platform: $PLATFORM" >> /tmp/review.txt +echo "ArgoCD Server: $ARGOCD_SERVER" >> /tmp/review.txt +echo "PagerDuty API Key: [HIDDEN]" >> /tmp/review.txt +echo "Control Plane Context: $CTRL_PLANE_CTX" >> /tmp/review.txt +if [ "$PLATFORM" == "GCP" ]; then + echo "Project: $PROJECT" >> /tmp/review.txt + echo "Region: $REGION" >> /tmp/review.txt + echo "Cluster: $CLUSTER" >> /tmp/review.txt + echo "App Plane Context: $APP_PLANE_CTX" >> /tmp/review.txt +else + echo "Region: $REGION" >> /tmp/review.txt + echo "Cluster: $CLUSTER" >> /tmp/review.txt + echo "App Plane Context: $APP_PLANE_CTX" >> /tmp/review.txt + echo "Node Role: $NODE_ROLE" >> /tmp/review.txt + echo "Subnets: $SUBNETS" >> /tmp/review.txt + echo "AWS Profile: $AWS_PROFILE" >> /tmp/review.txt +fi +echo "" >> /tmp/review.txt +echo "Press ESC to continue" >> /tmp/review.txt +gum pager < /tmp/review.txt +gum confirm "Are the above details correct?" || exit + +if [ "$PLATFORM" == "GCP" ]; then + if ! gcloud container node-pools list --cluster=$CLUSTER --region=$REGION --project=$PROJECT | grep -q "observability"; then + gum spin --spinner dot --title "Creating node pool..." --show-error -- \ + gcloud container node-pools create observability \ + --cluster=$CLUSTER \ + --region=$REGION \ + --machine-type=e2-standard-2 \ + --disk-size=50 \ + --enable-autoscaling \ + --max-nodes=10 \ + --project=$PROJECT \ + --node-labels=node_pool=observability + else + echo "Node pool 'observability' already exists, skipping creation." + fi + + gum spin --spinner dot --title "Setting current context to $CLUSTER..." --show-error -- \ + gcloud container clusters get-credentials $CLUSTER --region=$REGION --project=$PROJECT + +elif [ "$PLATFORM" == "AWS" ]; then + if ! aws eks list-nodegroups --cluster-name $CLUSTER --profile $AWS_PROFILE | grep -q "observability"; then + gum spin --spinner dot --title "Creating node group..." --show-error -- \ + aws eks create-nodegroup \ + --cluster-name $CLUSTER \ + --nodegroup-name observability \ + --node-role $NODE_ROLE \ + --subnets $SUBNETS \ + --instance-types m5.large \ + --disk-size 50 \ + --scaling-config minSize=1,maxSize=10,desiredSize=1 \ + --labels node_pool=observability \ + --profile $AWS_PROFILE + else + echo "Node group 'observability' already exists, skipping creation." + fi + + gum spin --spinner dot --title "Setting current context to $CLUSTER..." --show-error -- \ + aws eks update-kubeconfig --name $CLUSTER --region=$REGION --profile $AWS_PROFILE +fi + +# Login to ArgoCD +gum spin --spinner dot --title "Logging in to ArgoCD..." --show-error -- \ +argocd login $ARGOCD_SERVER --username admin --password $(kubectl get secret argocd-initial-admin-secret -n argocd -o jsonpath="{.data.password}" --context $CTRL_PLANE_CTX | base64 --decode) --insecure --plaintext + +# Create observability namespace +if ! kubectl get namespace observability &> /dev/null; then + gum spin --spinner dot --title "Creating observability namespace..." --show-error -- \ + kubectl create namespace observability +else + echo "Observability namespace already exists, skipping creation." +fi + +# Create PagerDuty secret +if ! kubectl get secret pagerduty-service-key --namespace=observability &> /dev/null; then + gum spin --spinner dot --title "Creating PagerDuty secret..." --show-error -- \ + kubectl create secret generic pagerduty-service-key \ + --from-literal=api-key=$PAGERDUTY_API_KEY \ + --namespace=observability +else + echo "PagerDuty secret already exists, skipping creation." +fi + +# Add cluster credentials +if ! argocd cluster list | grep -q "$APP_PLANE_CTX"; then + gum spin --spinner dot --title "Adding cluster credentials to control plane..." --show-error -- \ + argocd cluster add $APP_PLANE_CTX --server $ARGOCD_SERVER --label role=app-plane --label cloud_provider="$(echo $PLATFORM | tr '[:upper:]' '[:lower:]')" +else + echo "Cluster credentials for '$APP_PLANE_CTX' already added, skipping." +fi + +# Wait for vmuser secret +echo "Waiting for vmuser secret..." +while ! kubectl get secret $CLUSTER-vmuser -n observability --context $CTRL_PLANE_CTX &> /dev/null; do + sleep 2 + echo -n "." +done + +echo "vmuser secret created." + +# Create vmuser secret +if ! kubectl get secret vmuser --namespace=observability &> /dev/null; then + gum spin --spinner dot --title "Creating vmuser secret..." --show-error -- \ + kubectl create secret generic vmuser \ + --from-literal=password=$(kubectl get secret $CLUSTER-vmuser -n observability -o jsonpath="{.data.password}" --context $CTRL_PLANE_CTX | base64 --decode) \ + --namespace=observability +else + echo "vmuser secret already exists, skipping creation." +fi + +echo "Observability stack setup complete for $PLATFORM cluster." diff --git a/tofu/aws/.terraform.lock.hcl b/tofu/aws/.terraform.lock.hcl deleted file mode 100644 index 7695fb2a..00000000 --- a/tofu/aws/.terraform.lock.hcl +++ /dev/null @@ -1,110 +0,0 @@ -# This file is maintained automatically by "tofu init". -# Manual edits may be lost in future updates. - -provider "registry.opentofu.org/hashicorp/aws" { - version = "5.34.0" - constraints = ">= 4.0.0, >= 4.9.0, >= 4.33.0, >= 4.36.0, >= 4.47.0, >= 4.57.0, >= 5.0.0, >= 5.20.0" - hashes = [ - "h1:Ofv10Aw1nrPZ4amBcpge3lodQ7acaY7H/DeuUA4EsOU=", - "zh:01ab6b6ae075e2d09d67578af05c325117c40d407f1f3602caa95d31b52218bd", - "zh:18b938e0272e52e18f79eb8e355bb81397d859e7ac98d66fba1c142d142314b6", - "zh:225cad47a8a224bf5fc7ac47bc46746f9143f8ad39a2b2f0b5306bd580a5994a", - "zh:35fcfacd4f8ed71a6f9854eeb862431ca832cb732e97fb02e3ea3f764610db3f", - "zh:4018cd69689857968afe31cf67ef4796b45f08c27012daff56201618621a690b", - "zh:442ec76a21e9a55c3d3e38c5d57087f5c5127326237af10463ec26c2355f3102", - "zh:8417f0a78697223f2a38dd3d1df88d69891a6b2168aabcb4916afd6008cf1609", - "zh:c48b6103112efc02711f046625a60f76912be5f3f590c8bf68d94895c0d45f1c", - "zh:d11f4cec70f382b716241104dabfd1c4cc6b95c836e7a8c469ab64f62a8824a6", - "zh:d52b6d37066685eb3e3a4bd32fcc7bc68aad1e81cbd3240feaa138313c740e81", - ] -} - -provider "registry.opentofu.org/hashicorp/cloudinit" { - version = "2.3.3" - constraints = ">= 2.0.0" - hashes = [ - "h1:M19EHqgAqMHiVYlS3qEQOy+MZWOurTXyUq3kjnMhh+M=", - "zh:132f1782bb198a635892ea4b116fd69ffabcf4b6b11f86c57faf53b19575c23d", - "zh:21e7ab6820990f314de03be87af71cb4bae2409fa18007d11cfa60066a7f924c", - "zh:2a6a71194f3923ba6136c8a17765f505fa3e20624f4cd1078f36bdb92cafbe00", - "zh:2cf69cac676eb20e5f82b1dbb739c30b963fd6010e430e1b0bf3dfedc6554000", - "zh:3c508f6ef48fc8073d2e4ebd1ea1532b52e4d7ac679908d73891e8f4b451a71d", - "zh:5ab08771183c7dd6070ae95be84154540f15c41b34606e55fe87639e0bfddc0c", - "zh:af20410183201bbbe4e13f7fa69f0a57eea5b925e3092036f1aaa2767f1a7516", - "zh:bbb60400a2c17aa31728b348d4f7ba2de8a20b014b0c0658c7ff1f54a4e1f776", - "zh:e8cd5c617707b5e5f78a2dba45e864b7690930f39aa6c84e9455e9f3943cb83c", - "zh:f43b99f6b6d581d2745e4f0cfdeb0425f381c113bebf2cc95c08c8f8c2d6506b", - ] -} - -provider "registry.opentofu.org/hashicorp/helm" { - version = "2.12.1" - constraints = ">= 2.9.0" - hashes = [ - "h1:mRK57Pn5YGikn9jT4GyZtB1zf5gvu9ynNbwWq6YuPyA=", - "zh:0349149992646530c33314cb973eba68757606a037017ba47e56db695d4b3afe", - "zh:3138ffe23c481b01419a4a21adf83538efe6e698b421c4a8f7d142b198518709", - "zh:44658e3070405b88fbd76161ecddde62f478dc31aaebee3b93c2f2783a6d45f9", - "zh:5600a3407dfb8b77da7561490157afa8ad505c864a5dd35ed8d678e9ad8378ca", - "zh:6445e359c813ecbb7c2edf722ed0d1f33dfb171b6a7b470f40cf1e24045b7441", - "zh:7973054604c7f5a51600f6e63fa0327d05b29fac2bffd222c21660cbdd2939f9", - "zh:7c59e2d4602ab5d9de0ba8e442ec1fc425c8f143581018d1e7f645298a124f01", - "zh:8c0fb411dd5de664ac5e801d70507781790c4fc196518a56966d66d0963c240c", - "zh:a6a988c91bbf1828a8fc55001f10c7d06c5c53dc718ee7cd6814bdfa2e6652e0", - "zh:b7935d7dacd7e5a91ff9d17cfb04ce88c9100e563fd88487d14519e8d8d8b2e1", - ] -} - -provider "registry.opentofu.org/hashicorp/kubernetes" { - version = "2.25.2" - constraints = ">= 2.10.0, >= 2.20.0" - hashes = [ - "h1:0rrxDjtJb63VSS9npSlikqJGH2L726byPwnP7fd3B/4=", - "zh:38d35c069a7f5a7c360ce2ddf6da7f22fd25290f76697d5ab0adbc9e7cae4db8", - "zh:4b1967b873c6262d8c8f57e8cea2cd029c43912555f688cdff4f824193be4e50", - "zh:57e6cfda60c084bb141ea1d8f0ad5881b1bbda92dfcb090e752b09d2cc153b84", - "zh:5af23ccba04c2fef7cff9cb17ed5c10ac9d89098706244d1be4d2acbf44f2ae2", - "zh:67cd0b02deba8361b2689afe4b2f67e38ca68539a89eb88ceaef7fae6f7e6f0b", - "zh:7b4d1fa9d9612919e29a9424a316af6b4c5de766debd8a823b8ad0f28ac2d9c6", - "zh:b7a9ca5d1a734675de32c9bbdc78e8569a10aef140fe6d176efed393e64e1d71", - "zh:c97777211b3f46ede86fce794ab515185e1145af6a4cd57effdfcddc3cccdcf7", - "zh:e7921548f82f68694ad51974bad7cc6c853134a9851912a04da4cdba34eb6214", - "zh:fa7b37554ed32422a239b707380ae778a8918e4917a76aadd92fe559a353a10c", - ] -} - -provider "registry.opentofu.org/hashicorp/time" { - version = "0.10.0" - constraints = ">= 0.9.0" - hashes = [ - "h1:58xhugSr4hDVmzUqkAuVQIJjJ0E7cyOmnqTM0TQboEs=", - "zh:0997e9e82dbf3b01b783d75e4ad14dfa135d7dea9ef2e6ccd48348ed9dd31c27", - "zh:2794dc8a5b79f331ad780b757dd7a04e539551cc8b8c50d25ebeb43994c7fe1d", - "zh:2f5e81ebc5c4d9329c392de67483fb2144d2c0cae4001e6ec2c83e1ab71d62ca", - "zh:5cda8141aeeb594ca7a59e3ada5d9c8d8901a3467c07e263f2c0a4a08170ea53", - "zh:816aaa6eee3a053f29db48bed081e5f3be218336e41d1da385c29592916fb7c7", - "zh:8606f3ce3784927e732516368e69712cf127c5c71334384d6be84da4c555558c", - "zh:8b87a1472271a1dc359c4c1069ba62248c56240009c37598af75e4b888172cd7", - "zh:dc1617df212201ded78038634d5f68b7551b5763361d3a20ab8b5a55640557eb", - "zh:f9b10a794c6d3760cfeb21d3c4db551385a528837dcb523ba46b59f43f1935ac", - "zh:fa51a2259a691ad24ff1426a9b460dfa6d293835544e53915731c858608db024", - ] -} - -provider "registry.opentofu.org/hashicorp/tls" { - version = "4.0.5" - constraints = ">= 3.0.0" - hashes = [ - "h1:ILGm1+RP2+eIDc+YQ+xWgNX7Dcb9cD9OuvJHqUxtjmE=", - "zh:05a7dc3ac92005485714f87541ad6d0d478988b478c5774227a7d39b01660050", - "zh:547e0def44080456169bf77c21037aa6dc9e7f3e644a8f6a2c5fc3e6c15cf560", - "zh:6842b03d050ae1a4f1aaed2a2b1ca707eae84ae45ae492e4bb57c3d48c26e1f1", - "zh:6ced0a9eaaba12377f3a9b08df2fd9b83ae3cb357f859eb6aecf24852f718d9a", - "zh:766bcdf71a7501da73d4805d05764dcb7c848619fa7c04b3b9bd514e5ce9e4aa", - "zh:84cc8617ce0b9a3071472863f43152812e5e8544802653f636c866ef96f1ed34", - "zh:b1939e0d44c89315173b78228c1cf8660a6924604e75ced7b89e45196ce4f45e", - "zh:ced317916e13326766427790b1d8946c4151c4f3b0efd8f720a3bc24abe065fa", - "zh:ec9ff3412cf84ba81ca88328b62c17842b803ef406ae19152c13860b356b259c", - "zh:ff064f0071e98702e542e1ce00c0465b7cd186782fe9ccab8b8830cac0f10dd4", - ] -} diff --git a/tofu/aws/1-bootstrap/.terraform.lock.hcl b/tofu/aws/1-bootstrap/.terraform.lock.hcl new file mode 100644 index 00000000..b9c29d84 --- /dev/null +++ b/tofu/aws/1-bootstrap/.terraform.lock.hcl @@ -0,0 +1,37 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/aws" { + version = "5.62.0" + constraints = "5.62.0" + hashes = [ + "h1:osfvuI1zhXvYITS+pLYHD7qVmxS1LiGF4x1nb2meI4g=", + "zh:2cb519ce7f3cbcb88b2e93dd3b3424ad85a347fc0e7429661945da5df8a20fda", + "zh:2fc7ed911cceaa1652d1f4090eaa91e8463aba86873910bccf16601260379886", + "zh:395b32d157adeb92571a0efd230c73bbee01744782a50356fb16e8946bd63ffb", + "zh:43303d36af40a568cd40bd54dc9e8430e18c4a4d78682b459dca8c755c717a0c", + "zh:65b2c6e955deeeffb9d9cd4ed97e8c532a453ba690d0e3d88c740f9036bccc4d", + "zh:a9d09dc9daf33b16894ed7d192ceb4c402261da58cded503a3ffa1dd2373e3fb", + "zh:c5e9f8bc4397c2075b6dc62458be51b93322517affd760c161633d56b0b9a334", + "zh:db0921c091402179edd549f8aa4f12dce18aab09d4302e800c67d6ec6ff88a86", + "zh:e7d13f9c0891446d03c29e4fcd60de633f71bbf1bc9786fca47a0ee356ac979a", + "zh:f128a725dbdbd31b9ed8ea478782152339c9fab4d635485763c8da2a477fe3f6", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.6.2" + hashes = [ + "h1:cwVnVdQqyli6MhRE74KtT70s6MepGHFFQu+oKcbETP4=", + "zh:1f27612f7099441526d8af59f5b4bdcc35f46915df5d243043d7337ea5a3e38a", + "zh:2a58e66502825db8b4b96116c04bd0323bca1cf1f5752bdd8f9c26feb84d3b1e", + "zh:4f0a4fa479e29de0c3c90146fd58799c097f7a55401cb00560dd4e9b1e6fad9d", + "zh:9c93c0fe6ef685513734527e0c8078636b2cc07591427502a7260f4744b1af1d", + "zh:a466ff5219beb77fb3b18a3d7e7fe30e7edd4d95c8e5c87f4f4e3fe3eeb8c2d7", + "zh:ab33e6176d0c757ddb31e40e01a941e6918ad10f7a786c8e8e4f35e5cff81c96", + "zh:b6eabf377a1c12cb3f9ddd97aacdd5b49c1646dc959074124f81d40fcd216d7e", + "zh:ccec5d03d0d1c0f354be299cdd6a417b2700f1a6781df36bcce77246b2f57e50", + "zh:d2a7945eeb691fdd2b1474da76ddc2d1655e2aedbb14b57f06d4f5123d47adf9", + "zh:ed62351f4ad9d1469c6798b77dee5f63b18b29c473620a0046ba3d4f111b621d", + ] +} diff --git a/tofu/aws/1-bootstrap/backend.tf b/tofu/aws/1-bootstrap/backend.tf new file mode 100644 index 00000000..2758d2b8 --- /dev/null +++ b/tofu/aws/1-bootstrap/backend.tf @@ -0,0 +1,9 @@ +terraform { + # backend "local" { + # path = "bootstrap.tfstate" + # } + + backend "s3" { + + } +} diff --git a/tofu/aws/1-bootstrap/main.tf b/tofu/aws/1-bootstrap/main.tf new file mode 100644 index 00000000..0f0d8959 --- /dev/null +++ b/tofu/aws/1-bootstrap/main.tf @@ -0,0 +1,58 @@ +provider "aws" { + region = var.region +} + +# random suffix +resource "random_id" "suffix" { + byte_length = 4 +} + +data "aws_organizations_organizational_unit" "ou" { + name = var.ou_name + parent_id = var.ou_parent_id +} + +resource "aws_organizations_account" "account" { + name = var.account_name + email = var.account_email + parent_id = data.aws_organizations_organizational_unit.ou.id + role_name = "OrganizationAccountAccessRole" + close_on_deletion = false +} + +provider "aws" { + alias = "seed-account" + region = var.region + assume_role { + role_arn = "arn:aws:iam::${aws_organizations_account.account.id}:role/OrganizationAccountAccessRole" + } +} + +resource "aws_s3_bucket" "terraform_state" { + bucket = "tf-state-${random_id.suffix.hex}" + + provider = aws.seed-account +} + +resource "aws_s3_bucket_versioning" "example" { + bucket = aws_s3_bucket.terraform_state.id + + versioning_configuration { + status = "Enabled" + } + + provider = aws.seed-account +} + +resource "aws_s3_bucket_object_lock_configuration" "state_lock" { + bucket = aws_s3_bucket.terraform_state.id + + rule { + default_retention { + mode = "COMPLIANCE" + days = 14 + } + } + + provider = aws.seed-account +} diff --git a/tofu/aws/1-bootstrap/outputs.tf b/tofu/aws/1-bootstrap/outputs.tf new file mode 100644 index 00000000..b32da75c --- /dev/null +++ b/tofu/aws/1-bootstrap/outputs.tf @@ -0,0 +1,3 @@ +output "ou_id" { + value = data.aws_organizations_organizational_unit.ou.id +} \ No newline at end of file diff --git a/tofu/aws/1-bootstrap/providers.tf b/tofu/aws/1-bootstrap/providers.tf new file mode 100644 index 00000000..f2d23808 --- /dev/null +++ b/tofu/aws/1-bootstrap/providers.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "5.62.0" + } + } +} diff --git a/tofu/aws/1-bootstrap/variables.tf b/tofu/aws/1-bootstrap/variables.tf new file mode 100644 index 00000000..bc4aa4fa --- /dev/null +++ b/tofu/aws/1-bootstrap/variables.tf @@ -0,0 +1,26 @@ + + +variable "ou_name" { + type = string + description = "The name of the OU to create." +} + +variable "ou_parent_id" { + type = string + description = "The ID of the parent OU." +} + +variable "account_name" { + type = string + description = "The name of the account to create." +} + +variable "account_email" { + type = string + description = "The email of the account to create." +} + +variable "region" { + type = string + description = "The region to create the bucket in." +} diff --git a/tofu/aws/2-org/.terraform.lock.hcl b/tofu/aws/2-org/.terraform.lock.hcl new file mode 100644 index 00000000..2ec19c90 --- /dev/null +++ b/tofu/aws/2-org/.terraform.lock.hcl @@ -0,0 +1,20 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/aws" { + version = "5.62.0" + constraints = "5.62.0" + hashes = [ + "h1:osfvuI1zhXvYITS+pLYHD7qVmxS1LiGF4x1nb2meI4g=", + "zh:2cb519ce7f3cbcb88b2e93dd3b3424ad85a347fc0e7429661945da5df8a20fda", + "zh:2fc7ed911cceaa1652d1f4090eaa91e8463aba86873910bccf16601260379886", + "zh:395b32d157adeb92571a0efd230c73bbee01744782a50356fb16e8946bd63ffb", + "zh:43303d36af40a568cd40bd54dc9e8430e18c4a4d78682b459dca8c755c717a0c", + "zh:65b2c6e955deeeffb9d9cd4ed97e8c532a453ba690d0e3d88c740f9036bccc4d", + "zh:a9d09dc9daf33b16894ed7d192ceb4c402261da58cded503a3ffa1dd2373e3fb", + "zh:c5e9f8bc4397c2075b6dc62458be51b93322517affd760c161633d56b0b9a334", + "zh:db0921c091402179edd549f8aa4f12dce18aab09d4302e800c67d6ec6ff88a86", + "zh:e7d13f9c0891446d03c29e4fcd60de633f71bbf1bc9786fca47a0ee356ac979a", + "zh:f128a725dbdbd31b9ed8ea478782152339c9fab4d635485763c8da2a477fe3f6", + ] +} diff --git a/tofu/aws/2-org/backend.tf b/tofu/aws/2-org/backend.tf new file mode 100644 index 00000000..c38c6138 --- /dev/null +++ b/tofu/aws/2-org/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "s3" { + + } +} diff --git a/tofu/aws/2-org/main.tf b/tofu/aws/2-org/main.tf new file mode 100644 index 00000000..a975e82c --- /dev/null +++ b/tofu/aws/2-org/main.tf @@ -0,0 +1,20 @@ +# Create Workloads OU +resource "aws_organizations_organizational_unit" "workloads" { + name = var.workloads_ou_name + parent_id = var.workloads_ou_parent_id +} + + +resource "aws_organizations_account" "account" { + name = var.app_plane_account_name + email = var.app_plane_account_email + parent_id = aws_organizations_organizational_unit.workloads.id + role_name = "OrganizationAccountAccessRole" + close_on_deletion = false + + depends_on = [ aws_organizations_organizational_unit.workloads ] + + lifecycle { + ignore_changes = [role_name, name] + } +} \ No newline at end of file diff --git a/tofu/aws/2-org/providers.tf b/tofu/aws/2-org/providers.tf new file mode 100644 index 00000000..e69de29b diff --git a/tofu/aws/2-org/variables.tf b/tofu/aws/2-org/variables.tf new file mode 100644 index 00000000..f49abaf2 --- /dev/null +++ b/tofu/aws/2-org/variables.tf @@ -0,0 +1,19 @@ +variable "workloads_ou_name" { + type = string + description = "Name of the OU for workloads" +} + +variable "workloads_ou_parent_id" { + type = string + description = "Parent OU ID for workloads" +} + +variable "app_plane_account_name" { + type = string + description = "Name of the application plane account" +} + +variable "app_plane_account_email" { + type = string + description = "Email of the application plane account" +} diff --git a/tofu/aws/3-application_plane/.terraform.lock.hcl b/tofu/aws/3-application_plane/.terraform.lock.hcl new file mode 100644 index 00000000..103e75a4 --- /dev/null +++ b/tofu/aws/3-application_plane/.terraform.lock.hcl @@ -0,0 +1,37 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/aws" { + version = "5.62.0" + constraints = "5.62.0" + hashes = [ + "h1:osfvuI1zhXvYITS+pLYHD7qVmxS1LiGF4x1nb2meI4g=", + "zh:2cb519ce7f3cbcb88b2e93dd3b3424ad85a347fc0e7429661945da5df8a20fda", + "zh:2fc7ed911cceaa1652d1f4090eaa91e8463aba86873910bccf16601260379886", + "zh:395b32d157adeb92571a0efd230c73bbee01744782a50356fb16e8946bd63ffb", + "zh:43303d36af40a568cd40bd54dc9e8430e18c4a4d78682b459dca8c755c717a0c", + "zh:65b2c6e955deeeffb9d9cd4ed97e8c532a453ba690d0e3d88c740f9036bccc4d", + "zh:a9d09dc9daf33b16894ed7d192ceb4c402261da58cded503a3ffa1dd2373e3fb", + "zh:c5e9f8bc4397c2075b6dc62458be51b93322517affd760c161633d56b0b9a334", + "zh:db0921c091402179edd549f8aa4f12dce18aab09d4302e800c67d6ec6ff88a86", + "zh:e7d13f9c0891446d03c29e4fcd60de633f71bbf1bc9786fca47a0ee356ac979a", + "zh:f128a725dbdbd31b9ed8ea478782152339c9fab4d635485763c8da2a477fe3f6", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.6.3" + hashes = [ + "h1:32/UZofQoXk8zPj9vpIDiSEmERA3Mx2VPvk1lHTTHvw=", + "zh:1bfd2e54b4eee8c761a40b6d99d45880b3a71abc18a9a7a5319204da9c8363b2", + "zh:21a15ac74adb8ba499aab989a4248321b51946e5431219b56fc827e565776714", + "zh:221acfac3f7a5bcd6cb49f79a1fca99da7679bde01017334bad1f951a12d85ba", + "zh:3026fcdc0c1258e32ab519df878579160b1050b141d6f7883b39438244e08954", + "zh:50d07a7066ea46873b289548000229556908c3be746059969ab0d694e053ee4c", + "zh:54280cdac041f2c2986a585f62e102bc59ef412cad5f4ebf7387c2b3a357f6c0", + "zh:632adf40f1f63b0c5707182853c10ae23124c00869ffff05f310aef2ed26fcf3", + "zh:b8c2876cce9a38501d14880a47e59a5182ee98732ad7e576e9a9ce686a46d8f5", + "zh:f27e6995e1e9fe3914a2654791fc8d67cdce44f17bf06e614ead7dfd2b13d3ae", + "zh:f423f2b7e5c814799ad7580b5c8ae23359d8d342264902f821c357ff2b3c6d3d", + ] +} diff --git a/tofu/aws/3-application_plane/backend.tf b/tofu/aws/3-application_plane/backend.tf new file mode 100644 index 00000000..c38c6138 --- /dev/null +++ b/tofu/aws/3-application_plane/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "s3" { + + } +} diff --git a/tofu/aws/3-application_plane/main.tf b/tofu/aws/3-application_plane/main.tf new file mode 100644 index 00000000..f778d61b --- /dev/null +++ b/tofu/aws/3-application_plane/main.tf @@ -0,0 +1,106 @@ + +data "aws_organizations_organizational_unit" "ou" { + name = var.workloads_ou_name + parent_id = var.workloads_ou_parent_id +} + +data "aws_organizations_organizational_unit_child_accounts" "children" { + parent_id = data.aws_organizations_organizational_unit.ou.id +} + +locals { + workload_accounts = try( + tomap({ + for account in data.aws_organizations_organizational_unit_child_accounts.children.accounts : account.name => account + }), + {} + ) + app_plane_account = local.workload_accounts[var.app_plane_account_name] + app_plane_trail_bucket_name = nonsensitive("${lower(replace(var.app_plane_account_name, " ", "-"))}-cloudtrail-${random_bytes.suffix.hex}") + app_plane_access_logs_bucket_name = nonsensitive("${lower(replace(var.app_plane_account_name, " ", "-"))}-access-logs-${random_bytes.suffix.hex}") +} + +resource "random_bytes" "suffix" { + length = 4 +} + + +provider "aws" { + alias = "app-plane-account" + assume_role { + role_arn = "arn:aws:iam::${local.app_plane_account.id}:role/OrganizationAccountAccessRole" + } +} + +# Bucket policy to allow CloudTrail to write logs to the bucket +data "aws_iam_policy_document" "cloudtrail_bucket_policy" { + statement { + effect = "Allow" + actions = ["s3:GetBucketAcl"] + resources = ["arn:aws:s3:::${local.app_plane_trail_bucket_name}"] + principals { + type = "Service" + identifiers = ["cloudtrail.amazonaws.com"] + } + } + + statement { + effect = "Allow" + actions = ["s3:PutObject"] + resources = ["arn:aws:s3:::${local.app_plane_trail_bucket_name}/*"] + principals { + type = "Service" + identifiers = ["cloudtrail.amazonaws.com"] + } + } +} + + +module "aws-s3-bucket" { + source = "trussworks/s3-private-bucket/aws" + bucket = local.app_plane_trail_bucket_name + use_account_alias_prefix = false + enable_analytics = false + + custom_bucket_policy = data.aws_iam_policy_document.cloudtrail_bucket_policy.json + + providers = { + aws = aws.app-plane-account + } +} + +data "aws_iam_policy_document" "access_logs_bucket_policy" { + + statement { + effect = "Allow" + actions = ["s3:GetBucketAcl"] + resources = ["arn:aws:s3:::${local.app_plane_access_logs_bucket_name}"] + principals { + type = "Service" + identifiers = ["delivery.logs.amazonaws.com"] + } + } + + statement { + effect = "Allow" + actions = ["s3:PutObject"] + resources = ["arn:aws:s3:::${local.app_plane_access_logs_bucket_name}/*"] + principals { + type = "Service" + identifiers = ["delivery.logs.amazonaws.com"] + } + } +} + +module "aws-s3-bucket-access-logs" { + source = "trussworks/s3-private-bucket/aws" + bucket = local.app_plane_access_logs_bucket_name + use_account_alias_prefix = false + enable_analytics = false + + custom_bucket_policy = data.aws_iam_policy_document.access_logs_bucket_policy.json + + providers = { + aws = aws.app-plane-account + } +} \ No newline at end of file diff --git a/tofu/aws/3-application_plane/outputs.tf b/tofu/aws/3-application_plane/outputs.tf new file mode 100644 index 00000000..e69de29b diff --git a/tofu/aws/3-application_plane/variables.tf b/tofu/aws/3-application_plane/variables.tf new file mode 100644 index 00000000..226259a4 --- /dev/null +++ b/tofu/aws/3-application_plane/variables.tf @@ -0,0 +1,26 @@ +variable "workloads_ou_name" { + type = string + description = "Name of the OU for workloads" +} + +variable "workloads_ou_parent_id" { + type = string + description = "Parent OU ID for workloads" +} + +variable "app_plane_account_name" { + type = string + description = "Name of the application plane account" +} + +variable "cloudtrail_retention_days" { + type = number + description = "Number of days to retain CloudTrail logs" + default = 90 +} + +variable "app_plane_lb_bucket_access_allow_list" { + type = list(string) + description = "List of ARNs of load balancers that are allowed to write to the access logs bucket" + default = [] +} diff --git a/tofu/aws/main.tf b/tofu/aws/main.tf deleted file mode 100644 index f9cc6640..00000000 --- a/tofu/aws/main.tf +++ /dev/null @@ -1,212 +0,0 @@ -provider "aws" { - region = var.region - assume_role { - role_arn = var.assume_role_arn - } -} -data "aws_caller_identity" "current" { -} - -data "aws_availability_zones" "available" { -} - - -locals { - vpc_cidr = "10.0.0.0/16" - azs = slice(data.aws_availability_zones.available.names, 0, 3) - - tags = { - customer = var.name - } -} - -################################################################################ -# Cluster -################################################################################ - -module "eks" { - source = "terraform-aws-modules/eks/aws" - version = "~> 19.21" - - cluster_name = var.name - cluster_version = var.k8s_version - cluster_endpoint_public_access = true - - aws_auth_accounts = [ - data.aws_caller_identity.current.account_id - ] - - aws_auth_roles = [ - var.eks_auth_role - ] - - vpc_id = module.vpc.vpc_id - subnet_ids = module.vpc.private_subnets - - eks_managed_node_groups = { - initial = { - instance_types = [var.k8s_instance_type] - - min_size = var.k8s_node_min_count - max_size = var.k8s_node_max_count - desired_size = var.k8s_node_count - } - } - - tags = local.tags - -} - -################################################################################ -# EKS Blueprints Addons -################################################################################ - -module "eks_blueprints_addons" { - source = "aws-ia/eks-blueprints-addons/aws" - version = "~> 1.0" - - cluster_name = module.eks.cluster_name - cluster_endpoint = module.eks.cluster_endpoint - cluster_version = module.eks.cluster_version - oidc_provider_arn = module.eks.oidc_provider_arn - - create_delay_dependencies = [for group in module.eks.eks_managed_node_groups : group.node_group_arn] - - eks_addons = { - aws-ebs-csi-driver = { - service_account_role_arn = module.ebs_csi_driver_irsa.iam_role_arn - } - coredns = {} - vpc-cni = {} - kube-proxy = {} - } - - tags = local.tags - -} - -data "aws_iam_policy_document" "assume_role" { - statement { - effect = "Allow" - - principals { - type = "Service" - identifiers = ["pods.eks.amazonaws.com"] - } - - actions = [ - "sts:AssumeRole", - "sts:TagSession" - ] - } -} - -resource "aws_iam_role" "falkordb_backup_role" { - name = "${var.name}-falkordb_backup_role" - assume_role_policy = data.aws_iam_policy_document.assume_role.json -} - -resource "aws_iam_role_policy_attachment" "falkordb_backup_role_policy_attachment" { - policy_arn = "arn:aws:iam::aws:policy/AmazonS3FullAccess" - role = aws_iam_role.falkordb_backup_role.name -} - -resource "aws_eks_pod_identity_association" "falkordb_backup_association" { - cluster_name = module.eks.cluster_name - namespace = "falkordb-backup" - service_account = "default" - role_arn = aws_iam_role.falkordb_backup_role.arn -} - -resource "aws_eks_addon" "eks_pod_identity" { - cluster_name = module.eks.cluster_name - addon_name = "eks-pod-identity-agent" - addon_version = "v1.1.0-eksbuild.1" - resolve_conflicts_on_update = "OVERWRITE" -} - -################################################################################ -# Supporting Resources -################################################################################ - -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "~> 5.0" - - name = var.name - cidr = local.vpc_cidr - - azs = local.azs - private_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 4, k)] - public_subnets = [for k, v in local.azs : cidrsubnet(local.vpc_cidr, 8, k + 48)] - - enable_nat_gateway = true - single_nat_gateway = true - - public_subnet_tags = { - "kubernetes.io/role/elb" = 1 - } - - private_subnet_tags = { - "kubernetes.io/role/internal-elb" = 1 - } - - tags = local.tags -} - -#tfsec:ignore:* -module "falkordb_backup_s3_bucket" { - source = "terraform-aws-modules/s3-bucket/aws" - version = "~> 3.0" - - bucket = "${var.name}-backup" - - attach_deny_insecure_transport_policy = true - attach_require_latest_tls_policy = true - - acl = "private" - - block_public_acls = true - block_public_policy = true - ignore_public_acls = true - restrict_public_buckets = true - - control_object_ownership = true - object_ownership = "BucketOwnerPreferred" - - versioning = { - status = true - mfa_delete = false - } - - server_side_encryption_configuration = { - rule = { - apply_server_side_encryption_by_default = { - sse_algorithm = "AES256" - } - } - } - - force_destroy = true - - tags = local.tags - -} - -module "ebs_csi_driver_irsa" { - source = "terraform-aws-modules/iam/aws//modules/iam-role-for-service-accounts-eks" - version = "~> 5.20" - - role_name_prefix = module.eks.cluster_name - - attach_ebs_csi_policy = true - - oidc_providers = { - main = { - provider_arn = module.eks.oidc_provider_arn - namespace_service_accounts = ["kube-system:ebs-csi-controller-sa"] - } - } - - tags = local.tags -} diff --git a/tofu/aws/outputs.tf b/tofu/aws/outputs.tf deleted file mode 100644 index 5faeddb6..00000000 --- a/tofu/aws/outputs.tf +++ /dev/null @@ -1,37 +0,0 @@ -output "configure_kubectl" { - description = "Configure kubectl: make sure you're logged in with the correct AWS profile and run the following command to update your kubeconfig" - value = "aws eks --region ${var.region} update-kubeconfig --name ${module.eks.cluster_name}" -} -output "falkordb_eks_cluster_name" { - description = "EKS cluster name" - value = module.eks.cluster_name -} -output "falkordb_s3_backup_name" { - description = "Backup bucket name" - value = module.falkordb_backup_s3_bucket.s3_bucket_id -} - -output "falkordb_eks_cluster_endpoint" { - description = "EKS cluster endpoint" - value = module.eks.cluster_endpoint -} - -output "falkordb_eks_cluster_certificate_autority" { - description = "EKS cluster certificate autority" - value = module.eks.cluster_certificate_authority_data -} - -output "falkordb_eks_cluster_role_arn" { - description = "EKS cluster role ARN" - value = module.eks.cluster_iam_role_arn -} - -output "falkordb_eks_cluster_oidc_issuer_url" { - description = "EKS cluster OIDC issuer URL" - value = module.eks.cluster_oidc_issuer_url -} - -output "falkordb_eks_cluster_oidc_issuer_arn" { - description = "EKS cluster OIDC issuer ARN" - value = module.eks.oidc_provider_arn -} \ No newline at end of file diff --git a/tofu/aws/variables.tf b/tofu/aws/variables.tf deleted file mode 100644 index be374076..00000000 --- a/tofu/aws/variables.tf +++ /dev/null @@ -1,36 +0,0 @@ - -variable "name" { - type = string -} - -variable "region" { - type = string -} - -variable "assume_role_arn" { - type = string -} - -variable "eks_auth_role" { - type = string -} - -variable "k8s_version" { - type = string -} - -variable "k8s_instance_type" { - type = string -} - -variable "k8s_node_count" { - type = number -} - -variable "k8s_node_min_count" { - type = number -} - -variable "k8s_node_max_count" { - type = number -} diff --git a/tofu/gcp/bootstrap/seed_project/main.tf b/tofu/gcp/bootstrap/seed_project/main.tf index d10080d0..1d074baf 100644 --- a/tofu/gcp/bootstrap/seed_project/main.tf +++ b/tofu/gcp/bootstrap/seed_project/main.tf @@ -30,7 +30,7 @@ module "bootstrap" { create_terraform_sa = true state_bucket_name = var.state_bucket_name_prefix - + } locals { diff --git a/tofu/gcp/observability_stack/control_plane/README b/tofu/gcp/observability_stack/control_plane/README new file mode 100644 index 00000000..3495a7e0 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/README @@ -0,0 +1,12 @@ +# Observability Stack - Control Plane + +Defines a GKE cluster that will centralize observability on all customer deployments. + +The control plane for the observability stack is composed of the following components: + +- GKE Cluster + - ArgoCD: Sync all observability components in all clusters + - VictoriaMetrics: Store and query metrics across all clusters + - Grafana: Visualize metrics + - Grafana Ingress: Expose Grafana to the internet + - VMAuth Ingress: Proxy VictoriaMetrics requests diff --git a/tofu/gcp/observability_stack/control_plane/infra/.terraform.lock.hcl b/tofu/gcp/observability_stack/control_plane/infra/.terraform.lock.hcl new file mode 100644 index 00000000..dcc97965 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/.terraform.lock.hcl @@ -0,0 +1,74 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/google" { + version = "5.45.0" + constraints = ">= 3.33.0, >= 3.53.0, >= 3.83.0, >= 4.25.0, >= 4.64.0, >= 5.0.0, 5.45.0, < 6.0.0, < 7.0.0" + hashes = [ + "h1:i+v9EZbFRgLj1/Q+j71Z/V79ghSJnINQFq+uZEzDvTg=", + "zh:3abbc9944211da9027c69dde78b00430a7e2171ceb0c725c5b0d095f3f47f116", + "zh:42e4f1ccde4798b0ef769931f64fdc72ee6b6da33e560d697bd11ce2a94a973a", + "zh:56bb20c9ecfb4f8bd54b0673ec98798fbcebd9523503715ab7717160f3ed443d", + "zh:5a8894c3e568c851bb48ec8220850625fdb9c4ad645d847a3edcf66698f966e5", + "zh:66f964b2602210c98e9eb3621ad7dcfb0c0cc41fea5b85fdada9dfbf83d473ee", + "zh:9e9d23ef67272aee885797602dc7d58040d7ea9c7d3203ed9b07d94b4ff2d99e", + "zh:a28df1a130307ed6c5da5b0724bd31f0c12bd523839b7898f69436c02eadf367", + "zh:bf8a3de01d287c5a4277afd3108c95415a69677a93f26959a424e31c8bafc511", + "zh:c24d8805ecb92a605288204dd8a2ba209d590f872a6ca83e0bc204af0cc34cd4", + "zh:ec3face7efdef260965f9d7380c38ddd36c090adb775a09ab12ea7bd69c84897", + ] +} + +provider "registry.opentofu.org/hashicorp/google-beta" { + version = "6.20.0" + constraints = ">= 4.64.0, < 7.0.0" + hashes = [ + "h1:aLZkZyKHdOPUapzl9NSIv6QCcwf6onny86wxr3MUrGg=", + "zh:088f8b7b1c17df88a8e86a8a5fba02c8150f03a31e8eb47de7eeba2228a79dae", + "zh:1bf6a798f1d5be3019d5387c792940c324d72adf8020285561eea6428cad9d60", + "zh:2f5b18f40fb75b81dd51463510beccbea20bf9c629a98f675da3d80bf3cab305", + "zh:757b7b6b2248dcdbeb5a4e95c53387115f5a7cbb433e8fdfbc2be2f95817a82b", + "zh:7d0b2068ed7996d89bd00aec6e0311e864b73ce90b33584a56b4fdda6c3181e5", + "zh:82041f7ee70a4d20ed122acfd22ef22d9aa05a06ac61aba6414c4361da56de92", + "zh:8abf8bee31408c40514ee27ab4dc750f608e5f7f07194886d377cd3078758c90", + "zh:8b44ffa12cf0fec3a24accd0f8d25a877b857136e4d91388571845505022fc05", + "zh:a9045e5d9c8cc55742c4e0597d9decb32094e434603ab3773b95b771aeb2891f", + "zh:ccbfa676b40dba46163ff26b918ceb501bdeedd72e3808b3caf46a5adde9818a", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.35.1" + constraints = "~> 2.10" + hashes = [ + "h1:B2Z0iwnPv88NPJuJSswPo87ZCqXmVMv/G/tbfwEt/EI=", + "zh:0a569918d9e81755bdacb2380e70ed304c442e957a029984cbcd9ec88e5d3635", + "zh:1d4d1241cf51d7d4a036c774add1384bb1ba9ca16146334d17c730e1b41ad3e0", + "zh:243219f415f5d8caf32a4e6b6bf596c11cf7db5501ccb4ae77cc0b084bb5d108", + "zh:2f3a33cba73918adc6f580c76b252881f22beb75277df8ca26a01eb5411348f9", + "zh:3b5247f69e72d1e94ac965fa570f448436cedb278f3f29836f6a345aa1bbd5b6", + "zh:4206bca7bf30708e235535af50529565b14f30262dc43142153a1774ee5086af", + "zh:490c80454b8808bb937498aea98e4076a74887446b05feb6e200015613b5e065", + "zh:5e39824289f7b29711681bce98fbb6c27ed221b071a8c78fd0de7f6c2dae4371", + "zh:a7bf7892217bdb0464664f62485d89d014874b0dfb564e99c364fc6dd20c6a3b", + "zh:e8251170bad1c3e2d9c22d0f4dae7239f1a364f05732f7dff5c8e4ec76a95c5a", + ] +} + +provider "registry.opentofu.org/hashicorp/random" { + version = "3.6.3" + constraints = ">= 2.1.0" + hashes = [ + "h1:ohM08k4QVd81oVSJnFI53wJjPcH23XlYG4WslS9og2Q=", + "zh:1bfd2e54b4eee8c761a40b6d99d45880b3a71abc18a9a7a5319204da9c8363b2", + "zh:21a15ac74adb8ba499aab989a4248321b51946e5431219b56fc827e565776714", + "zh:221acfac3f7a5bcd6cb49f79a1fca99da7679bde01017334bad1f951a12d85ba", + "zh:3026fcdc0c1258e32ab519df878579160b1050b141d6f7883b39438244e08954", + "zh:50d07a7066ea46873b289548000229556908c3be746059969ab0d694e053ee4c", + "zh:54280cdac041f2c2986a585f62e102bc59ef412cad5f4ebf7387c2b3a357f6c0", + "zh:632adf40f1f63b0c5707182853c10ae23124c00869ffff05f310aef2ed26fcf3", + "zh:b8c2876cce9a38501d14880a47e59a5182ee98732ad7e576e9a9ce686a46d8f5", + "zh:f27e6995e1e9fe3914a2654791fc8d67cdce44f17bf06e614ead7dfd2b13d3ae", + "zh:f423f2b7e5c814799ad7580b5c8ae23359d8d342264902f821c357ff2b3c6d3d", + ] +} diff --git a/tofu/gcp/observability_stack/control_plane/infra/backend.tf b/tofu/gcp/observability_stack/control_plane/infra/backend.tf new file mode 100644 index 00000000..9f9728d1 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "gcs" { + prefix = "observability_stack_control_plane_infra" + } +} diff --git a/tofu/gcp/observability_stack/control_plane/infra/main.tf b/tofu/gcp/observability_stack/control_plane/infra/main.tf new file mode 100644 index 00000000..b26ca099 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/main.tf @@ -0,0 +1,208 @@ +provider "google" { + project = var.project_id + region = var.region +} + +module "vpc" { + source = "terraform-google-modules/network/google" + version = "~> 9.0" + + project_id = var.project_id + + network_name = "observability-stack-network" + routing_mode = "REGIONAL" + auto_create_subnetworks = false + + subnets = [{ + subnet_name = "observability-stack-subnet" + subnet_region = var.region + subnet_ip = var.ip_range_subnet + subnet_private_access = true + }] + + secondary_ranges = { + "observability-stack-subnet" = [{ + range_name = "pods" + ip_cidr_range = var.ip_range_pods + }, + { + range_name = "services" + ip_cidr_range = var.ip_range_services + }], + } + +} + +resource "google_compute_router" "router" { + name = "observability-stack-router" + region = var.region + project = var.project_id + + network = module.vpc.network_name + + bgp { + asn = 64514 + } +} + +resource "google_compute_router_nat" "nat" { + name = "observability-stack-nat" + region = var.region + project = var.project_id + + router = google_compute_router.router.name + + nat_ip_allocate_option = "AUTO_ONLY" + source_subnetwork_ip_ranges_to_nat = "ALL_SUBNETWORKS_ALL_IP_RANGES" + +} + +# Reserve premium IP Address for the Grafana Load Balancer +module "lb_ip" { + source = "terraform-google-modules/address/google" + version = "~> 3.2" + + project_id = var.project_id + region = var.region + + global = false + address_type = "EXTERNAL" + network_tier = "PREMIUM" + + names = ["falkordb-grafana-ip"] +} + +resource "random_string" "cluster_suffix" { + keepers = { + project_id = var.project_id + } + + upper = false + special = false + lower = true + + length = 4 +} + +module "gke" { + source = "terraform-google-modules/kubernetes-engine/google//modules/private-cluster" + version = "~> 29.0.0" + project_id = var.project_id + name = "observability-stack-${random_string.cluster_suffix.result}" + region = var.region + network = module.vpc.network_name + subnetwork = "observability-stack-subnet" + ip_range_pods = "pods" + ip_range_services = "services" + regional = true + create_service_account = true + service_account_name = "gke-obs-${random_string.cluster_suffix.result}-nodes-sa" + remove_default_node_pool = true + gce_pd_csi_driver = true + network_policy = false + monitoring_enable_managed_prometheus = false + enable_cost_allocation = false + horizontal_pod_autoscaling = false + filestore_csi_driver = false + disable_legacy_metadata_endpoints = false + deletion_protection = false + enable_private_endpoint = false + enable_private_nodes = true + http_load_balancing = true + + default_max_pods_per_node = 110 + + monitoring_enabled_components = ["SYSTEM_COMPONENTS"] + + security_posture_mode = "BASIC" + security_posture_vulnerability_mode = "VULNERABILITY_BASIC" + + node_pools = [ + { + name = "default-pool" + machine_type = "e2-medium" + disk_size_gb = 30 + min_count = 0 + max_count = 100 + image_type = "COS_CONTAINERD" + initial_node_count = 0 + }, + { + name = "observability-resources" + machine_type = "e2-standard-2" + disk_size_gb = 30 + min_count = 0 + max_count = 20 + image_type = "COS_CONTAINERD" + initial_node_count = 0 + + }, + ] +} + +# Public node pool +resource "google_container_node_pool" "public" { + project = var.project_id + name = "public-pool" + location = var.region + cluster = module.gke.name + node_count = 0 + + node_config { + machine_type = "e2-standard-2" + disk_size_gb = 30 + image_type = "COS_CONTAINERD" + service_account = module.gke.service_account + labels = { + "node_pool" = "public-pool" + } + } + + autoscaling { + min_node_count = 0 + max_node_count = 220 + + } + network_config { + enable_private_nodes = false + } + +} + +# Storage bucket for metrics +resource "google_storage_bucket" "metrics_bucket" { + name = "falkordb-observability-metrics" + location = var.region + project = var.project_id + force_destroy = true + public_access_prevention = "enforced" + + lifecycle_rule { + action { + type = "SetStorageClass" + storage_class = "NEARLINE" + } + condition { + age = 30 + } + } + + lifecycle_rule { + action { + type = "SetStorageClass" + storage_class = "COLDLINE" + } + condition { + age = 90 + } + } + + lifecycle_rule { + action { + type = "Delete" + } + condition { + age = 365 + } + } +} diff --git a/tofu/gcp/observability_stack/control_plane/infra/outputs.tf b/tofu/gcp/observability_stack/control_plane/infra/outputs.tf new file mode 100644 index 00000000..a4d89cef --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/outputs.tf @@ -0,0 +1,17 @@ +output "cluster_endpoint" { + value = module.gke.endpoint + sensitive = true +} + +output "cluster_ca_certificate" { + value = module.gke.ca_certificate + sensitive = true +} + +output "cluster_name" { + value = module.gke.name +} + +output "metrics_bucket" { + value = google_storage_bucket.metrics_bucket.name +} \ No newline at end of file diff --git a/tofu/gcp/observability_stack/control_plane/infra/providers.tf b/tofu/gcp/observability_stack/control_plane/infra/providers.tf new file mode 100644 index 00000000..7ece0f96 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/providers.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.45.0" + } + } +} diff --git a/tofu/gcp/observability_stack/control_plane/infra/variables.tf b/tofu/gcp/observability_stack/control_plane/infra/variables.tf new file mode 100644 index 00000000..03e10700 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/infra/variables.tf @@ -0,0 +1,19 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string +} + +variable "ip_range_subnet" { + type = string +} + +variable "ip_range_pods" { + type = string +} + +variable "ip_range_services" { + type = string +} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/.terraform.lock.hcl b/tofu/gcp/observability_stack/control_plane/k8s/.terraform.lock.hcl new file mode 100644 index 00000000..cf413c91 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/.terraform.lock.hcl @@ -0,0 +1,97 @@ +# This file is maintained automatically by "tofu init". +# Manual edits may be lost in future updates. + +provider "registry.opentofu.org/hashicorp/google" { + version = "5.45.0" + constraints = "5.45.0" + hashes = [ + "h1:i+v9EZbFRgLj1/Q+j71Z/V79ghSJnINQFq+uZEzDvTg=", + "zh:3abbc9944211da9027c69dde78b00430a7e2171ceb0c725c5b0d095f3f47f116", + "zh:42e4f1ccde4798b0ef769931f64fdc72ee6b6da33e560d697bd11ce2a94a973a", + "zh:56bb20c9ecfb4f8bd54b0673ec98798fbcebd9523503715ab7717160f3ed443d", + "zh:5a8894c3e568c851bb48ec8220850625fdb9c4ad645d847a3edcf66698f966e5", + "zh:66f964b2602210c98e9eb3621ad7dcfb0c0cc41fea5b85fdada9dfbf83d473ee", + "zh:9e9d23ef67272aee885797602dc7d58040d7ea9c7d3203ed9b07d94b4ff2d99e", + "zh:a28df1a130307ed6c5da5b0724bd31f0c12bd523839b7898f69436c02eadf367", + "zh:bf8a3de01d287c5a4277afd3108c95415a69677a93f26959a424e31c8bafc511", + "zh:c24d8805ecb92a605288204dd8a2ba209d590f872a6ca83e0bc204af0cc34cd4", + "zh:ec3face7efdef260965f9d7380c38ddd36c090adb775a09ab12ea7bd69c84897", + ] +} + +provider "registry.opentofu.org/hashicorp/helm" { + version = "2.17.0" + constraints = ">= 2.12.0" + hashes = [ + "h1:ojHGbVqPy4ShrUnNL7jif6AnEwgc8vC8sP7f37/VBC8=", + "zh:02690815e35131a42cb9851f63a3369c216af30ad093d05b39001d43da04b56b", + "zh:27a62f12b29926387f4d71aeeee9f7ffa0ccb81a1b6066ee895716ad050d1b7a", + "zh:2d0a5babfa73604b3fefc9dab9c87f91c77fce756c2e32b294e9f1290aed26c0", + "zh:3976400ceba6dda4636e1d297e3097e1831de5628afa534a166de98a70d1dcbe", + "zh:54440ef14f342b41d75c1aded7487bfcc3f76322b75894235b47b7e89ac4bfa4", + "zh:6512e2ab9f2fa31cbb90d9249647b5c5798f62eb1215ec44da2cdaa24e38ad25", + "zh:795f327ca0b8c5368af0ed03d5d4f6da7260692b4b3ca0bd004ed542e683464d", + "zh:ba659e1d94f224bc3f1fd34cbb9d2663e3a8e734108e5a58eb49eda84b140978", + "zh:c5c8575c4458835c2acbc3d1ed5570589b14baa2525d8fbd04295c097caf41eb", + "zh:e0877a5dac3de138e61eefa26b2f5a13305a17259779465899880f70e11314e0", + ] +} + +provider "registry.opentofu.org/hashicorp/kubernetes" { + version = "2.35.1" + constraints = ">= 2.27.0" + hashes = [ + "h1:B2Z0iwnPv88NPJuJSswPo87ZCqXmVMv/G/tbfwEt/EI=", + "zh:0a569918d9e81755bdacb2380e70ed304c442e957a029984cbcd9ec88e5d3635", + "zh:1d4d1241cf51d7d4a036c774add1384bb1ba9ca16146334d17c730e1b41ad3e0", + "zh:243219f415f5d8caf32a4e6b6bf596c11cf7db5501ccb4ae77cc0b084bb5d108", + "zh:2f3a33cba73918adc6f580c76b252881f22beb75277df8ca26a01eb5411348f9", + "zh:3b5247f69e72d1e94ac965fa570f448436cedb278f3f29836f6a345aa1bbd5b6", + "zh:4206bca7bf30708e235535af50529565b14f30262dc43142153a1774ee5086af", + "zh:490c80454b8808bb937498aea98e4076a74887446b05feb6e200015613b5e065", + "zh:5e39824289f7b29711681bce98fbb6c27ed221b071a8c78fd0de7f6c2dae4371", + "zh:a7bf7892217bdb0464664f62485d89d014874b0dfb564e99c364fc6dd20c6a3b", + "zh:e8251170bad1c3e2d9c22d0f4dae7239f1a364f05732f7dff5c8e4ec76a95c5a", + ] +} + +provider "registry.opentofu.org/hashicorp/tls" { + version = "4.0.6" + constraints = ">= 4.0.0" + hashes = [ + "h1:nbB85V/P7q9ZPajmun379YAaURjhrXb7QbzFR//YQvA=", + "zh:4b53b372767e5068d9bbfc89199201c1ae4283dde2f0c301974f8abb4215791f", + "zh:5b4c308bd074c6d0bd560220e6ee10a9859ca9a1f29a59367b0477a740ff265e", + "zh:674dd6bc85597677e160ee601d88b21c5a974759a658769812d2904bd94bc042", + "zh:6ccc1c448349b56677ba66112aec7e0a58eb827f66209ca5f4077b81cce240fb", + "zh:8aa6e13a5d722b74230937ea21e8b4994e53340d95b5691cf6cf3518b9f38e6e", + "zh:8b27e55e4c7fa887774860113b95c8f7f68804b002fa47f0eb8e3a485997287e", + "zh:a430b5a3e8753d8f61784de49e538ac4abed19fb665fccd8a10b55402fe9f076", + "zh:b07c978c335ae9fc12f9c221629610775e4ae36691ed4e7ba258d275dd58a243", + "zh:bbec8cb1efc84ee3026c793956a4a4cd0ece20b89d2d4f7d954c68e7f6d596d0", + "zh:e684e247424188dc3b500a543b1a8046d1c0ec08c2a90aedca0c4f6bb56bedbd", + ] +} + +provider "registry.opentofu.org/integrations/github" { + version = "6.5.0" + constraints = ">= 6.1.0" + hashes = [ + "h1:KN6W+TRczQXMQLAI5Cn/xpvJzq8r+/AQCZaxGURXQ3A=", + "zh:3088bfd30c51ebfcb7c8d829465ec7b3c19af684cf1aff1ea1111ad3c6421c11", + "zh:34f9054b0123f9fa7ab8ebc73591d2cf502f1cc75e7594bde42ce799fcac32b6", + "zh:406dc2e63d43a24ac4f1b004e5c60ada3347207ea750bbd51e6199eb7f044f9f", + "zh:43e7b6cb7e5062d9b7b7cf4d23f6ea99fb9605fb014fede62cda307051063c05", + "zh:6a0923ebcc09cb98c488c11582375d2145ba965d1e6f2f69c077be8e1224020b", + "zh:a2331f06b7ed57e83eadb784211067d675826f67cf0ed051c8ab20335d83de9a", + "zh:a3f82213c98319f20438bdb92145ce1b0407cd8b8eec9745c036db10deb3d3a2", + "zh:b4b8db8537d8e6fb3f05ed875726823e1dc6925c479db8749016e71568ebafc4", + "zh:cdcf76f6f6f5c638db540490ab35bb1aacfc27204f1197004da5e950024afc06", + "zh:de36cea60efe2b74cec958f88ec5c39d467ad9443c9c9e311424c3db229c4e78", + "zh:dfb8949edc6722da66c78a19ccb1b81ac855439a28ca3badfdac5c10bbf2190d", + "zh:e1a81734cc81f4f51dd11ca8a62b420f68e72d00835ed54f84d71bd56d19f37f", + "zh:ec0d51640c3e3cf933c73d0ed79ba8b395d1b94fed8117a6438dba872aa5561f", + "zh:ec59b7c420a2358e9750e9c6a8a5ef26ccbb8a2cae417e115e86d63520759ea5", + "zh:fbd1fee2c9df3aa19cf8851ce134dea6e45ea01cb85695c1726670c285797e25", + ] +} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/backend.tf b/tofu/gcp/observability_stack/control_plane/k8s/backend.tf new file mode 100644 index 00000000..0a9b7e0b --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/backend.tf @@ -0,0 +1,5 @@ +terraform { + backend "gcs" { + prefix = "observability_stack_control_plane_k8s" + } +} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/main.tf b/tofu/gcp/observability_stack/control_plane/k8s/main.tf new file mode 100644 index 00000000..9110d82a --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/main.tf @@ -0,0 +1,88 @@ +data "google_client_config" "default" {} + +data "google_project" "this" { + project_id = var.project_id +} + +provider "kubernetes" { + host = "https://${var.cluster_endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(var.cluster_ca_certificate) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "gcloud" + args = [ + "container", + "clusters", + "get-credentials", + var.cluster_name, + "--region", + var.region, + "--project", + var.project_id, + ] + } +} + +provider "helm" { + kubernetes { + host = "https://${var.cluster_endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(var.cluster_ca_certificate) + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "gcloud" + args = [ + "container", + "clusters", + "get-credentials", + var.cluster_name, + "--region", + var.region, + "--project", + var.project_id, + ] + } + } +} + +provider "github" { + owner = var.github_organization +} + +resource "github_repository" "this" { + name = var.github_repository + visibility = "public" + auto_init = true + vulnerability_alerts = true +} + +resource "tls_private_key" "flux" { + algorithm = "ECDSA" + ecdsa_curve = "P256" +} + +resource "github_repository_deploy_key" "this" { + title = "Flux" + repository = github_repository.this.name + key = tls_private_key.flux.public_key_openssh + read_only = "false" +} + +resource "helm_release" "argocd" { + name = "argocd" + + repository = "https://argoproj.github.io/argo-helm" + chart = "argo-cd" + namespace = "argocd" + create_namespace = true + version = "7.7.15" + + values = var.environment == "development" ? [file("./values/dev/argocd.yaml")] : [file("./values/prod/argocd.yaml")] +} + +resource "kubernetes_namespace" "observability" { + metadata { + name = "observability" + } +} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/providers.tf b/tofu/gcp/observability_stack/control_plane/k8s/providers.tf new file mode 100644 index 00000000..ac1da532 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/providers.tf @@ -0,0 +1,24 @@ +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = "5.45.0" + } + github = { + source = "integrations/github" + version = ">= 6.1" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.12" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.27" + } + tls = { + source = "hashicorp/tls" + version = ">= 4.0" + } + } +} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/values/dev/argocd.yaml b/tofu/gcp/observability_stack/control_plane/k8s/values/dev/argocd.yaml new file mode 100644 index 00000000..8d1c6076 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/values/dev/argocd.yaml @@ -0,0 +1,44 @@ +global: + image: + tag: "v2.13.3" + logging: + format: json +server: + extraArgs: + - --insecure + +cmp: + repositories: + falkordb-dbaas: + url: https://github.com/FalkorDB/falkordb-dbaas + +extraObjects: + - apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: ctrl-plane-observability-stack + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: argocd/ctrl_plane/dev + destination: + server: https://kubernetes.default.svc + syncPolicy: + automated: {} + + - apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: app-plane-observability-stack + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: dev + path: argocd/app_plane/dev + destination: + server: https://kubernetes.default.svc + syncPolicy: + automated: {} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/values/prod/argocd.yaml b/tofu/gcp/observability_stack/control_plane/k8s/values/prod/argocd.yaml new file mode 100644 index 00000000..8c87e407 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/values/prod/argocd.yaml @@ -0,0 +1,44 @@ +global: + image: + tag: "v2.13.3" + logging: + format: json +server: + extraArgs: + - --insecure + +cmp: + repositories: + falkordb-dbaas: + url: https://github.com/FalkorDB/falkordb-dbaas + +extraObjects: + - apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: ctrl-plane-observability-stack + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: main + path: argocd/ctrl_plane/prod + destination: + server: https://kubernetes.default.svc + syncPolicy: + automated: {} + + - apiVersion: argoproj.io/v1alpha1 + kind: Application + metadata: + name: app-plane-observability-stack + spec: + project: default + source: + repoURL: https://github.com/FalkorDB/falkordb-dbaas.git + targetRevision: main + path: argocd/app_plane/prod + destination: + server: https://kubernetes.default.svc + syncPolicy: + automated: {} diff --git a/tofu/gcp/observability_stack/control_plane/k8s/variables.tf b/tofu/gcp/observability_stack/control_plane/k8s/variables.tf new file mode 100644 index 00000000..cc89ed53 --- /dev/null +++ b/tofu/gcp/observability_stack/control_plane/k8s/variables.tf @@ -0,0 +1,39 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string +} + +variable "cluster_endpoint" { + type = string +} + +variable "cluster_ca_certificate" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "github_organization" { + type = string + default = "FalkorDB" +} + +variable "github_repository" { + type = string + default = "falkordb-observability-cluster" +} + +variable "environment" { + type = string + default = "production" + + validation { + condition = var.environment == "production" || var.environment == "development" + error_message = "Environment must be either 'production' or 'development'" + } +}