From b20b2996bddbc61a48fef31b7a9703653ee288e1 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Mon, 28 Oct 2024 15:08:21 -0700 Subject: [PATCH 01/11] (TESTING) IT-5665/ceph-tuning branch --- fleet/lib/fleet-conf/overlays/cp/gitrepo-elqui.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleet/lib/fleet-conf/overlays/cp/gitrepo-elqui.yaml b/fleet/lib/fleet-conf/overlays/cp/gitrepo-elqui.yaml index aa0343e57..15a123d17 100644 --- a/fleet/lib/fleet-conf/overlays/cp/gitrepo-elqui.yaml +++ b/fleet/lib/fleet-conf/overlays/cp/gitrepo-elqui.yaml @@ -6,7 +6,7 @@ metadata: namespace: fleet-default spec: repo: https://github.com/lsst-it/k8s-cookbook - branch: cp_production + branch: IT-5665/ceph-tuning keepResources: true paths: - fleet/s/cp/c/elqui/* From 908043d350f2b02aa63493e913107bfe9c3188d4 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Thu, 3 Oct 2024 13:33:39 -0700 Subject: [PATCH 02/11] (elqui) add rubinobs-raw-latiss bucket --- ...objectbucketclaim-rubinobs-raw-latiss.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-latiss.yaml diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-latiss.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-latiss.yaml new file mode 100644 index 000000000..a7e5e99e7 --- /dev/null +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-latiss.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: rubinobs-raw-latiss +provisioner: rook-ceph.ceph.rook.io/bucket +parameters: + objectStoreName: lfa + objectStoreNamespace: rook-ceph +reclaimPolicy: Retain +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: rubinobs-raw-latiss + namespace: rook-ceph +spec: + bucketName: rubinobs-raw-latiss + storageClassName: rubinobs-raw-latiss + additionalConfig: + maxSize: 1Ti # quota for BTS; XXX increase for cp From a0edf5b0c339180cb62c5aba5ce3328a7621cdaa Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 4 Oct 2024 10:35:41 -0700 Subject: [PATCH 03/11] (elqui) add rubinobs-butler-latiss bucket --- ...ectbucketclaim-rubinobs-butler-latiss.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-latiss.yaml diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-latiss.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-latiss.yaml new file mode 100644 index 000000000..6b3fd8288 --- /dev/null +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-latiss.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: rubinobs-butler-latiss +provisioner: rook-ceph.ceph.rook.io/bucket +parameters: + objectStoreName: lfa + objectStoreNamespace: rook-ceph +reclaimPolicy: Retain +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: rubinobs-butler-latiss + namespace: rook-ceph +spec: + bucketName: rubinobs-butler-latiss + storageClassName: rubinobs-butler-latiss + additionalConfig: + maxSize: 1Ti # quota for BTS; XXX increase for cp From 37a1a6609be581e81123bd00ff038fc8b3f54070 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 4 Oct 2024 10:37:20 -0700 Subject: [PATCH 04/11] (elqui) add rubinobs-raw-lsstcam bucket --- ...bjectbucketclaim-rubinobs-raw-lsstcam.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-lsstcam.yaml diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-lsstcam.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-lsstcam.yaml new file mode 100644 index 000000000..2c7f0b67f --- /dev/null +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-raw-lsstcam.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: rubinobs-raw-lsstcam +provisioner: rook-ceph.ceph.rook.io/bucket +parameters: + objectStoreName: lfa + objectStoreNamespace: rook-ceph +reclaimPolicy: Retain +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: rubinobs-raw-lsstcam + namespace: rook-ceph +spec: + bucketName: rubinobs-raw-lsstcam + storageClassName: rubinobs-raw-lsstcam + additionalConfig: + maxSize: 6Ti # quota for BTS; XXX increase for cp From d3cf1f983073a2e5ecd05e05dc9d721ae6ff79dd Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 4 Oct 2024 10:37:44 -0700 Subject: [PATCH 05/11] (elqui) add rubinobs-butler-lsstcam bucket --- ...ctbucketclaim-rubinobs-butler-lsstcam.yaml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-lsstcam.yaml diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-lsstcam.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-lsstcam.yaml new file mode 100644 index 000000000..7518358ff --- /dev/null +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/objectbucketclaim-rubinobs-butler-lsstcam.yaml @@ -0,0 +1,21 @@ +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: rubinobs-butler-lsstcam +provisioner: rook-ceph.ceph.rook.io/bucket +parameters: + objectStoreName: lfa + objectStoreNamespace: rook-ceph +reclaimPolicy: Retain +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: rubinobs-butler-lsstcam + namespace: rook-ceph +spec: + bucketName: rubinobs-butler-lsstcam + storageClassName: rubinobs-butler-lsstcam + additionalConfig: + maxSize: 34Ti # quota for BTS; XXX increase for cp From 5a463e389ba753500f10f4125a378b120e870d4f Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Thu, 31 Oct 2024 13:44:20 -0700 Subject: [PATCH 06/11] (elqui) add obs-env nfs --- .../elqui/templates/cephnfs-obs-env.yaml | 75 +++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml new file mode 100644 index 000000000..7c033fe44 --- /dev/null +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: ceph.rook.io/v1 +kind: CephFilesystem +metadata: + name: obs-env + namespace: rook-ceph +spec: + metadataPool: + failureDomain: host + replicated: + size: 3 + quotas: + maxSize: 10Gi + dataPools: + - failureDomain: host + replicated: + size: 3 + quotas: + maxSize: 250Gi + metadataServer: + activeCount: 3 + activeStandby: true + resources: + limits: + cpu: "4" + memory: 4Gi + requests: + cpu: "1" + memory: 4Gi + preserveFilesystemOnDelete: false +--- +apiVersion: ceph.rook.io/v1 +kind: CephNFS +metadata: + name: obs-env + namespace: rook-ceph +spec: + rados: + pool: obs-env-data0 + server: + active: 1 + resources: + limits: + cpu: "3" + memory: 8Gi + requests: + cpu: "1" + memory: 8Gi +--- +apiVersion: v1 +kind: Service +metadata: + labels: + app: rook-ceph-nfs + ceph_daemon_type: nfs + ceph_nfs: obs-env + instance: a + rook_cluster: rook-ceph + name: rook-ceph-nfs-obs-env + namespace: rook-ceph + annotations: + metallb.universe.tf/loadBalancerIPs: 139.229.181.22 +spec: + ports: + - name: nfs + port: 2049 + protocol: TCP + targetPort: 2049 + selector: + app: rook-ceph-nfs + ceph_daemon_type: nfs + ceph_nfs: obs-env + instance: a + rook_cluster: rook-ceph + type: LoadBalancer From 668b05531164cc6d92a758423a7545c3d835093c Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 1 Nov 2024 13:28:10 -0700 Subject: [PATCH 07/11] (TESTING) enable csi nfs --- fleet/lib/rook-ceph/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fleet/lib/rook-ceph/values.yaml b/fleet/lib/rook-ceph/values.yaml index 6f1186f91..3f0087b4a 100644 --- a/fleet/lib/rook-ceph/values.yaml +++ b/fleet/lib/rook-ceph/values.yaml @@ -26,6 +26,8 @@ csi: cephcsi: image: quay.io/cephcsi/cephcsi:v3.12.2 enableLiveness: true + nfs: + enabled: true provisionerTolerations: - <<: *storage_node_tol provisionerNodeAffinity: *storage_node_aff From 921afff0cac64a4c19b96ac322679a25b7fec59c Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 1 Nov 2024 15:18:46 -0700 Subject: [PATCH 08/11] (elqui) disable ceph pool autoscaling for cephfilesystem --- .../charts/elqui/templates/cephnfs-obs-env.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml index 7c033fe44..5f5721aa4 100644 --- a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml @@ -11,12 +11,22 @@ spec: size: 3 quotas: maxSize: 10Gi + parameters: + nodelete: "true" + nosizechange: "true" + pg_autoscale_mode: "off" + pg_num: "64" dataPools: - failureDomain: host replicated: size: 3 quotas: maxSize: 250Gi + parameters: + nodelete: "true" + nosizechange: "true" + pg_autoscale_mode: "off" + pg_num: "128" metadataServer: activeCount: 3 activeStandby: true From a627e8a6e73bcc3eb425fe71738c8fc037833da8 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 1 Nov 2024 15:33:59 -0700 Subject: [PATCH 09/11] (elqui) change cephfilesystem to single mds --- .../rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml index 5f5721aa4..7f4672a34 100644 --- a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cephnfs-obs-env.yaml @@ -28,7 +28,7 @@ spec: pg_autoscale_mode: "off" pg_num: "128" metadataServer: - activeCount: 3 + activeCount: 1 activeStandby: true resources: limits: From 49f82b98d0d58e345e67750b9f90834c95636071 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 1 Nov 2024 15:58:37 -0700 Subject: [PATCH 10/11] (elqui) permanently disable ceph orch --- fleet/lib/rook-ceph-conf/charts/elqui/templates/cm-cephcli.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cm-cephcli.yaml b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cm-cephcli.yaml index 27c7e56b5..9d12f7bd6 100644 --- a/fleet/lib/rook-ceph-conf/charts/elqui/templates/cm-cephcli.yaml +++ b/fleet/lib/rook-ceph-conf/charts/elqui/templates/cm-cephcli.yaml @@ -10,8 +10,6 @@ data: ceph orch set backend "" ceph mgr module disable rook - ceph mgr module enable rook - ceph orch set backend rook ceph device monitoring on ceph config set global device_failure_prediction_mode local ceph telemetry on --license sharing-1-0 From 7a905744a7dcbb5a052011952b2ee07e7b804463 Mon Sep 17 00:00:00 2001 From: Joshua Hoblitt Date: Fri, 8 Nov 2024 12:18:28 -0700 Subject: [PATCH 11/11] (TESTING) (elqui) set rgw_multipart_min_part_size to 1MiB --- fleet/lib/rook-ceph-cluster/overlays/elqui/values.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/fleet/lib/rook-ceph-cluster/overlays/elqui/values.yaml b/fleet/lib/rook-ceph-cluster/overlays/elqui/values.yaml index 22c668d41..e8f06fcc1 100644 --- a/fleet/lib/rook-ceph-cluster/overlays/elqui/values.yaml +++ b/fleet/lib/rook-ceph-cluster/overlays/elqui/values.yaml @@ -7,6 +7,7 @@ cephClusterSpec: osd_pool_default_pg_autoscale_mode: warn rgw_override_bucket_index_max_shards: "401" rgw_enable_usage_log: "false" + rgw_multipart_min_part_size: "1048576" mgr: mgr/balancer/upmap_max_deviation: "1" osd: