config.yaml

## Global settings affect all services
global:
  ## Whether to used the combined db - if this is true all the databases used
  ## by magda will be consolidated into a single pod. If it's false, they'll
  ## all run in separate pods unless some managed postgres solution (e.g.
  ## useCloudSql) is specified elsewhere.
  ## Running in separate pods scales better, but requires more resources
  ## Will be overwritten to `true` if deployed through `terraform`
  # useCombinedDb: true

  ## Whether to use the Google Cloud SQL managed postgres solution. Only set
  ## this to true if you already have cloud sql set up and be sure to look at
  ## settings for the cloud-sql-proxy service below.
  ## Will be overwritten to `false` if deployed through `terraform`
  # useCloudSql: false

  ## The external URL of the installation, e.g. "https://search.data.gov.au".
  ## This is used in a number of places, including linking back to the site
  ## from emails, creating a absolute sitemap URL for robots.txt, etc.
  externalUrl: https://example.com

  ## How to handle rolling updates during a deployment. If set to 0, kubernetes
  ## will happily take down pods while it replaces them (useful for dev setups).
  ## If set to 1, kubernetes will make sure one pod is available to take requests
  ## during upgrades. 0 by default.
  # rollingUpdate:
  #   maxUnavailable: 0

  ## Setting section for openfass
  # openfaas:
  #   # turn off auth over openfass gateway for ease of debugging
  #   allowAdminOnly: false

  image:
    ## What tag of the magda images to get. Look at
    ## for versions. https://github.com/magda-io/magda/releases/latest.
    ## By default this is the same version as the helm chart.
    # tag:
    ## The docker repository to get the images from - defaults to the official
    ## data61 docker hub repo
    # repository: data61
    ## The imagePullPolicy to use for images - generally unless you're actively
    ## trying to track development this should be "IfNotPresent", otherwise if
    ## you are it can be "Always". Defaults to "IfNotPresent"
    # pullPolicy: IfNotPresent
    ## Kubernetes secret to use if you're fetching images from a private repository
    # imagePullSecret: ""

  ## Specify the shared common settings for all connectors
  # connectors:

    ## Whether to run an initial job to crawl all these on install
    ## includeInitialJobs: false

    ## Whether to run recurring jobs on the schedule.
    ## includeCronJobs: true

    # Set default image setting for all connectors. 
    # You may explicitly set all connectors to use images from docker hub while load magda-core images from a private docker registry
    # image:
    #   repository: docker.io/data61
    #   tag: 0.0.57-0
    #   pullPolicy: IfNotPresent
    #   imagePullSecret: false
  ## Specify the shared common settings for all minions
  # minions:
    # Set default image setting for all minions. 
    # You may explicitly set all minions to use images from docker hub while load magda-core images from a private docker registry
    # image:
    #   repository: docker.io/data61
    #   tag: 0.0.57-0
    #   pullPolicy: IfNotPresent
    #   imagePullSecret: false

  ## Whether to expose the node ports of the pods in the cluster - useful for
  ## development, because it allows you to connect directly to pods without using
  ## kubectl proxy
  # exposeNodePorts: false

  ## Whether to enable kubernetes liveness probes - these continually hit pods in
  ## order to see whether they're still alive and restart them upon too many failures.
  ## This is essential for prod, but tends to create a lot of overhead in development
  # enableLivenessProbes: false

  ## The admin to use as the root admin user in the database. Defaults to the very first
  ## uuidv4
  # defaultAdminUserId: "00000000-0000-4000-8000-000000000000"

  ## Default log level to use across the project. Currently this only affects scala-based
  ## services
  # logLevel: INFO

  ## Whether to disable database authentication - useful in development, a very very bad
  ## idea in production
  # noDbAuth: false
  
  ## Whether to give the pods priority classes - these ensure that essential pods like
  ## ElasticSearch, Databases and the Gateway are schedules ahead of less essential pods
  ## like minions, but tend to cause problems if you're running Magda over multiple
  ## namespaces. Make sure that the priorities tag is true if you turn this on.
  # enablePriorityClass: true

  ## Whether to enable multi-tenancy (pre-alpha)
  # enableMultiTenants: false

## Tags allow for services to be turned on and off on the fly... e.g. if you don't need
## to send emails, turn the correspondence-api off
tags:
  ## "all" is a shortcut which turns on all services except minions.
  all: false

  ## General services. Note that if you are using combined-db, you need to set both
  ## combined-db and the databases that you are using (e.g. authorization-db) to true.
  ## You can find explanations of what each of these are below

  admin-api: true
  apidocs-server: true
  authorization-api: true
  authorization-db: true

  ## Will be overwritten to `false` if deployed through `terraform`
  # cloud-sql-proxy: true 
  
  combined-db: true
  content-db: true
  content-api: true
  correspondence-api: false  # Disabled because it needs an SMTP server to run
  elasticsearch: true
  gateway: true
  indexer: true
  preview-map: true
  registry-api: true
  registry-db: true
  search-api: true
  session-db: true
  web-server: true

  ## Whether to use an ingress, which is necessary for HTTPS and using the Google CDN with
  ## Google Kubernetes Engine. By default this is off, and access is provided to the cluster
  ## through a Kubernetes LoadBalancer service
  ## Will be overwritten to `false` if deployed through `terraform`
  # ingress: false

  ## Priority classes - see explanation under "enablePriorityClasses" above
  priorities: true

  ## Minions, which listen to changes in the registry and add extra metadata
  minion-broken-link: true
  minion-linked-data-rating: true
  minion-visualization: true
  minion-format: true

  ## Tenant api / db (pre-alpha)
  tenant-api: false
  tenant-db: false

  ## Open Policy Agent is used to determine what's authorised.
  opa: true

  ## storage-api is used to store dataset data files
  storage-api: true

  ## openfaas is used to manage & run serverless workload
  openfaas: true

  ## These are the external jobs that pull data in from other data portals. By default, all connectors are turned on
  ## Here we turn off all connectors for the initial deployment
  connectors: false
  ## You may want to turn DGA connector on after the initial deployment to generate some data
  ## connector-dga: true

## This is the config for the DGA connector. Only take effect if the connector is turned on with the `connector-dga` tag above
## More config examples can be found from [here](https://github.com/magda-io/magda/blob/master/deploy/helm/magda/values.yaml#L26)
connector-dga:
  ## Whether to run an initial job to crawl all these on install
  ## includeInitialJobs: false

  ## Whether to run recurring jobs on the schedule specified per connector.
  # includeCronJobs: true

  ## They can also have their resource requirements customised individually as per
  ## https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/ like so:
  # resources:
    # requests:
      # cpu: 0m
      # memory: 0Mi
    # limits:
      # cpu: 0m
      # memory: 0Mi

  ## You can also set image details (except name) for all connectors at this level - this will override
  ## what's in .global but be overridden by what's in the individual connector config.
  # image:
    # tag: vx.x.x
    # pullPolicy: IfNotPresent
    # imagePullSecret: ""
  config:
    ## Unique id to identify this connector and records that are harvested from it
    id: dga
    ## Friendly readable name
    name: "data.gov.au"
    ## The base URL of the place to source data from
    sourceUrl: "https://data.gov.au/"
    ## When crawling through from beginning to end, how big should the individual requests be in records?
    pageSize: 1000
    ## Crontab schedule for how often this should happen.
    schedule: "0 * * * *"
    ## CKAN-specific config: what harvest sources to ignore. * will ignore everything that's been harvested
    ## from another portal
    ignoreHarvestSources: ["*"]

    ## CKAN-only: Only get datasets that match this organisation name - you can only have one due to limitations
    ## in the CKAN API, if you need multiple organisations make multiple connectors.
    #allowedOrganisationName: "doee"

    ## Extra data that will be added to the 'source' aspect
    # extras:
    #   a: 122
    #   b:
    #     - 1

    ## Extra data that will be merged into the aspects that are recieved from the connector
    # presetRecordAspects:
    #   - id: "dcat-dataset-strings"
    #     recordType: "dataset"
    #     data:
    #       creation:
    #         isOpenData: true

# All core components are configurable under magda-core instead
# connectors & minions are not core components anymore
magda-core:
  ## Settings for the ingress. You need one of these if you want to do SSL or use the Google CDN on GKE
  ## Otherwise using the built-in Kubernetes LoadBalancer works ok.
  # ingress:
    ## The hostname of the site
    # hostname: example.com
    ## The name of the external ip to use - used when using the GCE ingress type
    # ipName:
    ## The class of the ingress - if running on GKE, gce will automatically work, otherwise you
    ## can use "nginx" which requires extra setup
    # ingressClass: gce
    ## Whether to enable TLS - requires a TLS secret to be set:
    ## https://kubernetes.io/docs/tasks/tls/managing-tls-in-a-cluster/
    # enableTls: false
    ## Name of the tls secret, defaults to "magda-cert-tls"
    # tlsSecretName: "magda-cert-tls"
    ## Domains to attempt to obtain an SSL certificate for from let's encrypt, if using certmanager
    ## (requires extra setup)
    # domains:
    #   - example.com

  ## Below are individual settings for services. All services can be customised as follows:
  # servicename:
    ## The number of replicas to run
    # replicas: 1

    ## The service's image as per the global options under "global.image" like so:
    # image:
      # tag: vx.x.x
      # pullPolicy: IfNotPresent
      # imagePullSecret: ""

    ## They can also have their resource requirements customised individually as per
    ## https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/ like so:
    # resources:
      # requests:
        # cpu: 0m
        # memory: 0Mi
      # limits:
        # cpu: 0m
        # memory: 0Mi

  ## Furthermore, all the stateful sets (elasticsearch.data and anything ending in -db) can
  ## have the volume that they claim customised like so:
  # databasename-db:
    # data:
      ## The amount of storage to allocate - be generous because it's often not easy to resize
      # storage: 200Gi
      ## The storage class - needs to correspond to a storage class configured in kubernetes
      # storageClass: "ssd"

  ## Database pods (anything ending in -db) can also be configured with streaming backups via
  ## WAL-E (https://github.com/wal-e/wal-e)
  # databasename-db:
    # waleBackup:
      ## The method to use for backup - either set to WAL (write-ahead-log) or leave blank
      ## for no backups
      # method: WAL

      ## The time (in the server's timezone) to do a full backup of the system
      # executionTime: 03:00

      ## Whether to only read backups but not write them
      # readOnly: false

      ## The WAL-E recovery mode to use - generally leave this out, or set to "immediate" for
      ## the quickest possible recovery at the expense of some data loss
      # recoveryMode:

      ## Settings for pushing backups to Google Storage
      ## The prefix of the place to push the backups
      # gsPrefix: "gs://magda-postgres-backups-asia/dev"

      ## Credentials for authenticating
      # googleApplicationCreds:
      ## The secret to look for the credentials json file in
      #   secretName: storage-account-credentials
      ## The filename of the credentials json mounted in the secret
      #   fileName: TerriaJS-5e042b649f8a.json

  ## Settings for the apidocs server, which serves the documentation for the api from
  ## /api/v0/apidocs/index.html
  # apidoc-server:

  ## Settings for the authorization-api, which determines what each user's permissions are:
  # authorization-api:

  ## Settings for the authorization-db, which serves the authorization-api
  # authorization-db:

  ## A pod for proxying to Google Cloud SQL, the GCE managed database service.
  # cloud-sql-proxy:
    ## The cloud sql instance to connect to
    # instanceConnectionName: project:region:instanceid

  ## A pod that combines all the databases if global.useCombinedDb is set to true
  # combined-db:

  ## A pod that controls the content in magda's UI
  # content-api:
    ## Override the SCSS variables used by Magda's SCSS files. Look at the "_variables.scss"
    ## file in the magda code for details. Most customisation can be performed by setting
    ## magda-color-primary and magda-color-secondary
    # scssVars: '{"magda-color-primary": "#272727", "magda-color-secondary": "#726a5c"}'

  ## The database behind content-api
  # content-db:

  ## A service that renders emails and forwards them to an SMTP server
  correspondence-api:
    ## The default recipient to email when there's no other choice - e.g. when asking a
    ## question about a dataset for which there's no organisational email
    defaultRecipient: "test@example.com"

    ## The hostname of the SMTP server to use - the password should be in a secret
    smtpHostname: "example.com"

    ## The SMTP port to use - beware, default SMTP ports are blocked on most cloud
    ## providers
    smtpPort: 80

  ## The elasticsearch setup - this can consist of up to three different kinds of pods -
  ## data nodes, master nodes and client nodes.
  # elasticsearch:
    ## Whether to use client and master nodes, or just use data nodes.
    # production: false

    ## All three kinds of nodes can be customised with these settings like so:
    # nodetype:
      ## The java heap size - this should be half the total memory request
      # heapSize: 3000m
      ## Elasticsearch plugins to install
      # pluginsInstall: "repository-gcs"

    ## Settings for the data nodes - this is a statefulset that actually holds the data,
    ## and is the only kind of node that's strictly necessary. You can have multiple
    ## replicas of these, they will discover each other and balance shards and replicas
    ## between them.
    # data:

    ## Settings for the client nodes - this accepts HTTP connections from search-api and
    ## indexer and forwards them to the data nodes.
    # client:

    ## Settings for the master nodes - these keep track of the data nodes as they go up
    ## and down and keep track of what the current state of the data should be.
    # master:

  ## The gateway accepts incoming connections and directs them to the appropriate api pod,
  ## or the web server pod
  gateway:    
    service:
      ## The type of kubernetes service to use for the gateway - set to LoadBalancer to
      ## expose an external IP, or set to "NodePort" if using an ingress
      ## Will be overwritten to `NodePort` if deployed through `terraform`
      type: LoadBalancer

      ## If using a LoadBalancer, the IP to try to use. Leave unset to get an automatic one
      # loadBalancerIP:

    ## If replacing a CKAN installation with Magda, this will redirect CKAN paths without
    ## direct magda equivalents (e.g. the API) to a ckan installation at this domain and path
    enableCkanRedirection: false
    # ckanRedirectionDomain: "example.com"
    # ckanRedirectionPath: "/data"

    ## Configure the autoscaler
    # autoscaler:
      ## whether to enable autoscaling
      # enabled: false
      ## The minimum number of replicas to have
      # minReplicas: 2
      ## The maximum number of replicas to have
      # maxReplicas: 4

    ## Client ids to use for OAuth authentication
    auth:
      ## Client id for facebook
      # facebookClientId: ""
      ## Client id for google
      # googleClientId: ""
      ## If you want to use ckan as a way to login, put the base URL here
      ckanAuthenticationUrl: https://data.gov.au/data
      ## If you want to use the Australian Department of Industry's VANguard service, put the details here:
      # vanguardWsFedIdpUrl: https://authentication.business.gov.au/fas/v2/wsfed12/authenticate
      # vanguardWsFedRealm: your-realm-here

    ## Configures the Content Security Policy that will be set on all responses -
    ## the contents will be converted to JSON and passed into https://github.com/helmetjs/csp
    csp:
      directives:
        scriptSrc:
        - "''self''"
        - "''unsafe-inline''"
        - browser-update.org
        objectSrc:
        - "''none''"

    ## Configures helmet.js - config will be converted to JSON and passed to
    ## https://github.com/helmetjs/helmet
    # helmet:

    ## Configures cross-origin resource sharing - will be converted to JSON and passed
    ## to https://www.npmjs.com/package/cors#configuring-cors
    # cors:

    ## Enables authorization, include a list of authorization options at /auth/providers
    ## and a simple form at /auth that allows you to test login and logout functionality
    ## without needing a web pod
    enableAuthEndpoint: true

    ## Redirect HTTP urls to HTTPs urls
    enableHttpsRedirection: false

    ## Turn on HTTP basic authentication - this uses secrets created by the create-secret script
    enableWebAccessControl: false

  ## Configures the service that puts datasets and organisations into elasticsearch
  # indexer:
    # elasticsearch:
      ## How many shards and replicas to use when creating a new index. Generally, more
      ## shards mean that the index will be split among more data nodes, and more replicas
      ## allows elasticsearch to copy those shards among different nodes.
      ## In general you want a number of shards equal to the lowest number of data nodes
      ## possible, and a number of replicas equal to the highest number.
      # shards: 1
      # replicas: 0

  ## Configures the TerriaJS server responsible for serving preview maps.
  # preview-map:
  #   image:
  #     repository: docker.io/data61
  #     tag: 0.0.57-0
  #     pullPolicy: IfNotPresent
  #     imagePullSecret: false

  ## Configures the registry, which holds information about datasets, distributions,
  ## organisations etc and distributes it.
  # registry-api:
    ## Log level to use
    # logLevel: INFO

    ## Stops checking for authorization on methods that change the state of the registry.
    ## Useful when running locally - DO NOT TURN ON IN PRODUCTION
    # skipAuthorization: false

    ## Configures the liveness probe that checks if the registry is still alive. This
    ## is configurable seperately because currently the registry api has a tendency to
    ## slow down over time, and needs to be restarted when this occurs.
    # livenessProbe:
      # timeoutSeconds: 10

    ## The registry api is able to be split into two kinds of deployments: full, which does everything
    ## but can only ever run one pod at a time, and read-only, which can only do read-only requests but
    ## is stateless and can be horizontally scaled
    # deployments:
      # full:
        ## Resources for the full deployment, as per other resource declarations
        # resources:

      # readOnly:
        ## Whether to enable the read-only deployment - if not all traffic will go to the full deployment,
        ## otherwise traffic through /api/v0/registry will come here and only traffic to /api/v0/registry-auth
        ## will go to the full deployment
        # enable: true
        
        ## Number of replicas
        # replicas: 2

        ## Resources for the full deployment, as per other resource declarations
        # resources:

  ## Configures the registry database, if a separate one is being used.
  # registry-db:

  ## Configures the API that connects to ElasticSearch to perform searches.
  # search-api:

  ## Configures the database used to store session information by passport js, if
  ## one is being used.
  # session-db:

  ## The server used for serving the web application to users.
  web-server:
    ## Autoscaler settings - the min and max number of pods to use
    # autoscaler:
    #   minReplicas: 2
    #   maxReplicas: 4

    ## Whether to disable the ability to log in in the UI - set this to true if
    ## you don't want to use any account features.
    # disableAuthenticationFeatures: true

    ## If you're replacing an existing system, you can set its url here to display a
    ## banner allowing people to go back if they don't like the new system.
    # fallbackUrl: "https://data.gov.au"

    # Google Analytics Ids to collect analytics data from the frontend
    # gapiIds:
      # - "ABC123"

    ## The search score after which to show a "suggest a dataset" form
    # datasetSearchSuggestionScoreThreshold: 65

    ## The id of the organization to display by default in the Add Dataset flow.
    # defaultOrganizationId: "abc"
    featureFlags:
      cataloguing: true
      previewAddDataset: true
      
    vocabularyApiEndpoints:
      - "https://vocabs.ands.org.au/repository/api/lda/abares/australian-land-use-and-management-classification/version-8/concept.json"
      - "https://vocabs.ands.org.au/repository/api/lda/neii/australian-landscape-water-balance/version-1/concept.json"
      - "https://vocabs.ands.org.au/repository/api/lda/ands-nc/controlled-vocabulary-for-resource-type-genres/version-1-1/concept.json"
  
  # Settings for the admin API, which allows for changes to be made to the magda
  # installation at runtime... e.g. creating new connector jobs
  # admin-api:

## Settings for the broken link minion, which crawls URLs of distributions in order
## to determine whether they're still available.
# minion-broken-link:

## Settings for the format minion, which tries to determine the true format of a
## distribution based on a number of criteria.
# minion-format:

## Settings for the linked data rating minion, which determines a rating for a dataset
## based off the availability, license and format of a dataset
# minion-linked-data-rating:

## Settings for the visualization minion, which looks at tabular data in order to
## determine which columns can be charted.
# minion-visualization: