From ae97496b102e055babc35cf8d9e4fd232216c626 Mon Sep 17 00:00:00 2001 From: Seth Call Date: Thu, 11 Dec 2025 22:57:03 -0600 Subject: [PATCH] Adding loki support --- k8s/DEBUGGING.md | 102 ++ k8s/activate-stg | 1 + k8s/loki/default-values.yaml | 1625 ++++++++++++++++++ k8s/loki/templates/podlogs.yaml | 46 + k8s/loki/values.yaml | 21 + k8s/monitoring/templates/loki-dashboard.yaml | 1140 ++++++++++-- k8s/monitoring/values-production.yaml | 1 + k8s/monitoring/values-staging.yaml | 1 + 8 files changed, 2806 insertions(+), 131 deletions(-) create mode 100644 k8s/DEBUGGING.md create mode 100644 k8s/activate-stg create mode 100644 k8s/loki/default-values.yaml create mode 100644 k8s/loki/templates/podlogs.yaml diff --git a/k8s/DEBUGGING.md b/k8s/DEBUGGING.md new file mode 100644 index 0000000..2ac2b56 --- /dev/null +++ b/k8s/DEBUGGING.md @@ -0,0 +1,102 @@ +# Debugging Guide + +This guide provides common commands and procedures for debugging the Kubernetes infrastructure, with a focus on Monitoring, Loki, and Grafana. + +## Prerequisites + +Ensure you have your environment activated: +```bash +source ~/bin/activate-stg # or activate-prod +``` + +## Monitoring & Logging (Loki, Grafana, Prometheus) + +### 1. Quick Status Check +Check if all pods are running in the relevant namespaces: +```bash +kubectl get pods -n monitoring +kubectl get pods -n loki +``` + +### 2. Verifying Loki Log Ingestion + +**Check if Loki is receiving logs:** +Paradoxically, "duplicate entry" errors are a *good* sign that logs are reaching Loki (it just means retries are happening). +```bash +kubectl -n loki logs -l app.kubernetes.io/name=loki --tail=50 +``` + +**Check if the Grafana Agent is sending logs:** +The Agent runs as a DaemonSet. Check the logs of one of the agent pods: +```bash +kubectl -n loki logs -l app.kubernetes.io/name=grafana-agent -c grafana-agent --tail=50 +``` +Look for errors like `401 Unauthorized` or `403 Forbidden`. + +**Inspect Agent Configuration:** +Verify the Agent is actually configured to scrape what you expect: +```bash +kubectl -n loki get secret loki-logs-config -o jsonpath='{.data.agent\.yml}' | base64 -d +``` + +### 3. Debugging Grafana + +**Check Grafana Logs:** +Look for datasource provisioning errors or plugin issues: +```bash +kubectl -n monitoring logs deployment/monitoring-grafana --tail=100 +``` + +**Verify Datasource Provisioning:** +Grafana uses a sidecar to watch secrets and provision datasources. Check its logs: +```bash +kubectl -n monitoring logs deployment/monitoring-grafana -c grafana-sc-datasources --tail=100 +``` + +**Inspect Provisioned Datasource File:** +Check the actual file generated inside the Grafana pod to ensure `uid`, `url`, etc., are correct: +```bash +kubectl -n monitoring exec deployment/monitoring-grafana -c grafana -- cat /etc/grafana/provisioning/datasources/datasource.yaml +``` + +**Restart Grafana:** +If you suspect configuration hasn't been picked up: +```bash +kubectl -n monitoring rollout restart deployment/monitoring-grafana +``` + +### 4. Connectivity Verification (The "Nuclear" Option) + +If the dashboard is empty but you think everything is working, run a query **directly from the Grafana pod** to Loki. This bypasses the UI and confirms network connectivity and data availability. + +**Test Connectivity:** +```bash +kubectl -n monitoring exec deployment/monitoring-grafana -- curl -s "http://loki.loki.svc:3100/loki/api/v1/labels" +``` + +**Query Actual Logs:** +This asks Loki for the last 10 log lines for *any* job. If this returns JSON data, Loki is working perfectly. +```bash +kubectl -n monitoring exec deployment/monitoring-grafana -- curl -G -s "http://loki.loki.svc:3100/loki/api/v1/query_range" --data-urlencode 'query={job=~".+"}' --data-urlencode 'limit=10' +``` + +## Common Issues & Fixes + +### "Datasource not found" in Dashboard +* **Cause**: The dashboard expects a specific Datasource UID (e.g., `uid: loki`), but Grafana generated a random one. +* **Fix**: Ensure `values.yaml` explicitly sets the UID: + ```yaml + additionalDataSources: + - name: Loki + uid: loki # <--- Critical + ``` + +### Logs not showing in Loki +* **Cause**: The `PodLogs` resource might be missing, or the Agent doesn't have permissions. +* **Check**: + 1. Ensure `ClusterRole` has `pods/log` permission. + 2. Ensure `LogsInstance` selector matches your `PodLogs` definition (or use `{}` to match all). + +### ArgoCD Out of Sync +* **Cause**: Sometimes ArgoCD doesn't auto-prune resources or fails to update Secrets. +* **Fix**: Sync manually with "Prune" enabled, or delete the conflicting resource manually if safe. diff --git a/k8s/activate-stg b/k8s/activate-stg new file mode 100644 index 0000000..e74b658 --- /dev/null +++ b/k8s/activate-stg @@ -0,0 +1 @@ +export KUBECONFIG=~/workspace/jamkazam/k8s/stg-video-cluster-kubeconfig.yaml diff --git a/k8s/loki/default-values.yaml b/k8s/loki/default-values.yaml new file mode 100644 index 0000000..7de0913 --- /dev/null +++ b/k8s/loki/default-values.yaml @@ -0,0 +1,1625 @@ +global: + image: + # -- Overrides the Docker registry globally for all images + registry: null + # -- Overrides the priorityClassName for all pods + priorityClassName: null + # -- configures cluster domain ("cluster.local" by default) + clusterDomain: "cluster.local" + # -- configures DNS service name + dnsService: "kube-dns" + # -- configures DNS service namespace + dnsNamespace: "kube-system" +# -- Overrides the chart's name +nameOverride: null +# -- Overrides the chart's computed fullname +fullnameOverride: null +# -- Overrides the chart's cluster label +clusterLabelOverride: null +# -- Image pull secrets for Docker images +imagePullSecrets: [] +kubectlImage: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: bitnami/kubectl + # -- Overrides the image tag whose default is the chart's appVersion + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent +loki: + # Configures the readiness probe for all of the Loki pods + readinessProbe: + httpGet: + path: /ready + port: http-metrics + initialDelaySeconds: 30 + timeoutSeconds: 1 + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki + # -- Overrides the image tag whose default is the chart's appVersion + # TODO: needed for 3rd target backend functionality + # revert to null or latest once this behavior is relased + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Common annotations for all deployments/StatefulSets + annotations: {} + # -- Common annotations for all pods + podAnnotations: {} + # -- Common labels for all pods + podLabels: {} + # -- Common annotations for all services + serviceAnnotations: {} + # -- Common labels for all services + serviceLabels: {} + # -- The number of old ReplicaSets to retain to allow rollback + revisionHistoryLimit: 10 + # -- The SecurityContext for Loki pods + podSecurityContext: + fsGroup: 10001 + runAsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + # -- The SecurityContext for Loki containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Should enableServiceLinks be enabled. Default to enable + enableServiceLinks: true + # -- Specify an existing secret containing loki configuration. If non-empty, overrides `loki.config` + existingSecretForConfig: "" + # -- Defines what kind of object stores the configuration, a ConfigMap or a Secret. + # In order to move sensitive information (such as credentials) from the ConfigMap/Secret to a more secure location (e.g. vault), it is possible to use [environment variables in the configuration](https://grafana.com/docs/loki/latest/configuration/#use-environment-variables-in-the-configuration). + # Such environment variables can be then stored in a separate Secret and injected via the global.extraEnvFrom value. For details about environment injection from a Secret please see [Secrets](https://kubernetes.io/docs/concepts/configuration/secret/#use-case-as-container-environment-variables). + configStorageType: ConfigMap + # -- Name of the Secret or ConfigMap that contains the configuration (used for naming even if config is internal). + externalConfigSecretName: '{{ include "loki.name" . }}' + # -- Config file contents for Loki + # @default -- See values.yaml + config: | + {{- if .Values.enterprise.enabled}} + {{- tpl .Values.enterprise.config . }} + {{- else }} + auth_enabled: {{ .Values.loki.auth_enabled }} + {{- end }} + + {{- with .Values.loki.server }} + server: + {{- toYaml . | nindent 2}} + {{- end}} + + memberlist: + {{- if .Values.loki.memberlistConfig }} + {{- toYaml .Values.loki.memberlistConfig | nindent 2 }} + {{- else }} + {{- if .Values.loki.extraMemberlistConfig}} + {{- toYaml .Values.loki.extraMemberlistConfig | nindent 2}} + {{- end }} + join_members: + - {{ include "loki.memberlist" . }} + {{- with .Values.migrate.fromDistributed }} + {{- if .enabled }} + - {{ .memberlistService }} + {{- end }} + {{- end }} + {{- end }} + + {{- with .Values.loki.ingester }} + ingester: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- if .Values.loki.commonConfig}} + common: + {{- toYaml .Values.loki.commonConfig | nindent 2}} + storage: + {{- include "loki.commonStorageConfig" . | nindent 4}} + {{- end}} + + {{- with .Values.loki.limits_config }} + limits_config: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + runtime_config: + file: /etc/loki/runtime-config/runtime-config.yaml + + {{- with .Values.loki.memcached.chunk_cache }} + {{- if and .enabled (or .host .addresses) }} + chunk_store_config: + chunk_cache_config: + memcached: + batch_size: {{ .batch_size }} + parallelism: {{ .parallelism }} + memcached_client: + {{- if .host }} + host: {{ .host }} + {{- end }} + {{- if .addresses }} + addresses: {{ .addresses }} + {{- end }} + service: {{ .service }} + {{- end }} + {{- end }} + + {{- if .Values.loki.schemaConfig }} + schema_config: + {{- toYaml .Values.loki.schemaConfig | nindent 2}} + {{- else }} + schema_config: + configs: + - from: 2022-01-11 + store: boltdb-shipper + object_store: {{ .Values.loki.storage.type }} + schema: v12 + index: + prefix: loki_index_ + period: 24h + {{- end }} + + {{ include "loki.rulerConfig" . }} + + {{- if or .Values.tableManager.retention_deletes_enabled .Values.tableManager.retention_period }} + table_manager: + retention_deletes_enabled: {{ .Values.tableManager.retention_deletes_enabled }} + retention_period: {{ .Values.tableManager.retention_period }} + {{- end }} + + {{- with .Values.loki.memcached.results_cache }} + query_range: + align_queries_with_step: true + {{- if and .enabled (or .host .addresses) }} + cache_results: {{ .enabled }} + results_cache: + cache: + default_validity: {{ .default_validity }} + memcached_client: + {{- if .host }} + host: {{ .host }} + {{- end }} + {{- if .addresses }} + addresses: {{ .addresses }} + {{- end }} + service: {{ .service }} + timeout: {{ .timeout }} + {{- end }} + {{- end }} + + {{- with .Values.loki.storage_config }} + storage_config: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.query_scheduler }} + query_scheduler: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.compactor }} + compactor: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.analytics }} + analytics: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.querier }} + querier: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.index_gateway }} + index_gateway: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.frontend }} + frontend: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.frontend_worker }} + frontend_worker: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + {{- with .Values.loki.distributor }} + distributor: + {{- tpl (. | toYaml) $ | nindent 4 }} + {{- end }} + + tracing: + enabled: {{ .Values.loki.tracing.enabled }} + # Should authentication be enabled + auth_enabled: true + # -- memberlist configuration (overrides embedded default) + memberlistConfig: {} + # -- Extra memberlist configuration + extraMemberlistConfig: {} + # -- Tenants list to be created on nginx htpasswd file, with name and password keys + tenants: [] + # -- Check https://grafana.com/docs/loki/latest/configuration/#server for more info on the server configuration. + server: + http_listen_port: 3100 + grpc_listen_port: 9095 + # -- Limits config + limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + max_cache_freshness_per_query: 10m + split_queries_by_interval: 15m + # -- Provides a reloadable runtime configuration file for some specific configuration + runtimeConfig: {} + # -- Check https://grafana.com/docs/loki/latest/configuration/#common_config for more info on how to provide a common configuration + commonConfig: + path_prefix: /var/loki + replication_factor: 3 + compactor_address: '{{ include "loki.compactorAddress" . }}' + # -- Storage config. Providing this will automatically populate all necessary storage configs in the templated config. + storage: + bucketNames: + chunks: chunks + ruler: ruler + admin: admin + type: s3 + s3: + s3: null + endpoint: null + region: null + secretAccessKey: null + accessKeyId: null + signatureVersion: null + s3ForcePathStyle: false + insecure: false + http_config: {} + # -- Check https://grafana.com/docs/loki/latest/configure/#s3_storage_config for more info on how to provide a backoff_config + backoff_config: {} + gcs: + chunkBufferSize: 0 + requestTimeout: "0s" + enableHttp2: true + azure: + accountName: null + accountKey: null + connectionString: null + useManagedIdentity: false + useFederatedToken: false + userAssignedId: null + requestTimeout: null + endpointSuffix: null + swift: + auth_version: null + auth_url: null + internal: null + username: null + user_domain_name: null + user_domain_id: null + user_id: null + password: null + domain_id: null + domain_name: null + project_id: null + project_name: null + project_domain_id: null + project_domain_name: null + region_name: null + container_name: null + max_retries: null + connect_timeout: null + request_timeout: null + filesystem: + chunks_directory: /var/loki/chunks + rules_directory: /var/loki/rules + # -- Configure memcached as an external cache for chunk and results cache. Disabled by default + # must enable and specify a host for each cache you would like to use. + memcached: + chunk_cache: + enabled: false + host: "" + service: "memcached-client" + batch_size: 256 + parallelism: 10 + results_cache: + enabled: false + host: "" + service: "memcached-client" + timeout: "500ms" + default_validity: "12h" + # -- Check https://grafana.com/docs/loki/latest/configuration/#schema_config for more info on how to configure schemas + schemaConfig: {} + # -- Check https://grafana.com/docs/loki/latest/configuration/#ruler for more info on configuring ruler + rulerConfig: {} + # -- Structured loki configuration, takes precedence over `loki.config`, `loki.schemaConfig`, `loki.storageConfig` + structuredConfig: {} + # -- Additional query scheduler config + query_scheduler: {} + # -- Additional storage config + storage_config: + hedging: + at: "250ms" + max_per_second: 20 + up_to: 3 + # -- Optional compactor configuration + compactor: {} + # -- Optional analytics configuration + analytics: {} + # -- Optional querier configuration + querier: {} + # -- Optional ingester configuration + ingester: {} + # -- Optional index gateway configuration + index_gateway: + mode: ring + frontend: + scheduler_address: '{{ include "loki.querySchedulerAddress" . }}' + frontend_worker: + scheduler_address: '{{ include "loki.querySchedulerAddress" . }}' + # -- Optional distributor configuration + distributor: {} + # -- Enable tracing + tracing: + enabled: false +enterprise: + # Enable enterprise features, license must be provided + enabled: false + # Default verion of GEL to deploy + version: v1.8.4 + # -- Optional name of the GEL cluster, otherwise will use .Release.Name + # The cluster name must match what is in your GEL license + cluster_name: null + # -- Grafana Enterprise Logs license + # In order to use Grafana Enterprise Logs features, you will need to provide + # the contents of your Grafana Enterprise Logs license, either by providing the + # contents of the license.jwt, or the name Kubernetes Secret that contains your + # license.jwt. + # To set the license contents, use the flag `--set-file 'enterprise.license.contents=./license.jwt'` + license: + contents: "NOTAVALIDLICENSE" + # -- Set to true when providing an external license + useExternalLicense: false + # -- Name of external license secret to use + externalLicenseName: null + # -- Name of the external config secret to use + externalConfigName: "" + # -- If enabled, the correct admin_client storage will be configured. If disabled while running enterprise, + # make sure auth is set to `type: trust`, or that `auth_enabled` is set to `false`. + adminApi: + enabled: true + # enterprise specific sections of the config.yaml file + config: | + {{- if .Values.enterprise.adminApi.enabled }} + {{- if or .Values.minio.enabled (eq .Values.loki.storage.type "s3") (eq .Values.loki.storage.type "gcs") (eq .Values.loki.storage.type "azure") }} + admin_client: + storage: + s3: + bucket_name: {{ .Values.loki.storage.bucketNames.admin }} + {{- end }} + {{- end }} + auth: + type: {{ .Values.enterprise.adminApi.enabled | ternary "enterprise" "trust" }} + auth_enabled: {{ .Values.loki.auth_enabled }} + cluster_name: {{ include "loki.clusterName" . }} + license: + path: /etc/loki/license/license.jwt + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/enterprise-logs + # -- Docker image tag + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + adminToken: + # -- Alternative name for admin token secret, needed by tokengen and provisioner jobs + secret: null + # -- Additional namespace to also create the token in. Useful if your Grafana instance + # is in a different namespace + additionalNamespaces: [] + # -- Alternative name of the secret to store token for the canary + canarySecret: null + # -- Configuration for `tokengen` target + tokengen: + # -- Whether the job should be part of the deployment + enabled: true + # -- Comma-separated list of Loki modules to load for tokengen + targetModule: "tokengen" + # -- Additional CLI arguments for the `tokengen` target + extraArgs: [] + # -- Additional Kubernetes environment + env: [] + # -- Additional labels for the `tokengen` Job + labels: {} + # -- Additional annotations for the `tokengen` Job + annotations: {} + # -- Tolerations for tokengen Job + tolerations: [] + # -- Additional volumes for Pods + extraVolumes: [] + # -- Additional volume mounts for Pods + extraVolumeMounts: [] + # -- Run containers as user `enterprise-logs(uid=10001)` + securityContext: + runAsNonRoot: true + runAsGroup: 10001 + runAsUser: 10001 + fsGroup: 10001 + # -- Environment variables from secrets or configmaps to add to the tokengen pods + extraEnvFrom: [] + # -- The name of the PriorityClass for tokengen Pods + priorityClassName: "" + # -- Configuration for `provisioner` target + provisioner: + # -- Whether the job should be part of the deployment + enabled: true + # -- Name of the secret to store provisioned tokens in + provisionedSecretPrefix: null + # -- Additional tenants to be created. Each tenant will get a read and write policy + # and associated token. Tenant must have a name and a namespace for the secret containting + # the token to be created in. For example + # additionalTenants: + # - name: loki + # secretNamespace: grafana + additionalTenants: [] + # -- Additional Kubernetes environment + env: [] + # -- Additional labels for the `provisioner` Job + labels: {} + # -- Additional annotations for the `provisioner` Job + annotations: {} + # -- The name of the PriorityClass for provisioner Job + priorityClassName: null + # -- Run containers as user `enterprise-logs(uid=10001)` + securityContext: + runAsNonRoot: true + runAsGroup: 10001 + runAsUser: 10001 + fsGroup: 10001 + # -- Provisioner image to Utilize + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/enterprise-logs-provisioner + # -- Overrides the image tag whose default is the chart's appVersion + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Volume mounts to add to the provisioner pods + extraVolumeMounts: [] +# -- Options that may be necessary when performing a migration from another helm chart +migrate: + # -- When migrating from a distributed chart like loki-distributed or enterprise-logs + fromDistributed: + # -- Set to true if migrating from a distributed helm chart + enabled: false + # -- If migrating from a distributed service, provide the distributed deployment's + # memberlist service DNS so the new deployment can join its ring. + memberlistService: "" +serviceAccount: + # -- Specifies whether a ServiceAccount should be created + create: true + # -- The name of the ServiceAccount to use. + # If not set and create is true, a name is generated using the fullname template + name: null + # -- Image pull secrets for the service account + imagePullSecrets: [] + # -- Annotations for the service account + annotations: {} + # -- Labels for the service account + labels: {} + # -- Set this toggle to false to opt out of automounting API credentials for the service account + automountServiceAccountToken: true +# RBAC configuration +rbac: + # -- If pspEnabled true, a PodSecurityPolicy is created for K8s that use psp. + pspEnabled: false + # -- For OpenShift set pspEnabled to 'false' and sccEnabled to 'true' to use the SecurityContextConstraints. + sccEnabled: false + # -- Specify PSP annotations + # Ref: https://kubernetes.io/docs/reference/access-authn-authz/psp-to-pod-security-standards/#podsecuritypolicy-annotations + pspAnnotations: {} + # seccomp.security.alpha.kubernetes.io/allowedProfileNames: '*' + # seccomp.security.alpha.kubernetes.io/defaultProfileName: 'docker/default' + # apparmor.security.beta.kubernetes.io/defaultProfileName: 'runtime/default' + # -- Whether to install RBAC in the namespace only or cluster-wide. Useful if you want to watch ConfigMap globally. + namespaced: false +# -- Section for configuring optional Helm test +test: + enabled: true + # -- Address of the prometheus server to query for the test + prometheusAddress: "http://prometheus:9090" + # -- Number of times to retry the test before failing + timeout: 1m + # -- Additional labels for the test pods + labels: {} + # -- Additional annotations for test pods + annotations: {} + # -- Image to use for loki canary + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki-helm-test + # -- Overrides the image tag whose default is the chart's appVersion + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent +# Monitoring section determines which monitoring features to enable +monitoring: + # Dashboards for monitoring Loki + dashboards: + # -- If enabled, create configmap with dashboards for monitoring Loki + enabled: true + # -- Alternative namespace to create dashboards ConfigMap in + namespace: null + # -- Additional annotations for the dashboards ConfigMap + annotations: {} + # -- Labels for the dashboards ConfigMap + labels: + grafana_dashboard: "1" + # Recording rules for monitoring Loki, required for some dashboards + rules: + # -- If enabled, create PrometheusRule resource with Loki recording rules + enabled: true + # -- Include alerting rules + alerting: true + # -- Specify which individual alerts should be disabled + # -- Instead of turning off each alert one by one, set the .monitoring.rules.alerting value to false instead. + # -- If you disable all the alerts and keep .monitoring.rules.alerting set to true, the chart will fail to render. + disabled: {} + # LokiRequestErrors: true + # LokiRequestPanics: true + # -- Alternative namespace to create PrometheusRule resources in + namespace: null + # -- Additional annotations for the rules PrometheusRule resource + annotations: {} + # -- Additional labels for the rules PrometheusRule resource + labels: {} + # -- Additional labels for PrometheusRule alerts + additionalRuleLabels: {} + # -- Additional groups to add to the rules file + additionalGroups: [] + # - name: additional-loki-rules + # rules: + # - record: job:loki_request_duration_seconds_bucket:sum_rate + # expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job) + # - record: job_route:loki_request_duration_seconds_bucket:sum_rate + # expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, job, route) + # - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + # expr: sum(rate(container_cpu_usage_seconds_total[1m])) by (node, namespace, pod, container) + # ServiceMonitor configuration + serviceMonitor: + # -- If enabled, ServiceMonitor resources for Prometheus Operator are created + enabled: true + # -- Namespace selector for ServiceMonitor resources + namespaceSelector: {} + # -- ServiceMonitor annotations + annotations: {} + # -- Additional ServiceMonitor labels + labels: {} + # -- ServiceMonitor scrape interval + # Default is 15s because included recording rules use a 1m rate, and scrape interval needs to be at + # least 1/4 rate interval. + interval: 15s + # -- ServiceMonitor scrape timeout in Go duration format (e.g. 15s) + scrapeTimeout: null + # -- ServiceMonitor relabel configs to apply to samples before scraping + # https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#relabelconfig + relabelings: [] + # -- ServiceMonitor metric relabel configs to apply to samples before ingestion + # https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api.md#endpoint + metricRelabelings: [] + # -- ServiceMonitor will use http by default, but you can pick https as well + scheme: http + # -- ServiceMonitor will use these tlsConfig settings to make the health check requests + tlsConfig: null + # -- If defined, will create a MetricsInstance for the Grafana Agent Operator. + metricsInstance: + # -- If enabled, MetricsInstance resources for Grafana Agent Operator are created + enabled: true + # -- MetricsInstance annotations + annotations: {} + # -- Additional MetricsInstance labels + labels: {} + # -- If defined a MetricsInstance will be created to remote write metrics. + remoteWrite: null + # Self monitoring determines whether Loki should scrape its own logs. + # This feature currently relies on the Grafana Agent Operator being installed, + # which is installed by default using the grafana-agent-operator sub-chart. + # It will create custom resources for GrafanaAgent, LogsInstance, and PodLogs to configure + # scrape configs to scrape its own logs with the labels expected by the included dashboards. + selfMonitoring: + enabled: true + # -- Tenant to use for self monitoring + tenant: + # -- Name of the tenant + name: "self-monitoring" + # -- Namespace to create additional tenant token secret in. Useful if your Grafana instance + # is in a separate namespace. Token will still be created in the canary namespace. + secretNamespace: "{{ .Release.Namespace }}" + # Grafana Agent configuration + grafanaAgent: + # -- Controls whether to install the Grafana Agent Operator and its CRDs. + # Note that helm will not install CRDs if this flag is enabled during an upgrade. + # In that case install the CRDs manually from https://github.com/grafana/agent/tree/main/production/operator/crds + installOperator: true + # -- Grafana Agent annotations + annotations: {} + # -- Additional Grafana Agent labels + labels: {} + # -- Enable the config read api on port 8080 of the agent + enableConfigReadAPI: false + # -- The name of the PriorityClass for GrafanaAgent pods + priorityClassName: null + # -- Tolerations for GrafanaAgent pods + tolerations: [] + # PodLogs configuration + podLogs: + # -- PodLogs version + apiVersion: monitoring.grafana.com/v1alpha1 + # -- PodLogs annotations + annotations: {} + # -- Additional PodLogs labels + labels: {} + # -- PodLogs relabel configs to apply to samples before scraping + # https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#relabelconfig + relabelings: [] + # LogsInstance configuration + logsInstance: + # -- LogsInstance annotations + annotations: {} + # -- Additional LogsInstance labels + labels: {} + # -- Additional clients for remote write + clients: null + # The Loki canary pushes logs to and queries from this loki installation to test + # that it's working correctly + lokiCanary: + enabled: true + # -- The name of the label to look for at loki when doing the checks. + labelname: pod + # -- Additional annotations for the `loki-canary` Daemonset + annotations: {} + # -- Additional labels for each `loki-canary` pod + podLabels: {} + service: + # -- Annotations for loki-canary Service + annotations: {} + # -- Additional labels for loki-canary Service + labels: {} + # -- Additional CLI arguments for the `loki-canary' command + extraArgs: [] + # -- Environment variables to add to the canary pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the canary pods + extraEnvFrom: [] + # -- Resource requests and limits for the canary + resources: {} + # -- DNS config for canary pods + dnsConfig: {} + # -- Node selector for canary pods + nodeSelector: {} + # -- Tolerations for canary pods + tolerations: [] + # -- The name of the PriorityClass for loki-canary pods + priorityClassName: null + # -- Image to use for loki canary + image: + # -- The Docker registry + registry: docker.io + # -- Docker image repository + repository: grafana/loki-canary + # -- Overrides the image tag whose default is the chart's appVersion + tag: null + # -- Overrides the image tag with an image digest + digest: null + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Update strategy for the `loki-canary` Daemonset pods + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 +# Configuration for the write pod(s) +write: + # -- Number of replicas for the write + replicas: 3 + autoscaling: + # -- Enable autoscaling for the write. + enabled: false + # -- Minimum autoscaling replicas for the write. + minReplicas: 2 + # -- Maximum autoscaling replicas for the write. + maxReplicas: 6 + # -- Target CPU utilisation percentage for the write. + targetCPUUtilizationPercentage: 60 + # -- Target memory utilization percentage for the write. + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: + # -- see https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown for scaledown details + scaleUp: + policies: + - type: Pods + value: 1 + periodSeconds: 900 + scaleDown: + policies: + - type: Pods + value: 1 + periodSeconds: 1800 + stabilizationWindowSeconds: 3600 + image: + # -- The Docker registry for the write image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the write image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the write image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for write pods + priorityClassName: null + # -- Annotations for write StatefulSet + annotations: {} + # -- Annotations for write pods + podAnnotations: {} + # -- Additional labels for each `write` pod + podLabels: {} + # -- Additional selector labels for each `write` pod + selectorLabels: {} + service: + # -- Annotations for write Service + annotations: {} + # -- Additional labels for write Service + labels: {} + # -- Comma-separated list of Loki modules to load for the write + targetModule: "write" + # -- Additional CLI args for the write + extraArgs: [] + # -- Environment variables to add to the write pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the write pods + extraEnvFrom: [] + # -- Lifecycle for the write container + lifecycle: {} + # -- The default /flush_shutdown preStop hook is recommended as part of the ingester + # scaledown process so it's added to the template by default when autoscaling is enabled, + # but it's disabled to optimize rolling restarts in instances that will never be scaled + # down or when using chunks storage with WAL disabled. + # https://github.com/grafana/loki/blob/main/docs/sources/operations/storage/wal.md#how-to-scale-updown + # -- Init containers to add to the write pods + initContainers: [] + # -- Containers to add to the write pods + extraContainers: [] + # -- Volume mounts to add to the write pods + extraVolumeMounts: [] + # -- Volumes to add to the write pods + extraVolumes: [] + # -- volumeClaimTemplates to add to StatefulSet + extraVolumeClaimTemplates: [] + # -- Resource requests and limits for the write + resources: {} + # -- Grace period to allow the write to shutdown before it is killed. Especially for the ingester, + # this must be increased. It must be long enough so writes can be gracefully shutdown flushing/transferring + # all data and to successfully leave the member ring on shutdown. + terminationGracePeriodSeconds: 300 + # -- Affinity for write pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.writeSelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + # -- DNS config for write pods + dnsConfig: {} + # -- Node selector for write pods + nodeSelector: {} + # -- Topology Spread Constraints for write pods + topologySpreadConstraints: [] + # -- Tolerations for write pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + persistence: + # -- Enable volume claims in pod spec + volumeClaimsEnabled: true + # -- Parameters used for the `data` volume when volumeClaimEnabled if false + dataVolumeParameters: + emptyDir: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: false + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null +# Configuration for the table-manager +tableManager: + # -- Specifies whether the table-manager should be enabled + enabled: false + image: + # -- The Docker registry for the table-manager image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the table-manager image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the table-manager image. Overrides `loki.image.tag` + tag: null + # -- Command to execute instead of defined in Docker image + command: null + # -- The name of the PriorityClass for table-manager pods + priorityClassName: null + # -- Labels for table-manager pods + podLabels: {} + # -- Annotations for table-manager deployment + annotations: {} + # -- Annotations for table-manager pods + podAnnotations: {} + service: + # -- Annotations for table-manager Service + annotations: {} + # -- Additional labels for table-manager Service + labels: {} + # -- Additional CLI args for the table-manager + extraArgs: [] + # -- Environment variables to add to the table-manager pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the table-manager pods + extraEnvFrom: [] + # -- Volume mounts to add to the table-manager pods + extraVolumeMounts: [] + # -- Volumes to add to the table-manager pods + extraVolumes: [] + # -- Resource requests and limits for the table-manager + resources: {} + # -- Containers to add to the table-manager pods + extraContainers: [] + # -- Grace period to allow the table-manager to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for table-manager pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.tableManagerSelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchLabels: + {{- include "loki.tableManagerSelectorLabels" . | nindent 12 }} + topologyKey: failure-domain.beta.kubernetes.io/zone + # -- DNS config table-manager pods + dnsConfig: {} + # -- Node selector for table-manager pods + nodeSelector: {} + # -- Tolerations for table-manager pods + tolerations: [] + # -- Enable deletes by retention + retention_deletes_enabled: false + # -- Set retention period + retention_period: 0 +# Configuration for the read pod(s) +read: + # -- Number of replicas for the read + replicas: 3 + autoscaling: + # -- Enable autoscaling for the read, this is only used if `queryIndex.enabled: true` + enabled: false + # -- Minimum autoscaling replicas for the read + minReplicas: 2 + # -- Maximum autoscaling replicas for the read + maxReplicas: 6 + # -- Target CPU utilisation percentage for the read + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the read + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + image: + # -- The Docker registry for the read image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the read image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the read image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for read pods + priorityClassName: null + # -- Annotations for read deployment + annotations: {} + # -- Annotations for read pods + podAnnotations: {} + # -- Additional labels for each `read` pod + podLabels: {} + # -- Additional selector labels for each `read` pod + selectorLabels: {} + service: + # -- Annotations for read Service + annotations: {} + # -- Additional labels for read Service + labels: {} + # -- Comma-separated list of Loki modules to load for the read + targetModule: "read" + # -- Whether or not to use the 2 target type simple scalable mode (read, write) or the + # 3 target type (read, write, backend). Legacy refers to the 2 target type, so true will + # run two targets, false will run 3 targets. + legacyReadTarget: false + # -- Additional CLI args for the read + extraArgs: [] + # -- Environment variables to add to the read pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the read pods + extraEnvFrom: [] + # -- Lifecycle for the read container + lifecycle: {} + # -- Volume mounts to add to the read pods + extraVolumeMounts: [] + # -- Volumes to add to the read pods + extraVolumes: [] + # -- Resource requests and limits for the read + resources: {} + # -- Grace period to allow the read to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for read pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.readSelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + # -- DNS config for read pods + dnsConfig: {} + # -- Node selector for read pods + nodeSelector: {} + # -- Topology Spread Constraints for read pods + topologySpreadConstraints: [] + # -- Tolerations for read pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + persistence: + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null +# Configuration for the backend pod(s) +backend: + # -- Number of replicas for the backend + replicas: 3 + autoscaling: + # -- Enable autoscaling for the backend. + enabled: false + # -- Minimum autoscaling replicas for the backend. + minReplicas: 2 + # -- Maximum autoscaling replicas for the backend. + maxReplicas: 6 + # -- Target CPU utilization percentage for the backend. + targetCPUUtilizationPercentage: 60 + # -- Target memory utilization percentage for the backend. + targetMemoryUtilizationPercentage: + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + image: + # -- The Docker registry for the backend image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the backend image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the backend image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for backend pods + priorityClassName: null + # -- Annotations for backend StatefulSet + annotations: {} + # -- Annotations for backend pods + podAnnotations: {} + # -- Additional labels for each `backend` pod + podLabels: {} + # -- Additional selector labels for each `backend` pod + selectorLabels: {} + service: + # -- Annotations for backend Service + annotations: {} + # -- Additional labels for backend Service + labels: {} + # -- Comma-separated list of Loki modules to load for the read + targetModule: "backend" + # -- Additional CLI args for the backend + extraArgs: [] + # -- Environment variables to add to the backend pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the backend pods + extraEnvFrom: [] + # -- Init containers to add to the backend pods + initContainers: [] + # -- Volume mounts to add to the backend pods + extraVolumeMounts: [] + # -- Volumes to add to the backend pods + extraVolumes: [] + # -- Resource requests and limits for the backend + resources: {} + # -- Grace period to allow the backend to shutdown before it is killed. Especially for the ingester, + # this must be increased. It must be long enough so backends can be gracefully shutdown flushing/transferring + # all data and to successfully leave the member ring on shutdown. + terminationGracePeriodSeconds: 300 + # -- Affinity for backend pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.backendSelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + # -- DNS config for backend pods + dnsConfig: {} + # -- Node selector for backend pods + nodeSelector: {} + # -- Topology Spread Constraints for backend pods + topologySpreadConstraints: [] + # -- Tolerations for backend pods + tolerations: [] + # -- The default is to deploy all pods in parallel. + podManagementPolicy: "Parallel" + persistence: + # -- Enable volume claims in pod spec + volumeClaimsEnabled: true + # -- Parameters used for the `data` volume when volumeClaimEnabled if false + dataVolumeParameters: + emptyDir: {} + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null +# Configuration for the single binary node(s) +singleBinary: + # -- Number of replicas for the single binary + replicas: 0 + autoscaling: + # -- Enable autoscaling + enabled: false + # -- Minimum autoscaling replicas for the single binary + minReplicas: 1 + # -- Maximum autoscaling replicas for the single binary + maxReplicas: 3 + # -- Target CPU utilisation percentage for the single binary + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the single binary + targetMemoryUtilizationPercentage: + image: + # -- The Docker registry for the single binary image. Overrides `loki.image.registry` + registry: null + # -- Docker image repository for the single binary image. Overrides `loki.image.repository` + repository: null + # -- Docker image tag for the single binary image. Overrides `loki.image.tag` + tag: null + # -- The name of the PriorityClass for single binary pods + priorityClassName: null + # -- Annotations for single binary StatefulSet + annotations: {} + # -- Annotations for single binary pods + podAnnotations: {} + # -- Additional labels for each `single binary` pod + podLabels: {} + # -- Additional selector labels for each `single binary` pod + selectorLabels: {} + service: + # -- Annotations for single binary Service + annotations: {} + # -- Additional labels for single binary Service + labels: {} + # -- Comma-separated list of Loki modules to load for the single binary + targetModule: "all" + # -- Labels for single binary service + extraArgs: [] + # -- Environment variables to add to the single binary pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the single binary pods + extraEnvFrom: [] + # -- Extra containers to add to the single binary loki pod + extraContainers: [] + # -- Init containers to add to the single binary pods + initContainers: [] + # -- Volume mounts to add to the single binary pods + extraVolumeMounts: [] + # -- Volumes to add to the single binary pods + extraVolumes: [] + # -- Resource requests and limits for the single binary + resources: {} + # -- Grace period to allow the single binary to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for single binary pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.singleBinarySelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + # -- DNS config for single binary pods + dnsConfig: {} + # -- Node selector for single binary pods + nodeSelector: {} + # -- Tolerations for single binary pods + tolerations: [] + persistence: + # -- Enable StatefulSetAutoDeletePVC feature + enableStatefulSetAutoDeletePVC: true + # -- Enable persistent disk + enabled: true + # -- Size of persistent disk + size: 10Gi + # -- Storage class to be used. + # If defined, storageClassName: . + # If set to "-", storageClassName: "", which disables dynamic provisioning. + # If empty or set to null, no storageClassName spec is + # set, choosing the default provisioner (gp2 on AWS, standard on GKE, AWS, and OpenStack). + storageClass: null + # -- Selector for persistent disk + selector: null +# Use either this ingress or the gateway, but not both at once. +# If you enable this, make sure to disable the gateway. +# You'll need to supply authn configuration for your ingress controller. +ingress: + enabled: false + ingressClassName: "" + annotations: {} + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: loki-distributed-basic-auth + # nginx.ingress.kubernetes.io/auth-secret-type: auth-map + # nginx.ingress.kubernetes.io/configuration-snippet: | + # proxy_set_header X-Scope-OrgID $remote_user; + labels: {} + # blackbox.monitoring.exclude: "true" + paths: + write: + - /api/prom/push + - /loki/api/v1/push + read: + - /api/prom/tail + - /loki/api/v1/tail + - /loki/api + - /api/prom/rules + - /loki/api/v1/rules + - /prometheus/api/v1/rules + - /prometheus/api/v1/alerts + singleBinary: + - /api/prom/push + - /loki/api/v1/push + - /api/prom/tail + - /loki/api/v1/tail + - /loki/api + - /api/prom/rules + - /loki/api/v1/rules + - /prometheus/api/v1/rules + - /prometheus/api/v1/alerts + # -- Hosts configuration for the ingress, passed through the `tpl` function to allow templating + hosts: + - loki.example.com + # -- TLS configuration for the ingress. Hosts passed through the `tpl` function to allow templating + tls: [] +# - hosts: +# - loki.example.com +# secretName: loki-distributed-tls + +# Configuration for the memberlist service +memberlist: + service: + publishNotReadyAddresses: false +# Configuration for the gateway +gateway: + # -- Specifies whether the gateway should be enabled + enabled: true + # -- Number of replicas for the gateway + replicas: 1 + # -- Enable logging of 2xx and 3xx HTTP requests + verboseLogging: true + autoscaling: + # -- Enable autoscaling for the gateway + enabled: false + # -- Minimum autoscaling replicas for the gateway + minReplicas: 1 + # -- Maximum autoscaling replicas for the gateway + maxReplicas: 3 + # -- Target CPU utilisation percentage for the gateway + targetCPUUtilizationPercentage: 60 + # -- Target memory utilisation percentage for the gateway + targetMemoryUtilizationPercentage: + # -- See `kubectl explain deployment.spec.strategy` for more + # -- ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy + # -- Behavior policies while scaling. + behavior: {} + # scaleUp: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 60 + # scaleDown: + # stabilizationWindowSeconds: 300 + # policies: + # - type: Pods + # value: 1 + # periodSeconds: 180 + deploymentStrategy: + type: RollingUpdate + image: + # -- The Docker registry for the gateway image + registry: docker.io + # -- The gateway image repository + repository: nginxinc/nginx-unprivileged + # -- The gateway image tag + tag: 1.24-alpine + # -- Overrides the gateway image tag with an image digest + digest: null + # -- The gateway image pull policy + pullPolicy: IfNotPresent + # -- The name of the PriorityClass for gateway pods + priorityClassName: null + # -- Annotations for gateway deployment + annotations: {} + # -- Annotations for gateway pods + podAnnotations: {} + # -- Additional labels for gateway pods + podLabels: {} + # -- Additional CLI args for the gateway + extraArgs: [] + # -- Environment variables to add to the gateway pods + extraEnv: [] + # -- Environment variables from secrets or configmaps to add to the gateway pods + extraEnvFrom: [] + # -- Lifecycle for the gateway container + lifecycle: {} + # -- Volumes to add to the gateway pods + extraVolumes: [] + # -- Volume mounts to add to the gateway pods + extraVolumeMounts: [] + # -- The SecurityContext for gateway containers + podSecurityContext: + fsGroup: 101 + runAsGroup: 101 + runAsNonRoot: true + runAsUser: 101 + # -- The SecurityContext for gateway containers + containerSecurityContext: + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + allowPrivilegeEscalation: false + # -- Resource requests and limits for the gateway + resources: {} + # -- Containers to add to the gateway pods + extraContainers: [] + # -- Grace period to allow the gateway to shutdown before it is killed + terminationGracePeriodSeconds: 30 + # -- Affinity for gateway pods. Passed through `tpl` and, thus, to be configured as string + # @default -- Hard node and soft zone anti-affinity + affinity: | + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchLabels: + {{- include "loki.gatewaySelectorLabels" . | nindent 10 }} + topologyKey: kubernetes.io/hostname + # -- DNS config for gateway pods + dnsConfig: {} + # -- Node selector for gateway pods + nodeSelector: {} + # -- Topology Spread Constraints for gateway pods + topologySpreadConstraints: [] + # -- Tolerations for gateway pods + tolerations: [] + # Gateway service configuration + service: + # -- Port of the gateway service + port: 80 + # -- Type of the gateway service + type: ClusterIP + # -- ClusterIP of the gateway service + clusterIP: null + # -- (int) Node port if service type is NodePort + nodePort: null + # -- Load balancer IPO address if service type is LoadBalancer + loadBalancerIP: null + # -- Annotations for the gateway service + annotations: {} + # -- Labels for gateway service + labels: {} + # Gateway ingress configuration + ingress: + # -- Specifies whether an ingress for the gateway should be created + enabled: false + # -- Ingress Class Name. MAY be required for Kubernetes versions >= 1.18 + ingressClassName: "" + # -- Annotations for the gateway ingress + annotations: {} + # -- Labels for the gateway ingress + labels: {} + # -- Hosts configuration for the gateway ingress, passed through the `tpl` function to allow templating + hosts: + - host: gateway.loki.example.com + paths: + - path: / + # -- pathType (e.g. ImplementationSpecific, Prefix, .. etc.) might also be required by some Ingress Controllers + # pathType: Prefix + # -- TLS configuration for the gateway ingress. Hosts passed through the `tpl` function to allow templating + tls: + - secretName: loki-gateway-tls + hosts: + - gateway.loki.example.com + # Basic auth configuration + basicAuth: + # -- Enables basic authentication for the gateway + enabled: false + # -- The basic auth username for the gateway + username: null + # -- The basic auth password for the gateway + password: null + # -- Uses the specified users from the `loki.tenants` list to create the htpasswd file + # if `loki.tenants` is not set, the `gateway.basicAuth.username` and `gateway.basicAuth.password` are used + # The value is templated using `tpl`. Override this to use a custom htpasswd, e.g. in case the default causes + # high CPU load. + htpasswd: >- + {{ if .Values.loki.tenants }} + + {{- range $t := .Values.loki.tenants }} + {{ htpasswd (required "All tenants must have a 'name' set" $t.name) (required "All tenants must have a 'password' set" $t.password) }} + + {{- end }} + {{ else }} {{ htpasswd (required "'gateway.basicAuth.username' is required" .Values.gateway.basicAuth.username) (required "'gateway.basicAuth.password' is required" .Values.gateway.basicAuth.password) }} {{ end }} + # -- Existing basic auth secret to use. Must contain '.htpasswd' + existingSecret: null + # Configures the readiness probe for the gateway + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 15 + timeoutSeconds: 1 + nginxConfig: + # -- Enable listener for IPv6, disable on IPv4-only systems + enableIPv6: true + # -- NGINX log format + logFormat: |- + main '$remote_addr - $remote_user [$time_local] $status ' + '"$request" $body_bytes_sent "$http_referer" ' + '"$http_user_agent" "$http_x_forwarded_for"'; + # -- Allows appending custom configuration to the server block + serverSnippet: "" + # -- Allows appending custom configuration to the http block, passed through the `tpl` function to allow templating + httpSnippet: >- + {{ if .Values.loki.tenants }}proxy_set_header X-Scope-OrgID $remote_user;{{ end }} + # -- Override Read URL + customReadUrl: null + # -- Override Write URL + customWriteUrl: null + # -- Override Backend URL + customBackendUrl: null + # -- Allows overriding the DNS resolver address nginx will use. + resolver: "" + # -- Config file contents for Nginx. Passed through the `tpl` function to allow templating + # @default -- See values.yaml + file: | + {{- include "loki.nginxFile" . | indent 2 -}} +networkPolicy: + # -- Specifies whether Network Policies should be created + enabled: false + # -- Specifies whether the policies created will be standard Network Policies (flavor: kubernetes) + # or Cilium Network Policies (flavor: cilium) + flavor: kubernetes + metrics: + # -- Specifies the Pods which are allowed to access the metrics port. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespaces which are allowed to access the metrics port + namespaceSelector: {} + # -- Specifies specific network CIDRs which are allowed to access the metrics port. + # In case you use namespaceSelector, you also have to specify your kubelet networks here. + # The metrics ports are also used for probes. + cidrs: [] + ingress: + # -- Specifies the Pods which are allowed to access the http port. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespaces which are allowed to access the http port + namespaceSelector: {} + alertmanager: + # -- Specify the alertmanager port used for alerting + port: 9093 + # -- Specifies the alertmanager Pods. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespace the alertmanager is running in + namespaceSelector: {} + externalStorage: + # -- Specify the port used for external storage, e.g. AWS S3 + ports: [] + # -- Specifies specific network CIDRs you want to limit access to + cidrs: [] + discovery: + # -- (int) Specify the port used for discovery + port: null + # -- Specifies the Pods labels used for discovery. + # As this is cross-namespace communication, you also need the namespaceSelector. + podSelector: {} + # -- Specifies the namespace the discovery Pods are running in + namespaceSelector: {} +# ------------------------------------- +# Configuration for `minio` child chart +# ------------------------------------- +minio: + enabled: false + replicas: 1 + # Minio requires 2 to 16 drives for erasure code (drivesPerNode * replicas) + # https://docs.min.io/docs/minio-erasure-code-quickstart-guide + # Since we only have 1 replica, that means 2 drives must be used. + drivesPerNode: 2 + rootUser: enterprise-logs + rootPassword: supersecret + buckets: + - name: chunks + policy: none + purge: false + - name: ruler + policy: none + purge: false + - name: admin + policy: none + purge: false + persistence: + size: 5Gi + resources: + requests: + cpu: 100m + memory: 128Mi +# Create extra manifests via values. Would be passed through `tpl` for templating +extraObjects: [] +# - apiVersion: v1 +# kind: ConfigMap +# metadata: +# name: loki-alerting-rules +# data: +# loki-alerting-rules.yaml: |- +# groups: +# - name: example +# rules: +# - alert: example +# expr: | +# sum(count_over_time({app="loki"} |~ "error")) > 0 +# for: 3m +# labels: +# severity: warning +# category: logs +# annotations: +# message: "loki has encountered errors" + +sidecar: + image: + # -- The Docker registry and image for the k8s sidecar + repository: kiwigrid/k8s-sidecar + # -- Docker image tag + tag: 1.24.3 + # -- Docker image sha. If empty, no sha will be used + sha: "" + # -- Docker image pull policy + pullPolicy: IfNotPresent + # -- Resource requests and limits for the sidecar + resources: {} + # limits: + # cpu: 100m + # memory: 100Mi + # requests: + # cpu: 50m + # memory: 50Mi + # -- The SecurityContext for the sidecar. + securityContext: {} + # -- Set to true to skip tls verification for kube api calls. + skipTlsVerify: false + # -- Ensure that rule files aren't conflicting and being overwritten by prefixing their name with the namespace they are defined in. + enableUniqueFilenames: false + # -- Readiness probe definition. Probe is disabled on the sidecar by default. + readinessProbe: {} + # -- Liveness probe definition. Probe is disabled on the sidecar by default. + livenessProbe: {} + rules: + # -- Whether or not to create a sidecar to ingest rule from specific ConfigMaps and/or Secrets. + enabled: true + # -- Label that the configmaps/secrets with rules will be marked with. + label: loki_rule + # -- Label value that the configmaps/secrets with rules will be set to. + labelValue: "" + # -- Folder into which the rules will be placed. + folder: /rules + # -- Comma separated list of namespaces. If specified, the sidecar will search for config-maps/secrets inside these namespaces. + # Otherwise the namespace in which the sidecar is running will be used. + # It's also possible to specify 'ALL' to search in all namespaces. + searchNamespace: null + # -- Method to use to detect ConfigMap changes. With WATCH the sidecar will do a WATCH request, with SLEEP it will list all ConfigMaps, then sleep for 60 seconds. + watchMethod: WATCH + # -- Search in configmap, secret, or both. + resource: both + # -- Absolute path to the shell script to execute after a configmap or secret has been reloaded. + script: null + # -- WatchServerTimeout: request to the server, asking it to cleanly close the connection after that. + # defaults to 60sec; much higher values like 3600 seconds (1h) are feasible for non-Azure K8S. + watchServerTimeout: 60 + # + # -- WatchClientTimeout: is a client-side timeout, configuring your local socket. + # If you have a network outage dropping all packets with no RST/FIN, + # this is how long your client waits before realizing & dropping the connection. + # Defaults to 66sec. + watchClientTimeout: 60 + # -- Log level of the sidecar container. + logLevel: INFO + diff --git a/k8s/loki/templates/podlogs.yaml b/k8s/loki/templates/podlogs.yaml new file mode 100644 index 0000000..54cbdd7 --- /dev/null +++ b/k8s/loki/templates/podlogs.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.grafana.com/v1alpha1 +kind: PodLogs +metadata: + name: all-logs-fixed + labels: + app.kubernetes.io/name: loki + app.kubernetes.io/instance: loki +spec: + namespaceSelector: + any: true + selector: + matchLabels: {} + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: '-' + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_name + - __meta_kubernetes_pod_label_app_kubernetes_io_component + targetLabel: __service__ + - action: replace + replacement: $1 + separator: / + sourceLabels: + - __meta_kubernetes_namespace + - __service__ + targetLabel: job + - action: replace + sourceLabels: + - __meta_kubernetes_pod_container_name + targetLabel: container + - action: replace + sourceLabels: + - __meta_kubernetes_namespace + targetLabel: namespace + - action: replace + replacement: loki + targetLabel: cluster + pipelineStages: + - cri: {} diff --git a/k8s/loki/values.yaml b/k8s/loki/values.yaml index a55ece0..36a42ea 100644 --- a/k8s/loki/values.yaml +++ b/k8s/loki/values.yaml @@ -39,6 +39,27 @@ loki: retention_deletes_enabled: true retention_period: 672h + monitoring: + selfMonitoring: + # -- Grafana Agent annotations + annotations: {} + # -- Additional Grafana Agent labels + labels: {} + # -- Enable the config read api on port 8080 of the agent + enableConfigReadAPI: false + extraClusterRoleRules: + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get", "list", "watch"] + grafanaAgent: + podLogs: + enabled: false + namespaceSelector: + any: true + logsInstance: + podLogsSelector: + matchLabels: {} + singleBinary: replicas: 1 persistence: diff --git a/k8s/monitoring/templates/loki-dashboard.yaml b/k8s/monitoring/templates/loki-dashboard.yaml index 2790fba..c537cb9 100644 --- a/k8s/monitoring/templates/loki-dashboard.yaml +++ b/k8s/monitoring/templates/loki-dashboard.yaml @@ -1,183 +1,1061 @@ apiVersion: v1 kind: ConfigMap metadata: - name: loki-verification-dashboard + name: loki-logs-dashboard labels: grafana_dashboard: "1" data: - loki-verification.json: | + loki-logs.json: | { "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] + "list": [] }, "editable": true, - "fiscalYearStartMonth": 0, + "gnetId": null, "graphTooltip": 0, + "hideControls": false, "id": null, - "links": [], - "liveNow": false, + "iteration": 1583185057230, + "links": [ + { + "asDropdown": true, + "icon": "external link", + "includeVars": true, + "keepTime": true, + "tags": [ + "loki" + ], + "targetBlank": false, + "title": "Loki Logs (Fixed v8)", + "type": "dashboards" + } + ], "panels": [ { - "datasource": { - "type": "loki", - "uid": "loki" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 8, - "w": 24, + "h": 4, + "w": 3, "x": 0, "y": 0 }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, "targets": [ { - "datasource": { - "type": "loki", - "uid": "loki" - }, - "editorMode": "builder", - "expr": "sum(count_over_time({job=~\".+\"}[1m]))", - "legendFormat": "", - "queryType": "range", + "expr": "sum(go_goroutines{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"})", "refId": "A" } ], - "title": "Log Volume", - "type": "timeseries" + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "goroutines", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { - "datasource": { - "type": "loki", - "uid": "loki" - }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, "gridPos": { - "h": 16, + "h": 4, + "w": 3, + "x": 3, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(go_gc_duration_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}) by (quantile)", + "legendFormat": "{{quantile}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "gc duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "cpu", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", container=~\"$container\"})", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "working set", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "tx", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\"}[5m]))", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "rx", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", + "legendFormat": "{{reason}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "restarts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 4, + "w": 3, + "x": 21, + "y": 0 + }, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", exported_exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "bad words", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$logs", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, "w": 24, "x": 0, - "y": 8 + "y": 4 }, - "id": 1, + "hiddenSeries": false, + "id": null, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "warn", + "color": "#FF780A" + }, + { + "alias": "error", + "color": "#E02F44" + }, + { + "alias": "info", + "color": "#56A64B" + }, + { + "alias": "debug", + "color": "#3274D9" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate({cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=~\"$level\" |= \"$filter\" [5m])) by (level)", + "intervalFactor": 3, + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Log Rate", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "$logs", + "gridPos": { + "h": 19, + "w": 24, + "x": 0, + "y": 6 + }, + "id": null, + "maxDataPoints": "", "options": { - "dedupStrategy": "none", - "enableLogDetails": true, - "prettifyLogMessage": false, - "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", - "wrapLogMessage": false + "wrapLogMessage": true }, "targets": [ { - "datasource": { - "type": "loki", - "uid": "loki" - }, - "editorMode": "builder", - "expr": "{job=~\".+\"}", - "queryType": "range", + "expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=~\"$level\" |= \"$filter\"", "refId": "A" } ], + "timeFrom": null, + "timeShift": null, "title": "Logs", "type": "logs" } ], - "refresh": "", - "schemaVersion": 38, + "refresh": "10s", + "rows": [], + "schemaVersion": 14, "style": "dark", - "tags": [], + "tags": [ + "loki" + ], "templating": { - "list": [] + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$logs", + "hide": 0, + "includeAll": true, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [], + "query": "label_values(cluster)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$logs", + "hide": 0, + "includeAll": true, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "hide": 0, + "label": null, + "name": "logs", + "options": [], + "query": "loki", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": {}, + "datasource": "$logs", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(pod)", + "refresh": 0, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": {}, + "datasource": "$logs", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "container", + "options": [], + "query": "label_values(container)", + "refresh": 0, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": true, + "text": "", + "value": "" + }, + "hide": 0, + "includeAll": true, + "label": "", + "multi": true, + "name": "level", + "options": [ + { + "selected": false, + "text": "debug", + "value": "debug" + }, + { + "selected": false, + "text": "info", + "value": "info" + }, + { + "selected": false, + "text": "warn", + "value": "warn" + }, + { + "selected": false, + "text": "error", + "value": "error" + } + ], + "query": "debug,info,warn,error", + "refresh": 0, + "type": "custom" + }, + { + "current": { + "selected": false, + "text": "", + "value": "" + }, + "label": "LogQL Filter", + "name": "filter", + "query": "", + "type": "textbox" + } + ] }, "time": { "from": "now-1h", "to": "now" }, - "timepicker": {}, - "timezone": "", - "title": "Loki Verification", - "uid": "loki-verification", - "version": 0, - "weekStart": "" + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Loki Logs (Fixed v10)", + "uid": "loki-logs-fixed-v10", + "version": 0 } diff --git a/k8s/monitoring/values-production.yaml b/k8s/monitoring/values-production.yaml index 2415807..3366b26 100644 --- a/k8s/monitoring/values-production.yaml +++ b/k8s/monitoring/values-production.yaml @@ -192,6 +192,7 @@ kube-prometheus-stack: additionalDataSources: - name: Loki type: loki + uid: loki url: http://loki.loki.svc:3100 access: proxy diff --git a/k8s/monitoring/values-staging.yaml b/k8s/monitoring/values-staging.yaml index d49d4e3..5badd4d 100644 --- a/k8s/monitoring/values-staging.yaml +++ b/k8s/monitoring/values-staging.yaml @@ -192,6 +192,7 @@ kube-prometheus-stack: additionalDataSources: - name: Loki type: loki + uid: loki url: http://loki.loki.svc:3100 access: proxy