From b173bed6bf7d88614051d310f9672f2ec70a83d4 Mon Sep 17 00:00:00 2001 From: Michele Cereda Date: Tue, 22 Apr 2025 23:43:07 +0200 Subject: [PATCH] chore(mimir): try to run on aws ecs --- knowledge base/cloud computing/aws/efs.md | 25 +++++ knowledge base/mimir.md | 127 +++++++++++++++++++++- knowledge base/prometheus/README.md | 51 +++++++-- snippets/aws/other commands.fish | 17 +++ 4 files changed, 207 insertions(+), 13 deletions(-) diff --git a/knowledge base/cloud computing/aws/efs.md b/knowledge base/cloud computing/aws/efs.md index 1f7cf1d..3db9e74 100644 --- a/knowledge base/cloud computing/aws/efs.md +++ b/knowledge base/cloud computing/aws/efs.md @@ -108,6 +108,31 @@ mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvp +
+ Example: mount an EFS volume and change a file in it + +```sh +$ aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir' +fs-abcdef0123456789a +$ dig 'A' +short '@172.16.0.2' 'fs-abcdef0123456789a.efs.eu-west-1.amazonaws.com' +172.16.1.20 +$ mkdir -p "$HOME/tmp/efs" +$ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \ + '172.16.1.20:/' "$HOME/tmp/efs" +$ mount -t 'nfs' +172.16.1.20:/ on /Users/someuser/tmp/efs (nfs, nodev, nosuid, mounted by someuser) +$ sudo cp -iv 'config.yaml' "$HOME/tmp/efs/" # EFS permissions require one to use `sudo` here +config.yaml -> /Users/someuser/tmp/efs/config.yaml +$ ls -l "$HOME/tmp/efs/" +total 1 +-rw-r--r--@ 1 root wheel 254 Apr 17 17:58 config.yaml +$ cat "$HOME/tmp/efs/config.yaml" +$ vim "$HOME/tmp/efs/config.yaml" +$ umount "$HOME/tmp/efs" +``` + +
+ ## Further readings - [Amazon Web Services] diff --git a/knowledge base/mimir.md b/knowledge base/mimir.md index 53900b8..dc36812 100644 --- a/knowledge base/mimir.md +++ b/knowledge base/mimir.md @@ -14,9 +14,13 @@ and set up alerting rules across multiple tenants to leverage tenant federation. 1. [Microservices mode](#microservices-mode) 1. [Storage](#storage) 1. [Object storage](#object-storage) +1. [Authentication and authorization](#authentication-and-authorization) 1. [APIs](#apis) 1. [Deduplication of data from multiple Prometheus scrapers](#deduplication-of-data-from-multiple-prometheus-scrapers) 1. [Migrate to Mimir](#migrate-to-mimir) +1. [Troubleshooting](#troubleshooting) + 1. [HTTP status 401 Unauthorized: no org id](#http-status-401-unauthorized-no-org-id) + 1. [HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1](#http-status-500-internal-server-error-send-data-to-ingesters-at-least-2-live-replicas-required-could-only-find-1) 1. [Further readings](#further-readings) 1. [Sources](#sources) @@ -32,6 +36,9 @@ Such blocks are the same that Prometheus and Thanos use, though each application uses slightly different metadata files for them. Mimir supports multiple tenants, and stores blocks on a **per-tenant** level.
+Multi-tenancy is enabled by default, and can be disabled using the `-auth.multitenancy-enabled=false` option.
+If enabled, then multi-tenancy **will require every API request** to have the `X-Scope-OrgID` header with the value set +to the tenant ID one is authenticating for.
When multi-tenancy is **disabled**, it will only manage a single tenant going by the name `anonymous`. Blocks can be uploaded using the `mimirtool` utility, so that Mimir can access them.
@@ -177,7 +184,12 @@ It will **not** scrape metrics itself. ```yaml remote_write: - - url: http://mimir.example.org:9009/api/v1/push + - url: http://mimir.example.org:8080/api/v1/push + headers: + X-Scope-OrgID: + # required unless multi-tenancy is disabled + # set it to the correct ones, this is the default + anonymous ``` @@ -254,7 +266,10 @@ Refer [Configure Grafana Mimir object storage backend]. Blocks storage must be located under a **different** prefix or bucket than both the ruler's and AlertManager's stores. Mimir **will** fail to start if that is the case. -To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations: +To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations. + +
+ Different buckets ```yaml common: @@ -277,6 +292,42 @@ ruler_storage: bucket_name: mimir-ruler ``` +
+ +
+ Same bucket, different prefixes + +```yaml +common: + storage: + backend: s3 + s3: + endpoint: s3.us-east-2.amazonaws.com + region: us-east-2 + bucket_name: mimir + +blocks_storage: + storage_prefix: blocks + +alertmanager_storage: + storage_prefix: alertmanager + +ruler_storage: + storage_prefix: ruler +``` + +
+ +The WAL is **only** retained on local disk, **not** persisted to the object storage. + +Metrics data is uploaded to the object storage every 2 hours, typically when a block is cut from the in-memory TSDB +head.
+After the metrics data block is uploaded, its related WAL is truncated too. + +## Authentication and authorization + +Refer [Grafana Mimir authentication and authorization]. + ## APIs Refer [Grafana Mimir HTTP API]. @@ -287,7 +338,72 @@ Refer [Configure Grafana Mimir high-availability deduplication]. ## Migrate to Mimir -Refer [Migrate from Thanos or Prometheus to Grafana Mimir]. +Refer [Configure TSDB block upload] and [Migrate from Thanos or Prometheus to Grafana Mimir]. + +## Troubleshooting + +### HTTP status 401 Unauthorized: no org id + +**Context**: Prometheus servers get this error when trying to push metrics. + +**Root cause**: The push request is missing the `X-Scope-OrgID` header that would specify the tenancy for the data. + +**Solution**: + +Configure Prometheus to add the `X-Scope-OrgID` header to the data pushed.
+Required even when multi-tenancy is disabled. In this case, use the default `anonymous` tenancy: + +```yaml +remote_write: + - url: http://mimir.example.org:8080/api/v1/push + headers: + X-Scope-OrgID: + # required unless multi-tenancy is disabled + # set it to the correct ones, this is the default + anonymous +``` + +### HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1 + +**Context**: + +Mimir is running on [AWS ECS] in monolithic mode for evaluation.
+It is loading the following configuration from a mounted [AWS EFS] volume: + +```yml +multitenancy_enabled: false + +common: + storage: + backend: s3 +blocks_storage: + s3: + bucket_name: my-mimir-blocks +``` + +The service is backed by a load balancer.
+The load balancer is allowing requests to reach the task serving Mimir correctly. + +A Prometheus server is configured to send data to Mimir: + +```yml +remote_write: + - url: http://mimir.dev.somecompany.com:8080/api/v1/push + headers: + X-Scope-OrgID: anonymous +``` + +The push request passes Mimir's validation for the above header. + +Both Mimir and Prometheus print this error when Prometheus tries to push metrics. + +**Root cause**: + +It seems Mimir requires a minimum of 2 ingester replicas even when running in monolithic mode. + +**Solution**: + +TODO ## Further readings @@ -307,6 +423,7 @@ Alternatives: - [Migrate from Thanos or Prometheus to Grafana Mimir] - [Configure Grafana Mimir object storage backend] - [Grafana Mimir configuration parameters] +- [Grafana Mimir authentication and authorization] +[aws ecs]: cloud%20computing/aws/ecs.md +[aws efs]: cloud%20computing/aws/efs.md [cortex]: cortex.md [grafana]: grafana.md [prometheus]: prometheus/README.md @@ -325,7 +444,9 @@ Alternatives: [codebase]: https://github.com/grafana/mimir [configure grafana mimir high-availability deduplication]: https://grafana.com/docs/mimir/latest/configure/configure-high-availability-deduplication/ [configure grafana mimir object storage backend]: https://grafana.com/docs/mimir/latest/configure/configure-object-storage-backend/ +[Configure TSDB block upload]: https://grafana.com/docs/mimir/latest/configure/configure-tsdb-block-upload/ [documentation]: https://grafana.com/docs/mimir/latest/ +[Grafana Mimir authentication and authorization]: https://grafana.com/docs/mimir/next/manage/secure/authentication-and-authorization/ [grafana mimir configuration parameters]: https://grafana.com/docs/mimir/latest/configure/configuration-parameters/ [grafana mimir http api]: https://grafana.com/docs/mimir/latest/references/http-api/ [helm chart]: https://github.com/grafana/mimir/tree/main/operations/helm/charts/mimir-distributed diff --git a/knowledge base/prometheus/README.md b/knowledge base/prometheus/README.md index e33adc0..b97f08a 100644 --- a/knowledge base/prometheus/README.md +++ b/knowledge base/prometheus/README.md @@ -16,7 +16,8 @@ are observed. 1. [Local storage](#local-storage) 1. [External storage](#external-storage) 1. [Backfilling](#backfilling) -1. [Write to remote Prometheus servers](#write-to-remote-prometheus-servers) +1. [Send metrics to other Prometheus servers](#send-metrics-to-other-prometheus-servers) +1. [Exporters](#exporters) 1. [Management API](#management-api) 1. [Take snapshots of the current data](#take-snapshots-of-the-current-data) 1. [High availability](#high-availability) @@ -42,6 +43,20 @@ Prometheus to scrape from.
Exporters are small and purpose-built applications that collect their objects' metrics in different ways, then expose them in an HTTP endpoint in their place. +
+ Setup + +```sh +docker pull 'prom/prometheus' +docker run -p '9090:9090' -v "$PWD/config/dir:/etc/prometheus" -v 'prometheus-data:/prometheus' 'prom/prometheus' + +``` + +
+ +
+ Usage + ```sh # Start the process. prometheus @@ -57,6 +72,8 @@ kill -s 'SIGTERM' '3969' pkill --signal 'TERM' 'prometheus' ``` +
+ ## Components Prometheus is composed by its **server**, the **Alertmanager** and its **exporters**. @@ -355,7 +372,7 @@ TODO TODO -## Write to remote Prometheus servers +## Send metrics to other Prometheus servers Also see [How to set up and experiment with Prometheus remote-write]. @@ -376,6 +393,23 @@ remote_write: region: eu-east-1 ``` +## Exporters + +Refer [Exporters and integrations]. + +Exporters are libraries and web servers that gather metrics from third-party systems, then either send them to +Prometheus servers or expose them as Prometheus metrics. + +They are used in cases where it is not feasible to instrument systems to send or expose Prometheus metrics directly. + +Exporters of interest: + +| Exporter | Summary | +| -------------------------------------- | ----------------------------------- | +| [BOINC exporter][ordaa/boinc_exporter] | Metrics for BOINC client | +| [Node exporter] | OS-related metrics | +| [SNMP exporter] | Basically SNMP in Prometheus format | + ## Management API ### Take snapshots of the current data @@ -416,9 +450,10 @@ The snapshot now exists at `/snapshots/20171210T211224Z-2be650b6d019eb Typically achieved by: 1. Running multiple Prometheus replicas.
- Replicas could each focus on a subset of the whole data, or just duplicate it. + Replicas could each focus _on a subset_ of the whole data, or just scrape the targets multiple times and leave the + deduplication to other tools. 1. Running a separate AlertManager instance.
- This would handle alerts from all the Prometheus instances, automatically managing eventually duplicated data. + This would handle alerts from **all** the Prometheus instances, automatically managing eventually duplicated data. 1. Using tools like [Thanos], [Cortex], or Grafana's [Mimir] to aggregate and deduplicate data. 1. Directing visualizers like Grafana to the aggregator instead of the Prometheus replicas. @@ -439,12 +474,7 @@ Typically achieved by: - [Cortex] - [Thanos] - Grafana's [Mimir] - -Exporters: - -- [Node exporter] -- [SNMP exporter] -- [`ordaa/boinc_exporter`][ordaa/boinc_exporter] +- [Exporters and integrations] ### Sources @@ -481,6 +511,7 @@ Exporters: [codebase]: https://github.com/prometheus/prometheus [documentation]: https://prometheus.io/docs/ +[Exporters and integrations]: https://prometheus.io/docs/instrumenting/exporters/ [functions]: https://prometheus.io/docs/prometheus/latest/querying/functions/ [helm chart]: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus [metric_relabel_configs]: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs diff --git a/snippets/aws/other commands.fish b/snippets/aws/other commands.fish index ddf19b7..f5994ad 100644 --- a/snippets/aws/other commands.fish +++ b/snippets/aws/other commands.fish @@ -120,6 +120,7 @@ aws ecr list-images --registry-id '012345678901' --repository-name 'cache/docker # ------------------ ### +# List tasks given a service name aws ecs list-tasks --query 'taskArns' --output 'text' --cluster 'testCluster' --service-name 'testService' aws ecs list-tasks --output 'text' --query 'taskArns' --cluster 'testCluster' --family 'testService' \ @@ -163,6 +164,10 @@ aws ecs describe-tasks --cluster 'staging' --tasks 'ef6260ed8aab49cf926667ab0c52 aws ecs execute-command --cluster 'staging' --task 'e242654518cf42a7be13a8551e0b3c27' --container 'echo-server' \ --interactive --command 'nc -vz 127.0.0.1 28080' +# Stop tasks given a service name +aws ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \ +| xargs aws ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task + ### # EFS @@ -197,6 +202,18 @@ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retran mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvport' \ '10.20.30.42:/export-name' "$HOME/efs/export" +# Update a file in an EFS volume, then stop the ECS tasks using it so new can start with the updated file. +mkdir -p "$HOME/tmp/efs" \ +&& aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir' \ + | xargs -I '%%' dig 'A' +short '@172.16.0.2' "%%.efs.eu-west-1.amazonaws.com" \ + | xargs -I '%%' mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \ + "%%:/" "$HOME/tmp/efs" \ +&& sudo cp -iv 'config.yaml' "$HOME/tmp/efs/" \ +&& diff -q 'config.yaml' "$HOME/tmp/efs/config.yaml" \ +&& umount "$HOME/tmp/efs" \ +&& aws --profile 'ro' ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \ + | xargs -n '1' aws --profile 'rw' ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task + ### # EKS