From b173bed6bf7d88614051d310f9672f2ec70a83d4 Mon Sep 17 00:00:00 2001
From: Michele Cereda <cereda.michele@gmail.com>
Date: Tue, 22 Apr 2025 23:43:07 +0200
Subject: [PATCH] chore(mimir): try to run on aws ecs

---
 knowledge base/cloud computing/aws/efs.md |  25 +++++
 knowledge base/mimir.md                   | 127 +++++++++++++++++++++-
 knowledge base/prometheus/README.md       |  51 +++++++--
 snippets/aws/other commands.fish          |  17 +++
 4 files changed, 207 insertions(+), 13 deletions(-)
diff --git a/knowledge base/cloud computing/aws/efs.md b/knowledge base/cloud computing/aws/efs.md
index 1f7cf1d..3db9e74 100644
--- a/knowledge base/cloud computing/aws/efs.md	
+++ b/knowledge base/cloud computing/aws/efs.md	
@@ -108,6 +108,31 @@ mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvp
 
 </details>
 
+<details>
+  <summary>Example: mount an EFS volume and change a file in it</summary>
+
+```sh
+$ aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir'
+fs-abcdef0123456789a
+$ dig 'A' +short '@172.16.0.2' 'fs-abcdef0123456789a.efs.eu-west-1.amazonaws.com'
+172.16.1.20
+$ mkdir -p "$HOME/tmp/efs"
+$ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \
+    '172.16.1.20:/' "$HOME/tmp/efs"
+$ mount -t 'nfs'
+172.16.1.20:/ on /Users/someuser/tmp/efs (nfs, nodev, nosuid, mounted by someuser)
+$ sudo cp -iv 'config.yaml' "$HOME/tmp/efs/"   # EFS permissions require one to use `sudo` here
+config.yaml -> /Users/someuser/tmp/efs/config.yaml
+$ ls -l "$HOME/tmp/efs/"
+total 1
+-rw-r--r--@ 1 root  wheel  254 Apr 17 17:58 config.yaml
+$ cat "$HOME/tmp/efs/config.yaml"
+$ vim "$HOME/tmp/efs/config.yaml"
+$ umount "$HOME/tmp/efs"
+```
+
+</details>
+
 ## Further readings
 
 - [Amazon Web Services]
diff --git a/knowledge base/mimir.md b/knowledge base/mimir.md
index 53900b8..dc36812 100644
--- a/knowledge base/mimir.md	
+++ b/knowledge base/mimir.md	
@@ -14,9 +14,13 @@ and set up alerting rules across multiple tenants to leverage tenant federation.
    1. [Microservices mode](#microservices-mode)
 1. [Storage](#storage)
    1. [Object storage](#object-storage)
+1. [Authentication and authorization](#authentication-and-authorization)
 1. [APIs](#apis)
 1. [Deduplication of data from multiple Prometheus scrapers](#deduplication-of-data-from-multiple-prometheus-scrapers)
 1. [Migrate to Mimir](#migrate-to-mimir)
+1. [Troubleshooting](#troubleshooting)
+   1. [HTTP status 401 Unauthorized: no org id](#http-status-401-unauthorized-no-org-id)
+   1. [HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1](#http-status-500-internal-server-error-send-data-to-ingesters-at-least-2-live-replicas-required-could-only-find-1)
 1. [Further readings](#further-readings)
    1. [Sources](#sources)
 
@@ -32,6 +36,9 @@ Such blocks are the same that Prometheus and Thanos use, though each application
 uses slightly different metadata files for them.
 
 Mimir supports multiple tenants, and stores blocks on a **per-tenant** level.<br/>
+Multi-tenancy is enabled by default, and can be disabled using the `-auth.multitenancy-enabled=false` option.<br/>
+If enabled, then multi-tenancy **will require every API request** to have the `X-Scope-OrgID` header with the value set
+to the tenant ID one is authenticating for.<br/>
 When multi-tenancy is **disabled**, it will only manage a single tenant going by the name `anonymous`.
 
 Blocks can be uploaded using the `mimirtool` utility, so that Mimir can access them.<br/>
@@ -177,7 +184,12 @@ It will **not** scrape metrics itself.
 
 ```yaml
 remote_write:
-  - url: http://mimir.example.org:9009/api/v1/push
+  - url: http://mimir.example.org:8080/api/v1/push
+    headers:
+      X-Scope-OrgID:
+        # required unless multi-tenancy is disabled
+        # set it to the correct ones, this is the default
+        anonymous
 ```
 
 </details>
@@ -254,7 +266,10 @@ Refer [Configure Grafana Mimir object storage backend].
 Blocks storage must be located under a **different** prefix or bucket than both the ruler's and AlertManager's stores.
 Mimir **will** fail to start if that is the case.
 
-To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations:
+To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations.
+
+<details style="padding: 0 0 0 1rem">
+  <summary>Different buckets</summary>
 
 ```yaml
 common:
@@ -277,6 +292,42 @@ ruler_storage:
     bucket_name: mimir-ruler
 ```
 
+</details>
+
+<details style="padding: 0 0 1rem 1rem">
+  <summary>Same bucket, different prefixes</summary>
+
+```yaml
+common:
+  storage:
+    backend: s3
+    s3:
+      endpoint: s3.us-east-2.amazonaws.com
+      region: us-east-2
+      bucket_name: mimir
+
+blocks_storage:
+  storage_prefix: blocks
+
+alertmanager_storage:
+  storage_prefix: alertmanager
+
+ruler_storage:
+  storage_prefix: ruler
+```
+
+</details>
+
+The WAL is **only** retained on local disk, **not** persisted to the object storage.
+
+Metrics data is uploaded to the object storage every 2 hours, typically when a block is cut from the in-memory TSDB
+head.<br/>
+After the metrics data block is uploaded, its related WAL is truncated too.
+
+## Authentication and authorization
+
+Refer [Grafana Mimir authentication and authorization].
+
 ## APIs
 
 Refer [Grafana Mimir HTTP API].
@@ -287,7 +338,72 @@ Refer [Configure Grafana Mimir high-availability deduplication].
 
 ## Migrate to Mimir
 
-Refer [Migrate from Thanos or Prometheus to Grafana Mimir].
+Refer [Configure TSDB block upload] and [Migrate from Thanos or Prometheus to Grafana Mimir].
+
+## Troubleshooting
+
+### HTTP status 401 Unauthorized: no org id
+
+**Context**: Prometheus servers get this error when trying to push metrics.
+
+**Root cause**: The push request is missing the `X-Scope-OrgID` header that would specify the tenancy for the data.
+
+**Solution**:
+
+Configure Prometheus to add the `X-Scope-OrgID` header to the data pushed.<br/>
+Required even when multi-tenancy is disabled. In this case, use the default `anonymous` tenancy:
+
+```yaml
+remote_write:
+  - url: http://mimir.example.org:8080/api/v1/push
+    headers:
+      X-Scope-OrgID:
+        # required unless multi-tenancy is disabled
+        # set it to the correct ones, this is the default
+        anonymous
+```
+
+### HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1
+
+**Context**:
+
+Mimir is running on [AWS ECS] in monolithic mode for evaluation.<br/>
+It is loading the following configuration from a mounted [AWS EFS] volume:
+
+```yml
+multitenancy_enabled: false
+
+common:
+  storage:
+    backend: s3
+blocks_storage:
+  s3:
+    bucket_name: my-mimir-blocks
+```
+
+The service is backed by a load balancer.<br/>
+The load balancer is allowing requests to reach the task serving Mimir correctly.
+
+A Prometheus server is configured to send data to Mimir:
+
+```yml
+remote_write:
+  - url: http://mimir.dev.somecompany.com:8080/api/v1/push
+    headers:
+      X-Scope-OrgID: anonymous
+```
+
+The push request passes Mimir's validation for the above header.
+
+Both Mimir and Prometheus print this error when Prometheus tries to push metrics.
+
+**Root cause**:
+
+It seems Mimir requires a minimum of 2 ingester replicas even when running in monolithic mode.
+
+**Solution**:
+
+TODO
 
 ## Further readings
 
@@ -307,6 +423,7 @@ Alternatives:
 - [Migrate from Thanos or Prometheus to Grafana Mimir]
 - [Configure Grafana Mimir object storage backend]
 - [Grafana Mimir configuration parameters]
+- [Grafana Mimir authentication and authorization]
 
 <!--
   Reference
@@ -315,6 +432,8 @@ Alternatives:
 
 <!-- In-article sections -->
 <!-- Knowledge base -->
+[aws ecs]: cloud%20computing/aws/ecs.md
+[aws efs]: cloud%20computing/aws/efs.md
 [cortex]: cortex.md
 [grafana]: grafana.md
 [prometheus]: prometheus/README.md
@@ -325,7 +444,9 @@ Alternatives:
 [codebase]: https://github.com/grafana/mimir
 [configure grafana mimir high-availability deduplication]: https://grafana.com/docs/mimir/latest/configure/configure-high-availability-deduplication/
 [configure grafana mimir object storage backend]: https://grafana.com/docs/mimir/latest/configure/configure-object-storage-backend/
+[Configure TSDB block upload]: https://grafana.com/docs/mimir/latest/configure/configure-tsdb-block-upload/
 [documentation]: https://grafana.com/docs/mimir/latest/
+[Grafana Mimir authentication and authorization]: https://grafana.com/docs/mimir/next/manage/secure/authentication-and-authorization/
 [grafana mimir configuration parameters]: https://grafana.com/docs/mimir/latest/configure/configuration-parameters/
 [grafana mimir http api]: https://grafana.com/docs/mimir/latest/references/http-api/
 [helm chart]: https://github.com/grafana/mimir/tree/main/operations/helm/charts/mimir-distributed
diff --git a/knowledge base/prometheus/README.md b/knowledge base/prometheus/README.md
index e33adc0..b97f08a 100644
--- a/knowledge base/prometheus/README.md	
+++ b/knowledge base/prometheus/README.md	
@@ -16,7 +16,8 @@ are observed.
    1. [Local storage](#local-storage)
    1. [External storage](#external-storage)
    1. [Backfilling](#backfilling)
-1. [Write to remote Prometheus servers](#write-to-remote-prometheus-servers)
+1. [Send metrics to other Prometheus servers](#send-metrics-to-other-prometheus-servers)
+1. [Exporters](#exporters)
 1. [Management API](#management-api)
    1. [Take snapshots of the current data](#take-snapshots-of-the-current-data)
 1. [High availability](#high-availability)
@@ -42,6 +43,20 @@ Prometheus to scrape from.<br/>
 Exporters are small and purpose-built applications that collect their objects' metrics in different ways, then expose
 them in an HTTP endpoint in their place.
 
+<details>
+  <summary>Setup</summary>
+
+```sh
+docker pull 'prom/prometheus'
+docker run -p '9090:9090' -v "$PWD/config/dir:/etc/prometheus" -v 'prometheus-data:/prometheus' 'prom/prometheus'
+
+```
+
+</details>
+
+<details>
+  <summary>Usage</summary>
+
 ```sh
 # Start the process.
 prometheus
@@ -57,6 +72,8 @@ kill -s 'SIGTERM' '3969'
 pkill --signal 'TERM' 'prometheus'
 ```
 
+</details>
+
 ## Components
 
 Prometheus is composed by its **server**, the **Alertmanager** and its **exporters**.
@@ -355,7 +372,7 @@ TODO
 
 TODO
 
-## Write to remote Prometheus servers
+## Send metrics to other Prometheus servers
 
 Also see [How to set up and experiment with Prometheus remote-write].
 
@@ -376,6 +393,23 @@ remote_write:
       region: eu-east-1
 ```
 
+## Exporters
+
+Refer [Exporters and integrations].
+
+Exporters are libraries and web servers that gather metrics from third-party systems, then either send them to
+Prometheus servers or expose them as Prometheus metrics.
+
+They are used in cases where it is not feasible to instrument systems to send or expose Prometheus metrics directly.
+
+Exporters of interest:
+
+| Exporter                               | Summary                             |
+| -------------------------------------- | ----------------------------------- |
+| [BOINC exporter][ordaa/boinc_exporter] | Metrics for BOINC client            |
+| [Node exporter]                        | OS-related metrics                  |
+| [SNMP exporter]                        | Basically SNMP in Prometheus format |
+
 ## Management API
 
 ### Take snapshots of the current data
@@ -416,9 +450,10 @@ The snapshot now exists at `<data-dir>/snapshots/20171210T211224Z-2be650b6d019eb
 Typically achieved by:
 
 1. Running multiple Prometheus replicas.<br/>
-   Replicas could each focus on a subset of the whole data, or just duplicate it.
+   Replicas could each focus _on a subset_ of the whole data, or just scrape the targets multiple times and leave the
+   deduplication to other tools.
 1. Running a separate AlertManager instance.<br/>
-   This would handle alerts from all the Prometheus instances, automatically managing eventually duplicated data.
+   This would handle alerts from **all** the Prometheus instances, automatically managing eventually duplicated data.
 1. Using tools like [Thanos], [Cortex], or Grafana's [Mimir] to aggregate and deduplicate data.
 1. Directing visualizers like Grafana to the aggregator instead of the Prometheus replicas.
 
@@ -439,12 +474,7 @@ Typically achieved by:
 - [Cortex]
 - [Thanos]
 - Grafana's [Mimir]
-
-Exporters:
-
-- [Node exporter]
-- [SNMP exporter]
-- [`ordaa/boinc_exporter`][ordaa/boinc_exporter]
+- [Exporters and integrations]
 
 ### Sources
 
@@ -481,6 +511,7 @@ Exporters:
 <!-- Upstream -->
 [codebase]: https://github.com/prometheus/prometheus
 [documentation]: https://prometheus.io/docs/
+[Exporters and integrations]: https://prometheus.io/docs/instrumenting/exporters/
 [functions]: https://prometheus.io/docs/prometheus/latest/querying/functions/
 [helm chart]: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus
 [metric_relabel_configs]: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs
diff --git a/snippets/aws/other commands.fish b/snippets/aws/other commands.fish
index ddf19b7..f5994ad 100644
--- a/snippets/aws/other commands.fish	
+++ b/snippets/aws/other commands.fish	
@@ -120,6 +120,7 @@ aws ecr list-images --registry-id '012345678901' --repository-name 'cache/docker
 # ------------------
 ###
 
+# List tasks given a service name
 aws ecs list-tasks --query 'taskArns' --output 'text' --cluster 'testCluster' --service-name 'testService'
 
 aws ecs list-tasks --output 'text' --query 'taskArns' --cluster 'testCluster' --family 'testService' \
@@ -163,6 +164,10 @@ aws ecs describe-tasks --cluster 'staging' --tasks 'ef6260ed8aab49cf926667ab0c52
 aws ecs execute-command --cluster 'staging' --task 'e242654518cf42a7be13a8551e0b3c27' --container 'echo-server' \
 	--interactive --command 'nc -vz 127.0.0.1 28080'
 
+# Stop tasks given a service name
+aws ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \
+| xargs aws ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task
+
 
 ###
 # EFS
@@ -197,6 +202,18 @@ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retran
 mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvport' \
 	'10.20.30.42:/export-name' "$HOME/efs/export"
 
+# Update a file in an EFS volume, then stop the ECS tasks using it so new can start with the updated file.
+mkdir -p "$HOME/tmp/efs" \
+&&	aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir' \
+	| xargs -I '%%' dig 'A' +short '@172.16.0.2' "%%.efs.eu-west-1.amazonaws.com" \
+	| xargs -I '%%' mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \
+		"%%:/" "$HOME/tmp/efs" \
+&&	sudo cp -iv 'config.yaml' "$HOME/tmp/efs/" \
+&&	diff -q 'config.yaml' "$HOME/tmp/efs/config.yaml" \
+&&	umount "$HOME/tmp/efs" \
+&&	aws --profile 'ro' ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \
+	| xargs -n '1' aws --profile 'rw' ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task
+
 
 ###
 # EKS