mirror of
https://gitea.com/mcereda/oam.git
synced 2026-02-09 05:44:23 +00:00
chore(mimir): try to run on aws ecs
This commit is contained in:
@@ -108,6 +108,31 @@ mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvp
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Example: mount an EFS volume and change a file in it</summary>
|
||||
|
||||
```sh
|
||||
$ aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir'
|
||||
fs-abcdef0123456789a
|
||||
$ dig 'A' +short '@172.16.0.2' 'fs-abcdef0123456789a.efs.eu-west-1.amazonaws.com'
|
||||
172.16.1.20
|
||||
$ mkdir -p "$HOME/tmp/efs"
|
||||
$ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \
|
||||
'172.16.1.20:/' "$HOME/tmp/efs"
|
||||
$ mount -t 'nfs'
|
||||
172.16.1.20:/ on /Users/someuser/tmp/efs (nfs, nodev, nosuid, mounted by someuser)
|
||||
$ sudo cp -iv 'config.yaml' "$HOME/tmp/efs/" # EFS permissions require one to use `sudo` here
|
||||
config.yaml -> /Users/someuser/tmp/efs/config.yaml
|
||||
$ ls -l "$HOME/tmp/efs/"
|
||||
total 1
|
||||
-rw-r--r--@ 1 root wheel 254 Apr 17 17:58 config.yaml
|
||||
$ cat "$HOME/tmp/efs/config.yaml"
|
||||
$ vim "$HOME/tmp/efs/config.yaml"
|
||||
$ umount "$HOME/tmp/efs"
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Further readings
|
||||
|
||||
- [Amazon Web Services]
|
||||
|
||||
@@ -14,9 +14,13 @@ and set up alerting rules across multiple tenants to leverage tenant federation.
|
||||
1. [Microservices mode](#microservices-mode)
|
||||
1. [Storage](#storage)
|
||||
1. [Object storage](#object-storage)
|
||||
1. [Authentication and authorization](#authentication-and-authorization)
|
||||
1. [APIs](#apis)
|
||||
1. [Deduplication of data from multiple Prometheus scrapers](#deduplication-of-data-from-multiple-prometheus-scrapers)
|
||||
1. [Migrate to Mimir](#migrate-to-mimir)
|
||||
1. [Troubleshooting](#troubleshooting)
|
||||
1. [HTTP status 401 Unauthorized: no org id](#http-status-401-unauthorized-no-org-id)
|
||||
1. [HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1](#http-status-500-internal-server-error-send-data-to-ingesters-at-least-2-live-replicas-required-could-only-find-1)
|
||||
1. [Further readings](#further-readings)
|
||||
1. [Sources](#sources)
|
||||
|
||||
@@ -32,6 +36,9 @@ Such blocks are the same that Prometheus and Thanos use, though each application
|
||||
uses slightly different metadata files for them.
|
||||
|
||||
Mimir supports multiple tenants, and stores blocks on a **per-tenant** level.<br/>
|
||||
Multi-tenancy is enabled by default, and can be disabled using the `-auth.multitenancy-enabled=false` option.<br/>
|
||||
If enabled, then multi-tenancy **will require every API request** to have the `X-Scope-OrgID` header with the value set
|
||||
to the tenant ID one is authenticating for.<br/>
|
||||
When multi-tenancy is **disabled**, it will only manage a single tenant going by the name `anonymous`.
|
||||
|
||||
Blocks can be uploaded using the `mimirtool` utility, so that Mimir can access them.<br/>
|
||||
@@ -177,7 +184,12 @@ It will **not** scrape metrics itself.
|
||||
|
||||
```yaml
|
||||
remote_write:
|
||||
- url: http://mimir.example.org:9009/api/v1/push
|
||||
- url: http://mimir.example.org:8080/api/v1/push
|
||||
headers:
|
||||
X-Scope-OrgID:
|
||||
# required unless multi-tenancy is disabled
|
||||
# set it to the correct ones, this is the default
|
||||
anonymous
|
||||
```
|
||||
|
||||
</details>
|
||||
@@ -254,7 +266,10 @@ Refer [Configure Grafana Mimir object storage backend].
|
||||
Blocks storage must be located under a **different** prefix or bucket than both the ruler's and AlertManager's stores.
|
||||
Mimir **will** fail to start if that is the case.
|
||||
|
||||
To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations:
|
||||
To avoid that, it is suggested to override the `bucket_name` setting in the specific configurations.
|
||||
|
||||
<details style="padding: 0 0 0 1rem">
|
||||
<summary>Different buckets</summary>
|
||||
|
||||
```yaml
|
||||
common:
|
||||
@@ -277,6 +292,42 @@ ruler_storage:
|
||||
bucket_name: mimir-ruler
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details style="padding: 0 0 1rem 1rem">
|
||||
<summary>Same bucket, different prefixes</summary>
|
||||
|
||||
```yaml
|
||||
common:
|
||||
storage:
|
||||
backend: s3
|
||||
s3:
|
||||
endpoint: s3.us-east-2.amazonaws.com
|
||||
region: us-east-2
|
||||
bucket_name: mimir
|
||||
|
||||
blocks_storage:
|
||||
storage_prefix: blocks
|
||||
|
||||
alertmanager_storage:
|
||||
storage_prefix: alertmanager
|
||||
|
||||
ruler_storage:
|
||||
storage_prefix: ruler
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
The WAL is **only** retained on local disk, **not** persisted to the object storage.
|
||||
|
||||
Metrics data is uploaded to the object storage every 2 hours, typically when a block is cut from the in-memory TSDB
|
||||
head.<br/>
|
||||
After the metrics data block is uploaded, its related WAL is truncated too.
|
||||
|
||||
## Authentication and authorization
|
||||
|
||||
Refer [Grafana Mimir authentication and authorization].
|
||||
|
||||
## APIs
|
||||
|
||||
Refer [Grafana Mimir HTTP API].
|
||||
@@ -287,7 +338,72 @@ Refer [Configure Grafana Mimir high-availability deduplication].
|
||||
|
||||
## Migrate to Mimir
|
||||
|
||||
Refer [Migrate from Thanos or Prometheus to Grafana Mimir].
|
||||
Refer [Configure TSDB block upload] and [Migrate from Thanos or Prometheus to Grafana Mimir].
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### HTTP status 401 Unauthorized: no org id
|
||||
|
||||
**Context**: Prometheus servers get this error when trying to push metrics.
|
||||
|
||||
**Root cause**: The push request is missing the `X-Scope-OrgID` header that would specify the tenancy for the data.
|
||||
|
||||
**Solution**:
|
||||
|
||||
Configure Prometheus to add the `X-Scope-OrgID` header to the data pushed.<br/>
|
||||
Required even when multi-tenancy is disabled. In this case, use the default `anonymous` tenancy:
|
||||
|
||||
```yaml
|
||||
remote_write:
|
||||
- url: http://mimir.example.org:8080/api/v1/push
|
||||
headers:
|
||||
X-Scope-OrgID:
|
||||
# required unless multi-tenancy is disabled
|
||||
# set it to the correct ones, this is the default
|
||||
anonymous
|
||||
```
|
||||
|
||||
### HTTP status 500 Internal Server Error: send data to ingesters: at least 2 live replicas required, could only find 1
|
||||
|
||||
**Context**:
|
||||
|
||||
Mimir is running on [AWS ECS] in monolithic mode for evaluation.<br/>
|
||||
It is loading the following configuration from a mounted [AWS EFS] volume:
|
||||
|
||||
```yml
|
||||
multitenancy_enabled: false
|
||||
|
||||
common:
|
||||
storage:
|
||||
backend: s3
|
||||
blocks_storage:
|
||||
s3:
|
||||
bucket_name: my-mimir-blocks
|
||||
```
|
||||
|
||||
The service is backed by a load balancer.<br/>
|
||||
The load balancer is allowing requests to reach the task serving Mimir correctly.
|
||||
|
||||
A Prometheus server is configured to send data to Mimir:
|
||||
|
||||
```yml
|
||||
remote_write:
|
||||
- url: http://mimir.dev.somecompany.com:8080/api/v1/push
|
||||
headers:
|
||||
X-Scope-OrgID: anonymous
|
||||
```
|
||||
|
||||
The push request passes Mimir's validation for the above header.
|
||||
|
||||
Both Mimir and Prometheus print this error when Prometheus tries to push metrics.
|
||||
|
||||
**Root cause**:
|
||||
|
||||
It seems Mimir requires a minimum of 2 ingester replicas even when running in monolithic mode.
|
||||
|
||||
**Solution**:
|
||||
|
||||
TODO
|
||||
|
||||
## Further readings
|
||||
|
||||
@@ -307,6 +423,7 @@ Alternatives:
|
||||
- [Migrate from Thanos or Prometheus to Grafana Mimir]
|
||||
- [Configure Grafana Mimir object storage backend]
|
||||
- [Grafana Mimir configuration parameters]
|
||||
- [Grafana Mimir authentication and authorization]
|
||||
|
||||
<!--
|
||||
Reference
|
||||
@@ -315,6 +432,8 @@ Alternatives:
|
||||
|
||||
<!-- In-article sections -->
|
||||
<!-- Knowledge base -->
|
||||
[aws ecs]: cloud%20computing/aws/ecs.md
|
||||
[aws efs]: cloud%20computing/aws/efs.md
|
||||
[cortex]: cortex.md
|
||||
[grafana]: grafana.md
|
||||
[prometheus]: prometheus/README.md
|
||||
@@ -325,7 +444,9 @@ Alternatives:
|
||||
[codebase]: https://github.com/grafana/mimir
|
||||
[configure grafana mimir high-availability deduplication]: https://grafana.com/docs/mimir/latest/configure/configure-high-availability-deduplication/
|
||||
[configure grafana mimir object storage backend]: https://grafana.com/docs/mimir/latest/configure/configure-object-storage-backend/
|
||||
[Configure TSDB block upload]: https://grafana.com/docs/mimir/latest/configure/configure-tsdb-block-upload/
|
||||
[documentation]: https://grafana.com/docs/mimir/latest/
|
||||
[Grafana Mimir authentication and authorization]: https://grafana.com/docs/mimir/next/manage/secure/authentication-and-authorization/
|
||||
[grafana mimir configuration parameters]: https://grafana.com/docs/mimir/latest/configure/configuration-parameters/
|
||||
[grafana mimir http api]: https://grafana.com/docs/mimir/latest/references/http-api/
|
||||
[helm chart]: https://github.com/grafana/mimir/tree/main/operations/helm/charts/mimir-distributed
|
||||
|
||||
@@ -16,7 +16,8 @@ are observed.
|
||||
1. [Local storage](#local-storage)
|
||||
1. [External storage](#external-storage)
|
||||
1. [Backfilling](#backfilling)
|
||||
1. [Write to remote Prometheus servers](#write-to-remote-prometheus-servers)
|
||||
1. [Send metrics to other Prometheus servers](#send-metrics-to-other-prometheus-servers)
|
||||
1. [Exporters](#exporters)
|
||||
1. [Management API](#management-api)
|
||||
1. [Take snapshots of the current data](#take-snapshots-of-the-current-data)
|
||||
1. [High availability](#high-availability)
|
||||
@@ -42,6 +43,20 @@ Prometheus to scrape from.<br/>
|
||||
Exporters are small and purpose-built applications that collect their objects' metrics in different ways, then expose
|
||||
them in an HTTP endpoint in their place.
|
||||
|
||||
<details>
|
||||
<summary>Setup</summary>
|
||||
|
||||
```sh
|
||||
docker pull 'prom/prometheus'
|
||||
docker run -p '9090:9090' -v "$PWD/config/dir:/etc/prometheus" -v 'prometheus-data:/prometheus' 'prom/prometheus'
|
||||
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary>Usage</summary>
|
||||
|
||||
```sh
|
||||
# Start the process.
|
||||
prometheus
|
||||
@@ -57,6 +72,8 @@ kill -s 'SIGTERM' '3969'
|
||||
pkill --signal 'TERM' 'prometheus'
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
## Components
|
||||
|
||||
Prometheus is composed by its **server**, the **Alertmanager** and its **exporters**.
|
||||
@@ -355,7 +372,7 @@ TODO
|
||||
|
||||
TODO
|
||||
|
||||
## Write to remote Prometheus servers
|
||||
## Send metrics to other Prometheus servers
|
||||
|
||||
Also see [How to set up and experiment with Prometheus remote-write].
|
||||
|
||||
@@ -376,6 +393,23 @@ remote_write:
|
||||
region: eu-east-1
|
||||
```
|
||||
|
||||
## Exporters
|
||||
|
||||
Refer [Exporters and integrations].
|
||||
|
||||
Exporters are libraries and web servers that gather metrics from third-party systems, then either send them to
|
||||
Prometheus servers or expose them as Prometheus metrics.
|
||||
|
||||
They are used in cases where it is not feasible to instrument systems to send or expose Prometheus metrics directly.
|
||||
|
||||
Exporters of interest:
|
||||
|
||||
| Exporter | Summary |
|
||||
| -------------------------------------- | ----------------------------------- |
|
||||
| [BOINC exporter][ordaa/boinc_exporter] | Metrics for BOINC client |
|
||||
| [Node exporter] | OS-related metrics |
|
||||
| [SNMP exporter] | Basically SNMP in Prometheus format |
|
||||
|
||||
## Management API
|
||||
|
||||
### Take snapshots of the current data
|
||||
@@ -416,9 +450,10 @@ The snapshot now exists at `<data-dir>/snapshots/20171210T211224Z-2be650b6d019eb
|
||||
Typically achieved by:
|
||||
|
||||
1. Running multiple Prometheus replicas.<br/>
|
||||
Replicas could each focus on a subset of the whole data, or just duplicate it.
|
||||
Replicas could each focus _on a subset_ of the whole data, or just scrape the targets multiple times and leave the
|
||||
deduplication to other tools.
|
||||
1. Running a separate AlertManager instance.<br/>
|
||||
This would handle alerts from all the Prometheus instances, automatically managing eventually duplicated data.
|
||||
This would handle alerts from **all** the Prometheus instances, automatically managing eventually duplicated data.
|
||||
1. Using tools like [Thanos], [Cortex], or Grafana's [Mimir] to aggregate and deduplicate data.
|
||||
1. Directing visualizers like Grafana to the aggregator instead of the Prometheus replicas.
|
||||
|
||||
@@ -439,12 +474,7 @@ Typically achieved by:
|
||||
- [Cortex]
|
||||
- [Thanos]
|
||||
- Grafana's [Mimir]
|
||||
|
||||
Exporters:
|
||||
|
||||
- [Node exporter]
|
||||
- [SNMP exporter]
|
||||
- [`ordaa/boinc_exporter`][ordaa/boinc_exporter]
|
||||
- [Exporters and integrations]
|
||||
|
||||
### Sources
|
||||
|
||||
@@ -481,6 +511,7 @@ Exporters:
|
||||
<!-- Upstream -->
|
||||
[codebase]: https://github.com/prometheus/prometheus
|
||||
[documentation]: https://prometheus.io/docs/
|
||||
[Exporters and integrations]: https://prometheus.io/docs/instrumenting/exporters/
|
||||
[functions]: https://prometheus.io/docs/prometheus/latest/querying/functions/
|
||||
[helm chart]: https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus
|
||||
[metric_relabel_configs]: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs
|
||||
|
||||
@@ -120,6 +120,7 @@ aws ecr list-images --registry-id '012345678901' --repository-name 'cache/docker
|
||||
# ------------------
|
||||
###
|
||||
|
||||
# List tasks given a service name
|
||||
aws ecs list-tasks --query 'taskArns' --output 'text' --cluster 'testCluster' --service-name 'testService'
|
||||
|
||||
aws ecs list-tasks --output 'text' --query 'taskArns' --cluster 'testCluster' --family 'testService' \
|
||||
@@ -163,6 +164,10 @@ aws ecs describe-tasks --cluster 'staging' --tasks 'ef6260ed8aab49cf926667ab0c52
|
||||
aws ecs execute-command --cluster 'staging' --task 'e242654518cf42a7be13a8551e0b3c27' --container 'echo-server' \
|
||||
--interactive --command 'nc -vz 127.0.0.1 28080'
|
||||
|
||||
# Stop tasks given a service name
|
||||
aws ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \
|
||||
| xargs aws ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task
|
||||
|
||||
|
||||
###
|
||||
# EFS
|
||||
@@ -197,6 +202,18 @@ mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retran
|
||||
mount -t 'nfs' -o 'nfsvers=4,tcp,rwsize=1048576,hard,timeo=600,retrans=2,noresvport' \
|
||||
'10.20.30.42:/export-name' "$HOME/efs/export"
|
||||
|
||||
# Update a file in an EFS volume, then stop the ECS tasks using it so new can start with the updated file.
|
||||
mkdir -p "$HOME/tmp/efs" \
|
||||
&& aws efs describe-file-systems --query 'FileSystems[].FileSystemId' --output 'text' --creation-token 'mimir' \
|
||||
| xargs -I '%%' dig 'A' +short '@172.16.0.2' "%%.efs.eu-west-1.amazonaws.com" \
|
||||
| xargs -I '%%' mount -t 'nfs' -o 'nfsvers=4.0,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport' \
|
||||
"%%:/" "$HOME/tmp/efs" \
|
||||
&& sudo cp -iv 'config.yaml' "$HOME/tmp/efs/" \
|
||||
&& diff -q 'config.yaml' "$HOME/tmp/efs/config.yaml" \
|
||||
&& umount "$HOME/tmp/efs" \
|
||||
&& aws --profile 'ro' ecs list-tasks --cluster 'staging' --service-name 'mimir' --query 'taskArns' --output 'text' \
|
||||
| xargs -n '1' aws --profile 'rw' ecs stop-task --cluster 'staging' --output 'text' --query 'task.lastStatus' --task
|
||||
|
||||
|
||||
###
|
||||
# EKS
|
||||
|
||||
Reference in New Issue
Block a user