From ae885a940f6f8a7184f44a41b9d9728d40652486 Mon Sep 17 00:00:00 2001 From: Michele Cereda Date: Wed, 2 Apr 2025 22:22:28 +0200 Subject: [PATCH] chore(opensearch): move api usage examples to their parent section, add index patterns and snippets --- knowledge base/cloud computing/aws/awscurl.md | 19 +- knowledge base/opensearch.md | 445 ++++++++++++------ snippets/aws/awscli.fish | 13 + snippets/opensearch.fish | 48 ++ 4 files changed, 368 insertions(+), 157 deletions(-) create mode 100644 snippets/aws/awscli.fish create mode 100644 snippets/opensearch.fish diff --git a/knowledge base/cloud computing/aws/awscurl.md b/knowledge base/cloud computing/aws/awscurl.md index 0bcc6d7..7a9f36e 100644 --- a/knowledge base/cloud computing/aws/awscurl.md +++ b/knowledge base/cloud computing/aws/awscurl.md @@ -22,15 +22,15 @@ brew install 'awscurl' ```sh # Credentials are inferred from the default profile if none is given. -awscurl -X 'POST' --region 'eu-south-1' --service 'aps' \ - 'https://aps.workspace.url/api/v1/query?query=up' -awscurl … --profile 'work' -awscurl … --access_key 'access-key-id' --secret_key 'secret-key' +awscurl --service 'es' 'https://search-domain.eu-west-1.es.amazonaws.com/_cluster/health?pretty' +awscurl --region 'eu-south-1' --service 'aps' -X 'POST' 'https://aps.workspace.url/api/v1/query?query=up' +awscurl --profile 'work' … +awscurl --access_key 'access-key-id' --secret_key 'secret-key' … # Set query data out of the URL. -awscurl … 'https://aps.workspace.url/api/v1/query/api/v1/query' \ +awscurl … --service 'aps' 'https://aps.workspace.url/api/v1/query/api/v1/query' \ -d 'query=up' -d 'time=1652382537' -d 'stats=all' -awscurl … 'https://aps.workspace.url/api/v1/query/api/v1/query_range' \ +awscurl … --service 'aps' 'https://aps.workspace.url/api/v1/query/api/v1/query_range' \ -d 'query=sum+%28rate+%28go_gc_duration_seconds_count%5B1m%5D%29%29' \ -d 'start=1652382537' -d 'end=1652384705' -d 'step=1000' -d 'stats=all' @@ -46,14 +46,15 @@ docker run --rm -it 'okigan/awscurl' \ ## Further readings - [Amazon Web Services] -- [Github] +- [Codebase] ### Sources - [Using awscurl to query Prometheus-compatible APIs] @@ -61,7 +62,7 @@ docker run --rm -it 'okigan/awscurl' \ [curl]: ../../curl.md -[github]: https://github.com/okigan/awscurl +[codebase]: https://github.com/okigan/awscurl [using awscurl to query prometheus-compatible apis]: https://docs.aws.amazon.com/prometheus/latest/userguide/AMP-compatible-APIs.html diff --git a/knowledge base/opensearch.md b/knowledge base/opensearch.md index 5986436..0bfbd8a 100644 --- a/knowledge base/opensearch.md +++ b/knowledge base/opensearch.md @@ -19,6 +19,7 @@ Use cases: application search, log analytics, data observability, data ingestion 1. [Bulk indexing](#bulk-indexing) 1. [Re-index data](#re-index-data) 1. [Data streams](#data-streams) +1. [Index patterns](#index-patterns) 1. [APIs](#apis) 1. [Further readings](#further-readings) 1. [Sources](#sources) @@ -30,7 +31,7 @@ Documents are stored in the JSON format, and returned when related information i Documents are immutable. However, they can be updated by retrieving them, updating the information in them, and re-indexing them using the same document IDs. -_Indexes_ are collections of documents.
+[_Indexes_][indexes] are collections of documents.
Their contents are queried when information is searched for. _Nodes_ are servers that store data and process search requests.
@@ -115,6 +116,21 @@ This reduces the overall number of segments on each shard, frees up disk space, Eventually, segments reach a maximum allowed size and are no longer merged into larger segments.
_Merge policies_ specify the segments' maximum size and how often merge operations are performed. +Interaction with the cluster is done via REST [APIs]. + +If indexes do not already exist, OpenSearch automatically creates them while [ingesting data][ingest data]. + +
+ Typical setup order of operations + +1. \[optional] Create [index templates]. +1. \[optional] Create [data streams]. +1. \[optional] Create [indexes]. +1. [Ingest data]. +1. Create [index patterns] for the search dashboard to use. + +
+ ## Node types | Node type | Description | Best practices for production | @@ -142,12 +158,14 @@ and query metrics for these tests improve upon the architecture. Refer [Managing indexes]. -Before one can search through one's data, documents must be indexed.
-Indexing is how search engines organize data for fast retrieval. The resulting structure is called _index_. +Indexes are collections of documents that one wants to make searchable.
+They organize the data for fast retrieval. -Within an index, OpenSearch identifies each document using a **unique** _document ID_. +To maximise one's ability to search and analyse documents, one can define how documents and their fields are stored and +indexed. -Data is indexed using the [APIs].
+Before one can search through one's data, documents **must** be ingested and indexed.
+Data is ingested and indexed using the [APIs].
There are two _indexing APIs_: - The _Index API_, which adds documents individually as they arrive.
@@ -159,8 +177,9 @@ There are two _indexing APIs_: Enormous documents are still better indexed **individually**. -When indexing documents, the document's `_id` must be **up to** 512 bytes in size.
-Should one **not** provide an ID for the document, OpenSearch generates a document ID itself. +Within indexes, OpenSearch identifies each document using a **unique** _document ID_.
+The document's `_id` must be **up to** 512 bytes in size.
+Should one **not** provide an ID for the document during ingestion, OpenSearch generates a document ID itself. Upon receiving indexing requests, OpenSearch: @@ -413,7 +432,7 @@ Refer [Set up a hot-warm architecture]. ## Index templates -Refer [Index templates]. +Refer [Index templates][documentation index templates]. Index templates allow to initialize new indexes with predefined mappings and settings. @@ -505,22 +524,121 @@ The entries in the items array are in the same order as the actions specified in Refer [Reindex data]. -When needing to make an extensive change (e.g., adding a new field to every document, or combining multiple indexes to -form a new one), one can use the `reindex` operation instead of deleting the old indexes, making the change offline, and -then indexing the data again. +The `_reindex` operation copies documents from an index, that one selects through a query, over to another index. -The `reindex` operation copies documents, that one selects through a query, over to another index. +When needing to make an extensive change (e.g., adding a new field to every document, move documents between indexes, or +combining multiple indexes into a new one), one can use the `_reindex` operation instead of deleting the old indexes, +making the change offline, and then indexing the data again. -Reindexing can be an expensive operation depending on the size of the source index.
-It is recommended to disable replicas in the destination index by setting `number_of_replicas` to `0`, and re-enable -them once the `reindex` process is complete. +Re-indexing can be an expensive operation depending on the size of the source index.
+It is recommended to disable replicas in the destination index by setting its `number_of_replicas` to `0`, and re-enable +them once the re-indexing process is complete. -`reindex` is a `POST` operation.
+`_reindex` is a `POST` operation.
In its most basic form, requires specifying a source index and a destination index. +Should the destination index not exist, `_reindex` creates a new index **with default configurations**.
+If the destination index requires field mappings or custom settings, (re)create the destination index **beforehand** +with the desired ones. + +
+ Reindex all documents + +Copy **all** documents from one index to another. + +```plaintext +POST _reindex +{ + "source": { + "index": "sourceIndex" + }, + "dest": { + "index": "destinationIndex" + } +} +``` + +```json +{ + "took": 1350, + "timed_out": false, + "total": 30, + "updated": 0, + "created": 30, + "deleted": 0, + "batches": 1, + "version_conflicts": 0, + "noops": 0, + "retries": { + "bulk": 0, + "search": 0 + }, + "throttled_millis": 0, + "requests_per_second": -1, + "throttled_until_millis": 0, + "failures": [] +} +``` + +
+ +
+ Reindex only unique documents + +Copy **only** documents **missing** from a destination index by setting the `op_type` option to `create`. + +If a document with the same ID already exists, the operation ignores the one from the source index.
+To ignore all version conflicts of documents, set the `conflicts` option to `proceed`. + +> For some reason, it seems to work better if the `conflicts` option is at the start of the request's data. + +```plaintext +POST _reindex +{ + "conflicts": "proceed", + "source": { + "index": "sourceIndex" + }, + "dest": { + "index": "destinationIndex", + "op_type": "create" + } +} +``` + +
+ +
+ Combine indexes + +Combine **all** documents from one or more indexes into another by adding the source indexes as a list. + +> The number of shards for your source and destination indexes **must be the same**. + +```plaintext +POST _reindex +{ + "source": { + "index": [ + "sourceIndex_1", + "sourceIndex_2" + ] + }, + "dest": { + "index": "destinationIndex" + } +} +``` + +
+ ## Data streams -Data streams simplify the management of time-series data. +Data streams are **managed** indices that are highly optimised for **time-series and append-only data** (typically, logs +and observability data in general). + +They work like any other index, but OpenSearch simplifies some management operations (e.g., rollovers) and stores them +in a more efficient way. They are internally composed of multiple _backing_ indexes.
Search requests are routed to **all** backing indexes, while indexing requests are routed only to the **latest** write @@ -529,6 +647,7 @@ index. ISM policies allow to automatically handle index rollover or deletion.
+ Create data streams 1. Create an index template containing `index_pattern: []` and `data_stream: {}`.
This template will configure all indexes matching the defined patterns as a data stream. @@ -622,6 +741,151 @@ ISM policies allow to automatically handle index rollover or deletion.
+
+ Create templates for data streams + +```plaintext +PUT _index_template/logs-template +{ + "data_stream": {}, + "index_patterns": [ + "logs-*" + ] +} +``` + +```json +{ + "acknowledged": true +} +``` + +
+ +
+ Explicitly create data streams + +```plaintext +PUT _data_stream/logs-nginx +``` + +```json +{ + "acknowledged": true +} +``` + +
+ +
+ Get information about data streams + +```plaintext +GET _data_stream/logs-nginx +``` + +```json +{ + "data_streams": [ + { + "name": "logs-nginx", + "timestamp_field": { + "name": "@timestamp" + }, + "indices": [ + { + "index_name": ".ds-logs-nginx-000002", + "index_uuid": "UjUVr7haTWePKAfDz2q4Xg" + }, + { + "index_name": ".ds-logs-nginx-000004", + "index_uuid": "gi372IUBSDO-pkaj7klLiQ" + }, + { + "index_name": ".ds-logs-nginx-000005", + "index_uuid": "O60_VDzBStCaVGl8Sud2BA" + } + ], + "generation": 5, + "status": "GREEN", + "template": "logs-template" + } + ] +} +``` + +
+ +
+ Get statistics about data streams + +```plaintext +GET _data_stream/logs-nginx/_stats +``` + +```json +{ + "_shards": { + "total": 2, + "successful": 2, + "failed": 0 + }, + "data_stream_count": 1, + "backing_indices": 1, + "total_store_size_bytes": 416, + "data_streams": [ + { + "data_stream": "logs-nginx", + "backing_indices": 1, + "store_size_bytes": 416, + "maximum_timestamp": 0 + } + ] +} +``` + +
+ +
+ Delete data streams + +```plaintext +DELETE _data_stream/logs-nginx +``` + +```json +{ + "acknowledged": true +} +``` + +
+ +## Index patterns + +Index patterns reference one or more indexes, data streams, or index aliases.
+They are mostly used in dashboards and in the _discover_ tab to filter indexes to gather data from. + +They require data to be indexed before creation. + +
+ Create index patterns + +1. Go to OpenSearch Dashboards. +1. In the _Management_ section of the side menu, select _Dashboards Management_. +1. Select _Index patterns_, then _Create index pattern_. +1. Define the pattern by entering a name in the Index pattern name field.
+ Dashboards automatically adds a wildcard (`*`). It will make the pattern match multiple sources or indexes. +1. Specify the time field to use when filtering documents on a time base.
+ Unless otherwise specified in the source or index properties, `@timestamp` will pop up in the dropdown menu. + + Should one **not** want to use a time filter, select that option from the dropdown menu.
+ This will make OpenSearch return **all** the data in **all** the indexes that match the index pattern. + +1. Select _Create index pattern_. + +
+ ## APIs OpenSearch clusters offer a REST API.
@@ -629,25 +893,22 @@ It allows almost everything - changing most settings, modify indexes, check clus One can interact with the API using every method that can send HTTP requests.
One can also send HTTP requests in the Dev Tools console in OpenSearch Dashboards. It uses a simpler syntax to format -REST requests compared to other tools like `curl`. +REST requests compared to other tools like [cURL]. OpenSearch returns responses in **flat** JSON format by default.
Provide the `pretty` query parameter to obtain response bodies in human-readable form: ```sh -curl 'https://localhost:9200/_cluster/health?pretty' +curl --insecure --user 'admin:someCustomStr0ng!Password' 'https://localhost:9200/_cluster/health?pretty' +awscurl --service 'es' 'https://search-domain.eu-west-1.es.amazonaws.com/_cluster/health?pretty' ``` -Requests that contain a body must specify the `Content-Type` header, and provide the request payload: +Requests that contain a body **must** specify the `Content-Type` header, **and** provide the request's payload: ```sh -curl 'https://localhost:9200/students/_search?pretty' \ +curl … \ -H 'Content-Type: application/json' \ - -d '{ - "query": { - "match_all": {} - } - }' + -d '{"query":{"match_all":{}}}' ``` [REST API reference] @@ -972,126 +1233,6 @@ PUT /prometheus-logs-20231205/_settings DELETE /students ``` -```json -{ - "acknowledged": true -} -``` - - - -
- Create templates for data streams - -```plaintext -PUT _index_template/logs-template -{ - "data_stream": {}, - "index_patterns": [ - "logs-*" - ] -} -``` - -```json -{ - "acknowledged": true -} -``` - -
- -
- Explicitly create data streams - -```plaintext -PUT _data_stream/logs-nginx -``` - -```json -{ - "acknowledged": true -} -``` - -
- -
- Get information about data streams - -```plaintext -GET _data_stream/logs-nginx -``` - -```json -{ - "data_streams": [ - { - "name": "logs-nginx", - "timestamp_field": { - "name": "@timestamp" - }, - "indices": [ - { - "index_name": ".ds-logs-nginx-000002", - "index_uuid": "UjUVr7haTWePKAfDz2q4Xg" - }, - { - "index_name": ".ds-logs-nginx-000004", - "index_uuid": "gi372IUBSDO-pkaj7klLiQ" - }, - { - "index_name": ".ds-logs-nginx-000005", - "index_uuid": "O60_VDzBStCaVGl8Sud2BA" - } - ], - "generation": 5, - "status": "GREEN", - "template": "logs-template" - } - ] -} -``` - -
- -
- Get statistics about data streams - -```plaintext -GET _data_stream/logs-nginx/_stats -``` - -```json -{ - "_shards": { - "total": 2, - "successful": 2, - "failed": 0 - }, - "data_stream_count": 1, - "backing_indices": 1, - "total_store_size_bytes": 416, - "data_streams": [ - { - "data_stream": "logs-nginx", - "backing_indices": 1, - "store_size_bytes": 416, - "maximum_timestamp": 0 - } - ] -} -``` - -
- -
- Delete data streams - -```plaintext -DELETE _data_stream/logs-nginx -``` - ```json { "acknowledged": true @@ -1129,8 +1270,9 @@ DELETE _data_stream/logs-nginx - [Stepping up for a truly open source Elasticsearch] - [Managing indexes] - [Reindex data] -- [Index templates] +- [Index templates][documentation index templates] - [OpenSearch Data Streams] +- [OpenSearch Indexes and Data streams] [apis]: #apis +[data streams]: #data-streams [hot-warm architecture]: #hot-warm-architecture +[index patterns]: #index-patterns +[index templates]: #index-templates +[indexes]: #indexes +[ingest data]: #ingest-data [aws' managed opensearch]: cloud%20computing/aws/opensearch.md +[curl]: curl.md [codebase]: https://github.com/opensearch-project [creating a cluster]: https://opensearch.org/docs/latest/tuning-your-cluster/ [data prepper]: https://opensearch.org/docs/latest/data-prepper/ +[documentation index templates]: https://opensearch.org/docs/latest/im-plugin/index-templates/ [documentation]: https://opensearch.org/docs/latest/ [index management]: https://opensearch.org/docs/latest/dashboards/im-dashboards/index-management/ [index settings]: https://opensearch.org/docs/latest/install-and-configure/configuring-opensearch/index-settings/ -[index templates]: https://opensearch.org/docs/latest/im-plugin/index-templates/ [managing indexes]: https://opensearch.org/docs/latest/im-plugin/ [reindex data]: https://opensearch.org/docs/latest/im-plugin/reindex-data/ [rest api reference]: https://opensearch.org/docs/latest/api-reference/ @@ -1169,6 +1317,7 @@ DELETE _data_stream/logs-nginx [lucene]: https://lucene.apache.org/ [okapi bm25]: https://en.wikipedia.org/wiki/Okapi_BM25 [opensearch data streams]: https://opster.com/guides/opensearch/opensearch-machine-learning/opensearch-data-streams/ +[opensearch indexes and data streams]: https://stackoverflow.com/questions/75394622/opensearch-indexes-and-data-streams#75494264 [setting up hot-warm architecture for ism in opensearch]: https://opster.com/guides/opensearch/opensearch-data-architecture/setting-up-hot-warm-architecture-for-ism/ [stepping up for a truly open source elasticsearch]: https://aws.amazon.com/blogs/opensource/stepping-up-for-a-truly-open-source-elasticsearch/ [top 14 elk alternatives in 2024]: https://signoz.io/blog/elk-alternatives/ diff --git a/snippets/aws/awscli.fish b/snippets/aws/awscli.fish new file mode 100644 index 0000000..ed1d615 --- /dev/null +++ b/snippets/aws/awscli.fish @@ -0,0 +1,13 @@ +#!/usr/bin/env fish + +docker run --rm -it 'okigan/awscurl' \ + -- \ + --region 'eu-south-1' --service 'aps' \ + --access_key "$AWS_ACCESS_KEY_ID" --secret_key "$AWS_SECRET_ACCESS_KEY" \ + 'https://aps.workspace.url/api/v1/query/api/v1/query?query=up' + +awscurl --service 'es' 'https://search-domain.eu-west-1.es.amazonaws.com/_cluster/health?pretty' + +awscurl --region 'eu-south-1' --service 'aps' -X 'POST' 'https://aps.workspace.url/api/v1/query?query=up' +awscurl --service 'aps' 'https://aps.workspace.url/api/v1/query/api/v1/query' \ + -d 'query=up' -d 'time=1652382537' -d 'stats=all' diff --git a/snippets/opensearch.fish b/snippets/opensearch.fish new file mode 100644 index 0000000..1dd9130 --- /dev/null +++ b/snippets/opensearch.fish @@ -0,0 +1,48 @@ +#!/usr/bin/env fish + +# Connect to the API +curl --insecure --user 'admin:someCustomStr0ng!Password' 'https://localhost:9200/_cluster/health?pretty' +awscurl --service 'es' 'https://search-domain.eu-west-1.es.amazonaws.com/_cluster/health?pretty' +# If sending data, also set the 'Content-Type' header +curl 'https://localhost:9200/someIndex/_search?pretty' \ + -ku 'admin:someCustomStr0ng!Password' \ + --header 'Content-Type: application/json' \ + --data '{"query":{"match_all":{}}}' + + +# Copy *all* documents from an index to another +curl 'https://localhost:9200/_reindex?pretty' --request 'POST' \ + -ku 'admin:someCustomStr0ng!Password' \ + -H 'Content-Type: application/json' \ + -d '{ + "source": {"index": "sourceIndex"}, + "dest": {"index": "destinationIndex"} + }' + +# Copy *only missing* documents from an index to another +curl 'https://localhost:9200/_reindex?pretty' -X 'POST' \ + -ku 'admin:someCustomStr0ng!Password' \ + -H 'Content-Type: application/json' \ + -d '{ + "conflicts": "proceed", + "source": {"index": "sourceIndex"}, + "dest": { + "index": "destinationIndex", + "op_type": "create" + } + }' + +# Combine indexes into one +curl 'https://localhost:9200/_reindex?pretty' -X 'POST' \ + -ku 'admin:someCustomStr0ng!Password' \ + -H 'Content-Type: application/json' \ + -d '{ + "source": { + "index": [ + "sourceIndex_1", + … + "sourceIndex_N" + ] + }, + "dest": {"index": "destinationIndex"} + }'