diff --git a/knowledge base/cloud computing/aws/ssm.md b/knowledge base/cloud computing/aws/ssm.md
index f082984..89cb4ae 100644
--- a/knowledge base/cloud computing/aws/ssm.md
+++ b/knowledge base/cloud computing/aws/ssm.md
@@ -204,6 +204,18 @@ Pitfalls:
> as. Remote commands will often default to running as the `ssm-agent` user, however this will also depend on how SSM
> has been configured.
+- SSM sessions' duration is limited by SSM's settings.
+ That might impact tasks that need to run for more than said duration.
+
+
+
+ Some modules (e.g.: `community.postgresql.postgresql_db`) got their session terminated and SSM retried the task,
+ killing and restarting the running process.
+ Since the process lasted more than the sessions' duration, it kept having its sessions terminated. The task failed
+ when the SSM reached the set retries for the connection.
+
+
+
- Since [SSM starts shell sessions under `/usr/bin`][gotchas], one must explicitly set Ansible's temporary directory to
a folder the remote user can write to ([source][ansible temp dir change]).
@@ -253,10 +265,28 @@ Pitfalls:
-- When using `async` tasks, SSM will fire the task and disconnect; this makes the task **fail**, but the module will
- still run on the target host.
- Fire these tasks with `poll` set to `0` and forcing a specific failure test, then use a different task to check up on
- them.
+- When using `async` tasks, SSM will fire the task and disconnect
+ This makes the task **fail**, but the process will still run on the target host.
+
+
+
+ ```json
+ {
+ "changed": false,
+ "module_stderr": "",
+ "module_stdout": "\u001b]0;@ip-172-31-42-42:/usr/bin\u0007{\"failed\": 0, \"started\": 1, \"finished\": 0, \"ansible_job_id\": \"j604343782826.4885\", \"results_file\": \"/tmp/.ansible-ssm-user/async/j604343782826.4885\", \"_ansible_suppress_tmpdir_delete\": true}\r\r",
+ "msg": "MODULE FAILURE\nSee stdout/stderr for the exact error",
+ "rc": 0
+ }
+ ```
+
+
+
+ Fire these tasks with `poll` set to `0` and forcing a specific failure test.
+ Then, use a different task to check up on them.
+
+ > When checking up tasks with `ansible.builtin.async_status`, SSM will use a single connection.
+ > Said connection must be kept alive until the end of the task.
@@ -264,30 +294,34 @@ Pitfalls:
- name: Dump a DB from an RDS instance
vars:
ansible_connection: community.aws.aws_ssm
- ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-\
+ ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-- see previous gotchas
ansible_async_dir: /tmp/.ansible-ssm-user/async #-- see previous gotchas
wanted_pattern_in_module_output: >-
{{ '"failed": 0, "started": 1, "finished": 0' | regex_escape() }}
community.postgresql.postgresql_db: { … }
- async: "{{ 60 * 60 * 2 }}" #-- wait up to 2 hours
- poll: 0 #-- fire and forget; ssm would not check anyways
+ async: "{{ 60 * 60 * 2 }}" #-- wait up to 2 hours ( 60s * 60m * 2h )
+ poll: 0 #-- fire and forget; ssm would not check anyways
register: dump
changed_when:
- dump.rc == 0
- dump.module_stderr == ''
- "'started' | extract(dump.module_stdout | regex_search('{.*}') | from_json) == 1"
- "'failed' | extract(dump.module_stdout | regex_search('{.*}') | from_json) == 0"
- failed_when: dump.rc != 0 #-- specify the failure yourself
+ failed_when: dump.rc != 0 #-- specify the failure yourself
- name: Check on the dump task
vars:
+ max_wait: "{{ 60 * 60 * 2 }}" #-- wait for the async task to end
+ ansible_aws_ssm_timeout: "{{ max_wait }}" #-- keep active the ssm connection the whole time
+ ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-- see previous gotchas
+ ansible_async_dir: /tmp/.ansible-ssm-user/async #-- see previous gotchas
dump_stdout_as_obj: "{{ dump.module_stdout | regex_search('{.*}') | from_json }}"
ansible_job_id: "{{ dump_stdout_as_obj.ansible_job_id }}"
ansible.builtin.async_status:
jid: "{{ ansible_job_id }}"
register: dump_result
until: dump_result.finished
- retries: "{{ 60 * 2 }}" #-- wait up to 2 hours like the task before
- delay: 60 #-/
+ retries: "{{ (max_wait/60) | int }}" #-- ( ( ( max_wait/60s ) * 1/( delay/60s ) ) | int )
+ delay: 60 #-- set high to avoid overloading the ssm agent with sessions
```
diff --git a/snippets/ansible/tasks.yml b/snippets/ansible/tasks.yml
index 3b8844e..c1e54dd 100644
--- a/snippets/ansible/tasks.yml
+++ b/snippets/ansible/tasks.yml
@@ -815,13 +815,14 @@
image:
id: "{{ source_ami.image_id }}"
- name: Long-running tasks via SSM
+ vars:
+ ansible_connection: community.aws.aws_ssm
+ ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp
+ ansible_async_dir: /tmp/.ansible-ssm-user/async
block:
- name: Dump a DB from an RDS instance to a temporary file
when: rds_instance.endpoint is defined
vars:
- ansible_connection: community.aws.aws_ssm
- ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp
- ansible_async_dir: /tmp/.ansible-ssm-user/async
wanted_pattern_in_module_output: >-
{{ '"failed": 0, "started": 1, "finished": 0' | regex_escape() }}
community.postgresql.postgresql_db:
@@ -831,14 +832,18 @@
login_password: "{{ db_password }}"
name: sales
state: dump
- target: "{{ temp_file_for_dump.path }}"
+ target: >-
+ [
+ {{ ansible_user_dir }},
+ dump.{{ db_instance_identifier }}.{{ ansible_date_time.iso8601_basic_short }}.dir
+ ] | path_join
target_opts: >-
--exclude-table …
--exclude-schema archived
--no-publications
- --format c
- async: "{{ 60 * 60 * 2 }}" # wait up to 2 hours
- poll: 0 # fire and forget, since it would not check anyways
+ --format d --jobs $(nproc)
+ async: "{{ 60 * 60 * 2 }}" # wait up to 12 hours -- 60 secs * 60 mins * 12 hours
+ poll: 0 # fire and forget, since ssm would not allow self-checking anyways
register: dump
changed_when:
- dump.rc == 0
@@ -848,14 +853,16 @@
failed_when: dump.rc != 0
- name: Check on the dump task
vars:
+ max_wait: "{{ (60 / 5 * 12) | int }}" # wait up to 12 hours -- 60 mins / (delay/60) * 12 hours
+ ansible_aws_ssm_timeout: "{{ max_wait }}" # ssm uses a single connection, keep active until the end
dump_stdout_as_obj: "{{ dump.module_stdout | regex_search('{.*}') | from_json }}"
ansible_job_id: "{{ dump_stdout_as_obj.ansible_job_id }}"
ansible.builtin.async_status:
jid: "{{ ansible_job_id }}"
register: dump_result
until: dump_result.finished
- retries: "{{ 60 * 2 }}"
- delay: 60
+ retries: "{{ max_wait }}"
+ delay: 300 # check once every 5 minutes to avoid overloading the ssm agent
- name: RDS-specific operations
block:
- name: Create an instance's snapshot
diff --git a/snippets/postgres/commands.sh b/snippets/postgres/commands.sh
index 7081794..1dd07c7 100644
--- a/snippets/postgres/commands.sh
+++ b/snippets/postgres/commands.sh
@@ -64,6 +64,7 @@ pg_dump -h 'host.fqnd' -p '5432' -U 'admin' -d 'postgres' -W
pg_dump -U 'postgres' -d 'sales' -F 'custom' -f 'sales.bak' --schema-only
pg_dump … -T 'customers,orders' -t 'salespeople,performances'
pg_dump … -s --format 'custom'
+pg_dump … -F'd' --jobs '3'
# Dump DBs' schema only
pg_dump --host 'host.fqnd' --port '5432' --username 'postgres' --dbname 'postgres' --password --schema-only
@@ -75,7 +76,7 @@ pg_dumpall -h 'host.fqnd' -p '5432' -U 'postgres' -l 'postgres' -Wrf 'roles.sql'
# Restore backups
pg_restore -U 'postgres' -d 'sales' 'sales.dump'
-pg_restore -h 'host.fqdn' -U 'master' -d 'sales' -Oxj 8 'sales.dump'
+pg_restore -h 'host.fqdn' -U 'master' -d 'sales' -Oxj '8' 'sales.dump'
# Initialize a test DB
pgbench -i 'test-db'