diff --git a/knowledge base/cloud computing/aws/ssm.md b/knowledge base/cloud computing/aws/ssm.md index f082984..89cb4ae 100644 --- a/knowledge base/cloud computing/aws/ssm.md +++ b/knowledge base/cloud computing/aws/ssm.md @@ -204,6 +204,18 @@ Pitfalls: > as. Remote commands will often default to running as the `ssm-agent` user, however this will also depend on how SSM > has been configured. +- SSM sessions' duration is limited by SSM's settings.
+ That might impact tasks that need to run for more than said duration. + +
+ + Some modules (e.g.: `community.postgresql.postgresql_db`) got their session terminated and SSM retried the task, + killing and restarting the running process.
+ Since the process lasted more than the sessions' duration, it kept having its sessions terminated. The task failed + when the SSM reached the set retries for the connection. + +
+ - Since [SSM starts shell sessions under `/usr/bin`][gotchas], one must explicitly set Ansible's temporary directory to a folder the remote user can write to ([source][ansible temp dir change]). @@ -253,10 +265,28 @@ Pitfalls: -- When using `async` tasks, SSM will fire the task and disconnect; this makes the task **fail**, but the module will - still run on the target host.
- Fire these tasks with `poll` set to `0` and forcing a specific failure test, then use a different task to check up on - them. +- When using `async` tasks, SSM will fire the task and disconnect
+ This makes the task **fail**, but the process will still run on the target host. + +
+ + ```json + { + "changed": false, + "module_stderr": "", + "module_stdout": "\u001b]0;@ip-172-31-42-42:/usr/bin\u0007{\"failed\": 0, \"started\": 1, \"finished\": 0, \"ansible_job_id\": \"j604343782826.4885\", \"results_file\": \"/tmp/.ansible-ssm-user/async/j604343782826.4885\", \"_ansible_suppress_tmpdir_delete\": true}\r\r", + "msg": "MODULE FAILURE\nSee stdout/stderr for the exact error", + "rc": 0 + } + ``` + +
+ + Fire these tasks with `poll` set to `0` and forcing a specific failure test.
+ Then, use a different task to check up on them. + + > When checking up tasks with `ansible.builtin.async_status`, SSM will use a single connection.
+ > Said connection must be kept alive until the end of the task.
@@ -264,30 +294,34 @@ Pitfalls: - name: Dump a DB from an RDS instance vars: ansible_connection: community.aws.aws_ssm - ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-\ + ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-- see previous gotchas ansible_async_dir: /tmp/.ansible-ssm-user/async #-- see previous gotchas wanted_pattern_in_module_output: >- {{ '"failed": 0, "started": 1, "finished": 0' | regex_escape() }} community.postgresql.postgresql_db: { … } - async: "{{ 60 * 60 * 2 }}" #-- wait up to 2 hours - poll: 0 #-- fire and forget; ssm would not check anyways + async: "{{ 60 * 60 * 2 }}" #-- wait up to 2 hours ( 60s * 60m * 2h ) + poll: 0 #-- fire and forget; ssm would not check anyways register: dump changed_when: - dump.rc == 0 - dump.module_stderr == '' - "'started' | extract(dump.module_stdout | regex_search('{.*}') | from_json) == 1" - "'failed' | extract(dump.module_stdout | regex_search('{.*}') | from_json) == 0" - failed_when: dump.rc != 0 #-- specify the failure yourself + failed_when: dump.rc != 0 #-- specify the failure yourself - name: Check on the dump task vars: + max_wait: "{{ 60 * 60 * 2 }}" #-- wait for the async task to end + ansible_aws_ssm_timeout: "{{ max_wait }}" #-- keep active the ssm connection the whole time + ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp #-- see previous gotchas + ansible_async_dir: /tmp/.ansible-ssm-user/async #-- see previous gotchas dump_stdout_as_obj: "{{ dump.module_stdout | regex_search('{.*}') | from_json }}" ansible_job_id: "{{ dump_stdout_as_obj.ansible_job_id }}" ansible.builtin.async_status: jid: "{{ ansible_job_id }}" register: dump_result until: dump_result.finished - retries: "{{ 60 * 2 }}" #-- wait up to 2 hours like the task before - delay: 60 #-/ + retries: "{{ (max_wait/60) | int }}" #-- ( ( ( max_wait/60s ) * 1/( delay/60s ) ) | int ) + delay: 60 #-- set high to avoid overloading the ssm agent with sessions ```
diff --git a/snippets/ansible/tasks.yml b/snippets/ansible/tasks.yml index 3b8844e..c1e54dd 100644 --- a/snippets/ansible/tasks.yml +++ b/snippets/ansible/tasks.yml @@ -815,13 +815,14 @@ image: id: "{{ source_ami.image_id }}" - name: Long-running tasks via SSM + vars: + ansible_connection: community.aws.aws_ssm + ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp + ansible_async_dir: /tmp/.ansible-ssm-user/async block: - name: Dump a DB from an RDS instance to a temporary file when: rds_instance.endpoint is defined vars: - ansible_connection: community.aws.aws_ssm - ansible_remote_tmp: /tmp/.ansible-ssm-user/tmp - ansible_async_dir: /tmp/.ansible-ssm-user/async wanted_pattern_in_module_output: >- {{ '"failed": 0, "started": 1, "finished": 0' | regex_escape() }} community.postgresql.postgresql_db: @@ -831,14 +832,18 @@ login_password: "{{ db_password }}" name: sales state: dump - target: "{{ temp_file_for_dump.path }}" + target: >- + [ + {{ ansible_user_dir }}, + dump.{{ db_instance_identifier }}.{{ ansible_date_time.iso8601_basic_short }}.dir + ] | path_join target_opts: >- --exclude-table … --exclude-schema archived --no-publications - --format c - async: "{{ 60 * 60 * 2 }}" # wait up to 2 hours - poll: 0 # fire and forget, since it would not check anyways + --format d --jobs $(nproc) + async: "{{ 60 * 60 * 2 }}" # wait up to 12 hours -- 60 secs * 60 mins * 12 hours + poll: 0 # fire and forget, since ssm would not allow self-checking anyways register: dump changed_when: - dump.rc == 0 @@ -848,14 +853,16 @@ failed_when: dump.rc != 0 - name: Check on the dump task vars: + max_wait: "{{ (60 / 5 * 12) | int }}" # wait up to 12 hours -- 60 mins / (delay/60) * 12 hours + ansible_aws_ssm_timeout: "{{ max_wait }}" # ssm uses a single connection, keep active until the end dump_stdout_as_obj: "{{ dump.module_stdout | regex_search('{.*}') | from_json }}" ansible_job_id: "{{ dump_stdout_as_obj.ansible_job_id }}" ansible.builtin.async_status: jid: "{{ ansible_job_id }}" register: dump_result until: dump_result.finished - retries: "{{ 60 * 2 }}" - delay: 60 + retries: "{{ max_wait }}" + delay: 300 # check once every 5 minutes to avoid overloading the ssm agent - name: RDS-specific operations block: - name: Create an instance's snapshot diff --git a/snippets/postgres/commands.sh b/snippets/postgres/commands.sh index 7081794..1dd07c7 100644 --- a/snippets/postgres/commands.sh +++ b/snippets/postgres/commands.sh @@ -64,6 +64,7 @@ pg_dump -h 'host.fqnd' -p '5432' -U 'admin' -d 'postgres' -W pg_dump -U 'postgres' -d 'sales' -F 'custom' -f 'sales.bak' --schema-only pg_dump … -T 'customers,orders' -t 'salespeople,performances' pg_dump … -s --format 'custom' +pg_dump … -F'd' --jobs '3' # Dump DBs' schema only pg_dump --host 'host.fqnd' --port '5432' --username 'postgres' --dbname 'postgres' --password --schema-only @@ -75,7 +76,7 @@ pg_dumpall -h 'host.fqnd' -p '5432' -U 'postgres' -l 'postgres' -Wrf 'roles.sql' # Restore backups pg_restore -U 'postgres' -d 'sales' 'sales.dump' -pg_restore -h 'host.fqdn' -U 'master' -d 'sales' -Oxj 8 'sales.dump' +pg_restore -h 'host.fqdn' -U 'master' -d 'sales' -Oxj '8' 'sales.dump' # Initialize a test DB pgbench -i 'test-db'