diff --git a/.config/cspell-words.txt b/.config/cspell-words.txt index a9f1badd..57f34a78 100644 --- a/.config/cspell-words.txt +++ b/.config/cspell-words.txt @@ -27,10 +27,12 @@ Hana srmode opmode sapenv +sapstart hdbnsutil SFAIL SAPSYSTEMNAME SAPSYSTEM +INSTANCENAME appserver ABAP saphostcontrol diff --git a/ansible_collections/sap/cluster_qa/playbooks/test09.yml b/ansible_collections/sap/cluster_qa/playbooks/test09.yml new file mode 100644 index 00000000..7e88d6a4 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/playbooks/test09.yml @@ -0,0 +1,11 @@ +--- +- name: Running TEST09 test role on the S4/HANA Cluster + hosts: all + gather_facts: false + become: true + become_user: root + any_errors_fatal: false + tasks: + - name: Running TEST09 test role on the S4/HANA Cluster + ansible.builtin.include_role: + name: sap.cluster_qa.test09 diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/README.md b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/README.md index a2fc7dde..2f43e1fc 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/README.md +++ b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/README.md @@ -13,7 +13,17 @@ No other resource managed by ocf:heartbeat:SAPInstance should have the same keyw Role Variables -------------- -None +This role sets the following facts: + +- `sap_ascs_resource_name` - The name of the ASCS resource in the cluster +- `sap_ascs_node_name` - The node where ASCS is currently running +- `sap_ascs_sid` - The SAP system ID (SID) of the ASCS instance +- `sap_ascs_instance_name` - The instance name of the ASCS +- `sap_ascs_instance_number` - The instance number of the ASCS +- `sap_ascs_start_profile` - The path to the ASCS start profile file +- `sap_sid` - General SAP system ID (same as sap_ascs_sid) +- `sap_instance_name` - General instance name (same as sap_ascs_instance_name) +- `sap_instance_number` - General instance number (same as sap_ascs_instance_number) Dependencies ------------ diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml index f1c62834..591596e9 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml +++ b/ansible_collections/sap/cluster_qa/roles/pcs_find_ascs/tasks/main.yml @@ -1,33 +1,38 @@ --- -- name: Acquiring ASCS resource name - ansible.builtin.shell: |- - set -o pipefail | - pcs resource config | - grep SAPInstance | - grep -i ascs | - awk '{print $2}' | - xargs - register: __pcs_find_ascs_sap_ascs_resource_name - changed_when: false - failed_when: __pcs_find_ascs_sap_ascs_resource_name.rc != 0 +- name: Acquiring ASCS resource name and setting as fact + block: + - name: Get ASCS resource name + ansible.builtin.shell: |- + set -o pipefail | + pcs resource config | + grep SAPInstance | + grep -i ascs | + awk '{print $2}' | + xargs + register: __pcs_find_ascs_sap_ascs_resource_name + changed_when: false + failed_when: __pcs_find_ascs_sap_ascs_resource_name.rc != 0 -- name: Setting ASCS resource name as fact - ansible.builtin.set_fact: - sap_ascs_resource_name: "{{ __pcs_find_ascs_sap_ascs_resource_name.stdout }}" + - name: Set ASCS resource name as fact + ansible.builtin.set_fact: + sap_ascs_resource_name: "{{ __pcs_find_ascs_sap_ascs_resource_name.stdout }}" -- name: Acquiring ASCS node name - ansible.builtin.shell: |- - set -o pipefail | - pcs resource status {{ sap_ascs_resource_name }} | - grep SAPInstance | - awk '{print $5}' - register: __pcs_find_ascs_sap_ascs_node_name - changed_when: false - failed_when: __pcs_find_ascs_sap_ascs_node_name.rc != 0 -- name: Setting ASCS Node name as fact - ansible.builtin.set_fact: - sap_ascs_node_name: "{{ __pcs_find_ascs_sap_ascs_node_name.stdout }}" +- name: Acquiring ASCS node name and setting as fact + block: + - name: Get ASCS node name + ansible.builtin.shell: |- + set -o pipefail | + pcs resource status {{ sap_ascs_resource_name }} | + grep SAPInstance | + awk '{print $5}' + register: __pcs_find_ascs_sap_ascs_node_name + changed_when: false + failed_when: __pcs_find_ascs_sap_ascs_node_name.rc != 0 + + - name: Set ASCS node name as fact + ansible.builtin.set_fact: + sap_ascs_node_name: "{{ __pcs_find_ascs_sap_ascs_node_name.stdout }}" - name: Acquiring ASCS start profile ansible.builtin.shell: |- @@ -51,12 +56,9 @@ changed_when: false failed_when: __pcs_find_ascs_sap_ascs_sid.rc != 0 -- name: Setting ASCS SID as a separate fact +- name: Setting ASCS SID as facts ansible.builtin.set_fact: sap_ascs_sid: "{{ __pcs_find_ascs_sap_ascs_sid.stdout }}" - -- name: Setting ASCS SID as a general fact - ansible.builtin.set_fact: sap_sid: "{{ __pcs_find_ascs_sap_ascs_sid.stdout }}" - name: Acquiring ASCS Instance Name @@ -70,12 +72,9 @@ changed_when: false failed_when: __pcs_find_ascs_sap_instance_name.rc != 0 -- name: Setting ASCS Instance Name as a separate fact +- name: Setting ASCS Instance Name as facts ansible.builtin.set_fact: sap_ascs_instance_name: "{{ __pcs_find_ascs_sap_instance_name.stdout }}" - -- name: Setting ASCS Instance Name as a general fact - ansible.builtin.set_fact: sap_instance_name: "{{ __pcs_find_ascs_sap_instance_name.stdout }}" - name: Acquiring ASCS Instance number @@ -89,10 +88,11 @@ changed_when: false failed_when: __pcs_find_ascs_sap_instance_number.rc != 0 -- name: Setting ASCS Instance number as fact +- name: Setting ASCS Instance number as facts ansible.builtin.set_fact: sap_ascs_instance_number: "{{ __pcs_find_ascs_sap_instance_number.stdout }}" + sap_instance_number: "{{ __pcs_find_ascs_sap_instance_number.stdout }}" -- name: Setting ASCS Instance number as fact +- name: Setting ASCS start profile path as fact ansible.builtin.set_fact: - sap_instance_number: "{{ __pcs_find_ascs_sap_instance_number.stdout }}" + sap_ascs_start_profile: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/README.md b/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/README.md index 25f7d0f1..18822fcf 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/README.md +++ b/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/README.md @@ -13,7 +13,17 @@ No other resource managed by ocf:heartbeat:SAPInstance should have the same keyw Role Variables -------------- -None +This role sets the following facts: + +- `sap_ers_resource_name` - The name of the ERS resource in the cluster +- `sap_ers_node_name` - The node where ERS is currently running +- `sap_ers_sid` - The SAP system ID (SID) of the ERS instance +- `sap_ers_instance_name` - The instance name of the ERS +- `sap_ers_instance_number` - The instance number of the ERS +- `sap_ers_start_profile` - The path to the ERS start profile file +- `sap_sid` - General SAP system ID (same as sap_ers_sid) +- `sap_instance_name` - General instance name (same as sap_ers_instance_name) +- `sap_instance_number` - General instance number (same as sap_ers_instance_number) Dependencies ------------ diff --git a/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/tasks/main.yml index 8155ef0a..893f7010 100644 --- a/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/tasks/main.yml +++ b/ansible_collections/sap/cluster_qa/roles/pcs_find_ers/tasks/main.yml @@ -1,36 +1,40 @@ --- -- name: Acquiring ERS resource name - ansible.builtin.shell: | - set -o pipefail | - pcs resource config | - grep SAPInstance | - grep -i ers | - awk '{print $2}' | - xargs - register: __pcs_find_ers_sap_ers_resource_name - changed_when: false - failed_when: __pcs_find_ers_sap_ers_resource_name.rc != 0 +- name: Acquiring ERS resource name and setting as fact + block: + - name: Get ERS resource name + ansible.builtin.shell: |- + set -o pipefail | + pcs resource config | + grep SAPInstance | + grep -i ers | + awk '{print $2}' | + xargs + register: __pcs_find_ers_sap_ers_resource_name + changed_when: false + failed_when: __pcs_find_ers_sap_ers_resource_name.rc != 0 -- name: Setting ERS resource name as fact - ansible.builtin.set_fact: - sap_ers_resource_name: "{{ __pcs_find_ers_sap_ers_resource_name.stdout }}" + - name: Set ERS resource name as fact + ansible.builtin.set_fact: + sap_ers_resource_name: "{{ __pcs_find_ers_sap_ers_resource_name.stdout }}" -- name: Acquiring ERS node name - ansible.builtin.shell: | - set -o pipefail | - pcs resource status "{{ sap_ers_resource_name }}" | - grep SAPInstance | - awk '{print $5}' - register: __pcs_find_ers_sap_ers_node_name - changed_when: false - failed_when: __pcs_find_ers_sap_ers_node_name.rc != 0 +- name: Acquiring ERS node name and setting as fact + block: + - name: Get ERS node name + ansible.builtin.shell: |- + set -o pipefail | + pcs resource status "{{ sap_ers_resource_name }}" | + grep SAPInstance | + awk '{print $5}' + register: __pcs_find_ers_sap_ers_node_name + changed_when: false + failed_when: __pcs_find_ers_sap_ers_node_name.rc != 0 -- name: Setting ERS Node name as fact - ansible.builtin.set_fact: - sap_ers_node_name: "{{ __pcs_find_ers_sap_ers_node_name.stdout }}" + - name: Set ERS node name as fact + ansible.builtin.set_fact: + sap_ers_node_name: "{{ __pcs_find_ers_sap_ers_node_name.stdout }}" - name: Acquiring ERS start profile - ansible.builtin.shell: | + ansible.builtin.shell: |- set -o pipefail | pcs resource config "{{ sap_ers_resource_name }}" | grep START_PROFILE | @@ -41,7 +45,7 @@ failed_when: __pcs_find_ers_sap_ers_start_profile.rc != 0 - name: Acquiring ERS SID - ansible.builtin.shell: | + ansible.builtin.shell: |- set -o pipefail | cat "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" | grep -w 'SAPSYSTEMNAME ' | @@ -51,16 +55,13 @@ changed_when: false failed_when: __pcs_find_ers_sap_ers_sid.rc != 0 -- name: Setting ERS SID as a separate fact - ansible.builtin.set_fact: - sap_ers_sid: "{{ __pcs_find_ers_sap_ers_sid.stdout }}" - -- name: Setting ERS SID as a general fact +- name: Setting ERS SID as facts ansible.builtin.set_fact: - sap_sid: "{{ __pcs_find_ers_sap_ers_sid.stdout }}" + sap_ers_sid: "{{ __pcs_find_ers_sap_ers_sid.stdout }}" + sap_sid: "{{ __pcs_find_ers_sap_ers_sid.stdout }}" - name: Acquiring ERS Instance Name - ansible.builtin.shell: | + ansible.builtin.shell: |- set -o pipefail | cat "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" | grep -w 'INSTANCE_NAME =' | @@ -70,16 +71,13 @@ changed_when: false failed_when: __pcs_find_ers_sap_instance_name.rc != 0 -- name: Setting ERS Instance Name as a separate fact - ansible.builtin.set_fact: - sap_ers_instance_name: "{{ __pcs_find_ers_sap_instance_name.stdout }}" - -- name: Setting ERS Instance Name as a general fact +- name: Setting ERS Instance Name as facts ansible.builtin.set_fact: - sap_instance_name: "{{ __pcs_find_ers_sap_instance_name.stdout }}" + sap_ers_instance_name: "{{ __pcs_find_ers_sap_instance_name.stdout }}" + sap_instance_name: "{{ __pcs_find_ers_sap_instance_name.stdout }}" - name: Acquiring ERS Instance number - ansible.builtin.shell: | + ansible.builtin.shell: |- set -o pipefail | cat "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" | grep -w SAPSYSTEM | @@ -89,10 +87,11 @@ changed_when: false failed_when: __pcs_find_ers_sap_instance_number.rc != 0 -- name: Setting ERS Instance number as fact separately +- name: Setting ERS Instance number as facts ansible.builtin.set_fact: - sap_ers_instance_number: "{{ __pcs_find_ers_sap_instance_number.stdout }}" + sap_ers_instance_number: "{{ __pcs_find_ers_sap_instance_number.stdout }}" + sap_instance_number: "{{ __pcs_find_ers_sap_instance_number.stdout }}" -- name: Setting ERS Instance number as fact without appserver name +- name: Setting ERS start profile path as fact ansible.builtin.set_fact: - sap_instance_number: "{{ __pcs_find_ers_sap_instance_number.stdout }}" + sap_ers_start_profile: "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" diff --git a/ansible_collections/sap/cluster_qa/roles/test04/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/test04/tasks/main.yml index 02ea41a7..5d0bd014 100644 --- a/ansible_collections/sap/cluster_qa/roles/test04/tasks/main.yml +++ b/ansible_collections/sap/cluster_qa/roles/test04/tasks/main.yml @@ -10,7 +10,7 @@ - name: Acquiring latest enq_admin information on ASCS before move sap.sap_operations.enq_admin_info: - profile_filepath: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" + profile_filepath: "{{ sap_ascs_start_profile }}" become: true become_user: "{{ sap_ascs_sid | lower }}adm" become_flags: -i @@ -19,7 +19,7 @@ - name: Removing 10 new locks via command line before move ansible.builtin.shell: | set -o pipefail | - enq_admin --release_locks=10:X:DIAG::TAB:%u pf={{ __pcs_find_ascs_sap_ascs_start_profile.stdout }} + enq_admin --release_locks=10:X:DIAG::TAB:%u pf={{ sap_ascs_start_profile }} become: true become_user: "{{ sap_ascs_sid | lower }}adm" become_flags: -i @@ -28,7 +28,7 @@ - name: Creating 10 new locks via command line before move ansible.builtin.shell: | set -o pipefail | - enq_admin --set_locks=10:X:DIAG::TAB:%u pf={{ __pcs_find_ascs_sap_ascs_start_profile.stdout }} + enq_admin --set_locks=10:X:DIAG::TAB:%u pf={{ sap_ascs_start_profile }} become: true become_user: "{{ sap_ascs_sid | lower }}adm" become_flags: -i @@ -36,7 +36,7 @@ - name: Setting 10 additional new locks using enq_admin_lock before move sap.sap_operations.enq_admin_lock: - profile_filepath: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" + profile_filepath: "{{ sap_ascs_start_profile }}" lock_type: X state: absent owner1: DIAG @@ -50,7 +50,7 @@ - name: Acquiring latest enq_admin lock information on ASCS before move sap.sap_operations.enq_admin_locks_info: - profile_filepath: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" + profile_filepath: "{{ sap_ascs_start_profile }}" become: true become_user: "{{ sap_ascs_sid | lower }}adm" become_flags: -i @@ -68,7 +68,7 @@ - name: Acquiring latest enq_admin lock information on ERS before move sap.sap_operations.enq_admin_locks_info: - profile_filepath: "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" + profile_filepath: "{{ sap_ers_start_profile }}" become: true become_user: "{{ sap_ers_sid | lower }}adm" become_flags: -i @@ -104,7 +104,7 @@ - name: Acquiring latest enq_admin lock information on the new ASCS node after move sap.sap_operations.enq_admin_locks_info: - profile_filepath: "{{ __pcs_find_ascs_sap_ascs_start_profile.stdout }}" + profile_filepath: "{{ sap_ascs_start_profile }}" become: true become_user: "{{ sap_ascs_sid | lower }}adm" become_flags: -i @@ -128,7 +128,7 @@ - name: Acquiring latest enq_admin lock information on new ERS node after move sap.sap_operations.enq_admin_locks_info: - profile_filepath: "{{ __pcs_find_ers_sap_ers_start_profile.stdout }}" + profile_filepath: "{{ sap_ers_start_profile }}" become: true become_user: "{{ sap_ers_sid | lower }}adm" become_flags: -i diff --git a/ansible_collections/sap/cluster_qa/roles/test09/README.md b/ansible_collections/sap/cluster_qa/roles/test09/README.md new file mode 100644 index 00000000..deab89d5 --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/README.md @@ -0,0 +1,100 @@ +test09 +========= + +This role tests the SAP Message Server automatic restart mechanism and its interaction with the HA solution. It verifies that recoverable Message Server outages are handled correctly by the SAP Start Service and that unrecoverable failures trigger appropriate HA responses. + +**Test Purpose:** +- **Verify Restart_Program parameter configuration** for Message Server (auto-configure if missing) +- Verify Message Server automatic restart functionality via SAP Start Service +- Test interaction between SAP automatic restart and HA solution +- Ensure HA solution responds appropriately when automatic restart fails +- Validate that ASCS failover respects ERS location constraints + +**Test Procedure:** +1. **Validate Restart_Program parameter** is configured in ASCS profile (auto-insert if missing) +2. Kill Message Server process repeatedly (up to 6 times by default) +3. Monitor SAP Start Service automatic restart behavior +4. Verify HA solution response when automatic restart threshold is exceeded +5. Ensure ASCS and ERS remain on different nodes throughout + +Requirements +------------ + +A 3 or more node pacemaker cluster managing S4/HANA ASCS and ERS Instances using the `SAPInstance` resource agent with the SAP HA interface for SAP ABAP application server instances as mentioned in: https://access.redhat.com/solutions/3606101. + +**Prerequisites:** +- **SAP Profile Parameter "Restart_Program" must be configured for Message Server** (auto-configured by test if missing) +- SAP system running in stable mode with HA solution activated +- 3+ node cluster setup required + +**Reference:** [SAP Support Content: Message Server Restart](https://help.sap.com/docs/SUPPORT_CONTENT/si/3362959619.html?locale=en-US) + +**Restart_Program Configuration Example:** +``` +Restart_Program_01 = local $(DIR_EXECUTABLE)/msg_server pf=$(DIR_PROFILE)/$(SAPSYSTEMNAME)_$(INSTANCE_NAME)_$(HOSTNAME) +``` + +Role Variables +-------------- + +This role uses variables provided by the `sap.cluster_qa.pcs_find_ascs` and `sap.cluster_qa.pcs_find_ers` roles: +- `sap_ascs_node_name` - The node where ASCS is currently running +- `sap_ers_node_name` - The node where ERS is currently running +- `sap_ascs_resource_name` - The name of the ASCS resource in the cluster +- `sap_ascs_instance_number` - The ASCS instance number +- `max_kill_attempts` - Maximum Message Server kill attempts (default: 6) +- `sap_ascs_start_profile` - Path to ASCS profile file (used for Restart_Program validation) + +**Expected Outcomes:** +- **Restart_Program parameter validation passes** (auto-configured if missing) +- Message Server restarts automatically via SAP Start Service (recoverable errors) +- Process ID changes with each restart +- Restart events logged in sapstartsrv.log/sapstart.log +- HA solution triggers ASCS restart/failover after restart threshold exceeded +- ASCS never moves to ERS node + +**Auto-Configuration Feature:** +If the `Restart_Program` parameter is not found, the test will automatically: +- Check for existing `Start_Program` parameter for Message Server +- **Replace `Start_Program` with `Restart_Program`** if found (to avoid conflicts) +- Insert `_MS = ms.sap$(SAPSYSTEMNAME)_$(INSTANCE_NAME)` variable definition if needed +- Add `Restart_Program_00 = local $(_MS) pf=$(_PF)` parameter if no existing Start_Program +- Create backup of original profile before modification +- **Restart sapstartsrv service** to apply the new configuration +- **Wait for cluster to detect ASCS resource failures** after service restart +- **Wait for ASCS resource to be fully started** by the cluster +- **Re-discover ASCS location** after cluster recovery (may cause failover) +- Verify successful configuration before proceeding + +**Important Note:** When the `Restart_Program` parameter is automatically configured, the sapstartsrv service will be restarted, which causes the cluster to detect resource failures and may trigger ASCS failover to another node. The test intelligently waits for complete cluster recovery before proceeding. + +**Configuration Logic:** +- If `Start_Program_XX = local $(_MS) pf=$(_PF)` exists → Replace with `Restart_Program_00 = local $(_MS) pf=$(_PF)` +- If no Start_Program exists → Add both `_MS` variable and `Restart_Program_00` parameter + +Dependencies +------------ + +- `sap.cluster_qa.pcs_find_ascs` - Required to locate the ASCS node and resource information +- `sap.cluster_qa.pcs_find_ers` - Required to locate the ERS node and resource information +- `sap.sap_operations` - Required for host_info and pcs_status_info modules + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - sap.cluster_qa.test09 + +License +------- + +GPLv3 + +Author Information +------------------ + +Amir Memon (@amemon-redhat) +Kirill Satarin (@kksat) \ No newline at end of file diff --git a/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml b/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml new file mode 100644 index 00000000..02b62d8c --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + author: Amir Memon (@amemon-redhat) + description: Run test09 - Message Server automatic restart and HA interaction test + license: GPl-3.0-only + min_ansible_version: "2.15" + platforms: + - name: EL + versions: + - "8" + - "9" + - "10" + galaxy_tags: [] +dependencies: [] diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml new file mode 100644 index 00000000..e315e9ca --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml @@ -0,0 +1,126 @@ +--- +- name: Get current Message Server process info + sap.sap_operations.host_info: + register: current_ascs_host_info + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Store current Message Server PID + ansible.builtin.set_fact: + current_msg_server_pid: >- + {{ (msg_server_process_list | selectattr('name', 'equalto', 'msg_server') | first)['pid'] + if msg_server_process_list | length > 0 else 'NO_INSTANCE' }} + previous_msg_server_pid: "{{ previous_msg_server_pid | default('none') }}" + ascs_instance_found: "{{ ascs_instance_list | length > 0 }}" + vars: + ascs_instance_list: >- + {{ current_ascs_host_info.instances | selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list }} + msg_server_process_list: >- + {{ (ascs_instance_list | first)['ProcessList'] | selectattr('name', 'equalto', 'msg_server') | list + if ascs_instance_list | length > 0 else [] }} + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_ascs_host_info is defined + - current_ascs_host_info.instances is defined + +- name: Handle case when ASCS instance not found + ansible.builtin.set_fact: + current_msg_server_pid: "NO_INSTANCE" + ascs_instance_found: false + msg_server_restarted: false + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_ascs_host_info is defined + - (current_ascs_host_info.instances | default([]) | + selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list | length == 0) + +- name: Display Message Server PID info + ansible.builtin.debug: + msg: | + Kill attempt {{ kill_attempt }}: + - ASCS instance found: {{ ascs_instance_found | default(false) }} + - Current PID: {{ current_msg_server_pid | default('N/A') }} + - Previous PID: {{ previous_msg_server_pid | default('none') }} + {% if not (ascs_instance_found | default(false)) %} + - WARNING: ASCS instance {{ sap_ascs_instance_number }} not found in process list + {% endif %} + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Killing the Message Server process + ansible.builtin.command: "kill -9 {{ current_msg_server_pid }}" + changed_when: true + when: + - ansible_hostname == sap_ascs_node_name_initial + - current_msg_server_pid is defined + - current_msg_server_pid != "NO_INSTANCE" + - ascs_instance_found | default(false) + +- name: Update kill counter + ansible.builtin.set_fact: + message_server_kill_count: "{{ kill_attempt }}" + previous_msg_server_pid: "{{ current_msg_server_pid | default('unknown') }}" + +- name: Wait for SAP automatic restart or HA intervention + ansible.builtin.pause: + seconds: 30 + prompt: >- + Waiting for SAP automatic restart or HA intervention after kill attempt {{ kill_attempt }} + +- name: Check if ASCS resource is still running on original node + sap.sap_operations.pcs_status_info: + register: ascs_status_check + run_once: true + +- name: Verify ASCS resource status + ansible.builtin.set_fact: + ascs_still_on_original_node: >- + {{ ascs_status_check | sap.sap_operations.pcs_resources_from_status(role='Started', id=sap_ascs_resource_name) | length > 0 }} + run_once: true + +- name: Check if Message Server process is running again + sap.sap_operations.host_info: + register: restart_check_host_info + failed_when: false + when: + - ansible_hostname == sap_ascs_node_name_initial + - ascs_still_on_original_node | bool + +- name: Determine if Message Server restarted automatically + ansible.builtin.set_fact: + msg_server_restarted: "{{ (restart_msg_server_list | length > 0) }}" + restart_ascs_instance_found: "{{ restart_ascs_instance_list | length > 0 }}" + vars: + restart_ascs_instance_list: >- + {{ restart_check_host_info.instances | default([]) | + selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list }} + restart_msg_server_list: >- + {{ (restart_ascs_instance_list | first)['ProcessList'] | + selectattr('name', 'equalto', 'msg_server') | list if restart_ascs_instance_list | length > 0 else [] }} + when: + - ansible_hostname == sap_ascs_node_name_initial + - ascs_still_on_original_node | bool + - restart_check_host_info is defined + - not (restart_check_host_info.failed | default(false)) + +- name: Set restart status to false if ASCS moved + ansible.builtin.set_fact: + msg_server_restarted: false + when: + - not (ascs_still_on_original_node | bool) + +- name: Display restart status + ansible.builtin.debug: + msg: | + After kill {{ kill_attempt }}: + - Message Server restarted: {{ msg_server_restarted | default(false) }} + - ASCS on original node: {{ ascs_still_on_original_node }} + - ASCS instance found during kill: {{ ascs_instance_found | default(false) }} + - ASCS instance found during restart check: {{ restart_ascs_instance_found | default(false) }} + when: ansible_hostname == sap_ascs_node_name_initial + +- name: Set global fact to stop further iterations if Message Server stopped restarting + ansible.builtin.set_fact: + msg_server_restarted: false + when: + - (not (msg_server_restarted | default(false) | bool)) or + (not (ascs_instance_found | default(true) | bool)) + - kill_attempt | int >= 2 diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml new file mode 100644 index 00000000..1857317f --- /dev/null +++ b/ansible_collections/sap/cluster_qa/roles/test09/tasks/main.yml @@ -0,0 +1,136 @@ +--- +- name: Clean-up of cluster + ansible.builtin.shell: |- + set -o pipefail | + pcs resource cleanup + run_once: true + +- name: Collect necessary gather_facts + ansible.builtin.setup: + gather_subset: + - min + +- name: Finding ASCS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ascs + +- name: Store original ASCS node name before any modifications + ansible.builtin.set_fact: + sap_ascs_node_name_original: "{{ sap_ascs_node_name }}" + +# ============================================================================ +# PREREQUISITE VERIFICATION - FAIL EARLY IF NOT CONFIGURED +# ============================================================================ + +- name: Check if ASCS profile file exists + ansible.builtin.stat: + path: "{{ sap_ascs_start_profile }}" + register: profile_file_stat + run_once: true + delegate_to: "{{ sap_ascs_node_name }}" + +- name: Check for Restart_Program parameter in ASCS profile + ansible.builtin.shell: | + grep "Restart_Program" "{{ sap_ascs_start_profile }}" | grep "_MS" || echo "NOT_FOUND" + register: restart_program_check + changed_when: false + run_once: true + delegate_to: "{{ sap_ascs_node_name }}" + when: profile_file_stat.stat.exists + +- name: Verify Restart_Program parameter is configured for Message Server + ansible.builtin.assert: + that: + - profile_file_stat.stat.exists + - restart_program_check.stdout != "NOT_FOUND" + - restart_program_check.stdout | length > 0 + fail_msg: | + PREREQUISITE FAILED: Restart_Program parameter for Message Server not found in ASCS profile. + Profile checked: {{ sap_ascs_start_profile }} + Profile exists: {{ profile_file_stat.stat.exists | default(false) }} + + Please manually configure it according to: https://help.sap.com/docs/SUPPORT_CONTENT/si/3362959619.html + + Expected configuration format: + _MS = ms.sap$(SAPSYSTEMNAME)_$(INSTANCE_NAME) + Restart_Program_00 = local $(_MS) pf=$(_PF) + + Or the expanded form: + Restart_Program_00 = local ms.sapSYSTEMNAME_INSTANCENAME pf=/path/to/profile + + TEST WILL STOP HERE - Fix the configuration and re-run the test. + success_msg: | + ✓ PREREQUISITE VERIFIED: Restart_Program parameter found for Message Server. + Configuration: {{ restart_program_check.stdout }} + ✓ Proceeding with Message Server kill test... + run_once: true + +# ============================================================================ +# MAIN TEST LOGIC - ALL TASKS BELOW ASSUME RESTART_PROGRAM IS CONFIGURED +# ============================================================================ + +- name: Finding ERS node name + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ers + +- name: Setting initial facts + ansible.builtin.set_fact: + sap_ascs_node_name_initial: "{{ sap_ascs_node_name }}" + sap_ers_node_name_initial: "{{ sap_ers_node_name }}" + message_server_kill_count: 0 + max_kill_attempts: 6 + msg_server_restarted: true + +- name: Verify ASCS and ERS are on different nodes initially + ansible.builtin.assert: + that: sap_ascs_node_name_initial != sap_ers_node_name_initial + fail_msg: "ASCS and ERS are on the same node initially, which violates HA setup requirements" + success_msg: "ASCS on {{ sap_ascs_node_name_initial }}, ERS on {{ sap_ers_node_name_initial }} - proper HA setup confirmed" + +- name: Execute Message Server kill attempts + ansible.builtin.include_tasks: kill_message_server.yml + loop: "{{ range(1, max_kill_attempts | int + 1) | list }}" + loop_control: + loop_var: kill_attempt + when: msg_server_restarted | bool + +- name: Wait for HA solution to respond to unrecoverable Message Server failure + ansible.builtin.pause: + seconds: 60 + prompt: "Waiting for HA solution to respond to repeated Message Server failures" + +- name: Check final ASCS location after HA intervention + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ascs + +- name: Check final ERS location + ansible.builtin.include_role: + name: sap.cluster_qa.pcs_find_ers + +- name: Set final location facts + ansible.builtin.set_fact: + sap_ascs_node_name_final: "{{ sap_ascs_node_name }}" + sap_ers_node_name_final: "{{ sap_ers_node_name }}" + +- name: Verify HA solution responded appropriately + ansible.builtin.assert: + that: + - sap_ascs_node_name_final != sap_ers_node_name_final + fail_msg: "HA solution failed: ASCS and ERS ended up on the same node ({{ sap_ascs_node_name_final }})" + success_msg: "HA solution succeeded: ASCS on {{ sap_ascs_node_name_final }}, ERS on {{ sap_ers_node_name_final }}" + +- name: Display test summary + ansible.builtin.debug: + msg: + - "===============================================" + - " TEST09 SUMMARY" + - "===============================================" + - "Message Server killed: {{ message_server_kill_count }} times" + - "Initial ASCS location: {{ sap_ascs_node_name_initial }}" + - "Final ASCS location: {{ sap_ascs_node_name_final }}" + - "Initial ERS location: {{ sap_ers_node_name_initial }}" + - "Final ERS location: {{ sap_ers_node_name_final }}" + - "HA Action taken: {{ 'ASCS Failover' if sap_ascs_node_name_initial != sap_ascs_node_name_final else 'ASCS Restart on same node' }}" + - "ASCS/ERS separation maintained: {{ 'YES' if sap_ascs_node_name_final != sap_ers_node_name_final else 'NO' }}" + - "===============================================" + run_once: true