sap-linuxlab · amemon-redhat · Jul 25, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
diff --git a/ansible_collections/sap/cluster_qa/playbooks/test09.yml b/ansible_collections/sap/cluster_qa/playbooks/test09.yml
@@ -0,0 +1,11 @@
+---
+- name: Running TEST09 test role on the S4/HANA Cluster
+  hosts: all
+  gather_facts: false
+  become: true
+  become_user: root
+  any_errors_fatal: false
+  tasks:
+    - name: Running TEST09 test role on the S4/HANA Cluster
+      ansible.builtin.include_role:
+        name: sap.cluster_qa.test09 
diff --git a/ansible_collections/sap/cluster_qa/roles/test09/README.md b/ansible_collections/sap/cluster_qa/roles/test09/README.md
@@ -0,0 +1,100 @@
+test09
+=========
+
+This role tests the SAP Message Server automatic restart mechanism and its interaction with the HA solution. It verifies that recoverable Message Server outages are handled correctly by the SAP Start Service and that unrecoverable failures trigger appropriate HA responses.
+
+**Test Purpose:**
+- **Verify Restart_Program parameter configuration** for Message Server (auto-configure if missing)
+- Verify Message Server automatic restart functionality via SAP Start Service
+- Test interaction between SAP automatic restart and HA solution
+- Ensure HA solution responds appropriately when automatic restart fails
+- Validate that ASCS failover respects ERS location constraints
+
+**Test Procedure:**
+1. **Validate Restart_Program parameter** is configured in ASCS profile (auto-insert if missing)
+2. Kill Message Server process repeatedly (up to 6 times by default)
+3. Monitor SAP Start Service automatic restart behavior
+4. Verify HA solution response when automatic restart threshold is exceeded
+5. Ensure ASCS and ERS remain on different nodes throughout
+
+Requirements
+------------
+
+A 3 or more node pacemaker cluster managing S4/HANA ASCS and ERS Instances using the `SAPInstance` resource agent with the SAP HA interface for SAP ABAP application server instances as mentioned in: https://access.redhat.com/solutions/3606101.
+
+**Prerequisites:**
+- **SAP Profile Parameter "Restart_Program" must be configured for Message Server** (auto-configured by test if missing)
+- SAP system running in stable mode with HA solution activated
+- 3+ node cluster setup required
+
+**Reference:** [SAP Support Content: Message Server Restart](https://help.sap.com/docs/SUPPORT_CONTENT/si/3362959619.html?locale=en-US)
+
+**Restart_Program Configuration Example:**
+```
+Restart_Program_01 = local $(DIR_EXECUTABLE)/msg_server pf=$(DIR_PROFILE)/$(SAPSYSTEMNAME)_$(INSTANCE_NAME)_$(HOSTNAME)
+```
+
+Role Variables
+--------------
+
+This role uses variables provided by the `sap.cluster_qa.pcs_find_ascs` and `sap.cluster_qa.pcs_find_ers` roles:
+- `sap_ascs_node_name` - The node where ASCS is currently running
+- `sap_ers_node_name` - The node where ERS is currently running
+- `sap_ascs_resource_name` - The name of the ASCS resource in the cluster
+- `sap_ascs_instance_number` - The ASCS instance number
+- `max_kill_attempts` - Maximum Message Server kill attempts (default: 6)
+- `__pcs_find_ascs_sap_ascs_start_profile.stdout` - Path to ASCS profile file (used for Restart_Program validation)
+
+**Expected Outcomes:**
+- **Restart_Program parameter validation passes** (auto-configured if missing)
+- Message Server restarts automatically via SAP Start Service (recoverable errors)
+- Process ID changes with each restart
+- Restart events logged in sapstartsrv.log/sapstart.log
+- HA solution triggers ASCS restart/failover after restart threshold exceeded
+- ASCS never moves to ERS node
+
+**Auto-Configuration Feature:**
+If the `Restart_Program` parameter is not found, the test will automatically:
+- Check for existing `Start_Program` parameter for Message Server
+- **Replace `Start_Program` with `Restart_Program`** if found (to avoid conflicts)
+- Insert `_MS = ms.sap$(SAPSYSTEMNAME)_$(INSTANCE_NAME)` variable definition if needed
+- Add `Restart_Program_00 = local $(_MS) pf=$(_PF)` parameter if no existing Start_Program
+- Create backup of original profile before modification
+- **Restart sapstartsrv service** to apply the new configuration
+- **Wait for cluster to detect ASCS resource failures** after service restart
+- **Wait for ASCS resource to be fully started** by the cluster
+- **Re-discover ASCS location** after cluster recovery (may cause failover)
+- Verify successful configuration before proceeding
+
+**Important Note:** When the `Restart_Program` parameter is automatically configured, the sapstartsrv service will be restarted, which causes the cluster to detect resource failures and may trigger ASCS failover to another node. The test intelligently waits for complete cluster recovery before proceeding.
+
+**Configuration Logic:**
+- If `Start_Program_XX = local $(_MS) pf=$(_PF)` exists → Replace with `Restart_Program_00 = local $(_MS) pf=$(_PF)`
+- If no Start_Program exists → Add both `_MS` variable and `Restart_Program_00` parameter
+
+Dependencies
+------------
+
+- `sap.cluster_qa.pcs_find_ascs` - Required to locate the ASCS node and resource information
+- `sap.cluster_qa.pcs_find_ers` - Required to locate the ERS node and resource information  
+- `sap.sap_operations` - Required for host_info and pcs_status_info modules
+
+Example Playbook
+----------------
+
+Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too:
+
+    - hosts: servers
+      roles:
+         - sap.cluster_qa.test09
+
+License
+-------
+
+GPLv3
+
+Author Information
+------------------
+
+Amir Memon (@amemon-redhat)
+Kirill Satarin (@kksat) 
diff --git a/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml b/ansible_collections/sap/cluster_qa/roles/test09/meta/main.yml
@@ -0,0 +1,14 @@
+---
+galaxy_info:
+  author: Amir Memon (@amemon-redhat)
+  description: Run test09 - Message Server automatic restart and HA interaction test
+  license: GPl-3.0-only
+  min_ansible_version: "2.15"
+  platforms:
+    - name: EL
+      versions:
+        - "8"
+        - "9"
+        - "10"
+  galaxy_tags: []
+dependencies: []
diff --git a/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml b/ansible_collections/sap/cluster_qa/roles/test09/tasks/kill_message_server.yml
@@ -0,0 +1,139 @@
+---
+- name: Check if ASCS node is responsive before attempting kill
+  ansible.builtin.ping:
+  register: ascs_node_ping
+  failed_when: false
+  when: ansible_hostname == sap_ascs_node_name_initial
+
+- name: Skip kill attempt if ASCS node is unresponsive
+  ansible.builtin.set_fact:
+    msg_server_restarted: false
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - ascs_node_ping is defined
+    - ascs_node_ping.failed | default(false)
+
+- name: Display node responsiveness status
+  ansible.builtin.debug:
+    msg: "Kill attempt {{ kill_attempt }}: ASCS node {{ sap_ascs_node_name_initial }} responsiveness = {{ 'RESPONSIVE' if (ascs_node_ping.ping is defined and ascs_node_ping.ping == 'pong') else 'UNRESPONSIVE' }}"
+  when: ansible_hostname == sap_ascs_node_name_initial
+
+- name: Get current Message Server process info
+  sap.sap_operations.host_info:
+  register: current_ascs_host_info
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - ascs_node_ping.ping is defined
+    - ascs_node_ping.ping == 'pong'
+
+- name: Store current Message Server PID
+  ansible.builtin.set_fact:
+    current_msg_server_pid: "{{ (msg_server_process_list | selectattr('name', 'equalto', 'msg_server') | first)['pid'] if msg_server_process_list | length > 0 else 'NO_INSTANCE' }}"
+    previous_msg_server_pid: "{{ previous_msg_server_pid | default('none') }}"
+    ascs_instance_found: "{{ ascs_instance_list | length > 0 }}"
+  vars:
+    ascs_instance_list: "{{ current_ascs_host_info.instances | selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list }}"
+    msg_server_process_list: "{{ (ascs_instance_list | first)['ProcessList'] | selectattr('name', 'equalto', 'msg_server') | list if ascs_instance_list | length > 0 else [] }}"
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - current_ascs_host_info is defined
+    - current_ascs_host_info.instances is defined
+
+- name: Handle case when ASCS instance not found
+  ansible.builtin.set_fact:
+    current_msg_server_pid: "NO_INSTANCE"
+    ascs_instance_found: false
+    msg_server_restarted: false
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - current_ascs_host_info is defined
+    - (current_ascs_host_info.instances | default([]) | selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list | length == 0)
+
+- name: Display Message Server PID info
+  ansible.builtin.debug:
+    msg: |
+      Kill attempt {{ kill_attempt }}: 
+      - ASCS instance found: {{ ascs_instance_found | default(false) }}
+      - Current PID: {{ current_msg_server_pid | default('N/A') }}
+      - Previous PID: {{ previous_msg_server_pid | default('none') }}
+      {% if not (ascs_instance_found | default(false)) %}
+      - WARNING: ASCS instance {{ sap_ascs_instance_number }} not found in process list
+      {% endif %}
+  when: ansible_hostname == sap_ascs_node_name_initial
+
+- name: Killing the Message Server process
+  ansible.builtin.command: "kill -9 {{ current_msg_server_pid }}"
+  changed_when: true
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - current_msg_server_pid is defined
+    - current_msg_server_pid != "NO_INSTANCE"
+    - ascs_instance_found | default(false)
+    - ascs_node_ping.ping is defined
+    - ascs_node_ping.ping == 'pong'
+
+- name: Update kill counter
+  ansible.builtin.set_fact:
+    message_server_kill_count: "{{ kill_attempt }}"
+    previous_msg_server_pid: "{{ current_msg_server_pid | default('unknown') }}"
+
+- name: Wait for SAP automatic restart or HA intervention
+  ansible.builtin.pause:
+    seconds: 30
+    prompt: "Waiting for SAP automatic restart or HA intervention after kill attempt {{ kill_attempt }}"
+
+- name: Check if ASCS resource is still running on original node
+  sap.sap_operations.pcs_status_info:
+  register: ascs_status_check
+  run_once: true
+
+- name: Verify ASCS resource status
+  ansible.builtin.set_fact:
+    ascs_still_on_original_node: "{{ ascs_status_check | sap.sap_operations.pcs_resources_from_status(role='Started', id=sap_ascs_resource_name) | length > 0 }}"
+  run_once: true
+
+- name: Check if Message Server process is running again
+  sap.sap_operations.host_info:
+  register: restart_check_host_info
+  failed_when: false
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - ascs_still_on_original_node | bool
+
+- name: Determine if Message Server restarted automatically
+  ansible.builtin.set_fact:
+    msg_server_restarted: "{{ (restart_msg_server_list | length > 0) }}"
+    restart_ascs_instance_found: "{{ restart_ascs_instance_list | length > 0 }}"
+  vars:
+    restart_ascs_instance_list: "{{ restart_check_host_info.instances | default([]) | selectattr('mSystemNumber', 'equalto', sap_ascs_instance_number) | list }}"
+    restart_msg_server_list: "{{ (restart_ascs_instance_list | first)['ProcessList'] | selectattr('name', 'equalto', 'msg_server') | list if restart_ascs_instance_list | length > 0 else [] }}"
+  when:
+    - ansible_hostname == sap_ascs_node_name_initial
+    - ascs_still_on_original_node | bool
+    - restart_check_host_info is defined
+    - not (restart_check_host_info.failed | default(false))
+
+- name: Set restart status to false if ASCS moved or node unresponsive
+  ansible.builtin.set_fact:
+    msg_server_restarted: false
+  when:
+    - not (ascs_still_on_original_node | bool) or 
+      (ansible_hostname == sap_ascs_node_name_initial and (ascs_node_ping.failed | default(false)))
+
+- name: Display restart status
+  ansible.builtin.debug:
+    msg: |
+      After kill {{ kill_attempt }}:
+      - Message Server restarted: {{ msg_server_restarted | default(false) }}
+      - ASCS on original node: {{ ascs_still_on_original_node }}
+      - ASCS instance found during kill: {{ ascs_instance_found | default(false) }}
+      - ASCS instance found during restart check: {{ restart_ascs_instance_found | default(false) }}
+  when: ansible_hostname == sap_ascs_node_name_initial
+
+- name: Set global fact to stop further iterations if Message Server stopped restarting
+  ansible.builtin.set_fact:
+    msg_server_restarted: false
+  when:
+    - (not (msg_server_restarted | default(false) | bool)) or
+      (not (ascs_instance_found | default(true) | bool))
+    - kill_attempt | int >= 2