oracle-dba: 2026

#!/bin/bash

# ------------------------------------------------------------------

# Data Guard Status Report Script (Oracle 19c)

# Production-ready with centralized control panel, monitoring, alerts,

# visual dashboards, and controlled auto-healing capabilities

# ------------------------------------------------------------------

# =========================================================================

# CENTRALIZED CONTROL PANEL - TOGGLE FEATURES HERE

# =========================================================================

# 1. Global Monitoring Switches

ENABLE_MONITORING=true # Master switch for the entire script

ENABLE_AUTO_HEAL=false # Toggle Auto-Healing capabilities

ENABLE_EMAIL_ALERTS=true # Toggle Email notifications

ENABLE_PROM_METRICS=true # Toggle Prometheus .prom file generation

ENABLE_HTML_REPORT=true # Toggle HTML dashboard generation

ENABLE_HISTORICAL_DATA=true # Toggle historical data tracking

ENABLE_CSV_EXPORT=true # Toggle CSV export for Excel

GLOBAL_DRY_RUN=false # Global dry-run mode (overrides all actions)

# 2. Connection & Security Settings

# Options: "WALLET" (uses /@alias) or "USER" (requires credentials below)

CONNECTION_METHOD="WALLET"

DGMGRL_OPTIONS="-silent" # Add -xml if you prefer parsing XML output

# For USER method - set credentials (use secure method like env vars in production)

DG_MONITOR_USER="dg_monitor"

DG_MONITOR_PASSWORD="" # Set via environment variable or secure file

DG_MONITOR_PASSWORD_FILE="/etc/oracle/dg_monitor.pwd" # Alternative secure file

# 3. Environment Context

# Location of the centralized TNS_ADMIN if not in default OH/network/admin

export TNS_ADMIN="/u01/app/oracle/network/admin"

export ORACLE_HOME="/u01/app/oracle/product/19.0.0/dbhome_1"

export PATH=$ORACLE_HOME/bin:$PATH

export LD_LIBRARY_PATH=$ORACLE_HOME/lib:$LD_LIBRARY_PATH

# 4. Target Databases (Central Inventory)

# Add all Primary TNS Aliases here. The script will discover Standbys automatically.

DB_LIST=("PRIM_PROD_KWT" "PRIM_DR_KWT" "PRIM_DEV_CORE")

# 5. Performance & Concurrency

MAX_PARALLEL_JOBS=5 # Increase for large fleets

PARALLEL_SLEEP_INTERVAL=0.5 # Seconds between parallel job checks

SCRIPT_TIMEOUT=600 # Maximum execution time in seconds (10 min)

DGMGRL_TIMEOUT=30 # Seconds before killing a hung connection

DGMGRL_KILL_TIMEOUT=5 # Seconds after timeout before force kill

DGMGRL_RETRIES=2 # Number of retries for transient errors

MAX_HISTORY_LINES=50000 # Rotate history file after this many lines

# 6. Alert Thresholds (in seconds)

WARNING_LAG_THRESHOLD=300 # 5 minutes

CRITICAL_LAG_THRESHOLD=900 # 15 minutes

ALERT_COOLDOWN=900 # 15 minutes between alerts per database

# 7. Auto-Heal Configuration

AUTO_HEAL_DRY_RUN=true # true = log only, no execution

AUTO_HEAL_COOLDOWN=600 # seconds (10 min) between auto-heal attempts

MAX_AUTO_HEAL_ATTEMPTS=3 # Maximum attempts per standby before giving up

AUTO_HEAL_EXCLUDE=() # List of standbys to exclude from auto-heal (e.g., ("DR_TEST" "STBY_OLD"))

# 8. Directories

LOG_DIR="/var/log/dataguard"

REPORT_DIR="/tmp/dataguard_reports"

TEMP_DIR="/tmp/dataguard_temp"

CSV_DIR="/tmp/dataguard_csv"

# 9. Email Configuration

MAIL_TO="dba_team@yourcompany.com"

MAIL_FROM="dataguard@$(hostname)"

# 10. Logging Format (json or plain)

LOG_FORMAT="plain" # Options: "plain" or "json"

# 11. Console Output Colors

ENABLE_COLORS=true # Toggle colored console output

# =========================================================================

# INTERNAL VARIABLES - DO NOT MODIFY BELOW THIS LINE

# =========================================================================

# Color definitions

if [ "$ENABLE_COLORS" = true ] && [ -t 1 ]; then

RED="\e[31m"

GREEN="\e[32m"

YELLOW="\e[33m"

BLUE="\e[34m"

MAGENTA="\e[35m"

CYAN="\e[36m"

BOLD="\e[1m"

NC="\e[0m"

else

RED=""; GREEN=""; YELLOW=""; BLUE=""; MAGENTA=""; CYAN=""; BOLD=""; NC=""

# Derived file paths

LOG_FILE="$LOG_DIR/dataguard_report_$(date +%Y%m%d).log"

REPORT_FILE="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).txt"

RAW_DATA_FILE="$REPORT_DIR/dataguard_raw_$(date +%Y%m%d_%H%M%S).dat"

HISTORICAL_DATA_FILE="$REPORT_DIR/dataguard_history.dat"

HTML_REPORT="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).html"

CSV_FILE="$CSV_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).csv"

PROM_FILE="/tmp/dataguard_metrics.prom"

# Auto-heal files (per standby)

AUTO_HEAL_COUNT_DIR="/var/tmp/dataguard_heal_counts"

AUTO_HEAL_LOG="$LOG_DIR/dataguard_autoheal.log"

# Alert cooldown files (per database)

ALERT_COOLDOWN_BASE="/var/tmp/dataguard_last_alert"

ALERT_LOCK_BASE="/var/tmp/dataguard_alert_lock"

# Lock files for thread-safe operations

LOG_LOCKFILE="/tmp/dataguard_log_$$.lock"

AUTOHEAL_LOG_LOCKFILE="/tmp/dataguard_autoheallog_$$.lock"

OUTPUT_LOCKFILE="/tmp/dataguard_output_$$.lock"

HTML_LOCKFILE="/tmp/dataguard_html_$$.lock"

PROM_LOCKFILE="/tmp/dataguard_prom_$$.lock"

HISTORY_LOCKFILE="/tmp/dataguard_history_$$.lock"

SCRIPT_LOCK="/tmp/dataguard_main.lock"

# Watchdog PID

WATCHDOG_PID=""

MAIN_PID=$$

# Start timing

START_TIME=$(date +%s)

# =========================================================================

# INITIALIZATION

# =========================================================================

# Create directories

mkdir -p "$LOG_DIR" "$REPORT_DIR" "$TEMP_DIR" "$AUTO_HEAL_COUNT_DIR" "$CSV_DIR"

touch "$LOG_LOCKFILE" "$AUTOHEAL_LOG_LOCKFILE" "$OUTPUT_LOCKFILE" "$HTML_LOCKFILE" "$PROM_LOCKFILE" "$HISTORY_LOCKFILE"

# Script lock to prevent duplicate runs

exec 200>"$SCRIPT_LOCK"

flock -n 200 || {

echo "ERROR: Another instance of this script is already running. Exiting."

exit 1

}

# Initialize historical file if it doesn't exist

if [ "$ENABLE_HISTORICAL_DATA" = true ] && [ ! -f "$HISTORICAL_DATA_FILE" ]; then

# Initialize CSV file if enabled

if [ "$ENABLE_CSV_EXPORT" = true ]; then

echo "\"PrimaryDB\",\"StandbyDB\",\"TransportLag\",\"ApplyLag\",\"Error\",\"SwitchoverReady\",\"Enabled\",\"ConfigStatus\",\"OverallStatus\",\"Trend\",\"Timestamp\"" > "$CSV_FILE"

# Initialize Prometheus metrics file if enabled

if [ "$ENABLE_PROM_METRICS" = true ]; then

: > "$PROM_FILE"

{

echo "# HELP dataguard_transport_lag_seconds Data Guard transport lag in seconds"

echo "# TYPE dataguard_transport_lag_seconds gauge"

echo "# HELP dataguard_apply_lag_seconds Data Guard apply lag in seconds"

echo "# TYPE dataguard_apply_lag_seconds gauge"

echo "# HELP dataguard_status Data Guard status (0=error, 1=warning, 2=ok)"

echo "# TYPE dataguard_status gauge"

echo "# HELP dataguard_configuration_status Configuration status (0=error, 1=warning, 2=ok)"

echo "# TYPE dataguard_configuration_status gauge"

} >> "$PROM_FILE"

# =========================================================================

# UTILITY FUNCTIONS

# =========================================================================

# CSV escape function

escape_csv() {

echo "$1" | sed 's/"/""/g'

}

# Structured logging function

log_message() {

[ "$ENABLE_MONITORING" != true ] && return

local msg="$1"

local level="${2:-INFO}"

local timestamp=$(date -Iseconds)

if [ "$LOG_FORMAT" = "json" ]; then

local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","host":"%s"}\n' \

"$timestamp" "$level" "${msg//\"/\\\"}" "$(hostname)")

{

flock -x 200

echo "$json_msg" >> "$LOG_FILE"

} 200>"$LOG_LOCKFILE"

# Also print to console for readability

echo "$msg"

else

{

flock -x 200

echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" | tee -a "$LOG_FILE"

} 200>"$LOG_LOCKFILE"

}

# Thread-safe auto-heal logging

log_autoheal() {

[ "$ENABLE_AUTO_HEAL" != true ] && return

local msg="$1"

local level="${2:-INFO}"

local timestamp=$(date -Iseconds)

if [ "$LOG_FORMAT" = "json" ]; then

local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","component":"autoheal"}\n' \

"$timestamp" "$level" "${msg//\"/\\\"}")

{

flock -x 200

echo "$json_msg" >> "$AUTO_HEAL_LOG"

} 200>"$AUTOHEAL_LOG_LOCKFILE"

else

{

flock -x 200

echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" >> "$AUTO_HEAL_LOG"

} 200>"$AUTOHEAL_LOG_LOCKFILE"

}

# Thread-safe output writer

write_output() {

local console_line="$1"

local report_line="$2"

local raw_data_line="$3"

local csv_line="$4"

# Write to console and main report files

{

flock -x 200

echo "$console_line"

echo "$report_line" >> "$REPORT_FILE"

if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then

echo "$raw_data_line" >> "$RAW_DATA_FILE"

if [ -n "$csv_line" ] && [ "$ENABLE_CSV_EXPORT" = true ]; then

echo "$csv_line" >> "$CSV_FILE"

} 200>"$OUTPUT_LOCKFILE"

# Write to historical data file with separate lock

if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then

{

flock -x 200

echo "$raw_data_line" >> "$HISTORICAL_DATA_FILE"

} 200>"$HISTORY_LOCKFILE"

}

# Thread-safe Prometheus writer

write_prometheus() {

[ "$ENABLE_PROM_METRICS" != true ] && return

local metrics="$1"

{

flock -x 200

echo "$metrics" >> "$PROM_FILE"

} 200>"$PROM_LOCKFILE"

}

# Thread-safe HTML writer

write_html_row() {

[ "$ENABLE_HTML_REPORT" != true ] && return

local html_row="$1"

{

flock -x 200

echo "$html_row" >> "$HTML_REPORT"

} 200>"$HTML_LOCKFILE"

}

# =========================================================================

# REMOTE DGMGRL EXECUTION FUNCTIONS

# =========================================================================

# Build connection string based on control panel settings

get_connection_string() {

local target_alias="$1"

if [ "$CONNECTION_METHOD" == "WALLET" ]; then

echo "/@$target_alias"

else

# Check for password in environment or file

local password=""

if [ -n "$DG_MONITOR_PASSWORD" ]; then

password="$DG_MONITOR_PASSWORD"

elif [ -f "$DG_MONITOR_PASSWORD_FILE" ]; then

# Read password safely, stripping newlines

password=$(tr -d '\n' < "$DG_MONITOR_PASSWORD_FILE" 2>/dev/null)

if [ -z "$password" ]; then

echo "ERROR: No password available for USER connection method" >&2

return 1

echo "${DG_MONITOR_USER}/$password@$target_alias"

}

# Thread-safe Remote DGMGRL Execution with timeout and retry

run_remote_dgmgrl() {

local target_alias="$1"

local sql_commands="$2"

local connection_str=""

connection_str=$(get_connection_string "$target_alias")

if [ $? -ne 0 ] || [ -z "$connection_str" ]; then

echo "ERROR: Failed to build connection string for $target_alias"

return 1

local output=""

local rc=0

for ((i=1; i<=DGMGRL_RETRIES+1; i++)); do

output=$(timeout -k ${DGMGRL_KILL_TIMEOUT}s ${DGMGRL_TIMEOUT}s dgmgrl $DGMGRL_OPTIONS "$connection_str" <<EOF

$sql_commands

exit;

EOF

) 2>&1

rc=$?

if [ $rc -eq 0 ]; then

echo "$output"

return 0

# Check for transient errors

local is_transient=0

if [ $rc -eq 124 ] || [ $rc -eq 137 ]; then

is_transient=1

elif grep -q "ORA-03113\|ORA-03114\|ORA-12541\|ORA-12514\|TNS-12541" <<< "$output"; then

is_transient=1

if [ $i -gt $DGMGRL_RETRIES ] || [ $is_transient -eq 0 ]; then

echo "$output"

return $rc

log_message "Retry $i/$DGMGRL_RETRIES for $target_alias (exit code: $rc)"

sleep 2

done

echo "$output"

return 1

}

# Alias for backward compatibility

run_dgmgrl() {

run_remote_dgmgrl "$1" "$2"

}

# =========================================================================

# LAG PROCESSING FUNCTIONS

# =========================================================================

# Convert lag to seconds

lag_to_seconds() {

local lag_value="$1"

local seconds=0

if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]] || [[ "$lag_value" =~ unknown|UNKNOWN ]]; then

echo -1

return

if [[ "$lag_value" =~ ([0-9]+)\ days ]]; then

seconds=$((seconds + ${BASH_REMATCH[1]} * 86400))

if [[ "$lag_value" =~ ([0-9]+)\ hours ]]; then

seconds=$((seconds + ${BASH_REMATCH[1]} * 3600))

if [[ "$lag_value" =~ ([0-9]+)\ minutes ]]; then

seconds=$((seconds + ${BASH_REMATCH[1]} * 60))

if [[ "$lag_value" =~ ([0-9]+)\ seconds ]]; then

seconds=$((seconds + ${BASH_REMATCH[1]}))

echo $seconds

}

# Parse lag from DGMGRL format

parse_lag() {

local lag_value=$1

if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]]; then

echo "N/A"

return

if [[ "$lag_value" =~ \+([0-9]{2})\ ([0-9]{2}):([0-9]{2}):([0-9]{2}) ]]; then

local days=${BASH_REMATCH[1]}

local hours=${BASH_REMATCH[2]}

local minutes=${BASH_REMATCH[3]}

local seconds=${BASH_REMATCH[4]}

local total_seconds=$((days * 86400 + hours * 3600 + minutes * 60 + seconds))

if [ $total_seconds -gt 0 ]; then

if [ $total_seconds -ge 86400 ]; then

echo "$days days"

elif [ $total_seconds -ge 3600 ]; then

echo "$((hours + days * 24)) hours"

elif [ $total_seconds -ge 60 ]; then

echo "$minutes minutes"

else

echo "$seconds seconds"

else

echo "0 seconds"

else

echo "$lag_value"

}

# Check lag thresholds

check_lag_threshold() {

local lag_seconds=$1

if [ $lag_seconds -eq -1 ]; then

echo "UNKNOWN"

elif [ $lag_seconds -ge $CRITICAL_LAG_THRESHOLD ]; then

echo "CRITICAL"

elif [ $lag_seconds -ge $WARNING_LAG_THRESHOLD ]; then

echo "WARNING"

else

echo "OK"

}

# =========================================================================

# STATUS DETERMINATION FUNCTIONS

# =========================================================================

# Get overall status

get_overall_status() {

local status_err="$1"

local transport_status="$2"

local apply_status="$3"

local config_status="$4"

# Infrastructure failures

if [[ "$status_err" == "Show command failed" ]] || \

[[ "$status_err" == "Validate command failed" ]] || \

[[ "$status_err" == "Command timeout" ]]; then

echo "INFRA_ERROR"

return

# Configuration status issues

if [[ "$config_status" == "WARNING" ]]; then

echo "WARNING"

return

elif [[ "$config_status" == "ERROR" ]]; then

echo "ERROR"

return

# Data Guard errors

if [[ "$status_err" != "None" ]] && [[ "$status_err" != "SUCCESS" ]]; then

echo "ERROR"

elif [[ "$transport_status" == "CRITICAL" ]] || [[ "$apply_status" == "CRITICAL" ]]; then

echo "CRITICAL"

elif [[ "$transport_status" == "WARNING" ]] || [[ "$apply_status" == "WARNING" ]]; then

echo "WARNING"

else

echo "OK"

}

# Get colored status for console

get_colored_status() {

local status="$1"

case "$status" in

"OK") echo "${GREEN}✓ OK${NC}" ;;

"WARNING") echo "${YELLOW}⚠ WARNING${NC}" ;;

"CRITICAL") echo "${RED}🔴 CRITICAL${NC}" ;;

"ERROR") echo "${RED}✗ ERROR${NC}" ;;

"INFRA_ERROR") echo "${MAGENTA}⚙ INFRA_ERROR${NC}" ;;

*) echo "$status" ;;

esac

}

# Get numeric status for Prometheus

get_numeric_status() {

case "$1" in

"ERROR"|"CRITICAL"|"INFRA_ERROR") echo 0 ;;

"WARNING") echo 1 ;;

*) echo 2 ;;

esac

}

get_numeric_config_status() {

case "$1" in

"ERROR") echo 0 ;;

"WARNING") echo 1 ;;

*) echo 2 ;;

esac

}

# =========================================================================

# TREND DETECTION

# =========================================================================

detect_trend() {

[ "$ENABLE_HISTORICAL_DATA" != true ] && { echo "UNKNOWN"; return; }

local primary_db="$1"

local standby="$2"

local current_seconds="$3"

local prev_seconds=""

if [ -f "$HISTORICAL_DATA_FILE" ]; then

{

flock -s 200

prev_seconds=$(grep "^${primary_db}|${standby}|" "$HISTORICAL_DATA_FILE" | tail -2 | head -1 | awk -F'|' '{print $3}' | xargs)

} 200>"$HISTORY_LOCKFILE"

if [[ -n "$prev_seconds" ]] && [[ "$prev_seconds" =~ ^[0-9]+$ ]] && [[ "$current_seconds" =~ ^[0-9]+$ ]]; then

if [ "$current_seconds" -gt "$prev_seconds" ]; then

echo "INCREASING"

elif [ "$current_seconds" -lt "$prev_seconds" ]; then

echo "DECREASING"

else

echo "STABLE"

return

echo "UNKNOWN"

}

# Get colored trend for console

get_colored_trend() {

local trend="$1"

case "$trend" in

"INCREASING") echo "${RED}▲ INCREASING${NC}" ;;

"DECREASING") echo "${GREEN}▼ DECREASING${NC}" ;;

"STABLE") echo "${BLUE}● STABLE${NC}" ;;

*) echo "$trend" ;;

esac

}

# =========================================================================

# HTML ESCAPE

# =========================================================================

html_escape() {

local str="$1"

str="${str//&/&}"

str="${str//</<}"

str="${str//>/>}"

str="${str//\"/"}"

str="${str//\'/'}"

echo "$str"

}

# =========================================================================

# AUTO-HEAL FUNCTIONS

# =========================================================================

# Get auto-heal attempt count (thread-safe with lock)

get_heal_attempts() {

local primary="$1"

local standby="$2"

local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"

{

flock -s 200

if [ -f "$count_file" ]; then

cat "$count_file"

else

echo "0"

} 200>"${count_file}.lock"

}

# Increment auto-heal attempt count (atomic with lock)

increment_heal_attempts() {

local primary="$1"

local standby="$2"

local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"

{

flock -x 200

local current=$(cat "$count_file" 2>/dev/null || echo 0)

echo $((current + 1)) > "$count_file"

} 200>"${count_file}.lock"

}

# Reset auto-heal attempt count

reset_heal_attempts() {

local primary="$1"

local standby="$2"

local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"

rm -f "$count_file"

rm -f "${count_file}.lock"

}

# Check if standby is excluded from auto-heal

is_excluded() {

local standby="$1"

for excluded in "${AUTO_HEAL_EXCLUDE[@]}"; do

if [ "$standby" = "$excluded" ]; then

return 0

done

return 1

}

# Auto-heal function with per-standby cooldown, retry limits, and exclusions

auto_heal() {

[ "$ENABLE_AUTO_HEAL" != true ] && return

[ "$GLOBAL_DRY_RUN" = true ] && { log_autoheal "[DRY-RUN] Global dry-run active, skipping auto-heal"; return; }

local PRIMARY="$1"

local STBY="$2"

local STATUS_ERR="$3"

local APPLY_RATE="$4"

local APPLY_SECONDS="$5"

local TRANSPORT_DISCONNECTED="$6"

local CONFIG_STATUS="$7"

local DB_ROLE="$8"

local OVERALL_STATUS="$9"

# Check if standby is excluded

if is_excluded "$STBY"; then

log_autoheal "[EXCLUDED] $PRIMARY -> $STBY is in exclusion list, skipping"

return

# Check role - only heal from PRIMARY

if [[ "$DB_ROLE" != "PRIMARY" ]]; then

log_autoheal "[SKIP] $PRIMARY is not PRIMARY (role=$DB_ROLE), skipping auto-heal"

return

# Check max attempts - only if not OK

local attempts=$(get_heal_attempts "$PRIMARY" "$STBY")

if [ "$OVERALL_STATUS" != "OK" ] && [ "$attempts" -ge "$MAX_AUTO_HEAL_ATTEMPTS" ]; then

log_autoheal "[MAX_ATTEMPTS] $PRIMARY -> $STBY has reached max attempts ($MAX_AUTO_HEAL_ATTEMPTS), skipping"

return

# Per-standby cooldown lock

local LOCK_FILE="/var/tmp/dataguard_autoheal_${PRIMARY}_${STBY}.lock"

(

flock -x 200

local now=$(date +%s)

if [ -f "$LOCK_FILE" ]; then

local last=$(cat "$LOCK_FILE")

if (( now - last < AUTO_HEAL_COOLDOWN )); then

exit 1

echo "$now" > "$LOCK_FILE"

exit 0

) 200>"${LOCK_FILE}.lck"

if [ $? -eq 1 ]; then

log_autoheal "[COOLDOWN] Active for $PRIMARY -> $STBY, skipping"

return

log_autoheal "[EVALUATE] $PRIMARY -> $STBY | Status: $STATUS_ERR | Rate: $APPLY_RATE | Lag: ${APPLY_SECONDS}s | Config: $CONFIG_STATUS | Attempt: $((attempts+1))/$MAX_AUTO_HEAL_ATTEMPTS"

run_fix() {

local cmd="$1"

local description="$2"

if [ "$AUTO_HEAL_DRY_RUN" = true ]; then

log_autoheal "[DRY-RUN] $description: $cmd"

return 0

log_autoheal "[EXEC] $description: $cmd"

run_dgmgrl "$PRIMARY" "$cmd" >> "$AUTO_HEAL_LOG" 2>&1

local exit_code=$?

if [ $exit_code -eq 0 ]; then

log_autoheal "[SUCCESS] $description completed"

return 0

else

log_autoheal "[FAILURE] $description failed (exit: $exit_code)"

return 1

}

local fix_applied=0

# CASE 1: MRP NOT RUNNING

if [[ "$STATUS_ERR" == "MRP not running" ]]; then

log_autoheal "[ACTION] MRP not running - restarting apply"

run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Restart MRP on $STBY"

fix_applied=1

# CASE 2: APPLY STALLED (fixed regex)

if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$APPLY_SECONDS" -gt 300 ] && [ "$APPLY_SECONDS" -ne -1 ]; then

log_autoheal "[ACTION] Apply stalled - restarting apply (stop then start)"

run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-OFF';" "Stop apply on $STBY"

sleep 5

run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Start apply on $STBY"

fix_applied=1

# CASE 3: TRANSPORT DISCONNECTED (with config status check)

if [ "$TRANSPORT_DISCONNECTED" -eq 1 ] && [[ "$CONFIG_STATUS" == "SUCCESS" ]]; then

log_autoheal "[ACTION] Transport disconnected - re-enabling"

run_fix "EDIT DATABASE '$PRIMARY' SET STATE='TRANSPORT-ON';" "Enable transport on $PRIMARY"

fix_applied=1

# CASE 4: CONFIG DISABLED

if [[ "$CONFIG_STATUS" == "DISABLED" ]] || [[ "$CONFIG_STATUS" == "disabled" ]]; then

log_autoheal "[ACTION] Configuration disabled - enabling"

run_fix "ENABLE CONFIGURATION;" "Enable Data Guard configuration"

fix_applied=1

# Update attempt counter based on overall status

if [ "$OVERALL_STATUS" = "OK" ]; then

reset_heal_attempts "$PRIMARY" "$STBY"

elif [ $fix_applied -eq 1 ]; then

increment_heal_attempts "$PRIMARY" "$STBY"

}

# =========================================================================

# EMAIL ALERT FUNCTION

# =========================================================================

send_alert_email() {

[ "$ENABLE_EMAIL_ALERTS" != true ] && {

log_message "Email alerts disabled, skipping"

return

}

[ "$GLOBAL_DRY_RUN" = true ] && { log_message "[DRY-RUN] Global dry-run active, skipping email"; return; }

local subject="$1"

local body_file="$2"

local db="$3"

# Per-database cooldown check with per-database lock

local cooldown_file="${ALERT_COOLDOWN_BASE}_${db}.lock"

local lock_file="${ALERT_LOCK_BASE}_${db}.lock"

(

flock -x 200

if [ -f "$cooldown_file" ]; then

local last=$(cat "$cooldown_file")

local now=$(date +%s)

if (( now - last < ALERT_COOLDOWN )); then

exit 1

date +%s > "$cooldown_file"

exit 0

) 200>"$lock_file"

[ $? -eq 1 ] && { log_message "Alert suppressed for $db (cooldown)"; return; }

local email_body="Data Guard Alert Report\n"

email_body+="Host: $(hostname)\n"

email_body+="Database: $db\n"

email_body+="Time: $(date)\n"

email_body+="================================\n\n"

email_body+="$(cat "$body_file")\n"

if command -v mailx >/dev/null 2>&1; then

echo -e "$email_body" | mailx -s "$subject" -r "$MAIL_FROM" "$MAIL_TO"

log_message "Alert email sent via mailx for $db"

elif command -v mail >/dev/null 2>&1; then

echo -e "$email_body" | mail -s "$subject" "$MAIL_TO"

log_message "Alert email sent via mail for $db"

else

log_message "WARNING: mail utility not found"

echo "$email_body" > "$REPORT_DIR/alert_manual_${db}_$(date +%Y%m%d_%H%M%S).txt"

}

# =========================================================================

# WALLET VALIDATION

# =========================================================================

validate_wallet() {

[ "$ENABLE_MONITORING" != true ] && return 0

log_message "Validating connectivity..."

local failed_dbs=()

for DB in "${DB_LIST[@]}"; do

# Strong validation - check if broker is accessible

if ! run_dgmgrl "$DB" "show configuration;" >/dev/null 2>&1; then

log_message "WARNING: Connection validation failed for $DB (will skip)"

failed_dbs+=("$DB")

done

if [ ${#failed_dbs[@]} -eq ${#DB_LIST[@]} ]; then

log_message "ERROR: All databases failed validation. Exiting."

return 1

# Update DB_LIST to exclude failed databases

local new_list=()

for DB in "${DB_LIST[@]}"; do

local skip=0

for failed in "${failed_dbs[@]}"; do

[ "$DB" = "$failed" ] && { skip=1; break; }

done

[ $skip -eq 0 ] && new_list+=("$DB")

done

DB_LIST=("${new_list[@]}")

log_message "Connection validation successful for ${#DB_LIST[@]} databases"

return 0

}

# =========================================================================

# ROTATE HISTORICAL FILE

# =========================================================================

rotate_historical_file() {

[ "$ENABLE_HISTORICAL_DATA" != true ] && return

if [ -f "$HISTORICAL_DATA_FILE" ]; then

local lines=$(wc -l < "$HISTORICAL_DATA_FILE")

if [ "$lines" -gt "$MAX_HISTORY_LINES" ]; then

log_message "Rotating historical file (${lines} lines, max ${MAX_HISTORY_LINES})"

tail -n "$MAX_HISTORY_LINES" "$HISTORICAL_DATA_FILE" > "${HISTORICAL_DATA_FILE}.tmp"

mv "${HISTORICAL_DATA_FILE}.tmp" "$HISTORICAL_DATA_FILE"

}

# =========================================================================

# MAIN DATABASE PROCESSING

# =========================================================================

process_database() {

local DB="$1"

local TMPFILE_LOCAL=""

local tmppath

tmppath=$(mktemp)

trap "rm -f '$tmppath'" RETURN

TMPFILE_LOCAL="$tmppath"

local PRIMARY_DB=""

local DB_ROLE=""

local STANDBY_ARRAY=()

local CONNECTION_ERROR=0

local CONFIG_STATUS=""

local FSFO_ENABLED=0

log_message "Processing database: $DB"

# Get configuration

run_dgmgrl "$DB" "show configuration;" > "$TMPFILE_LOCAL" 2>&1

local dgmgrl_exit=$?

if [ $dgmgrl_exit -eq 124 ] || [ $dgmgrl_exit -eq 137 ]; then

log_message "ERROR: Timeout connecting to $DB"

CONNECTION_ERROR=1

CONFIG_STATUS=$(grep -i "Configuration Status" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)

CONFIG_STATUS=${CONFIG_STATUS:-"Unknown"}

write_prometheus "dataguard_configuration_status{primary=\"$DB\"} $(get_numeric_config_status "$CONFIG_STATUS")"

# Extract database role

DB_ROLE=$(grep -i "Database Role" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)

DB_ROLE=${DB_ROLE:-"UNKNOWN"}

# Detect FSFO (Fast-Start Failover) - once per database

grep -qi "Fast-Start Failover: ENABLED" "$TMPFILE_LOCAL" && FSFO_ENABLED=1

# Check for errors

if [ $CONNECTION_ERROR -eq 0 ] && grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -qv "ORA-16809" 2>/dev/null; then

local error_msg=$(grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -v "ORA-16809" | head -1)

log_message "WARNING: DGMGRL error on $DB: $error_msg"

CONNECTION_ERROR=1

# Handle connection failure

if [ $CONNECTION_ERROR -eq 1 ]; then

local colored_status="${RED}INFRA_ERROR${NC}"

local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \

"$DB" "N/A" "ERROR" "ERROR" "DGMGRL connection failed" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")

local csv_line="\"$(escape_csv "$DB")\",\"N/A\",\"ERROR\",\"ERROR\",\"DGMGRL connection failed\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""

write_output "$output_line" "$output_line" "$raw_line" "$csv_line"

return 1

# Extract primary database

PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}' | xargs)

[ -z "$PRIMARY_DB" ] && PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')

local PROM_PRIMARY="$DB"

local DISPLAY_PRIMARY="${PRIMARY_DB:-$DB} ($DB)"

local DISPLAY_PRIMARY_ESCAPED=$(html_escape "$DISPLAY_PRIMARY")

# Extract standbys

mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}')

if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then

mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')

if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then

log_message "INFO: No standbys found for ${PRIMARY_DB:-$DB}"

return 0

# Process each standby

for STBY in "${STANDBY_ARRAY[@]}"; do

: > "$TMPFILE_LOCAL"

# Combined commands - single connection per standby for efficiency

run_dgmgrl "$DB" "

show database verbose '$STBY';

validate database '$STBY';

" > "$TMPFILE_LOCAL" 2>&1

local combined_exit=$?

# Timeout handling

if [ $combined_exit -eq 124 ] || [ $combined_exit -eq 137 ]; then

log_message "ERROR: Timeout processing standby $STBY on $PRIMARY_DB"

local colored_status="${RED}INFRA_ERROR${NC}"

local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \

"$DISPLAY_PRIMARY" "$STBY" "TIMEOUT" "TIMEOUT" "Command timeout" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")

local raw_line="${PRIMARY_DB:-$DB}|$STBY|-1|-1|Command timeout|Unknown|Unknown|$CONFIG_STATUS|INFRA_ERROR|UNKNOWN|$(date '+%Y-%m-%d %H:%M:%S')"

local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"TIMEOUT\",\"TIMEOUT\",\"Command timeout\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""

write_output "$output_line" "$output_line" "$raw_line" "$csv_line"

write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1

dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1

dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"INFRA_ERROR\",config=\"$CONFIG_STATUS\"} 0"

continue

# Check output presence

local show_output_present=0

local validate_output_present=0

grep -q "Database Name:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1

grep -q "Transport Lag:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1

grep -q "Intended State:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1

grep -q "Ready for Switchover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1

grep -q "Ready for Failover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1

# Parse values

local TRANSPORT_LAG_RAW=""

local APPLY_LAG_RAW=""

local STATUS_ERR=""

local INTENDED_STATE=""

local ENABLED=""

local SWITCHOVER=""

local APPLY_RATE=""

local TRANSPORT_DISCONNECTED=0

if [ $show_output_present -eq 1 ]; then

TRANSPORT_LAG_RAW=$(awk -F": " '/Transport Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)

APPLY_LAG_RAW=$(awk -F": " '/Apply Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)

STATUS_ERR=$(awk -F": " '/Status:|Error:/ {print $2}' "$TMPFILE_LOCAL" | grep -v "SUCCESS" | head -1 | xargs)

INTENDED_STATE=$(awk -F": " '/Intended State/ {print $2}' "$TMPFILE_LOCAL" | xargs)

ENABLED=$(awk -F": " '/^Enabled:/ {print $2}' "$TMPFILE_LOCAL" | head -1 | xargs)

APPLY_RATE=$(awk -F": " '/Apply Rate/ {print $2}' "$TMPFILE_LOCAL" | xargs)

# Check MRP

if grep -qi "Apply Instance.*not running" "$TMPFILE_LOCAL"; then

STATUS_ERR="MRP not running"

log_message "WARNING: MRP not running for $STBY"

# Check transport

if grep -qi "DISCONNECTED" "$TMPFILE_LOCAL"; then

TRANSPORT_DISCONNECTED=1

log_message "WARNING: Transport disconnected for $STBY"

else

STATUS_ERR="Show command failed"

if [ $validate_output_present -eq 1 ]; then

SWITCHOVER=$(awk -F": " '/Ready for Switchover/ {print $2}' "$TMPFILE_LOCAL" | xargs)

# Parse lag

local TRANSPORT_LAG=$(parse_lag "$TRANSPORT_LAG_RAW")

local APPLY_LAG=$(parse_lag "$APPLY_LAG_RAW")

local transport_seconds=$(lag_to_seconds "$TRANSPORT_LAG")

local apply_seconds=$(lag_to_seconds "$APPLY_LAG")

# Set defaults

TRANSPORT_LAG=${TRANSPORT_LAG:-"N/A"}

APPLY_LAG=${APPLY_LAG:-"N/A"}

STATUS_ERR=${STATUS_ERR:-"None"}

ENABLED=${ENABLED:-"Unknown"}

SWITCHOVER=${SWITCHOVER:-"Unknown"}

# Validate failures

if [ $validate_output_present -eq 0 ] && [ "$STATUS_ERR" = "None" ]; then

STATUS_ERR="Validate command failed"

# Check thresholds

local transport_status=$(check_lag_threshold $transport_seconds)

local apply_status=$(check_lag_threshold $apply_seconds)

# Apply stall detection (fixed regex)

if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$apply_seconds" -gt 60 ] && [ "$apply_seconds" -ne -1 ]; then

STATUS_ERR="Apply stalled"

apply_status="CRITICAL"

log_message "WARNING: Apply stalled for $STBY (rate=$APPLY_RATE, lag=${apply_seconds}s)"

# Override critical conditions

[ "$STATUS_ERR" = "MRP not running" ] && apply_status="CRITICAL"

[ $TRANSPORT_DISCONNECTED -eq 1 ] && transport_status="CRITICAL"

# Detect trend

local TREND=$(detect_trend "${PRIMARY_DB:-$DB}" "$STBY" "$transport_seconds")

# Overall status

local OVERALL_STATUS=$(get_overall_status "$STATUS_ERR" "$transport_status" "$apply_status" "$CONFIG_STATUS")

local NUMERIC_STATUS=$(get_numeric_status "$OVERALL_STATUS")

# Auto-heal hook (skip if FSFO is enabled)

if [ $FSFO_ENABLED -eq 0 ]; then

auto_heal "${PRIMARY_DB:-$DB}" "$STBY" "$STATUS_ERR" "$APPLY_RATE" "$apply_seconds" "$TRANSPORT_DISCONNECTED" "$CONFIG_STATUS" "$DB_ROLE" "$OVERALL_STATUS"

elif [ $FSFO_ENABLED -eq 1 ]; then

log_message "INFO: FSFO enabled for $PRIMARY_DB, auto-heal disabled for this configuration"

# HTML classes

local row_class="ok"

case "$OVERALL_STATUS" in

"CRITICAL"|"ERROR"|"INFRA_ERROR") row_class="crit" ;;

"WARNING") row_class="warn" ;;

esac

local trend_class=""

case "$TREND" in

"INCREASING") trend_class="trend-up" ;;

"DECREASING") trend_class="trend-down" ;;

*) trend_class="trend-stable" ;;

esac

# Escape HTML

local STBY_ESCAPED=$(html_escape "$STBY")

local STATUS_ERR_ESCAPED=$(html_escape "$STATUS_ERR")

local SWITCHOVER_ESCAPED=$(html_escape "$SWITCHOVER")

local ENABLED_ESCAPED=$(html_escape "$ENABLED")

local CONFIG_STATUS_ESCAPED=$(html_escape "$CONFIG_STATUS")

# Colored output

local colored_status=$(get_colored_status "$OVERALL_STATUS")

local colored_trend=$(get_colored_trend "$TREND")

# Output

local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${colored_status} ${colored_trend} %-20s\n" \

"$DISPLAY_PRIMARY" "$STBY" "$TRANSPORT_LAG" "$APPLY_LAG" "$STATUS_ERR" "$SWITCHOVER" "$ENABLED" "$CONFIG_STATUS" "$(date '+%Y-%m-%d %H:%M:%S')")

local raw_line="${PRIMARY_DB:-$DB}|$STBY|$transport_seconds|$apply_seconds|$STATUS_ERR|$SWITCHOVER|$ENABLED|$CONFIG_STATUS|$OVERALL_STATUS|$TREND|$(date '+%Y-%m-%d %H:%M:%S')"

local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"$(escape_csv "$TRANSPORT_LAG")\",\"$(escape_csv "$APPLY_LAG")\",\"$(escape_csv "$STATUS_ERR")\",\"$(escape_csv "$SWITCHOVER")\",\"$(escape_csv "$ENABLED")\",\"$(escape_csv "$CONFIG_STATUS")\",\"$(escape_csv "$OVERALL_STATUS")\",\"$(escape_csv "$TREND")\",\"$(date '+%Y-%m-%d %H:%M:%S')\""

write_output "$output_line" "$output_line" "$raw_line" "$csv_line"

write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $transport_seconds

dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $apply_seconds

dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"$OVERALL_STATUS\",config=\"$CONFIG_STATUS\"} $NUMERIC_STATUS"

local html_row="<tr class='$row_class'>

<td>${DISPLAY_PRIMARY_ESCAPED}</td>

<td>${STBY_ESCAPED}</td>

<td>$TRANSPORT_LAG</td>

<td>$APPLY_LAG</td>

<td>${STATUS_ERR_ESCAPED}</td>

<td>${SWITCHOVER_ESCAPED}</td>

<td>${ENABLED_ESCAPED}</td>

<td>${CONFIG_STATUS_ESCAPED}</td>

<td>$OVERALL_STATUS</td>

<td class=\"$trend_class\">$TREND</td>

</tr>"

write_html_row "$html_row"

# Log issues

if [ "$OVERALL_STATUS" = "CRITICAL" ] || [ "$OVERALL_STATUS" = "ERROR" ]; then

log_message "CRITICAL: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: $STATUS_ERR (transport: $TRANSPORT_LAG, apply: $APPLY_LAG)"

elif [ "$OVERALL_STATUS" = "WARNING" ]; then

log_message "WARNING: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: transport=$TRANSPORT_LAG apply=$APPLY_LAG"

elif [ "$OVERALL_STATUS" = "INFRA_ERROR" ]; then

log_message "INFRA_ERROR: Command issue for $PRIMARY_DB -> $STBY: $STATUS_ERR"

done

}

# =========================================================================

# HTML REPORT FUNCTIONS

# =========================================================================

init_html_report() {

[ "$ENABLE_HTML_REPORT" != true ] && return

cat <<EOF > "$HTML_REPORT"

<!DOCTYPE html>

<html>

<head>

<title>Data Guard Report - $(date)</title>

<style>

body { font-family: 'Segoe UI', Arial, sans-serif; margin: 20px; background: #f5f5f5; }

h1, h2 { color: #333; }

table { border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.2); }

th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }

th { background: #4CAF50; color: white; }

tr:hover { background: #f5f5f5; }

.ok { background: #d4edda; }

.warn { background: #fff3cd; }

.crit { background: #f8d7da; }

.trend-up { color: #dc3545; font-weight: bold; }

.trend-down { color: #28a745; font-weight: bold; }

.trend-stable { color: #6c757d; }

.footer { margin-top: 20px; font-size: 12px; color: #666; text-align: center; }

.summary { background: white; padding: 15px; margin-bottom: 20px; border-left: 4px solid #4CAF50; }

</style>

</head>

<body>

<h1>📊 Oracle Data Guard Status Report</h1>

<strong>Report Time:</strong> $(date)<br>

<strong>Host:</strong> $(hostname)<br>

<strong>Connection Method:</strong> $CONNECTION_METHOD<br>

<strong>Databases Monitored:</strong> ${#DB_LIST[@]}<br>

<strong>Parallel Jobs:</strong> $MAX_PARALLEL_JOBS<br>

<strong>Email Alerts:</strong> $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )<br>

<strong>Auto-Heal:</strong> $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )<br>

<strong>Global Dry-Run:</strong> $GLOBAL_DRY_RUN<br>

<strong>Execution Time:</strong> <span id="exec-time">Calculating...</span>

</div>

<h2>📋 Standby Database Status</h2>

<table>

<thead>

<tr>

<th>Primary</th><th>Standby</th><th>Transport Lag</th><th>Apply Lag</th>

<th>Error</th><th>Switchover</th><th>Enabled</th><th>Config</th><th>Status</th><th>Trend</th><th>Timestamp</th>

</tr>

</thead>

<tbody>

EOF

}

finalize_html_report() {

[ "$ENABLE_HTML_REPORT" != true ] && return

DURATION=$(( $(date +%s) - START_TIME ))

cat <<EOF >> "$HTML_REPORT"

</tbody>

</table>

<strong>Legend:</strong>

<span style="background:#fff3cd; padding:2px 8px;">⚠ Warning</span>

<span style="background:#f8d7da; padding:2px 8px;">🔴 Critical</span>

Trend: <span class="trend-up">▲ Increasing</span> <span class="trend-down">▼ Decreasing</span> <span class="trend-stable">● Stable</span>

Generated by Data Guard Monitoring System | $(date) | Execution Time: ${DURATION}s

</div>

</body>

</html>

EOF

}

# =========================================================================

# SUMMARY GENERATION

# =========================================================================

generate_summary() {

local alert_file="$REPORT_DIR/alert_$(date +%s).txt"

local alert_flag=0

local issues_found=0

{

flock -x 200

echo -e "\n=== SUMMARY ===" >> "$REPORT_FILE"

echo "Report generated: $(date)" >> "$REPORT_FILE"

echo "Report file: $REPORT_FILE" >> "$REPORT_FILE"

[ "$ENABLE_HTML_REPORT" = true ] && echo "HTML report: $HTML_REPORT" >> "$REPORT_FILE"

[ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV export: $CSV_FILE" >> "$REPORT_FILE"

echo "Raw data file: $RAW_DATA_FILE" >> "$REPORT_FILE"

echo "Historical data file: $HISTORICAL_DATA_FILE" >> "$REPORT_FILE"

echo "Prometheus metrics: $PROM_FILE" >> "$REPORT_FILE"

echo "Log file: $LOG_FILE" >> "$REPORT_FILE"

echo "Auto-heal log: $AUTO_HEAL_LOG" >> "$REPORT_FILE"

echo "" >> "$REPORT_FILE"

echo "Configuration Summary:" >> "$REPORT_FILE"

echo " Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"

echo " Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"

echo " Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"

echo " Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"

echo " Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"

echo " Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"

if [ -f "$RAW_DATA_FILE" ]; then

local total_standbys=$(awk 'END{print NR}' "$RAW_DATA_FILE")

echo "Total standbys processed: $total_standbys" >> "$REPORT_FILE"

echo "" >> "$REPORT_FILE"

echo "Breakdown by Primary Database:" >> "$REPORT_FILE"

for DB in "${DB_LIST[@]}"; do

local count=$(awk -F'|' -v db="$DB" '$1==db {count++} END{print count+0}' "$RAW_DATA_FILE")

[ $count -gt 0 ] && echo " $DB: $count standby(s)" >> "$REPORT_FILE"

done

# Health summary

echo "" >> "$REPORT_FILE"

echo "Primary Database Health Summary:" >> "$REPORT_FILE"

awk -F'|' '

{

primary=$1

status=$9

if (!(primary in seen)) {

seen[primary]=status

} else {

current=seen[primary]

if (status == "CRITICAL") {

seen[primary]="CRITICAL"

} else if (status == "ERROR" && current != "CRITICAL") {

seen[primary]="ERROR"

} else if (status == "WARNING" && current != "CRITICAL" && current != "ERROR") {

seen[primary]="WARNING"

}

END {

for (p in seen) {

printf(" %s: %s\n", p, seen[p])

}

}' "$RAW_DATA_FILE" >> "$REPORT_FILE"

# Issue detection

local issue_lines

issue_lines=$(awk -F'|' '$9 ~ /CRITICAL|ERROR|INFRA_ERROR/' "$RAW_DATA_FILE")

if [ -n "$issue_lines" ]; then

echo "" >> "$REPORT_FILE"

echo "Issues Detected:" >> "$REPORT_FILE"

echo "$issue_lines" >> "$REPORT_FILE"

echo "$issue_lines" > "$alert_file"

alert_flag=1

issues_found=1

else

echo "" >> "$REPORT_FILE"

echo "No issues detected." >> "$REPORT_FILE"

# Execution time

local end_time=$(date +%s)

local duration=$((end_time - START_TIME))

echo "" >> "$REPORT_FILE"

echo "Execution Time: ${duration} seconds" >> "$REPORT_FILE"

echo "=========================" >> "$REPORT_FILE"

} 200>"$OUTPUT_LOCKFILE"

# Send alert if needed

if [ $alert_flag -eq 1 ]; then

log_message "Issues detected - triggering alert"

send_alert_email "Data Guard Alert - Issues Detected" "$alert_file" "ALL_DATABASES"

else

log_message "No issues detected"

}

# =========================================================================

# MAIN EXECUTION

# =========================================================================

main() {

# Check master switch

if [ "$ENABLE_MONITORING" != true ]; then

echo "Monitoring is disabled. Set ENABLE_MONITORING=true to enable."

exit 0

# Global timeout watchdog

if [ -n "$SCRIPT_TIMEOUT" ] && [ "$SCRIPT_TIMEOUT" -gt 0 ]; then

(

sleep "$SCRIPT_TIMEOUT"

echo "ERROR: Script timeout after ${SCRIPT_TIMEOUT}s" >&2

kill -9 $MAIN_PID 2>/dev/null

) &

WATCHDOG_PID=$!

log_message "===== Data Guard Monitoring Started ====="

log_message "Connection Method: $CONNECTION_METHOD"

log_message "Parallel jobs: $MAX_PARALLEL_JOBS"

log_message "Email alerts: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"

log_message "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"

log_message "Global dry-run: $GLOBAL_DRY_RUN"

log_message "Prometheus metrics: $([ "$ENABLE_PROM_METRICS" = true ] && echo "ENABLED" || echo "DISABLED")"

log_message "HTML report: $([ "$ENABLE_HTML_REPORT" = true ] && echo "ENABLED" || echo "DISABLED")"

log_message "CSV export: $([ "$ENABLE_CSV_EXPORT" = true ] && echo "ENABLED" || echo "DISABLED")"

log_message "Historical data: $([ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED")"

# Rotate historical file

rotate_historical_file

# Validate wallet connectivity

if ! validate_wallet; then

log_message "ERROR: No valid databases. Exiting."

[ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null

exit 1

# Initialize HTML report

init_html_report

# Initialize report files

{

flock -x 200

echo "Data Guard Status Report - $(date)" > "$REPORT_FILE"

echo "========================================" >> "$REPORT_FILE"

echo "" >> "$REPORT_FILE"

echo "Configuration:" >> "$REPORT_FILE"

echo " Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"

echo " Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"

echo " Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"

echo " Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"

echo " Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"

echo " Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"

echo "" >> "$REPORT_FILE"

# Print header to console

printf "\n%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \

"PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date"

printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \

"---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------"

# Write headers to report file

printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \

"PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date" >> "$REPORT_FILE"

printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \

"---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------" >> "$REPORT_FILE"

} 200>"$OUTPUT_LOCKFILE"

# Parallel execution with reliable polling approach

local pids=()

for DB in "${DB_LIST[@]}"; do

# Wait until we have fewer than MAX_PARALLEL_JOBS running

while [ "$(jobs -rp | wc -l)" -ge "$MAX_PARALLEL_JOBS" ]; do

sleep "$PARALLEL_SLEEP_INTERVAL"

done

process_database "$DB" &

pids+=($!)

done

# Wait for all background jobs

for pid in "${pids[@]}"; do

wait $pid 2>/dev/null || log_message "ERROR: Background job $pid failed"

done

# Generate summary

generate_summary

# Finalize HTML report

finalize_html_report

# Kill watchdog

[ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null

# Copy Prometheus metrics if node_exporter exists

if [ "$ENABLE_PROM_METRICS" = true ] && [ -d "/var/lib/node_exporter/textfile_collector" ]; then

cp "$PROM_FILE" "/var/lib/node_exporter/textfile_collector/dataguard_metrics.prom" 2>/dev/null && \

log_message "Prometheus metrics copied to node_exporter"

DURATION=$(( $(date +%s) - START_TIME ))

echo -e "\n${GREEN}Report Completed.${NC}"

echo "Report: $REPORT_FILE"

[ "$ENABLE_HTML_REPORT" = true ] && echo "HTML: $HTML_REPORT"

[ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV: $CSV_FILE"

echo "Raw: $RAW_DATA_FILE"

[ "$ENABLE_HISTORICAL_DATA" = true ] && echo "History: $HISTORICAL_DATA_FILE"

[ "$ENABLE_PROM_METRICS" = true ] && echo "Prometheus: $PROM_FILE"

echo "Log: $LOG_FILE"

[ "$ENABLE_AUTO_HEAL" = true ] && echo "Auto-heal log: $AUTO_HEAL_LOG"

echo "Connection Method: $CONNECTION_METHOD"

echo "Parallel Jobs: $MAX_PARALLEL_JOBS"

echo "Email: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"

echo "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"

echo "Global Dry-Run: $GLOBAL_DRY_RUN"

echo "Time: ${DURATION}s"

log_message "===== Data Guard Monitoring Completed ====="

}

# Run main

main "$@"

oracle-dba

Popular Posts

Saturday, March 28, 2026

dgmgrl monitoring

Followers

Blog Archive