Popular Posts

Saturday, March 28, 2026

dgmgrl monitoring

#!/bin/bash
# ------------------------------------------------------------------
# Data Guard Status Report Script (Oracle 19c)
# Production-ready with centralized control panel, monitoring, alerts,
# visual dashboards, and controlled auto-healing capabilities
# ------------------------------------------------------------------

# =========================================================================
# CENTRALIZED CONTROL PANEL - TOGGLE FEATURES HERE
# =========================================================================

# 1. Global Monitoring Switches
ENABLE_MONITORING=true            # Master switch for the entire script
ENABLE_AUTO_HEAL=false            # Toggle Auto-Healing capabilities
ENABLE_EMAIL_ALERTS=true          # Toggle Email notifications
ENABLE_PROM_METRICS=true          # Toggle Prometheus .prom file generation
ENABLE_HTML_REPORT=true           # Toggle HTML dashboard generation
ENABLE_HISTORICAL_DATA=true       # Toggle historical data tracking
ENABLE_CSV_EXPORT=true            # Toggle CSV export for Excel
GLOBAL_DRY_RUN=false              # Global dry-run mode (overrides all actions)

# 2. Connection & Security Settings
# Options: "WALLET" (uses /@alias) or "USER" (requires credentials below)
CONNECTION_METHOD="WALLET" 
DGMGRL_OPTIONS="-silent"          # Add -xml if you prefer parsing XML output

# For USER method - set credentials (use secure method like env vars in production)
DG_MONITOR_USER="dg_monitor"
DG_MONITOR_PASSWORD=""            # Set via environment variable or secure file
DG_MONITOR_PASSWORD_FILE="/etc/oracle/dg_monitor.pwd"  # Alternative secure file

# 3. Environment Context
# Location of the centralized TNS_ADMIN if not in default OH/network/admin
export TNS_ADMIN="/u01/app/oracle/network/admin" 
export ORACLE_HOME="/u01/app/oracle/product/19.0.0/dbhome_1"
export PATH=$ORACLE_HOME/bin:$PATH
export LD_LIBRARY_PATH=$ORACLE_HOME/lib:$LD_LIBRARY_PATH

# 4. Target Databases (Central Inventory)
# Add all Primary TNS Aliases here. The script will discover Standbys automatically.
DB_LIST=("PRIM_PROD_KWT" "PRIM_DR_KWT" "PRIM_DEV_CORE")

# 5. Performance & Concurrency
MAX_PARALLEL_JOBS=5               # Increase for large fleets
PARALLEL_SLEEP_INTERVAL=0.5       # Seconds between parallel job checks
SCRIPT_TIMEOUT=600                # Maximum execution time in seconds (10 min)
DGMGRL_TIMEOUT=30                 # Seconds before killing a hung connection
DGMGRL_KILL_TIMEOUT=5             # Seconds after timeout before force kill
DGMGRL_RETRIES=2                  # Number of retries for transient errors
MAX_HISTORY_LINES=50000           # Rotate history file after this many lines

# 6. Alert Thresholds (in seconds)
WARNING_LAG_THRESHOLD=300         # 5 minutes
CRITICAL_LAG_THRESHOLD=900        # 15 minutes
ALERT_COOLDOWN=900                # 15 minutes between alerts per database

# 7. Auto-Heal Configuration
AUTO_HEAL_DRY_RUN=true            # true = log only, no execution
AUTO_HEAL_COOLDOWN=600            # seconds (10 min) between auto-heal attempts
MAX_AUTO_HEAL_ATTEMPTS=3          # Maximum attempts per standby before giving up
AUTO_HEAL_EXCLUDE=()              # List of standbys to exclude from auto-heal (e.g., ("DR_TEST" "STBY_OLD"))

# 8. Directories
LOG_DIR="/var/log/dataguard"
REPORT_DIR="/tmp/dataguard_reports"
TEMP_DIR="/tmp/dataguard_temp"
CSV_DIR="/tmp/dataguard_csv"

# 9. Email Configuration
MAIL_TO="dba_team@yourcompany.com"
MAIL_FROM="dataguard@$(hostname)"

# 10. Logging Format (json or plain)
LOG_FORMAT="plain"                # Options: "plain" or "json"

# 11. Console Output Colors
ENABLE_COLORS=true                # Toggle colored console output

# =========================================================================
# INTERNAL VARIABLES - DO NOT MODIFY BELOW THIS LINE
# =========================================================================

# Color definitions
if [ "$ENABLE_COLORS" = true ] && [ -t 1 ]; then
    RED="\e[31m"
    GREEN="\e[32m"
    YELLOW="\e[33m"
    BLUE="\e[34m"
    MAGENTA="\e[35m"
    CYAN="\e[36m"
    BOLD="\e[1m"
    NC="\e[0m"
else
    RED=""; GREEN=""; YELLOW=""; BLUE=""; MAGENTA=""; CYAN=""; BOLD=""; NC=""
fi

# Derived file paths
LOG_FILE="$LOG_DIR/dataguard_report_$(date +%Y%m%d).log"
REPORT_FILE="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).txt"
RAW_DATA_FILE="$REPORT_DIR/dataguard_raw_$(date +%Y%m%d_%H%M%S).dat"
HISTORICAL_DATA_FILE="$REPORT_DIR/dataguard_history.dat"
HTML_REPORT="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).html"
CSV_FILE="$CSV_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).csv"
PROM_FILE="/tmp/dataguard_metrics.prom"

# Auto-heal files (per standby)
AUTO_HEAL_COUNT_DIR="/var/tmp/dataguard_heal_counts"
AUTO_HEAL_LOG="$LOG_DIR/dataguard_autoheal.log"

# Alert cooldown files (per database)
ALERT_COOLDOWN_BASE="/var/tmp/dataguard_last_alert"
ALERT_LOCK_BASE="/var/tmp/dataguard_alert_lock"

# Lock files for thread-safe operations
LOG_LOCKFILE="/tmp/dataguard_log_$$.lock"
AUTOHEAL_LOG_LOCKFILE="/tmp/dataguard_autoheallog_$$.lock"
OUTPUT_LOCKFILE="/tmp/dataguard_output_$$.lock"
HTML_LOCKFILE="/tmp/dataguard_html_$$.lock"
PROM_LOCKFILE="/tmp/dataguard_prom_$$.lock"
HISTORY_LOCKFILE="/tmp/dataguard_history_$$.lock"
SCRIPT_LOCK="/tmp/dataguard_main.lock"

# Watchdog PID
WATCHDOG_PID=""
MAIN_PID=$$

# Start timing
START_TIME=$(date +%s)

# =========================================================================
# INITIALIZATION
# =========================================================================

# Create directories
mkdir -p "$LOG_DIR" "$REPORT_DIR" "$TEMP_DIR" "$AUTO_HEAL_COUNT_DIR" "$CSV_DIR"
touch "$LOG_LOCKFILE" "$AUTOHEAL_LOG_LOCKFILE" "$OUTPUT_LOCKFILE" "$HTML_LOCKFILE" "$PROM_LOCKFILE" "$HISTORY_LOCKFILE"

# Script lock to prevent duplicate runs
exec 200>"$SCRIPT_LOCK"
flock -n 200 || {
    echo "ERROR: Another instance of this script is already running. Exiting."
    exit 1
}

# Initialize historical file if it doesn't exist
if [ "$ENABLE_HISTORICAL_DATA" = true ] && [ ! -f "$HISTORICAL_DATA_FILE" ]; then
    echo "PrimaryDB|StandbyDB|TransportLagSec|ApplyLagSec|Error|SwitchoverReady|Enabled|ConfigStatus|OverallStatus|Trend|Timestamp" > "$HISTORICAL_DATA_FILE"
fi

# Initialize CSV file if enabled
if [ "$ENABLE_CSV_EXPORT" = true ]; then
    echo "\"PrimaryDB\",\"StandbyDB\",\"TransportLag\",\"ApplyLag\",\"Error\",\"SwitchoverReady\",\"Enabled\",\"ConfigStatus\",\"OverallStatus\",\"Trend\",\"Timestamp\"" > "$CSV_FILE"
fi

# Initialize Prometheus metrics file if enabled
if [ "$ENABLE_PROM_METRICS" = true ]; then
    : > "$PROM_FILE"
    {
        echo "# HELP dataguard_transport_lag_seconds Data Guard transport lag in seconds"
        echo "# TYPE dataguard_transport_lag_seconds gauge"
        echo "# HELP dataguard_apply_lag_seconds Data Guard apply lag in seconds"
        echo "# TYPE dataguard_apply_lag_seconds gauge"
        echo "# HELP dataguard_status Data Guard status (0=error, 1=warning, 2=ok)"
        echo "# TYPE dataguard_status gauge"
        echo "# HELP dataguard_configuration_status Configuration status (0=error, 1=warning, 2=ok)"
        echo "# TYPE dataguard_configuration_status gauge"
    } >> "$PROM_FILE"
fi

# =========================================================================
# UTILITY FUNCTIONS
# =========================================================================

# CSV escape function
escape_csv() {
    echo "$1" | sed 's/"/""/g'
}

# Structured logging function
log_message() {
    [ "$ENABLE_MONITORING" != true ] && return
    local msg="$1"
    local level="${2:-INFO}"
    local timestamp=$(date -Iseconds)
    
    if [ "$LOG_FORMAT" = "json" ]; then
        local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","host":"%s"}\n' \
            "$timestamp" "$level" "${msg//\"/\\\"}" "$(hostname)")
        {
            flock -x 200
            echo "$json_msg" >> "$LOG_FILE"
        } 200>"$LOG_LOCKFILE"
        # Also print to console for readability
        echo "$msg"
    else
        {
            flock -x 200
            echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" | tee -a "$LOG_FILE"
        } 200>"$LOG_LOCKFILE"
    fi
}

# Thread-safe auto-heal logging
log_autoheal() {
    [ "$ENABLE_AUTO_HEAL" != true ] && return
    local msg="$1"
    local level="${2:-INFO}"
    local timestamp=$(date -Iseconds)
    
    if [ "$LOG_FORMAT" = "json" ]; then
        local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","component":"autoheal"}\n' \
            "$timestamp" "$level" "${msg//\"/\\\"}")
        {
            flock -x 200
            echo "$json_msg" >> "$AUTO_HEAL_LOG"
        } 200>"$AUTOHEAL_LOG_LOCKFILE"
    else
        {
            flock -x 200
            echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" >> "$AUTO_HEAL_LOG"
        } 200>"$AUTOHEAL_LOG_LOCKFILE"
    fi
}

# Thread-safe output writer
write_output() {
    local console_line="$1"
    local report_line="$2"
    local raw_data_line="$3"
    local csv_line="$4"
    
    # Write to console and main report files
    {
        flock -x 200
        echo "$console_line"
        echo "$report_line" >> "$REPORT_FILE"
        if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then
            echo "$raw_data_line" >> "$RAW_DATA_FILE"
        fi
        if [ -n "$csv_line" ] && [ "$ENABLE_CSV_EXPORT" = true ]; then
            echo "$csv_line" >> "$CSV_FILE"
        fi
    } 200>"$OUTPUT_LOCKFILE"
    
    # Write to historical data file with separate lock
    if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then
        {
            flock -x 200
            echo "$raw_data_line" >> "$HISTORICAL_DATA_FILE"
        } 200>"$HISTORY_LOCKFILE"
    fi
}

# Thread-safe Prometheus writer
write_prometheus() {
    [ "$ENABLE_PROM_METRICS" != true ] && return
    local metrics="$1"
    {
        flock -x 200
        echo "$metrics" >> "$PROM_FILE"
    } 200>"$PROM_LOCKFILE"
}

# Thread-safe HTML writer
write_html_row() {
    [ "$ENABLE_HTML_REPORT" != true ] && return
    local html_row="$1"
    {
        flock -x 200
        echo "$html_row" >> "$HTML_REPORT"
    } 200>"$HTML_LOCKFILE"
}

# =========================================================================
# REMOTE DGMGRL EXECUTION FUNCTIONS
# =========================================================================

# Build connection string based on control panel settings
get_connection_string() {
    local target_alias="$1"
    
    if [ "$CONNECTION_METHOD" == "WALLET" ]; then
        echo "/@$target_alias"
    else
        # Check for password in environment or file
        local password=""
        if [ -n "$DG_MONITOR_PASSWORD" ]; then
            password="$DG_MONITOR_PASSWORD"
        elif [ -f "$DG_MONITOR_PASSWORD_FILE" ]; then
            # Read password safely, stripping newlines
            password=$(tr -d '\n' < "$DG_MONITOR_PASSWORD_FILE" 2>/dev/null)
        fi
        
        if [ -z "$password" ]; then
            echo "ERROR: No password available for USER connection method" >&2
            return 1
        fi
        echo "${DG_MONITOR_USER}/$password@$target_alias"
    fi
}

# Thread-safe Remote DGMGRL Execution with timeout and retry
run_remote_dgmgrl() {
    local target_alias="$1"
    local sql_commands="$2"
    local connection_str=""
    
    connection_str=$(get_connection_string "$target_alias")
    if [ $? -ne 0 ] || [ -z "$connection_str" ]; then
        echo "ERROR: Failed to build connection string for $target_alias"
        return 1
    fi
    
    local output=""
    local rc=0
    
    for ((i=1; i<=DGMGRL_RETRIES+1; i++)); do
        output=$(timeout -k ${DGMGRL_KILL_TIMEOUT}s ${DGMGRL_TIMEOUT}s dgmgrl $DGMGRL_OPTIONS "$connection_str" <<EOF
$sql_commands
exit;
EOF
) 2>&1
        rc=$?
        
        if [ $rc -eq 0 ]; then
            echo "$output"
            return 0
        fi
        
        # Check for transient errors
        local is_transient=0
        if [ $rc -eq 124 ] || [ $rc -eq 137 ]; then
            is_transient=1
        elif grep -q "ORA-03113\|ORA-03114\|ORA-12541\|ORA-12514\|TNS-12541" <<< "$output"; then
            is_transient=1
        fi
        
        if [ $i -gt $DGMGRL_RETRIES ] || [ $is_transient -eq 0 ]; then
            echo "$output"
            return $rc
        fi
        
        log_message "Retry $i/$DGMGRL_RETRIES for $target_alias (exit code: $rc)"
        sleep 2
    done
    
    echo "$output"
    return 1
}

# Alias for backward compatibility
run_dgmgrl() {
    run_remote_dgmgrl "$1" "$2"
}

# =========================================================================
# LAG PROCESSING FUNCTIONS
# =========================================================================

# Convert lag to seconds
lag_to_seconds() {
    local lag_value="$1"
    local seconds=0
    
    if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]] || [[ "$lag_value" =~ unknown|UNKNOWN ]]; then
        echo -1
        return
    fi
    
    if [[ "$lag_value" =~ ([0-9]+)\ days ]]; then
        seconds=$((seconds + ${BASH_REMATCH[1]} * 86400))
    fi
    if [[ "$lag_value" =~ ([0-9]+)\ hours ]]; then
        seconds=$((seconds + ${BASH_REMATCH[1]} * 3600))
    fi
    if [[ "$lag_value" =~ ([0-9]+)\ minutes ]]; then
        seconds=$((seconds + ${BASH_REMATCH[1]} * 60))
    fi
    if [[ "$lag_value" =~ ([0-9]+)\ seconds ]]; then
        seconds=$((seconds + ${BASH_REMATCH[1]}))
    fi
    
    echo $seconds
}

# Parse lag from DGMGRL format
parse_lag() {
    local lag_value=$1
    
    if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]]; then
        echo "N/A"
        return
    fi
    
    if [[ "$lag_value" =~ \+([0-9]{2})\ ([0-9]{2}):([0-9]{2}):([0-9]{2}) ]]; then
        local days=${BASH_REMATCH[1]}
        local hours=${BASH_REMATCH[2]}
        local minutes=${BASH_REMATCH[3]}
        local seconds=${BASH_REMATCH[4]}
        local total_seconds=$((days * 86400 + hours * 3600 + minutes * 60 + seconds))
        
        if [ $total_seconds -gt 0 ]; then
            if [ $total_seconds -ge 86400 ]; then
                echo "$days days"
            elif [ $total_seconds -ge 3600 ]; then
                echo "$((hours + days * 24)) hours"
            elif [ $total_seconds -ge 60 ]; then
                echo "$minutes minutes"
            else
                echo "$seconds seconds"
            fi
        else
            echo "0 seconds"
        fi
    else
        echo "$lag_value"
    fi
}

# Check lag thresholds
check_lag_threshold() {
    local lag_seconds=$1
    
    if [ $lag_seconds -eq -1 ]; then
        echo "UNKNOWN"
    elif [ $lag_seconds -ge $CRITICAL_LAG_THRESHOLD ]; then
        echo "CRITICAL"
    elif [ $lag_seconds -ge $WARNING_LAG_THRESHOLD ]; then
        echo "WARNING"
    else
        echo "OK"
    fi
}

# =========================================================================
# STATUS DETERMINATION FUNCTIONS
# =========================================================================

# Get overall status
get_overall_status() {
    local status_err="$1"
    local transport_status="$2"
    local apply_status="$3"
    local config_status="$4"
    
    # Infrastructure failures
    if [[ "$status_err" == "Show command failed" ]] || \
       [[ "$status_err" == "Validate command failed" ]] || \
       [[ "$status_err" == "Command timeout" ]]; then
        echo "INFRA_ERROR"
        return
    fi
    
    # Configuration status issues
    if [[ "$config_status" == "WARNING" ]]; then
        echo "WARNING"
        return
    elif [[ "$config_status" == "ERROR" ]]; then
        echo "ERROR"
        return
    fi
    
    # Data Guard errors
    if [[ "$status_err" != "None" ]] && [[ "$status_err" != "SUCCESS" ]]; then
        echo "ERROR"
    elif [[ "$transport_status" == "CRITICAL" ]] || [[ "$apply_status" == "CRITICAL" ]]; then
        echo "CRITICAL"
    elif [[ "$transport_status" == "WARNING" ]] || [[ "$apply_status" == "WARNING" ]]; then
        echo "WARNING"
    else
        echo "OK"
    fi
}

# Get colored status for console
get_colored_status() {
    local status="$1"
    case "$status" in
        "OK") echo "${GREEN}✓ OK${NC}" ;;
        "WARNING") echo "${YELLOW}⚠ WARNING${NC}" ;;
        "CRITICAL") echo "${RED}🔴 CRITICAL${NC}" ;;
        "ERROR") echo "${RED}✗ ERROR${NC}" ;;
        "INFRA_ERROR") echo "${MAGENTA}⚙ INFRA_ERROR${NC}" ;;
        *) echo "$status" ;;
    esac
}

# Get numeric status for Prometheus
get_numeric_status() {
    case "$1" in
        "ERROR"|"CRITICAL"|"INFRA_ERROR") echo 0 ;;
        "WARNING") echo 1 ;;
        *) echo 2 ;;
    esac
}

get_numeric_config_status() {
    case "$1" in
        "ERROR") echo 0 ;;
        "WARNING") echo 1 ;;
        *) echo 2 ;;
    esac
}

# =========================================================================
# TREND DETECTION
# =========================================================================

detect_trend() {
    [ "$ENABLE_HISTORICAL_DATA" != true ] && { echo "UNKNOWN"; return; }
    
    local primary_db="$1"
    local standby="$2"
    local current_seconds="$3"
    local prev_seconds=""
    
    if [ -f "$HISTORICAL_DATA_FILE" ]; then
        {
            flock -s 200
            prev_seconds=$(grep "^${primary_db}|${standby}|" "$HISTORICAL_DATA_FILE" | tail -2 | head -1 | awk -F'|' '{print $3}' | xargs)
        } 200>"$HISTORY_LOCKFILE"
        
        if [[ -n "$prev_seconds" ]] && [[ "$prev_seconds" =~ ^[0-9]+$ ]] && [[ "$current_seconds" =~ ^[0-9]+$ ]]; then
            if [ "$current_seconds" -gt "$prev_seconds" ]; then
                echo "INCREASING"
            elif [ "$current_seconds" -lt "$prev_seconds" ]; then
                echo "DECREASING"
            else
                echo "STABLE"
            fi
            return
        fi
    fi
    echo "UNKNOWN"
}

# Get colored trend for console
get_colored_trend() {
    local trend="$1"
    case "$trend" in
        "INCREASING") echo "${RED}▲ INCREASING${NC}" ;;
        "DECREASING") echo "${GREEN}▼ DECREASING${NC}" ;;
        "STABLE") echo "${BLUE}● STABLE${NC}" ;;
        *) echo "$trend" ;;
    esac
}

# =========================================================================
# HTML ESCAPE
# =========================================================================

html_escape() {
    local str="$1"
    str="${str//&/&amp;}"
    str="${str//</&lt;}"
    str="${str//>/&gt;}"
    str="${str//\"/&quot;}"
    str="${str//\'/&#39;}"
    echo "$str"
}

# =========================================================================
# AUTO-HEAL FUNCTIONS
# =========================================================================

# Get auto-heal attempt count (thread-safe with lock)
get_heal_attempts() {
    local primary="$1"
    local standby="$2"
    local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
    
    {
        flock -s 200
        if [ -f "$count_file" ]; then
            cat "$count_file"
        else
            echo "0"
        fi
    } 200>"${count_file}.lock"
}

# Increment auto-heal attempt count (atomic with lock)
increment_heal_attempts() {
    local primary="$1"
    local standby="$2"
    local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
    {
        flock -x 200
        local current=$(cat "$count_file" 2>/dev/null || echo 0)
        echo $((current + 1)) > "$count_file"
    } 200>"${count_file}.lock"
}

# Reset auto-heal attempt count
reset_heal_attempts() {
    local primary="$1"
    local standby="$2"
    local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
    rm -f "$count_file"
    rm -f "${count_file}.lock"
}

# Check if standby is excluded from auto-heal
is_excluded() {
    local standby="$1"
    for excluded in "${AUTO_HEAL_EXCLUDE[@]}"; do
        if [ "$standby" = "$excluded" ]; then
            return 0
        fi
    done
    return 1
}

# Auto-heal function with per-standby cooldown, retry limits, and exclusions
auto_heal() {
    [ "$ENABLE_AUTO_HEAL" != true ] && return
    [ "$GLOBAL_DRY_RUN" = true ] && { log_autoheal "[DRY-RUN] Global dry-run active, skipping auto-heal"; return; }
    
    local PRIMARY="$1"
    local STBY="$2"
    local STATUS_ERR="$3"
    local APPLY_RATE="$4"
    local APPLY_SECONDS="$5"
    local TRANSPORT_DISCONNECTED="$6"
    local CONFIG_STATUS="$7"
    local DB_ROLE="$8"
    local OVERALL_STATUS="$9"
    
    # Check if standby is excluded
    if is_excluded "$STBY"; then
        log_autoheal "[EXCLUDED] $PRIMARY -> $STBY is in exclusion list, skipping"
        return
    fi
    
    # Check role - only heal from PRIMARY
    if [[ "$DB_ROLE" != "PRIMARY" ]]; then
        log_autoheal "[SKIP] $PRIMARY is not PRIMARY (role=$DB_ROLE), skipping auto-heal"
        return
    fi
    
    # Check max attempts - only if not OK
    local attempts=$(get_heal_attempts "$PRIMARY" "$STBY")
    if [ "$OVERALL_STATUS" != "OK" ] && [ "$attempts" -ge "$MAX_AUTO_HEAL_ATTEMPTS" ]; then
        log_autoheal "[MAX_ATTEMPTS] $PRIMARY -> $STBY has reached max attempts ($MAX_AUTO_HEAL_ATTEMPTS), skipping"
        return
    fi
    
    # Per-standby cooldown lock
    local LOCK_FILE="/var/tmp/dataguard_autoheal_${PRIMARY}_${STBY}.lock"
    (
        flock -x 200
        local now=$(date +%s)
        if [ -f "$LOCK_FILE" ]; then
            local last=$(cat "$LOCK_FILE")
            if (( now - last < AUTO_HEAL_COOLDOWN )); then
                exit 1
            fi
        fi
        echo "$now" > "$LOCK_FILE"
        exit 0
    ) 200>"${LOCK_FILE}.lck"
    
    if [ $? -eq 1 ]; then
        log_autoheal "[COOLDOWN] Active for $PRIMARY -> $STBY, skipping"
        return
    fi

    log_autoheal "[EVALUATE] $PRIMARY -> $STBY | Status: $STATUS_ERR | Rate: $APPLY_RATE | Lag: ${APPLY_SECONDS}s | Config: $CONFIG_STATUS | Attempt: $((attempts+1))/$MAX_AUTO_HEAL_ATTEMPTS"

    run_fix() {
        local cmd="$1"
        local description="$2"

        if [ "$AUTO_HEAL_DRY_RUN" = true ]; then
            log_autoheal "[DRY-RUN] $description: $cmd"
            return 0
        fi

        log_autoheal "[EXEC] $description: $cmd"
        run_dgmgrl "$PRIMARY" "$cmd" >> "$AUTO_HEAL_LOG" 2>&1
        local exit_code=$?
        
        if [ $exit_code -eq 0 ]; then
            log_autoheal "[SUCCESS] $description completed"
            return 0
        else
            log_autoheal "[FAILURE] $description failed (exit: $exit_code)"
            return 1
        fi
    }

    local fix_applied=0

    # CASE 1: MRP NOT RUNNING
    if [[ "$STATUS_ERR" == "MRP not running" ]]; then
        log_autoheal "[ACTION] MRP not running - restarting apply"
        run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Restart MRP on $STBY"
        fix_applied=1
    fi

    # CASE 2: APPLY STALLED (fixed regex)
    if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$APPLY_SECONDS" -gt 300 ] && [ "$APPLY_SECONDS" -ne -1 ]; then
        log_autoheal "[ACTION] Apply stalled - restarting apply (stop then start)"
        run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-OFF';" "Stop apply on $STBY"
        sleep 5
        run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Start apply on $STBY"
        fix_applied=1
    fi

    # CASE 3: TRANSPORT DISCONNECTED (with config status check)
    if [ "$TRANSPORT_DISCONNECTED" -eq 1 ] && [[ "$CONFIG_STATUS" == "SUCCESS" ]]; then
        log_autoheal "[ACTION] Transport disconnected - re-enabling"
        run_fix "EDIT DATABASE '$PRIMARY' SET STATE='TRANSPORT-ON';" "Enable transport on $PRIMARY"
        fix_applied=1
    fi

    # CASE 4: CONFIG DISABLED
    if [[ "$CONFIG_STATUS" == "DISABLED" ]] || [[ "$CONFIG_STATUS" == "disabled" ]]; then
        log_autoheal "[ACTION] Configuration disabled - enabling"
        run_fix "ENABLE CONFIGURATION;" "Enable Data Guard configuration"
        fix_applied=1
    fi
    
    # Update attempt counter based on overall status
    if [ "$OVERALL_STATUS" = "OK" ]; then
        reset_heal_attempts "$PRIMARY" "$STBY"
    elif [ $fix_applied -eq 1 ]; then
        increment_heal_attempts "$PRIMARY" "$STBY"
    fi
}

# =========================================================================
# EMAIL ALERT FUNCTION
# =========================================================================

send_alert_email() {
    [ "$ENABLE_EMAIL_ALERTS" != true ] && {
        log_message "Email alerts disabled, skipping"
        return
    }
    [ "$GLOBAL_DRY_RUN" = true ] && { log_message "[DRY-RUN] Global dry-run active, skipping email"; return; }
    
    local subject="$1"
    local body_file="$2"
    local db="$3"
    
    # Per-database cooldown check with per-database lock
    local cooldown_file="${ALERT_COOLDOWN_BASE}_${db}.lock"
    local lock_file="${ALERT_LOCK_BASE}_${db}.lock"
    (
        flock -x 200
        if [ -f "$cooldown_file" ]; then
            local last=$(cat "$cooldown_file")
            local now=$(date +%s)
            if (( now - last < ALERT_COOLDOWN )); then
                exit 1
            fi
        fi
        date +%s > "$cooldown_file"
        exit 0
    ) 200>"$lock_file"
    
    [ $? -eq 1 ] && { log_message "Alert suppressed for $db (cooldown)"; return; }
    
    local email_body="Data Guard Alert Report\n"
    email_body+="Host: $(hostname)\n"
    email_body+="Database: $db\n"
    email_body+="Time: $(date)\n"
    email_body+="================================\n\n"
    email_body+="$(cat "$body_file")\n"
    
    if command -v mailx >/dev/null 2>&1; then
        echo -e "$email_body" | mailx -s "$subject" -r "$MAIL_FROM" "$MAIL_TO"
        log_message "Alert email sent via mailx for $db"
    elif command -v mail >/dev/null 2>&1; then
        echo -e "$email_body" | mail -s "$subject" "$MAIL_TO"
        log_message "Alert email sent via mail for $db"
    else
        log_message "WARNING: mail utility not found"
        echo "$email_body" > "$REPORT_DIR/alert_manual_${db}_$(date +%Y%m%d_%H%M%S).txt"
    fi
}

# =========================================================================
# WALLET VALIDATION
# =========================================================================

validate_wallet() {
    [ "$ENABLE_MONITORING" != true ] && return 0
    
    log_message "Validating connectivity..."
    local failed_dbs=()
    for DB in "${DB_LIST[@]}"; do
        # Strong validation - check if broker is accessible
        if ! run_dgmgrl "$DB" "show configuration;" >/dev/null 2>&1; then
            log_message "WARNING: Connection validation failed for $DB (will skip)"
            failed_dbs+=("$DB")
        fi
    done
    
    if [ ${#failed_dbs[@]} -eq ${#DB_LIST[@]} ]; then
        log_message "ERROR: All databases failed validation. Exiting."
        return 1
    fi
    
    # Update DB_LIST to exclude failed databases
    local new_list=()
    for DB in "${DB_LIST[@]}"; do
        local skip=0
        for failed in "${failed_dbs[@]}"; do
            [ "$DB" = "$failed" ] && { skip=1; break; }
        done
        [ $skip -eq 0 ] && new_list+=("$DB")
    done
    DB_LIST=("${new_list[@]}")
    
    log_message "Connection validation successful for ${#DB_LIST[@]} databases"
    return 0
}

# =========================================================================
# ROTATE HISTORICAL FILE
# =========================================================================

rotate_historical_file() {
    [ "$ENABLE_HISTORICAL_DATA" != true ] && return
    if [ -f "$HISTORICAL_DATA_FILE" ]; then
        local lines=$(wc -l < "$HISTORICAL_DATA_FILE")
        if [ "$lines" -gt "$MAX_HISTORY_LINES" ]; then
            log_message "Rotating historical file (${lines} lines, max ${MAX_HISTORY_LINES})"
            tail -n "$MAX_HISTORY_LINES" "$HISTORICAL_DATA_FILE" > "${HISTORICAL_DATA_FILE}.tmp"
            mv "${HISTORICAL_DATA_FILE}.tmp" "$HISTORICAL_DATA_FILE"
        fi
    fi
}

# =========================================================================
# MAIN DATABASE PROCESSING
# =========================================================================

process_database() {
    local DB="$1"
    local TMPFILE_LOCAL=""
    
    local tmppath
    tmppath=$(mktemp)
    trap "rm -f '$tmppath'" RETURN
    TMPFILE_LOCAL="$tmppath"
    
    local PRIMARY_DB=""
    local DB_ROLE=""
    local STANDBY_ARRAY=()
    local CONNECTION_ERROR=0
    local CONFIG_STATUS=""
    local FSFO_ENABLED=0
    
    log_message "Processing database: $DB"
    
    # Get configuration
    run_dgmgrl "$DB" "show configuration;" > "$TMPFILE_LOCAL" 2>&1
    local dgmgrl_exit=$?
    
    if [ $dgmgrl_exit -eq 124 ] || [ $dgmgrl_exit -eq 137 ]; then
        log_message "ERROR: Timeout connecting to $DB"
        CONNECTION_ERROR=1
    fi
    
    CONFIG_STATUS=$(grep -i "Configuration Status" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)
    CONFIG_STATUS=${CONFIG_STATUS:-"Unknown"}
    write_prometheus "dataguard_configuration_status{primary=\"$DB\"} $(get_numeric_config_status "$CONFIG_STATUS")"
    
    # Extract database role
    DB_ROLE=$(grep -i "Database Role" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)
    DB_ROLE=${DB_ROLE:-"UNKNOWN"}
    
    # Detect FSFO (Fast-Start Failover) - once per database
    grep -qi "Fast-Start Failover: ENABLED" "$TMPFILE_LOCAL" && FSFO_ENABLED=1
    
    # Check for errors
    if [ $CONNECTION_ERROR -eq 0 ] && grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -qv "ORA-16809" 2>/dev/null; then
        local error_msg=$(grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -v "ORA-16809" | head -1)
        log_message "WARNING: DGMGRL error on $DB: $error_msg"
        CONNECTION_ERROR=1
    fi
    
    # Handle connection failure
    if [ $CONNECTION_ERROR -eq 1 ]; then
        local colored_status="${RED}INFRA_ERROR${NC}"
        local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \
            "$DB" "N/A" "ERROR" "ERROR" "DGMGRL connection failed" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")
        local raw_line="$DB|N/A|-1|-1|DGMGRL connection failed|Unknown|Unknown|$CONFIG_STATUS|INFRA_ERROR|UNKNOWN|$(date '+%Y-%m-%d %H:%M:%S')"
        local csv_line="\"$(escape_csv "$DB")\",\"N/A\",\"ERROR\",\"ERROR\",\"DGMGRL connection failed\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
        write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
        return 1
    fi
    
    # Extract primary database
    PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}' | xargs)
    [ -z "$PRIMARY_DB" ] && PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')
    
    local PROM_PRIMARY="$DB"
    local DISPLAY_PRIMARY="${PRIMARY_DB:-$DB} ($DB)"
    local DISPLAY_PRIMARY_ESCAPED=$(html_escape "$DISPLAY_PRIMARY")
    
    # Extract standbys
    mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}')
    if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then
        mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')
    fi
    
    if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then
        log_message "INFO: No standbys found for ${PRIMARY_DB:-$DB}"
        return 0
    fi
    
    # Process each standby
    for STBY in "${STANDBY_ARRAY[@]}"; do
        : > "$TMPFILE_LOCAL"
        
        # Combined commands - single connection per standby for efficiency
        run_dgmgrl "$DB" "
show database verbose '$STBY';
validate database '$STBY';
" > "$TMPFILE_LOCAL" 2>&1
        local combined_exit=$?
        
        # Timeout handling
        if [ $combined_exit -eq 124 ] || [ $combined_exit -eq 137 ]; then
            log_message "ERROR: Timeout processing standby $STBY on $PRIMARY_DB"
            local colored_status="${RED}INFRA_ERROR${NC}"
            local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \
                "$DISPLAY_PRIMARY" "$STBY" "TIMEOUT" "TIMEOUT" "Command timeout" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")
            local raw_line="${PRIMARY_DB:-$DB}|$STBY|-1|-1|Command timeout|Unknown|Unknown|$CONFIG_STATUS|INFRA_ERROR|UNKNOWN|$(date '+%Y-%m-%d %H:%M:%S')"
            local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"TIMEOUT\",\"TIMEOUT\",\"Command timeout\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
            write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
            
            write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1
dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1
dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"INFRA_ERROR\",config=\"$CONFIG_STATUS\"} 0"
            continue
        fi
        
        # Check output presence
        local show_output_present=0
        local validate_output_present=0
        
        grep -q "Database Name:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
        grep -q "Transport Lag:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
        grep -q "Intended State:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
        grep -q "Ready for Switchover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1
        grep -q "Ready for Failover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1
        
        # Parse values
        local TRANSPORT_LAG_RAW=""
        local APPLY_LAG_RAW=""
        local STATUS_ERR=""
        local INTENDED_STATE=""
        local ENABLED=""
        local SWITCHOVER=""
        local APPLY_RATE=""
        local TRANSPORT_DISCONNECTED=0
        
        if [ $show_output_present -eq 1 ]; then
            TRANSPORT_LAG_RAW=$(awk -F": " '/Transport Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)
            APPLY_LAG_RAW=$(awk -F": " '/Apply Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)
            STATUS_ERR=$(awk -F": " '/Status:|Error:/ {print $2}' "$TMPFILE_LOCAL" | grep -v "SUCCESS" | head -1 | xargs)
            INTENDED_STATE=$(awk -F": " '/Intended State/ {print $2}' "$TMPFILE_LOCAL" | xargs)
            ENABLED=$(awk -F": " '/^Enabled:/ {print $2}' "$TMPFILE_LOCAL" | head -1 | xargs)
            APPLY_RATE=$(awk -F": " '/Apply Rate/ {print $2}' "$TMPFILE_LOCAL" | xargs)
            
            # Check MRP
            if grep -qi "Apply Instance.*not running" "$TMPFILE_LOCAL"; then
                STATUS_ERR="MRP not running"
                log_message "WARNING: MRP not running for $STBY"
            fi
            
            # Check transport
            if grep -qi "DISCONNECTED" "$TMPFILE_LOCAL"; then
                TRANSPORT_DISCONNECTED=1
                log_message "WARNING: Transport disconnected for $STBY"
            fi
        else
            STATUS_ERR="Show command failed"
        fi
        
        if [ $validate_output_present -eq 1 ]; then
            SWITCHOVER=$(awk -F": " '/Ready for Switchover/ {print $2}' "$TMPFILE_LOCAL" | xargs)
        fi
        
        # Parse lag
        local TRANSPORT_LAG=$(parse_lag "$TRANSPORT_LAG_RAW")
        local APPLY_LAG=$(parse_lag "$APPLY_LAG_RAW")
        local transport_seconds=$(lag_to_seconds "$TRANSPORT_LAG")
        local apply_seconds=$(lag_to_seconds "$APPLY_LAG")
        
        # Set defaults
        TRANSPORT_LAG=${TRANSPORT_LAG:-"N/A"}
        APPLY_LAG=${APPLY_LAG:-"N/A"}
        STATUS_ERR=${STATUS_ERR:-"None"}
        ENABLED=${ENABLED:-"Unknown"}
        SWITCHOVER=${SWITCHOVER:-"Unknown"}
        
        # Validate failures
        if [ $validate_output_present -eq 0 ] && [ "$STATUS_ERR" = "None" ]; then
            STATUS_ERR="Validate command failed"
        fi
        
        # Check thresholds
        local transport_status=$(check_lag_threshold $transport_seconds)
        local apply_status=$(check_lag_threshold $apply_seconds)
        
        # Apply stall detection (fixed regex)
        if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$apply_seconds" -gt 60 ] && [ "$apply_seconds" -ne -1 ]; then
            STATUS_ERR="Apply stalled"
            apply_status="CRITICAL"
            log_message "WARNING: Apply stalled for $STBY (rate=$APPLY_RATE, lag=${apply_seconds}s)"
        fi
        
        # Override critical conditions
        [ "$STATUS_ERR" = "MRP not running" ] && apply_status="CRITICAL"
        [ $TRANSPORT_DISCONNECTED -eq 1 ] && transport_status="CRITICAL"
        
        # Detect trend
        local TREND=$(detect_trend "${PRIMARY_DB:-$DB}" "$STBY" "$transport_seconds")
        
        # Overall status
        local OVERALL_STATUS=$(get_overall_status "$STATUS_ERR" "$transport_status" "$apply_status" "$CONFIG_STATUS")
        local NUMERIC_STATUS=$(get_numeric_status "$OVERALL_STATUS")
        
        # Auto-heal hook (skip if FSFO is enabled)
        if [ $FSFO_ENABLED -eq 0 ]; then
            auto_heal "${PRIMARY_DB:-$DB}" "$STBY" "$STATUS_ERR" "$APPLY_RATE" "$apply_seconds" "$TRANSPORT_DISCONNECTED" "$CONFIG_STATUS" "$DB_ROLE" "$OVERALL_STATUS"
        elif [ $FSFO_ENABLED -eq 1 ]; then
            log_message "INFO: FSFO enabled for $PRIMARY_DB, auto-heal disabled for this configuration"
        fi
        
        # HTML classes
        local row_class="ok"
        case "$OVERALL_STATUS" in
            "CRITICAL"|"ERROR"|"INFRA_ERROR") row_class="crit" ;;
            "WARNING") row_class="warn" ;;
        esac
        
        local trend_class=""
        case "$TREND" in
            "INCREASING") trend_class="trend-up" ;;
            "DECREASING") trend_class="trend-down" ;;
            *) trend_class="trend-stable" ;;
        esac
        
        # Escape HTML
        local STBY_ESCAPED=$(html_escape "$STBY")
        local STATUS_ERR_ESCAPED=$(html_escape "$STATUS_ERR")
        local SWITCHOVER_ESCAPED=$(html_escape "$SWITCHOVER")
        local ENABLED_ESCAPED=$(html_escape "$ENABLED")
        local CONFIG_STATUS_ESCAPED=$(html_escape "$CONFIG_STATUS")
        
        # Colored output
        local colored_status=$(get_colored_status "$OVERALL_STATUS")
        local colored_trend=$(get_colored_trend "$TREND")
        
        # Output
        local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${colored_status} ${colored_trend} %-20s\n" \
            "$DISPLAY_PRIMARY" "$STBY" "$TRANSPORT_LAG" "$APPLY_LAG" "$STATUS_ERR" "$SWITCHOVER" "$ENABLED" "$CONFIG_STATUS" "$(date '+%Y-%m-%d %H:%M:%S')")
        
        local raw_line="${PRIMARY_DB:-$DB}|$STBY|$transport_seconds|$apply_seconds|$STATUS_ERR|$SWITCHOVER|$ENABLED|$CONFIG_STATUS|$OVERALL_STATUS|$TREND|$(date '+%Y-%m-%d %H:%M:%S')"
        local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"$(escape_csv "$TRANSPORT_LAG")\",\"$(escape_csv "$APPLY_LAG")\",\"$(escape_csv "$STATUS_ERR")\",\"$(escape_csv "$SWITCHOVER")\",\"$(escape_csv "$ENABLED")\",\"$(escape_csv "$CONFIG_STATUS")\",\"$(escape_csv "$OVERALL_STATUS")\",\"$(escape_csv "$TREND")\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
        
        write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
        
        write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $transport_seconds
dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $apply_seconds
dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"$OVERALL_STATUS\",config=\"$CONFIG_STATUS\"} $NUMERIC_STATUS"
        
        local html_row="<tr class='$row_class'>
<td>${DISPLAY_PRIMARY_ESCAPED}</td>
<td>${STBY_ESCAPED}</td>
<td>$TRANSPORT_LAG</td>
<td>$APPLY_LAG</td>
<td>${STATUS_ERR_ESCAPED}</td>
<td>${SWITCHOVER_ESCAPED}</td>
<td>${ENABLED_ESCAPED}</td>
<td>${CONFIG_STATUS_ESCAPED}</td>
<td>$OVERALL_STATUS</td>
<td class=\"$trend_class\">$TREND</td>
<td>$(date '+%Y-%m-%d %H:%M:%S')</td>
</tr>"
        write_html_row "$html_row"
        
        # Log issues
        if [ "$OVERALL_STATUS" = "CRITICAL" ] || [ "$OVERALL_STATUS" = "ERROR" ]; then
            log_message "CRITICAL: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: $STATUS_ERR (transport: $TRANSPORT_LAG, apply: $APPLY_LAG)"
        elif [ "$OVERALL_STATUS" = "WARNING" ]; then
            log_message "WARNING: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: transport=$TRANSPORT_LAG apply=$APPLY_LAG"
        elif [ "$OVERALL_STATUS" = "INFRA_ERROR" ]; then
            log_message "INFRA_ERROR: Command issue for $PRIMARY_DB -> $STBY: $STATUS_ERR"
        fi
    done
}

# =========================================================================
# HTML REPORT FUNCTIONS
# =========================================================================

init_html_report() {
    [ "$ENABLE_HTML_REPORT" != true ] && return
    
    cat <<EOF > "$HTML_REPORT"
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <title>Data Guard Report - $(date)</title>
    <style>
        body { font-family: 'Segoe UI', Arial, sans-serif; margin: 20px; background: #f5f5f5; }
        h1, h2 { color: #333; }
        table { border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.2); }
        th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }
        th { background: #4CAF50; color: white; }
        tr:hover { background: #f5f5f5; }
        .ok { background: #d4edda; }
        .warn { background: #fff3cd; }
        .crit { background: #f8d7da; }
        .trend-up { color: #dc3545; font-weight: bold; }
        .trend-down { color: #28a745; font-weight: bold; }
        .trend-stable { color: #6c757d; }
        .footer { margin-top: 20px; font-size: 12px; color: #666; text-align: center; }
        .summary { background: white; padding: 15px; margin-bottom: 20px; border-left: 4px solid #4CAF50; }
    </style>
</head>
<body>
    <h1>📊 Oracle Data Guard Status Report</h1>
    <div class="summary">
        <strong>Report Time:</strong> $(date)<br>
        <strong>Host:</strong> $(hostname)<br>
        <strong>Connection Method:</strong> $CONNECTION_METHOD<br>
        <strong>Databases Monitored:</strong> ${#DB_LIST[@]}<br>
        <strong>Parallel Jobs:</strong> $MAX_PARALLEL_JOBS<br>
        <strong>Email Alerts:</strong> $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )<br>
        <strong>Auto-Heal:</strong> $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )<br>
        <strong>Global Dry-Run:</strong> $GLOBAL_DRY_RUN<br>
        <strong>Execution Time:</strong> <span id="exec-time">Calculating...</span>
    </div>
    <h2>📋 Standby Database Status</h2>
     <table>
        <thead>
             <tr>
                <th>Primary</th><th>Standby</th><th>Transport Lag</th><th>Apply Lag</th>
                <th>Error</th><th>Switchover</th><th>Enabled</th><th>Config</th><th>Status</th><th>Trend</th><th>Timestamp</th>
             </tr>
        </thead>
        <tbody>
EOF
}

finalize_html_report() {
    [ "$ENABLE_HTML_REPORT" != true ] && return
    
    DURATION=$(( $(date +%s) - START_TIME ))
    cat <<EOF >> "$HTML_REPORT"
        </tbody>
     </table>
    <div class="footer">
        <strong>Legend:</strong>
        <span style="background:#d4edda; padding:2px 8px;">✓ OK</span>
        <span style="background:#fff3cd; padding:2px 8px;">⚠ Warning</span>
        <span style="background:#f8d7da; padding:2px 8px;">🔴 Critical</span>
        Trend: <span class="trend-up">▲ Increasing</span> <span class="trend-down">▼ Decreasing</span> <span class="trend-stable">● Stable</span>
        <br><br>
        Generated by Data Guard Monitoring System | $(date) | Execution Time: ${DURATION}s
    </div>
    <script>document.getElementById('exec-time').innerText = '${DURATION}s';</script>
</body>
</html>
EOF
}

# =========================================================================
# SUMMARY GENERATION
# =========================================================================

generate_summary() {
    local alert_file="$REPORT_DIR/alert_$(date +%s).txt"
    local alert_flag=0
    local issues_found=0
    
    {
        flock -x 200
        echo -e "\n=== SUMMARY ===" >> "$REPORT_FILE"
        echo "Report generated: $(date)" >> "$REPORT_FILE"
        echo "Report file: $REPORT_FILE" >> "$REPORT_FILE"
        [ "$ENABLE_HTML_REPORT" = true ] && echo "HTML report: $HTML_REPORT" >> "$REPORT_FILE"
        [ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV export: $CSV_FILE" >> "$REPORT_FILE"
        echo "Raw data file: $RAW_DATA_FILE" >> "$REPORT_FILE"
        echo "Historical data file: $HISTORICAL_DATA_FILE" >> "$REPORT_FILE"
        echo "Prometheus metrics: $PROM_FILE" >> "$REPORT_FILE"
        echo "Log file: $LOG_FILE" >> "$REPORT_FILE"
        echo "Auto-heal log: $AUTO_HEAL_LOG" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        echo "Configuration Summary:" >> "$REPORT_FILE"
        echo "  Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"
        echo "  Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"
        echo "  Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "  Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "  Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "  Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"
        
        if [ -f "$RAW_DATA_FILE" ]; then
            local total_standbys=$(awk 'END{print NR}' "$RAW_DATA_FILE")
            echo "Total standbys processed: $total_standbys" >> "$REPORT_FILE"
            
            echo "" >> "$REPORT_FILE"
            echo "Breakdown by Primary Database:" >> "$REPORT_FILE"
            for DB in "${DB_LIST[@]}"; do
                local count=$(awk -F'|' -v db="$DB" '$1==db {count++} END{print count+0}' "$RAW_DATA_FILE")
                [ $count -gt 0 ] && echo "  $DB: $count standby(s)" >> "$REPORT_FILE"
            done
            
            # Health summary
            echo "" >> "$REPORT_FILE"
            echo "Primary Database Health Summary:" >> "$REPORT_FILE"
            awk -F'|' '
            {
                primary=$1
                status=$9

                if (!(primary in seen)) {
                    seen[primary]=status
                } else {
                    current=seen[primary]

                    if (status == "CRITICAL") {
                        seen[primary]="CRITICAL"
                    } else if (status == "ERROR" && current != "CRITICAL") {
                        seen[primary]="ERROR"
                    } else if (status == "WARNING" && current != "CRITICAL" && current != "ERROR") {
                        seen[primary]="WARNING"
                    }
                }
            }
            END {
                for (p in seen) {
                    printf("  %s: %s\n", p, seen[p])
                }
            }' "$RAW_DATA_FILE" >> "$REPORT_FILE"
            
            # Issue detection
            local issue_lines
            issue_lines=$(awk -F'|' '$9 ~ /CRITICAL|ERROR|INFRA_ERROR/' "$RAW_DATA_FILE")
            
            if [ -n "$issue_lines" ]; then
                echo "" >> "$REPORT_FILE"
                echo "Issues Detected:" >> "$REPORT_FILE"
                echo "$issue_lines" >> "$REPORT_FILE"
                
                echo "$issue_lines" > "$alert_file"
                alert_flag=1
                issues_found=1
            else
                echo "" >> "$REPORT_FILE"
                echo "No issues detected." >> "$REPORT_FILE"
            fi
        fi
        
        # Execution time
        local end_time=$(date +%s)
        local duration=$((end_time - START_TIME))
        echo "" >> "$REPORT_FILE"
        echo "Execution Time: ${duration} seconds" >> "$REPORT_FILE"
        echo "=========================" >> "$REPORT_FILE"
        
    } 200>"$OUTPUT_LOCKFILE"
    
    # Send alert if needed
    if [ $alert_flag -eq 1 ]; then
        log_message "Issues detected - triggering alert"
        send_alert_email "Data Guard Alert - Issues Detected" "$alert_file" "ALL_DATABASES"
    else
        log_message "No issues detected"
    fi
}

# =========================================================================
# MAIN EXECUTION
# =========================================================================

main() {
    # Check master switch
    if [ "$ENABLE_MONITORING" != true ]; then
        echo "Monitoring is disabled. Set ENABLE_MONITORING=true to enable."
        exit 0
    fi
    
    # Global timeout watchdog
    if [ -n "$SCRIPT_TIMEOUT" ] && [ "$SCRIPT_TIMEOUT" -gt 0 ]; then
        (
            sleep "$SCRIPT_TIMEOUT"
            echo "ERROR: Script timeout after ${SCRIPT_TIMEOUT}s" >&2
            kill -9 $MAIN_PID 2>/dev/null
        ) &
        WATCHDOG_PID=$!
    fi
    
    log_message "===== Data Guard Monitoring Started ====="
    log_message "Connection Method: $CONNECTION_METHOD"
    log_message "Parallel jobs: $MAX_PARALLEL_JOBS"
    log_message "Email alerts: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"
    log_message "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"
    log_message "Global dry-run: $GLOBAL_DRY_RUN"
    log_message "Prometheus metrics: $([ "$ENABLE_PROM_METRICS" = true ] && echo "ENABLED" || echo "DISABLED")"
    log_message "HTML report: $([ "$ENABLE_HTML_REPORT" = true ] && echo "ENABLED" || echo "DISABLED")"
    log_message "CSV export: $([ "$ENABLE_CSV_EXPORT" = true ] && echo "ENABLED" || echo "DISABLED")"
    log_message "Historical data: $([ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED")"
    
    # Rotate historical file
    rotate_historical_file
    
    # Validate wallet connectivity
    if ! validate_wallet; then
        log_message "ERROR: No valid databases. Exiting."
        [ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null
        exit 1
    fi
    
    # Initialize HTML report
    init_html_report
    
    # Initialize report files
    {
        flock -x 200
        echo "Data Guard Status Report - $(date)" > "$REPORT_FILE"
        echo "========================================" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        echo "Configuration:" >> "$REPORT_FILE"
        echo "  Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"
        echo "  Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"
        echo "  Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "  Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "  Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"
        echo "  Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
        echo "" >> "$REPORT_FILE"
        
        echo "PrimaryDB|StandbyDB|TransportLagSec|ApplyLagSec|Error|SwitchoverReady|Enabled|ConfigStatus|OverallStatus|Trend|Timestamp" > "$RAW_DATA_FILE"
        
        # Print header to console
        printf "\n%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
            "PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date"
        printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
            "---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------"
        
        # Write headers to report file
        printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
            "PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date" >> "$REPORT_FILE"
        printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
            "---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------" >> "$REPORT_FILE"
    } 200>"$OUTPUT_LOCKFILE"
    
    # Parallel execution with reliable polling approach
    local pids=()
    for DB in "${DB_LIST[@]}"; do
        # Wait until we have fewer than MAX_PARALLEL_JOBS running
        while [ "$(jobs -rp | wc -l)" -ge "$MAX_PARALLEL_JOBS" ]; do
            sleep "$PARALLEL_SLEEP_INTERVAL"
        done
        process_database "$DB" &
        pids+=($!)
    done
    
    # Wait for all background jobs
    for pid in "${pids[@]}"; do
        wait $pid 2>/dev/null || log_message "ERROR: Background job $pid failed"
    done
    
    # Generate summary
    generate_summary
    
    # Finalize HTML report
    finalize_html_report
    
    # Kill watchdog
    [ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null
    
    # Copy Prometheus metrics if node_exporter exists
    if [ "$ENABLE_PROM_METRICS" = true ] && [ -d "/var/lib/node_exporter/textfile_collector" ]; then
        cp "$PROM_FILE" "/var/lib/node_exporter/textfile_collector/dataguard_metrics.prom" 2>/dev/null && \
        log_message "Prometheus metrics copied to node_exporter"
    fi
    
    DURATION=$(( $(date +%s) - START_TIME ))
    echo -e "\n${GREEN}Report Completed.${NC}"
    echo "Report: $REPORT_FILE"
    [ "$ENABLE_HTML_REPORT" = true ] && echo "HTML: $HTML_REPORT"
    [ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV: $CSV_FILE"
    echo "Raw: $RAW_DATA_FILE"
    [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "History: $HISTORICAL_DATA_FILE"
    [ "$ENABLE_PROM_METRICS" = true ] && echo "Prometheus: $PROM_FILE"
    echo "Log: $LOG_FILE"
    [ "$ENABLE_AUTO_HEAL" = true ] && echo "Auto-heal log: $AUTO_HEAL_LOG"
    echo "Connection Method: $CONNECTION_METHOD"
    echo "Parallel Jobs: $MAX_PARALLEL_JOBS"
    echo "Email: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"
    echo "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"
    echo "Global Dry-Run: $GLOBAL_DRY_RUN"
    echo "Time: ${DURATION}s"
    
    log_message "===== Data Guard Monitoring Completed ====="
}

# Run main
main "$@"