#!/bin/bash
# ------------------------------------------------------------------
# Data Guard Status Report Script (Oracle 19c)
# Production-ready with centralized control panel, monitoring, alerts,
# visual dashboards, and controlled auto-healing capabilities
# ------------------------------------------------------------------
# =========================================================================
# CENTRALIZED CONTROL PANEL - TOGGLE FEATURES HERE
# =========================================================================
# 1. Global Monitoring Switches
ENABLE_MONITORING=true # Master switch for the entire script
ENABLE_AUTO_HEAL=false # Toggle Auto-Healing capabilities
ENABLE_EMAIL_ALERTS=true # Toggle Email notifications
ENABLE_PROM_METRICS=true # Toggle Prometheus .prom file generation
ENABLE_HTML_REPORT=true # Toggle HTML dashboard generation
ENABLE_HISTORICAL_DATA=true # Toggle historical data tracking
ENABLE_CSV_EXPORT=true # Toggle CSV export for Excel
GLOBAL_DRY_RUN=false # Global dry-run mode (overrides all actions)
# 2. Connection & Security Settings
# Options: "WALLET" (uses /@alias) or "USER" (requires credentials below)
CONNECTION_METHOD="WALLET"
DGMGRL_OPTIONS="-silent" # Add -xml if you prefer parsing XML output
# For USER method - set credentials (use secure method like env vars in production)
DG_MONITOR_USER="dg_monitor"
DG_MONITOR_PASSWORD="" # Set via environment variable or secure file
DG_MONITOR_PASSWORD_FILE="/etc/oracle/dg_monitor.pwd" # Alternative secure file
# 3. Environment Context
# Location of the centralized TNS_ADMIN if not in default OH/network/admin
export TNS_ADMIN="/u01/app/oracle/network/admin"
export ORACLE_HOME="/u01/app/oracle/product/19.0.0/dbhome_1"
export PATH=$ORACLE_HOME/bin:$PATH
export LD_LIBRARY_PATH=$ORACLE_HOME/lib:$LD_LIBRARY_PATH
# 4. Target Databases (Central Inventory)
# Add all Primary TNS Aliases here. The script will discover Standbys automatically.
DB_LIST=("PRIM_PROD_KWT" "PRIM_DR_KWT" "PRIM_DEV_CORE")
# 5. Performance & Concurrency
MAX_PARALLEL_JOBS=5 # Increase for large fleets
PARALLEL_SLEEP_INTERVAL=0.5 # Seconds between parallel job checks
SCRIPT_TIMEOUT=600 # Maximum execution time in seconds (10 min)
DGMGRL_TIMEOUT=30 # Seconds before killing a hung connection
DGMGRL_KILL_TIMEOUT=5 # Seconds after timeout before force kill
DGMGRL_RETRIES=2 # Number of retries for transient errors
MAX_HISTORY_LINES=50000 # Rotate history file after this many lines
# 6. Alert Thresholds (in seconds)
WARNING_LAG_THRESHOLD=300 # 5 minutes
CRITICAL_LAG_THRESHOLD=900 # 15 minutes
ALERT_COOLDOWN=900 # 15 minutes between alerts per database
# 7. Auto-Heal Configuration
AUTO_HEAL_DRY_RUN=true # true = log only, no execution
AUTO_HEAL_COOLDOWN=600 # seconds (10 min) between auto-heal attempts
MAX_AUTO_HEAL_ATTEMPTS=3 # Maximum attempts per standby before giving up
AUTO_HEAL_EXCLUDE=() # List of standbys to exclude from auto-heal (e.g., ("DR_TEST" "STBY_OLD"))
# 8. Directories
LOG_DIR="/var/log/dataguard"
REPORT_DIR="/tmp/dataguard_reports"
TEMP_DIR="/tmp/dataguard_temp"
CSV_DIR="/tmp/dataguard_csv"
# 9. Email Configuration
MAIL_TO="dba_team@yourcompany.com"
MAIL_FROM="dataguard@$(hostname)"
# 10. Logging Format (json or plain)
LOG_FORMAT="plain" # Options: "plain" or "json"
# 11. Console Output Colors
ENABLE_COLORS=true # Toggle colored console output
# =========================================================================
# INTERNAL VARIABLES - DO NOT MODIFY BELOW THIS LINE
# =========================================================================
# Color definitions
if [ "$ENABLE_COLORS" = true ] && [ -t 1 ]; then
RED="\e[31m"
GREEN="\e[32m"
YELLOW="\e[33m"
BLUE="\e[34m"
MAGENTA="\e[35m"
CYAN="\e[36m"
BOLD="\e[1m"
NC="\e[0m"
else
RED=""; GREEN=""; YELLOW=""; BLUE=""; MAGENTA=""; CYAN=""; BOLD=""; NC=""
fi
# Derived file paths
LOG_FILE="$LOG_DIR/dataguard_report_$(date +%Y%m%d).log"
REPORT_FILE="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).txt"
RAW_DATA_FILE="$REPORT_DIR/dataguard_raw_$(date +%Y%m%d_%H%M%S).dat"
HISTORICAL_DATA_FILE="$REPORT_DIR/dataguard_history.dat"
HTML_REPORT="$REPORT_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).html"
CSV_FILE="$CSV_DIR/dataguard_report_$(date +%Y%m%d_%H%M%S).csv"
PROM_FILE="/tmp/dataguard_metrics.prom"
# Auto-heal files (per standby)
AUTO_HEAL_COUNT_DIR="/var/tmp/dataguard_heal_counts"
AUTO_HEAL_LOG="$LOG_DIR/dataguard_autoheal.log"
# Alert cooldown files (per database)
ALERT_COOLDOWN_BASE="/var/tmp/dataguard_last_alert"
ALERT_LOCK_BASE="/var/tmp/dataguard_alert_lock"
# Lock files for thread-safe operations
LOG_LOCKFILE="/tmp/dataguard_log_$$.lock"
AUTOHEAL_LOG_LOCKFILE="/tmp/dataguard_autoheallog_$$.lock"
OUTPUT_LOCKFILE="/tmp/dataguard_output_$$.lock"
HTML_LOCKFILE="/tmp/dataguard_html_$$.lock"
PROM_LOCKFILE="/tmp/dataguard_prom_$$.lock"
HISTORY_LOCKFILE="/tmp/dataguard_history_$$.lock"
SCRIPT_LOCK="/tmp/dataguard_main.lock"
# Watchdog PID
WATCHDOG_PID=""
MAIN_PID=$$
# Start timing
START_TIME=$(date +%s)
# =========================================================================
# INITIALIZATION
# =========================================================================
# Create directories
mkdir -p "$LOG_DIR" "$REPORT_DIR" "$TEMP_DIR" "$AUTO_HEAL_COUNT_DIR" "$CSV_DIR"
touch "$LOG_LOCKFILE" "$AUTOHEAL_LOG_LOCKFILE" "$OUTPUT_LOCKFILE" "$HTML_LOCKFILE" "$PROM_LOCKFILE" "$HISTORY_LOCKFILE"
# Script lock to prevent duplicate runs
exec 200>"$SCRIPT_LOCK"
flock -n 200 || {
echo "ERROR: Another instance of this script is already running. Exiting."
exit 1
}
# Initialize historical file if it doesn't exist
if [ "$ENABLE_HISTORICAL_DATA" = true ] && [ ! -f "$HISTORICAL_DATA_FILE" ]; then
echo "PrimaryDB|StandbyDB|TransportLagSec|ApplyLagSec|Error|SwitchoverReady|Enabled|ConfigStatus|OverallStatus|Trend|Timestamp" > "$HISTORICAL_DATA_FILE"
fi
# Initialize CSV file if enabled
if [ "$ENABLE_CSV_EXPORT" = true ]; then
echo "\"PrimaryDB\",\"StandbyDB\",\"TransportLag\",\"ApplyLag\",\"Error\",\"SwitchoverReady\",\"Enabled\",\"ConfigStatus\",\"OverallStatus\",\"Trend\",\"Timestamp\"" > "$CSV_FILE"
fi
# Initialize Prometheus metrics file if enabled
if [ "$ENABLE_PROM_METRICS" = true ]; then
: > "$PROM_FILE"
{
echo "# HELP dataguard_transport_lag_seconds Data Guard transport lag in seconds"
echo "# TYPE dataguard_transport_lag_seconds gauge"
echo "# HELP dataguard_apply_lag_seconds Data Guard apply lag in seconds"
echo "# TYPE dataguard_apply_lag_seconds gauge"
echo "# HELP dataguard_status Data Guard status (0=error, 1=warning, 2=ok)"
echo "# TYPE dataguard_status gauge"
echo "# HELP dataguard_configuration_status Configuration status (0=error, 1=warning, 2=ok)"
echo "# TYPE dataguard_configuration_status gauge"
} >> "$PROM_FILE"
fi
# =========================================================================
# UTILITY FUNCTIONS
# =========================================================================
# CSV escape function
escape_csv() {
echo "$1" | sed 's/"/""/g'
}
# Structured logging function
log_message() {
[ "$ENABLE_MONITORING" != true ] && return
local msg="$1"
local level="${2:-INFO}"
local timestamp=$(date -Iseconds)
if [ "$LOG_FORMAT" = "json" ]; then
local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","host":"%s"}\n' \
"$timestamp" "$level" "${msg//\"/\\\"}" "$(hostname)")
{
flock -x 200
echo "$json_msg" >> "$LOG_FILE"
} 200>"$LOG_LOCKFILE"
# Also print to console for readability
echo "$msg"
else
{
flock -x 200
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" | tee -a "$LOG_FILE"
} 200>"$LOG_LOCKFILE"
fi
}
# Thread-safe auto-heal logging
log_autoheal() {
[ "$ENABLE_AUTO_HEAL" != true ] && return
local msg="$1"
local level="${2:-INFO}"
local timestamp=$(date -Iseconds)
if [ "$LOG_FORMAT" = "json" ]; then
local json_msg=$(printf '{"timestamp":"%s","level":"%s","message":"%s","component":"autoheal"}\n' \
"$timestamp" "$level" "${msg//\"/\\\"}")
{
flock -x 200
echo "$json_msg" >> "$AUTO_HEAL_LOG"
} 200>"$AUTOHEAL_LOG_LOCKFILE"
else
{
flock -x 200
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $msg" >> "$AUTO_HEAL_LOG"
} 200>"$AUTOHEAL_LOG_LOCKFILE"
fi
}
# Thread-safe output writer
write_output() {
local console_line="$1"
local report_line="$2"
local raw_data_line="$3"
local csv_line="$4"
# Write to console and main report files
{
flock -x 200
echo "$console_line"
echo "$report_line" >> "$REPORT_FILE"
if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then
echo "$raw_data_line" >> "$RAW_DATA_FILE"
fi
if [ -n "$csv_line" ] && [ "$ENABLE_CSV_EXPORT" = true ]; then
echo "$csv_line" >> "$CSV_FILE"
fi
} 200>"$OUTPUT_LOCKFILE"
# Write to historical data file with separate lock
if [ -n "$raw_data_line" ] && [ "$ENABLE_HISTORICAL_DATA" = true ]; then
{
flock -x 200
echo "$raw_data_line" >> "$HISTORICAL_DATA_FILE"
} 200>"$HISTORY_LOCKFILE"
fi
}
# Thread-safe Prometheus writer
write_prometheus() {
[ "$ENABLE_PROM_METRICS" != true ] && return
local metrics="$1"
{
flock -x 200
echo "$metrics" >> "$PROM_FILE"
} 200>"$PROM_LOCKFILE"
}
# Thread-safe HTML writer
write_html_row() {
[ "$ENABLE_HTML_REPORT" != true ] && return
local html_row="$1"
{
flock -x 200
echo "$html_row" >> "$HTML_REPORT"
} 200>"$HTML_LOCKFILE"
}
# =========================================================================
# REMOTE DGMGRL EXECUTION FUNCTIONS
# =========================================================================
# Build connection string based on control panel settings
get_connection_string() {
local target_alias="$1"
if [ "$CONNECTION_METHOD" == "WALLET" ]; then
echo "/@$target_alias"
else
# Check for password in environment or file
local password=""
if [ -n "$DG_MONITOR_PASSWORD" ]; then
password="$DG_MONITOR_PASSWORD"
elif [ -f "$DG_MONITOR_PASSWORD_FILE" ]; then
# Read password safely, stripping newlines
password=$(tr -d '\n' < "$DG_MONITOR_PASSWORD_FILE" 2>/dev/null)
fi
if [ -z "$password" ]; then
echo "ERROR: No password available for USER connection method" >&2
return 1
fi
echo "${DG_MONITOR_USER}/$password@$target_alias"
fi
}
# Thread-safe Remote DGMGRL Execution with timeout and retry
run_remote_dgmgrl() {
local target_alias="$1"
local sql_commands="$2"
local connection_str=""
connection_str=$(get_connection_string "$target_alias")
if [ $? -ne 0 ] || [ -z "$connection_str" ]; then
echo "ERROR: Failed to build connection string for $target_alias"
return 1
fi
local output=""
local rc=0
for ((i=1; i<=DGMGRL_RETRIES+1; i++)); do
output=$(timeout -k ${DGMGRL_KILL_TIMEOUT}s ${DGMGRL_TIMEOUT}s dgmgrl $DGMGRL_OPTIONS "$connection_str" <<EOF
$sql_commands
exit;
EOF
) 2>&1
rc=$?
if [ $rc -eq 0 ]; then
echo "$output"
return 0
fi
# Check for transient errors
local is_transient=0
if [ $rc -eq 124 ] || [ $rc -eq 137 ]; then
is_transient=1
elif grep -q "ORA-03113\|ORA-03114\|ORA-12541\|ORA-12514\|TNS-12541" <<< "$output"; then
is_transient=1
fi
if [ $i -gt $DGMGRL_RETRIES ] || [ $is_transient -eq 0 ]; then
echo "$output"
return $rc
fi
log_message "Retry $i/$DGMGRL_RETRIES for $target_alias (exit code: $rc)"
sleep 2
done
echo "$output"
return 1
}
# Alias for backward compatibility
run_dgmgrl() {
run_remote_dgmgrl "$1" "$2"
}
# =========================================================================
# LAG PROCESSING FUNCTIONS
# =========================================================================
# Convert lag to seconds
lag_to_seconds() {
local lag_value="$1"
local seconds=0
if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]] || [[ "$lag_value" =~ unknown|UNKNOWN ]]; then
echo -1
return
fi
if [[ "$lag_value" =~ ([0-9]+)\ days ]]; then
seconds=$((seconds + ${BASH_REMATCH[1]} * 86400))
fi
if [[ "$lag_value" =~ ([0-9]+)\ hours ]]; then
seconds=$((seconds + ${BASH_REMATCH[1]} * 3600))
fi
if [[ "$lag_value" =~ ([0-9]+)\ minutes ]]; then
seconds=$((seconds + ${BASH_REMATCH[1]} * 60))
fi
if [[ "$lag_value" =~ ([0-9]+)\ seconds ]]; then
seconds=$((seconds + ${BASH_REMATCH[1]}))
fi
echo $seconds
}
# Parse lag from DGMGRL format
parse_lag() {
local lag_value=$1
if [[ -z "$lag_value" ]] || [[ "$lag_value" == "N/A" ]]; then
echo "N/A"
return
fi
if [[ "$lag_value" =~ \+([0-9]{2})\ ([0-9]{2}):([0-9]{2}):([0-9]{2}) ]]; then
local days=${BASH_REMATCH[1]}
local hours=${BASH_REMATCH[2]}
local minutes=${BASH_REMATCH[3]}
local seconds=${BASH_REMATCH[4]}
local total_seconds=$((days * 86400 + hours * 3600 + minutes * 60 + seconds))
if [ $total_seconds -gt 0 ]; then
if [ $total_seconds -ge 86400 ]; then
echo "$days days"
elif [ $total_seconds -ge 3600 ]; then
echo "$((hours + days * 24)) hours"
elif [ $total_seconds -ge 60 ]; then
echo "$minutes minutes"
else
echo "$seconds seconds"
fi
else
echo "0 seconds"
fi
else
echo "$lag_value"
fi
}
# Check lag thresholds
check_lag_threshold() {
local lag_seconds=$1
if [ $lag_seconds -eq -1 ]; then
echo "UNKNOWN"
elif [ $lag_seconds -ge $CRITICAL_LAG_THRESHOLD ]; then
echo "CRITICAL"
elif [ $lag_seconds -ge $WARNING_LAG_THRESHOLD ]; then
echo "WARNING"
else
echo "OK"
fi
}
# =========================================================================
# STATUS DETERMINATION FUNCTIONS
# =========================================================================
# Get overall status
get_overall_status() {
local status_err="$1"
local transport_status="$2"
local apply_status="$3"
local config_status="$4"
# Infrastructure failures
if [[ "$status_err" == "Show command failed" ]] || \
[[ "$status_err" == "Validate command failed" ]] || \
[[ "$status_err" == "Command timeout" ]]; then
echo "INFRA_ERROR"
return
fi
# Configuration status issues
if [[ "$config_status" == "WARNING" ]]; then
echo "WARNING"
return
elif [[ "$config_status" == "ERROR" ]]; then
echo "ERROR"
return
fi
# Data Guard errors
if [[ "$status_err" != "None" ]] && [[ "$status_err" != "SUCCESS" ]]; then
echo "ERROR"
elif [[ "$transport_status" == "CRITICAL" ]] || [[ "$apply_status" == "CRITICAL" ]]; then
echo "CRITICAL"
elif [[ "$transport_status" == "WARNING" ]] || [[ "$apply_status" == "WARNING" ]]; then
echo "WARNING"
else
echo "OK"
fi
}
# Get colored status for console
get_colored_status() {
local status="$1"
case "$status" in
"OK") echo "${GREEN}✓ OK${NC}" ;;
"WARNING") echo "${YELLOW}⚠ WARNING${NC}" ;;
"CRITICAL") echo "${RED}🔴 CRITICAL${NC}" ;;
"ERROR") echo "${RED}✗ ERROR${NC}" ;;
"INFRA_ERROR") echo "${MAGENTA}⚙ INFRA_ERROR${NC}" ;;
*) echo "$status" ;;
esac
}
# Get numeric status for Prometheus
get_numeric_status() {
case "$1" in
"ERROR"|"CRITICAL"|"INFRA_ERROR") echo 0 ;;
"WARNING") echo 1 ;;
*) echo 2 ;;
esac
}
get_numeric_config_status() {
case "$1" in
"ERROR") echo 0 ;;
"WARNING") echo 1 ;;
*) echo 2 ;;
esac
}
# =========================================================================
# TREND DETECTION
# =========================================================================
detect_trend() {
[ "$ENABLE_HISTORICAL_DATA" != true ] && { echo "UNKNOWN"; return; }
local primary_db="$1"
local standby="$2"
local current_seconds="$3"
local prev_seconds=""
if [ -f "$HISTORICAL_DATA_FILE" ]; then
{
flock -s 200
prev_seconds=$(grep "^${primary_db}|${standby}|" "$HISTORICAL_DATA_FILE" | tail -2 | head -1 | awk -F'|' '{print $3}' | xargs)
} 200>"$HISTORY_LOCKFILE"
if [[ -n "$prev_seconds" ]] && [[ "$prev_seconds" =~ ^[0-9]+$ ]] && [[ "$current_seconds" =~ ^[0-9]+$ ]]; then
if [ "$current_seconds" -gt "$prev_seconds" ]; then
echo "INCREASING"
elif [ "$current_seconds" -lt "$prev_seconds" ]; then
echo "DECREASING"
else
echo "STABLE"
fi
return
fi
fi
echo "UNKNOWN"
}
# Get colored trend for console
get_colored_trend() {
local trend="$1"
case "$trend" in
"INCREASING") echo "${RED}▲ INCREASING${NC}" ;;
"DECREASING") echo "${GREEN}▼ DECREASING${NC}" ;;
"STABLE") echo "${BLUE}● STABLE${NC}" ;;
*) echo "$trend" ;;
esac
}
# =========================================================================
# HTML ESCAPE
# =========================================================================
html_escape() {
local str="$1"
str="${str//&/&}"
str="${str//</<}"
str="${str//>/>}"
str="${str//\"/"}"
str="${str//\'/'}"
echo "$str"
}
# =========================================================================
# AUTO-HEAL FUNCTIONS
# =========================================================================
# Get auto-heal attempt count (thread-safe with lock)
get_heal_attempts() {
local primary="$1"
local standby="$2"
local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
{
flock -s 200
if [ -f "$count_file" ]; then
cat "$count_file"
else
echo "0"
fi
} 200>"${count_file}.lock"
}
# Increment auto-heal attempt count (atomic with lock)
increment_heal_attempts() {
local primary="$1"
local standby="$2"
local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
{
flock -x 200
local current=$(cat "$count_file" 2>/dev/null || echo 0)
echo $((current + 1)) > "$count_file"
} 200>"${count_file}.lock"
}
# Reset auto-heal attempt count
reset_heal_attempts() {
local primary="$1"
local standby="$2"
local count_file="$AUTO_HEAL_COUNT_DIR/${primary}_${standby}.count"
rm -f "$count_file"
rm -f "${count_file}.lock"
}
# Check if standby is excluded from auto-heal
is_excluded() {
local standby="$1"
for excluded in "${AUTO_HEAL_EXCLUDE[@]}"; do
if [ "$standby" = "$excluded" ]; then
return 0
fi
done
return 1
}
# Auto-heal function with per-standby cooldown, retry limits, and exclusions
auto_heal() {
[ "$ENABLE_AUTO_HEAL" != true ] && return
[ "$GLOBAL_DRY_RUN" = true ] && { log_autoheal "[DRY-RUN] Global dry-run active, skipping auto-heal"; return; }
local PRIMARY="$1"
local STBY="$2"
local STATUS_ERR="$3"
local APPLY_RATE="$4"
local APPLY_SECONDS="$5"
local TRANSPORT_DISCONNECTED="$6"
local CONFIG_STATUS="$7"
local DB_ROLE="$8"
local OVERALL_STATUS="$9"
# Check if standby is excluded
if is_excluded "$STBY"; then
log_autoheal "[EXCLUDED] $PRIMARY -> $STBY is in exclusion list, skipping"
return
fi
# Check role - only heal from PRIMARY
if [[ "$DB_ROLE" != "PRIMARY" ]]; then
log_autoheal "[SKIP] $PRIMARY is not PRIMARY (role=$DB_ROLE), skipping auto-heal"
return
fi
# Check max attempts - only if not OK
local attempts=$(get_heal_attempts "$PRIMARY" "$STBY")
if [ "$OVERALL_STATUS" != "OK" ] && [ "$attempts" -ge "$MAX_AUTO_HEAL_ATTEMPTS" ]; then
log_autoheal "[MAX_ATTEMPTS] $PRIMARY -> $STBY has reached max attempts ($MAX_AUTO_HEAL_ATTEMPTS), skipping"
return
fi
# Per-standby cooldown lock
local LOCK_FILE="/var/tmp/dataguard_autoheal_${PRIMARY}_${STBY}.lock"
(
flock -x 200
local now=$(date +%s)
if [ -f "$LOCK_FILE" ]; then
local last=$(cat "$LOCK_FILE")
if (( now - last < AUTO_HEAL_COOLDOWN )); then
exit 1
fi
fi
echo "$now" > "$LOCK_FILE"
exit 0
) 200>"${LOCK_FILE}.lck"
if [ $? -eq 1 ]; then
log_autoheal "[COOLDOWN] Active for $PRIMARY -> $STBY, skipping"
return
fi
log_autoheal "[EVALUATE] $PRIMARY -> $STBY | Status: $STATUS_ERR | Rate: $APPLY_RATE | Lag: ${APPLY_SECONDS}s | Config: $CONFIG_STATUS | Attempt: $((attempts+1))/$MAX_AUTO_HEAL_ATTEMPTS"
run_fix() {
local cmd="$1"
local description="$2"
if [ "$AUTO_HEAL_DRY_RUN" = true ]; then
log_autoheal "[DRY-RUN] $description: $cmd"
return 0
fi
log_autoheal "[EXEC] $description: $cmd"
run_dgmgrl "$PRIMARY" "$cmd" >> "$AUTO_HEAL_LOG" 2>&1
local exit_code=$?
if [ $exit_code -eq 0 ]; then
log_autoheal "[SUCCESS] $description completed"
return 0
else
log_autoheal "[FAILURE] $description failed (exit: $exit_code)"
return 1
fi
}
local fix_applied=0
# CASE 1: MRP NOT RUNNING
if [[ "$STATUS_ERR" == "MRP not running" ]]; then
log_autoheal "[ACTION] MRP not running - restarting apply"
run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Restart MRP on $STBY"
fix_applied=1
fi
# CASE 2: APPLY STALLED (fixed regex)
if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$APPLY_SECONDS" -gt 300 ] && [ "$APPLY_SECONDS" -ne -1 ]; then
log_autoheal "[ACTION] Apply stalled - restarting apply (stop then start)"
run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-OFF';" "Stop apply on $STBY"
sleep 5
run_fix "EDIT DATABASE '$STBY' SET STATE='APPLY-ON';" "Start apply on $STBY"
fix_applied=1
fi
# CASE 3: TRANSPORT DISCONNECTED (with config status check)
if [ "$TRANSPORT_DISCONNECTED" -eq 1 ] && [[ "$CONFIG_STATUS" == "SUCCESS" ]]; then
log_autoheal "[ACTION] Transport disconnected - re-enabling"
run_fix "EDIT DATABASE '$PRIMARY' SET STATE='TRANSPORT-ON';" "Enable transport on $PRIMARY"
fix_applied=1
fi
# CASE 4: CONFIG DISABLED
if [[ "$CONFIG_STATUS" == "DISABLED" ]] || [[ "$CONFIG_STATUS" == "disabled" ]]; then
log_autoheal "[ACTION] Configuration disabled - enabling"
run_fix "ENABLE CONFIGURATION;" "Enable Data Guard configuration"
fix_applied=1
fi
# Update attempt counter based on overall status
if [ "$OVERALL_STATUS" = "OK" ]; then
reset_heal_attempts "$PRIMARY" "$STBY"
elif [ $fix_applied -eq 1 ]; then
increment_heal_attempts "$PRIMARY" "$STBY"
fi
}
# =========================================================================
# EMAIL ALERT FUNCTION
# =========================================================================
send_alert_email() {
[ "$ENABLE_EMAIL_ALERTS" != true ] && {
log_message "Email alerts disabled, skipping"
return
}
[ "$GLOBAL_DRY_RUN" = true ] && { log_message "[DRY-RUN] Global dry-run active, skipping email"; return; }
local subject="$1"
local body_file="$2"
local db="$3"
# Per-database cooldown check with per-database lock
local cooldown_file="${ALERT_COOLDOWN_BASE}_${db}.lock"
local lock_file="${ALERT_LOCK_BASE}_${db}.lock"
(
flock -x 200
if [ -f "$cooldown_file" ]; then
local last=$(cat "$cooldown_file")
local now=$(date +%s)
if (( now - last < ALERT_COOLDOWN )); then
exit 1
fi
fi
date +%s > "$cooldown_file"
exit 0
) 200>"$lock_file"
[ $? -eq 1 ] && { log_message "Alert suppressed for $db (cooldown)"; return; }
local email_body="Data Guard Alert Report\n"
email_body+="Host: $(hostname)\n"
email_body+="Database: $db\n"
email_body+="Time: $(date)\n"
email_body+="================================\n\n"
email_body+="$(cat "$body_file")\n"
if command -v mailx >/dev/null 2>&1; then
echo -e "$email_body" | mailx -s "$subject" -r "$MAIL_FROM" "$MAIL_TO"
log_message "Alert email sent via mailx for $db"
elif command -v mail >/dev/null 2>&1; then
echo -e "$email_body" | mail -s "$subject" "$MAIL_TO"
log_message "Alert email sent via mail for $db"
else
log_message "WARNING: mail utility not found"
echo "$email_body" > "$REPORT_DIR/alert_manual_${db}_$(date +%Y%m%d_%H%M%S).txt"
fi
}
# =========================================================================
# WALLET VALIDATION
# =========================================================================
validate_wallet() {
[ "$ENABLE_MONITORING" != true ] && return 0
log_message "Validating connectivity..."
local failed_dbs=()
for DB in "${DB_LIST[@]}"; do
# Strong validation - check if broker is accessible
if ! run_dgmgrl "$DB" "show configuration;" >/dev/null 2>&1; then
log_message "WARNING: Connection validation failed for $DB (will skip)"
failed_dbs+=("$DB")
fi
done
if [ ${#failed_dbs[@]} -eq ${#DB_LIST[@]} ]; then
log_message "ERROR: All databases failed validation. Exiting."
return 1
fi
# Update DB_LIST to exclude failed databases
local new_list=()
for DB in "${DB_LIST[@]}"; do
local skip=0
for failed in "${failed_dbs[@]}"; do
[ "$DB" = "$failed" ] && { skip=1; break; }
done
[ $skip -eq 0 ] && new_list+=("$DB")
done
DB_LIST=("${new_list[@]}")
log_message "Connection validation successful for ${#DB_LIST[@]} databases"
return 0
}
# =========================================================================
# ROTATE HISTORICAL FILE
# =========================================================================
rotate_historical_file() {
[ "$ENABLE_HISTORICAL_DATA" != true ] && return
if [ -f "$HISTORICAL_DATA_FILE" ]; then
local lines=$(wc -l < "$HISTORICAL_DATA_FILE")
if [ "$lines" -gt "$MAX_HISTORY_LINES" ]; then
log_message "Rotating historical file (${lines} lines, max ${MAX_HISTORY_LINES})"
tail -n "$MAX_HISTORY_LINES" "$HISTORICAL_DATA_FILE" > "${HISTORICAL_DATA_FILE}.tmp"
mv "${HISTORICAL_DATA_FILE}.tmp" "$HISTORICAL_DATA_FILE"
fi
fi
}
# =========================================================================
# MAIN DATABASE PROCESSING
# =========================================================================
process_database() {
local DB="$1"
local TMPFILE_LOCAL=""
local tmppath
tmppath=$(mktemp)
trap "rm -f '$tmppath'" RETURN
TMPFILE_LOCAL="$tmppath"
local PRIMARY_DB=""
local DB_ROLE=""
local STANDBY_ARRAY=()
local CONNECTION_ERROR=0
local CONFIG_STATUS=""
local FSFO_ENABLED=0
log_message "Processing database: $DB"
# Get configuration
run_dgmgrl "$DB" "show configuration;" > "$TMPFILE_LOCAL" 2>&1
local dgmgrl_exit=$?
if [ $dgmgrl_exit -eq 124 ] || [ $dgmgrl_exit -eq 137 ]; then
log_message "ERROR: Timeout connecting to $DB"
CONNECTION_ERROR=1
fi
CONFIG_STATUS=$(grep -i "Configuration Status" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)
CONFIG_STATUS=${CONFIG_STATUS:-"Unknown"}
write_prometheus "dataguard_configuration_status{primary=\"$DB\"} $(get_numeric_config_status "$CONFIG_STATUS")"
# Extract database role
DB_ROLE=$(grep -i "Database Role" "$TMPFILE_LOCAL" | awk -F": " '{print $2}' | xargs)
DB_ROLE=${DB_ROLE:-"UNKNOWN"}
# Detect FSFO (Fast-Start Failover) - once per database
grep -qi "Fast-Start Failover: ENABLED" "$TMPFILE_LOCAL" && FSFO_ENABLED=1
# Check for errors
if [ $CONNECTION_ERROR -eq 0 ] && grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -qv "ORA-16809" 2>/dev/null; then
local error_msg=$(grep -E "ORA-|DGM-[0-9]{5}|Error:" "$TMPFILE_LOCAL" | grep -v "ORA-16809" | head -1)
log_message "WARNING: DGMGRL error on $DB: $error_msg"
CONNECTION_ERROR=1
fi
# Handle connection failure
if [ $CONNECTION_ERROR -eq 1 ]; then
local colored_status="${RED}INFRA_ERROR${NC}"
local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \
"$DB" "N/A" "ERROR" "ERROR" "DGMGRL connection failed" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")
local raw_line="$DB|N/A|-1|-1|DGMGRL connection failed|Unknown|Unknown|$CONFIG_STATUS|INFRA_ERROR|UNKNOWN|$(date '+%Y-%m-%d %H:%M:%S')"
local csv_line="\"$(escape_csv "$DB")\",\"N/A\",\"ERROR\",\"ERROR\",\"DGMGRL connection failed\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
return 1
fi
# Extract primary database
PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}' | xargs)
[ -z "$PRIMARY_DB" ] && PRIMARY_DB=$(grep -i "primary database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')
local PROM_PRIMARY="$DB"
local DISPLAY_PRIMARY="${PRIMARY_DB:-$DB} ($DB)"
local DISPLAY_PRIMARY_ESCAPED=$(html_escape "$DISPLAY_PRIMARY")
# Extract standbys
mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk -F'"' '{print $2}')
if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then
mapfile -t STANDBY_ARRAY < <(grep "Physical standby database" "$TMPFILE_LOCAL" | awk '{print $NF}' | tr -d '"')
fi
if [ ${#STANDBY_ARRAY[@]} -eq 0 ]; then
log_message "INFO: No standbys found for ${PRIMARY_DB:-$DB}"
return 0
fi
# Process each standby
for STBY in "${STANDBY_ARRAY[@]}"; do
: > "$TMPFILE_LOCAL"
# Combined commands - single connection per standby for efficiency
run_dgmgrl "$DB" "
show database verbose '$STBY';
validate database '$STBY';
" > "$TMPFILE_LOCAL" 2>&1
local combined_exit=$?
# Timeout handling
if [ $combined_exit -eq 124 ] || [ $combined_exit -eq 137 ]; then
log_message "ERROR: Timeout processing standby $STBY on $PRIMARY_DB"
local colored_status="${RED}INFRA_ERROR${NC}"
local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${RED}%s${NC} %-10s %-20s\n" \
"$DISPLAY_PRIMARY" "$STBY" "TIMEOUT" "TIMEOUT" "Command timeout" "Unknown" "Unknown" "$CONFIG_STATUS" "INFRA_ERROR" "UNKNOWN" "$(date '+%Y-%m-%d %H:%M:%S')")
local raw_line="${PRIMARY_DB:-$DB}|$STBY|-1|-1|Command timeout|Unknown|Unknown|$CONFIG_STATUS|INFRA_ERROR|UNKNOWN|$(date '+%Y-%m-%d %H:%M:%S')"
local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"TIMEOUT\",\"TIMEOUT\",\"Command timeout\",\"Unknown\",\"Unknown\",\"$(escape_csv "$CONFIG_STATUS")\",\"INFRA_ERROR\",\"UNKNOWN\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1
dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} -1
dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"INFRA_ERROR\",config=\"$CONFIG_STATUS\"} 0"
continue
fi
# Check output presence
local show_output_present=0
local validate_output_present=0
grep -q "Database Name:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
grep -q "Transport Lag:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
grep -q "Intended State:" "$TMPFILE_LOCAL" 2>/dev/null && show_output_present=1
grep -q "Ready for Switchover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1
grep -q "Ready for Failover:" "$TMPFILE_LOCAL" 2>/dev/null && validate_output_present=1
# Parse values
local TRANSPORT_LAG_RAW=""
local APPLY_LAG_RAW=""
local STATUS_ERR=""
local INTENDED_STATE=""
local ENABLED=""
local SWITCHOVER=""
local APPLY_RATE=""
local TRANSPORT_DISCONNECTED=0
if [ $show_output_present -eq 1 ]; then
TRANSPORT_LAG_RAW=$(awk -F": " '/Transport Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)
APPLY_LAG_RAW=$(awk -F": " '/Apply Lag/ {print $2}' "$TMPFILE_LOCAL" | xargs)
STATUS_ERR=$(awk -F": " '/Status:|Error:/ {print $2}' "$TMPFILE_LOCAL" | grep -v "SUCCESS" | head -1 | xargs)
INTENDED_STATE=$(awk -F": " '/Intended State/ {print $2}' "$TMPFILE_LOCAL" | xargs)
ENABLED=$(awk -F": " '/^Enabled:/ {print $2}' "$TMPFILE_LOCAL" | head -1 | xargs)
APPLY_RATE=$(awk -F": " '/Apply Rate/ {print $2}' "$TMPFILE_LOCAL" | xargs)
# Check MRP
if grep -qi "Apply Instance.*not running" "$TMPFILE_LOCAL"; then
STATUS_ERR="MRP not running"
log_message "WARNING: MRP not running for $STBY"
fi
# Check transport
if grep -qi "DISCONNECTED" "$TMPFILE_LOCAL"; then
TRANSPORT_DISCONNECTED=1
log_message "WARNING: Transport disconnected for $STBY"
fi
else
STATUS_ERR="Show command failed"
fi
if [ $validate_output_present -eq 1 ]; then
SWITCHOVER=$(awk -F": " '/Ready for Switchover/ {print $2}' "$TMPFILE_LOCAL" | xargs)
fi
# Parse lag
local TRANSPORT_LAG=$(parse_lag "$TRANSPORT_LAG_RAW")
local APPLY_LAG=$(parse_lag "$APPLY_LAG_RAW")
local transport_seconds=$(lag_to_seconds "$TRANSPORT_LAG")
local apply_seconds=$(lag_to_seconds "$APPLY_LAG")
# Set defaults
TRANSPORT_LAG=${TRANSPORT_LAG:-"N/A"}
APPLY_LAG=${APPLY_LAG:-"N/A"}
STATUS_ERR=${STATUS_ERR:-"None"}
ENABLED=${ENABLED:-"Unknown"}
SWITCHOVER=${SWITCHOVER:-"Unknown"}
# Validate failures
if [ $validate_output_present -eq 0 ] && [ "$STATUS_ERR" = "None" ]; then
STATUS_ERR="Validate command failed"
fi
# Check thresholds
local transport_status=$(check_lag_threshold $transport_seconds)
local apply_status=$(check_lag_threshold $apply_seconds)
# Apply stall detection (fixed regex)
if [[ "$APPLY_RATE" =~ ^0(\.0+)?[[:space:]] ]] && [ "$apply_seconds" -gt 60 ] && [ "$apply_seconds" -ne -1 ]; then
STATUS_ERR="Apply stalled"
apply_status="CRITICAL"
log_message "WARNING: Apply stalled for $STBY (rate=$APPLY_RATE, lag=${apply_seconds}s)"
fi
# Override critical conditions
[ "$STATUS_ERR" = "MRP not running" ] && apply_status="CRITICAL"
[ $TRANSPORT_DISCONNECTED -eq 1 ] && transport_status="CRITICAL"
# Detect trend
local TREND=$(detect_trend "${PRIMARY_DB:-$DB}" "$STBY" "$transport_seconds")
# Overall status
local OVERALL_STATUS=$(get_overall_status "$STATUS_ERR" "$transport_status" "$apply_status" "$CONFIG_STATUS")
local NUMERIC_STATUS=$(get_numeric_status "$OVERALL_STATUS")
# Auto-heal hook (skip if FSFO is enabled)
if [ $FSFO_ENABLED -eq 0 ]; then
auto_heal "${PRIMARY_DB:-$DB}" "$STBY" "$STATUS_ERR" "$APPLY_RATE" "$apply_seconds" "$TRANSPORT_DISCONNECTED" "$CONFIG_STATUS" "$DB_ROLE" "$OVERALL_STATUS"
elif [ $FSFO_ENABLED -eq 1 ]; then
log_message "INFO: FSFO enabled for $PRIMARY_DB, auto-heal disabled for this configuration"
fi
# HTML classes
local row_class="ok"
case "$OVERALL_STATUS" in
"CRITICAL"|"ERROR"|"INFRA_ERROR") row_class="crit" ;;
"WARNING") row_class="warn" ;;
esac
local trend_class=""
case "$TREND" in
"INCREASING") trend_class="trend-up" ;;
"DECREASING") trend_class="trend-down" ;;
*) trend_class="trend-stable" ;;
esac
# Escape HTML
local STBY_ESCAPED=$(html_escape "$STBY")
local STATUS_ERR_ESCAPED=$(html_escape "$STATUS_ERR")
local SWITCHOVER_ESCAPED=$(html_escape "$SWITCHOVER")
local ENABLED_ESCAPED=$(html_escape "$ENABLED")
local CONFIG_STATUS_ESCAPED=$(html_escape "$CONFIG_STATUS")
# Colored output
local colored_status=$(get_colored_status "$OVERALL_STATUS")
local colored_trend=$(get_colored_trend "$TREND")
# Output
local output_line=$(printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s ${colored_status} ${colored_trend} %-20s\n" \
"$DISPLAY_PRIMARY" "$STBY" "$TRANSPORT_LAG" "$APPLY_LAG" "$STATUS_ERR" "$SWITCHOVER" "$ENABLED" "$CONFIG_STATUS" "$(date '+%Y-%m-%d %H:%M:%S')")
local raw_line="${PRIMARY_DB:-$DB}|$STBY|$transport_seconds|$apply_seconds|$STATUS_ERR|$SWITCHOVER|$ENABLED|$CONFIG_STATUS|$OVERALL_STATUS|$TREND|$(date '+%Y-%m-%d %H:%M:%S')"
local csv_line="\"$(escape_csv "${PRIMARY_DB:-$DB}")\",\"$(escape_csv "$STBY")\",\"$(escape_csv "$TRANSPORT_LAG")\",\"$(escape_csv "$APPLY_LAG")\",\"$(escape_csv "$STATUS_ERR")\",\"$(escape_csv "$SWITCHOVER")\",\"$(escape_csv "$ENABLED")\",\"$(escape_csv "$CONFIG_STATUS")\",\"$(escape_csv "$OVERALL_STATUS")\",\"$(escape_csv "$TREND")\",\"$(date '+%Y-%m-%d %H:%M:%S')\""
write_output "$output_line" "$output_line" "$raw_line" "$csv_line"
write_prometheus "dataguard_transport_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $transport_seconds
dataguard_apply_lag_seconds{primary=\"$PROM_PRIMARY\",standby=\"$STBY\"} $apply_seconds
dataguard_status{primary=\"$PROM_PRIMARY\",standby=\"$STBY\",status=\"$OVERALL_STATUS\",config=\"$CONFIG_STATUS\"} $NUMERIC_STATUS"
local html_row="<tr class='$row_class'>
<td>${DISPLAY_PRIMARY_ESCAPED}</td>
<td>${STBY_ESCAPED}</td>
<td>$TRANSPORT_LAG</td>
<td>$APPLY_LAG</td>
<td>${STATUS_ERR_ESCAPED}</td>
<td>${SWITCHOVER_ESCAPED}</td>
<td>${ENABLED_ESCAPED}</td>
<td>${CONFIG_STATUS_ESCAPED}</td>
<td>$OVERALL_STATUS</td>
<td class=\"$trend_class\">$TREND</td>
<td>$(date '+%Y-%m-%d %H:%M:%S')</td>
</tr>"
write_html_row "$html_row"
# Log issues
if [ "$OVERALL_STATUS" = "CRITICAL" ] || [ "$OVERALL_STATUS" = "ERROR" ]; then
log_message "CRITICAL: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: $STATUS_ERR (transport: $TRANSPORT_LAG, apply: $APPLY_LAG)"
elif [ "$OVERALL_STATUS" = "WARNING" ]; then
log_message "WARNING: $OVERALL_STATUS for $PRIMARY_DB -> $STBY: transport=$TRANSPORT_LAG apply=$APPLY_LAG"
elif [ "$OVERALL_STATUS" = "INFRA_ERROR" ]; then
log_message "INFRA_ERROR: Command issue for $PRIMARY_DB -> $STBY: $STATUS_ERR"
fi
done
}
# =========================================================================
# HTML REPORT FUNCTIONS
# =========================================================================
init_html_report() {
[ "$ENABLE_HTML_REPORT" != true ] && return
cat <<EOF > "$HTML_REPORT"
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Data Guard Report - $(date)</title>
<style>
body { font-family: 'Segoe UI', Arial, sans-serif; margin: 20px; background: #f5f5f5; }
h1, h2 { color: #333; }
table { border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 3px rgba(0,0,0,0.2); }
th, td { border: 1px solid #ddd; padding: 12px; text-align: left; }
th { background: #4CAF50; color: white; }
tr:hover { background: #f5f5f5; }
.ok { background: #d4edda; }
.warn { background: #fff3cd; }
.crit { background: #f8d7da; }
.trend-up { color: #dc3545; font-weight: bold; }
.trend-down { color: #28a745; font-weight: bold; }
.trend-stable { color: #6c757d; }
.footer { margin-top: 20px; font-size: 12px; color: #666; text-align: center; }
.summary { background: white; padding: 15px; margin-bottom: 20px; border-left: 4px solid #4CAF50; }
</style>
</head>
<body>
<h1>📊 Oracle Data Guard Status Report</h1>
<div class="summary">
<strong>Report Time:</strong> $(date)<br>
<strong>Host:</strong> $(hostname)<br>
<strong>Connection Method:</strong> $CONNECTION_METHOD<br>
<strong>Databases Monitored:</strong> ${#DB_LIST[@]}<br>
<strong>Parallel Jobs:</strong> $MAX_PARALLEL_JOBS<br>
<strong>Email Alerts:</strong> $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )<br>
<strong>Auto-Heal:</strong> $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )<br>
<strong>Global Dry-Run:</strong> $GLOBAL_DRY_RUN<br>
<strong>Execution Time:</strong> <span id="exec-time">Calculating...</span>
</div>
<h2>📋 Standby Database Status</h2>
<table>
<thead>
<tr>
<th>Primary</th><th>Standby</th><th>Transport Lag</th><th>Apply Lag</th>
<th>Error</th><th>Switchover</th><th>Enabled</th><th>Config</th><th>Status</th><th>Trend</th><th>Timestamp</th>
</tr>
</thead>
<tbody>
EOF
}
finalize_html_report() {
[ "$ENABLE_HTML_REPORT" != true ] && return
DURATION=$(( $(date +%s) - START_TIME ))
cat <<EOF >> "$HTML_REPORT"
</tbody>
</table>
<div class="footer">
<strong>Legend:</strong>
<span style="background:#d4edda; padding:2px 8px;">✓ OK</span>
<span style="background:#fff3cd; padding:2px 8px;">⚠ Warning</span>
<span style="background:#f8d7da; padding:2px 8px;">🔴 Critical</span>
Trend: <span class="trend-up">▲ Increasing</span> <span class="trend-down">▼ Decreasing</span> <span class="trend-stable">● Stable</span>
<br><br>
Generated by Data Guard Monitoring System | $(date) | Execution Time: ${DURATION}s
</div>
<script>document.getElementById('exec-time').innerText = '${DURATION}s';</script>
</body>
</html>
EOF
}
# =========================================================================
# SUMMARY GENERATION
# =========================================================================
generate_summary() {
local alert_file="$REPORT_DIR/alert_$(date +%s).txt"
local alert_flag=0
local issues_found=0
{
flock -x 200
echo -e "\n=== SUMMARY ===" >> "$REPORT_FILE"
echo "Report generated: $(date)" >> "$REPORT_FILE"
echo "Report file: $REPORT_FILE" >> "$REPORT_FILE"
[ "$ENABLE_HTML_REPORT" = true ] && echo "HTML report: $HTML_REPORT" >> "$REPORT_FILE"
[ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV export: $CSV_FILE" >> "$REPORT_FILE"
echo "Raw data file: $RAW_DATA_FILE" >> "$REPORT_FILE"
echo "Historical data file: $HISTORICAL_DATA_FILE" >> "$REPORT_FILE"
echo "Prometheus metrics: $PROM_FILE" >> "$REPORT_FILE"
echo "Log file: $LOG_FILE" >> "$REPORT_FILE"
echo "Auto-heal log: $AUTO_HEAL_LOG" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "Configuration Summary:" >> "$REPORT_FILE"
echo " Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"
echo " Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"
echo " Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
echo " Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"
echo " Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
echo " Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"
if [ -f "$RAW_DATA_FILE" ]; then
local total_standbys=$(awk 'END{print NR}' "$RAW_DATA_FILE")
echo "Total standbys processed: $total_standbys" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "Breakdown by Primary Database:" >> "$REPORT_FILE"
for DB in "${DB_LIST[@]}"; do
local count=$(awk -F'|' -v db="$DB" '$1==db {count++} END{print count+0}' "$RAW_DATA_FILE")
[ $count -gt 0 ] && echo " $DB: $count standby(s)" >> "$REPORT_FILE"
done
# Health summary
echo "" >> "$REPORT_FILE"
echo "Primary Database Health Summary:" >> "$REPORT_FILE"
awk -F'|' '
{
primary=$1
status=$9
if (!(primary in seen)) {
seen[primary]=status
} else {
current=seen[primary]
if (status == "CRITICAL") {
seen[primary]="CRITICAL"
} else if (status == "ERROR" && current != "CRITICAL") {
seen[primary]="ERROR"
} else if (status == "WARNING" && current != "CRITICAL" && current != "ERROR") {
seen[primary]="WARNING"
}
}
}
END {
for (p in seen) {
printf(" %s: %s\n", p, seen[p])
}
}' "$RAW_DATA_FILE" >> "$REPORT_FILE"
# Issue detection
local issue_lines
issue_lines=$(awk -F'|' '$9 ~ /CRITICAL|ERROR|INFRA_ERROR/' "$RAW_DATA_FILE")
if [ -n "$issue_lines" ]; then
echo "" >> "$REPORT_FILE"
echo "Issues Detected:" >> "$REPORT_FILE"
echo "$issue_lines" >> "$REPORT_FILE"
echo "$issue_lines" > "$alert_file"
alert_flag=1
issues_found=1
else
echo "" >> "$REPORT_FILE"
echo "No issues detected." >> "$REPORT_FILE"
fi
fi
# Execution time
local end_time=$(date +%s)
local duration=$((end_time - START_TIME))
echo "" >> "$REPORT_FILE"
echo "Execution Time: ${duration} seconds" >> "$REPORT_FILE"
echo "=========================" >> "$REPORT_FILE"
} 200>"$OUTPUT_LOCKFILE"
# Send alert if needed
if [ $alert_flag -eq 1 ]; then
log_message "Issues detected - triggering alert"
send_alert_email "Data Guard Alert - Issues Detected" "$alert_file" "ALL_DATABASES"
else
log_message "No issues detected"
fi
}
# =========================================================================
# MAIN EXECUTION
# =========================================================================
main() {
# Check master switch
if [ "$ENABLE_MONITORING" != true ]; then
echo "Monitoring is disabled. Set ENABLE_MONITORING=true to enable."
exit 0
fi
# Global timeout watchdog
if [ -n "$SCRIPT_TIMEOUT" ] && [ "$SCRIPT_TIMEOUT" -gt 0 ]; then
(
sleep "$SCRIPT_TIMEOUT"
echo "ERROR: Script timeout after ${SCRIPT_TIMEOUT}s" >&2
kill -9 $MAIN_PID 2>/dev/null
) &
WATCHDOG_PID=$!
fi
log_message "===== Data Guard Monitoring Started ====="
log_message "Connection Method: $CONNECTION_METHOD"
log_message "Parallel jobs: $MAX_PARALLEL_JOBS"
log_message "Email alerts: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"
log_message "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"
log_message "Global dry-run: $GLOBAL_DRY_RUN"
log_message "Prometheus metrics: $([ "$ENABLE_PROM_METRICS" = true ] && echo "ENABLED" || echo "DISABLED")"
log_message "HTML report: $([ "$ENABLE_HTML_REPORT" = true ] && echo "ENABLED" || echo "DISABLED")"
log_message "CSV export: $([ "$ENABLE_CSV_EXPORT" = true ] && echo "ENABLED" || echo "DISABLED")"
log_message "Historical data: $([ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED")"
# Rotate historical file
rotate_historical_file
# Validate wallet connectivity
if ! validate_wallet; then
log_message "ERROR: No valid databases. Exiting."
[ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null
exit 1
fi
# Initialize HTML report
init_html_report
# Initialize report files
{
flock -x 200
echo "Data Guard Status Report - $(date)" > "$REPORT_FILE"
echo "========================================" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "Configuration:" >> "$REPORT_FILE"
echo " Connection Method: $CONNECTION_METHOD" >> "$REPORT_FILE"
echo " Parallel Jobs: $MAX_PARALLEL_JOBS" >> "$REPORT_FILE"
echo " Email Alerts: $( [ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
echo " Auto-Heal: $( [ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED" )" >> "$REPORT_FILE"
echo " Global Dry-Run: $GLOBAL_DRY_RUN" >> "$REPORT_FILE"
echo " Historical Data: $( [ "$ENABLE_HISTORICAL_DATA" = true ] && echo "ENABLED" || echo "DISABLED" )" >> "$REPORT_FILE"
echo "" >> "$REPORT_FILE"
echo "PrimaryDB|StandbyDB|TransportLagSec|ApplyLagSec|Error|SwitchoverReady|Enabled|ConfigStatus|OverallStatus|Trend|Timestamp" > "$RAW_DATA_FILE"
# Print header to console
printf "\n%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
"PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date"
printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
"---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------"
# Write headers to report file
printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
"PrimaryDB" "StandbyDB" "TransportLag" "ApplyLag" "Error" "SwitchoverReady" "Enabled" "ConfigStatus" "Status" "Trend" "Date" >> "$REPORT_FILE"
printf "%-15s %-20s %-15s %-15s %-20s %-20s %-10s %-15s %-15s %-10s %-20s\n" \
"---------------" "--------------------" "---------------" "---------------" "--------------------" "--------------------" "---------" "---------------" "---------" "---------" "--------------------" >> "$REPORT_FILE"
} 200>"$OUTPUT_LOCKFILE"
# Parallel execution with reliable polling approach
local pids=()
for DB in "${DB_LIST[@]}"; do
# Wait until we have fewer than MAX_PARALLEL_JOBS running
while [ "$(jobs -rp | wc -l)" -ge "$MAX_PARALLEL_JOBS" ]; do
sleep "$PARALLEL_SLEEP_INTERVAL"
done
process_database "$DB" &
pids+=($!)
done
# Wait for all background jobs
for pid in "${pids[@]}"; do
wait $pid 2>/dev/null || log_message "ERROR: Background job $pid failed"
done
# Generate summary
generate_summary
# Finalize HTML report
finalize_html_report
# Kill watchdog
[ -n "$WATCHDOG_PID" ] && kill -9 "$WATCHDOG_PID" 2>/dev/null
# Copy Prometheus metrics if node_exporter exists
if [ "$ENABLE_PROM_METRICS" = true ] && [ -d "/var/lib/node_exporter/textfile_collector" ]; then
cp "$PROM_FILE" "/var/lib/node_exporter/textfile_collector/dataguard_metrics.prom" 2>/dev/null && \
log_message "Prometheus metrics copied to node_exporter"
fi
DURATION=$(( $(date +%s) - START_TIME ))
echo -e "\n${GREEN}Report Completed.${NC}"
echo "Report: $REPORT_FILE"
[ "$ENABLE_HTML_REPORT" = true ] && echo "HTML: $HTML_REPORT"
[ "$ENABLE_CSV_EXPORT" = true ] && echo "CSV: $CSV_FILE"
echo "Raw: $RAW_DATA_FILE"
[ "$ENABLE_HISTORICAL_DATA" = true ] && echo "History: $HISTORICAL_DATA_FILE"
[ "$ENABLE_PROM_METRICS" = true ] && echo "Prometheus: $PROM_FILE"
echo "Log: $LOG_FILE"
[ "$ENABLE_AUTO_HEAL" = true ] && echo "Auto-heal log: $AUTO_HEAL_LOG"
echo "Connection Method: $CONNECTION_METHOD"
echo "Parallel Jobs: $MAX_PARALLEL_JOBS"
echo "Email: $([ "$ENABLE_EMAIL_ALERTS" = true ] && echo "ENABLED" || echo "DISABLED")"
echo "Auto-heal: $([ "$ENABLE_AUTO_HEAL" = true ] && echo "ENABLED (Dry-run: $AUTO_HEAL_DRY_RUN)" || echo "DISABLED")"
echo "Global Dry-Run: $GLOBAL_DRY_RUN"
echo "Time: ${DURATION}s"
log_message "===== Data Guard Monitoring Completed ====="
}
# Run main
main "$@"