lifecycle diagnostics
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-02-09 14:27:41 +11:00
parent 59b16db04f
commit 6dcbb9caef
8 changed files with 498 additions and 134 deletions

View File

@@ -1672,6 +1672,33 @@ type VmLifecycle struct {
DeletionTime int64
}
// VmLifecycleSourceDiagnostics captures how one lifecycle source contributed to the final lifecycle decision.
type VmLifecycleSourceDiagnostics struct {
Source string
Used bool
Error string
MatchedRows int64
FirstSeen int64
LastSeen int64
CreationTime int64
CreationApprox bool
DeletionRows int64
DeletionMin int64
DeletionMax int64
SelectedDeletionTime int64
StaleDeletionIgnored bool
}
// VmLifecycleDiagnostics captures per-source lifecycle diagnostics for a VM lookup.
type VmLifecycleDiagnostics struct {
LookupField string
LookupValue string
HourlyCache VmLifecycleSourceDiagnostics
LifecycleCache VmLifecycleSourceDiagnostics
SnapshotFallback VmLifecycleSourceDiagnostics
FinalLifecycle VmLifecycle
}
func vmLookupPredicate(vmID, vmUUID, name string) (string, []interface{}, bool) {
vmID = strings.TrimSpace(vmID)
vmUUID = strings.TrimSpace(vmUUID)
@@ -1688,6 +1715,22 @@ func vmLookupPredicate(vmID, vmUUID, name string) (string, []interface{}, bool)
}
}
func vmLookupDescriptor(vmID, vmUUID, name string) (string, string) {
vmID = strings.TrimSpace(vmID)
vmUUID = strings.TrimSpace(vmUUID)
name = strings.TrimSpace(name)
switch {
case vmID != "":
return "vm_id", vmID
case vmUUID != "":
return "vm_uuid", vmUUID
case name != "":
return "name", name
default:
return "", ""
}
}
// FetchVmTrace returns combined hourly snapshot records for a VM (by id/uuid/name) ordered by snapshot time.
// It prefers the shared vm_hourly_stats history table and falls back to per-snapshot tables.
func FetchVmTrace(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) ([]VmTraceRow, error) {
@@ -1892,36 +1935,75 @@ WHERE %s
// FetchVmLifecycle walks VM history data to determine lifecycle bounds for a VM.
// It prefers vm_hourly_stats + vm_lifecycle_cache and falls back to per-snapshot table probes.
func FetchVmLifecycle(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, error) {
lifecycle, _, err := FetchVmLifecycleWithDiagnostics(ctx, dbConn, vmID, vmUUID, name)
return lifecycle, err
}
// FetchVmLifecycleWithDiagnostics returns lifecycle details plus source-level diagnostics.
func FetchVmLifecycleWithDiagnostics(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, VmLifecycleDiagnostics, error) {
lookupField, lookupValue := vmLookupDescriptor(vmID, vmUUID, name)
diagnostics := VmLifecycleDiagnostics{
LookupField: lookupField,
LookupValue: lookupValue,
HourlyCache: VmLifecycleSourceDiagnostics{
Source: "vm_hourly_stats",
},
LifecycleCache: VmLifecycleSourceDiagnostics{
Source: "vm_lifecycle_cache",
},
SnapshotFallback: VmLifecycleSourceDiagnostics{
Source: "snapshot_registry/hourly_tables",
},
}
if TableExists(ctx, dbConn, "vm_hourly_stats") {
lifecycle, found, err := fetchVmLifecycleFromHourlyCache(ctx, dbConn, vmID, vmUUID, name)
lifecycle, sourceDiag, found, err := fetchVmLifecycleFromHourlyCache(ctx, dbConn, vmID, vmUUID, name)
diagnostics.HourlyCache = sourceDiag
if err != nil {
diagnostics.HourlyCache.Error = err.Error()
slog.Warn("vm lifecycle cache query failed; falling back to hourly tables", "error", err)
} else if found {
diagnostics.HourlyCache.Used = true
if TableExists(ctx, dbConn, "vm_lifecycle_cache") {
cached, cachedFound, cacheErr := fetchVmLifecycleFromLifecycleCache(ctx, dbConn, vmID, vmUUID, name)
cached, cachedDiag, cachedFound, cacheErr := fetchVmLifecycleFromLifecycleCache(ctx, dbConn, vmID, vmUUID, name)
diagnostics.LifecycleCache = cachedDiag
if cacheErr != nil {
diagnostics.LifecycleCache.Error = cacheErr.Error()
slog.Warn("vm lifecycle cache lookup failed", "error", cacheErr)
} else if cachedFound {
diagnostics.LifecycleCache.Used = true
lifecycle = mergeVmLifecycle(lifecycle, cached)
}
}
return lifecycle, nil
diagnostics.FinalLifecycle = lifecycle
return lifecycle, diagnostics, nil
}
}
return fetchVmLifecycleFromSnapshotTables(ctx, dbConn, vmID, vmUUID, name)
lifecycle, fallbackDiag, err := fetchVmLifecycleFromSnapshotTables(ctx, dbConn, vmID, vmUUID, name)
diagnostics.SnapshotFallback = fallbackDiag
if err != nil {
diagnostics.SnapshotFallback.Error = err.Error()
return lifecycle, diagnostics, err
}
diagnostics.SnapshotFallback.Used = true
diagnostics.FinalLifecycle = lifecycle
return lifecycle, diagnostics, nil
}
func fetchVmLifecycleFromHourlyCache(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, bool, error) {
func fetchVmLifecycleFromHourlyCache(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, VmLifecycleSourceDiagnostics, bool, error) {
diag := VmLifecycleSourceDiagnostics{Source: "vm_hourly_stats"}
matchWhere, args, ok := vmLookupPredicate(vmID, vmUUID, name)
if !ok {
return VmLifecycle{}, false, nil
return VmLifecycle{}, diag, false, nil
}
var row struct {
Rows int64 `db:"rows"`
Creation sql.NullInt64 `db:"creation_time"`
FirstSeen sql.NullInt64 `db:"first_seen"`
LastSeen sql.NullInt64 `db:"last_seen"`
Deletion sql.NullInt64 `db:"deletion_time"`
Rows int64 `db:"rows"`
Creation sql.NullInt64 `db:"creation_time"`
FirstSeen sql.NullInt64 `db:"first_seen"`
LastSeen sql.NullInt64 `db:"last_seen"`
DeletionRows int64 `db:"deletion_rows"`
DeletionMin sql.NullInt64 `db:"deletion_min"`
DeletionMax sql.NullInt64 `db:"deletion_max"`
}
query := `
SELECT
@@ -1929,16 +2011,29 @@ SELECT
MIN(NULLIF("CreationTime",0)) AS creation_time,
MIN("SnapshotTime") AS first_seen,
MAX("SnapshotTime") AS last_seen,
MIN(NULLIF("DeletionTime",0)) AS deletion_time
SUM(CASE WHEN COALESCE("DeletionTime",0) > 0 THEN 1 ELSE 0 END) AS deletion_rows,
MIN(NULLIF("DeletionTime",0)) AS deletion_min,
MAX(NULLIF("DeletionTime",0)) AS deletion_max
FROM vm_hourly_stats
WHERE ` + matchWhere + `
`
query = dbConn.Rebind(query)
if err := getLog(ctx, dbConn, &row, query, args...); err != nil {
return VmLifecycle{}, false, err
diag.Error = err.Error()
return VmLifecycle{}, diag, false, err
}
diag.MatchedRows = row.Rows
diag.FirstSeen = row.FirstSeen.Int64
diag.LastSeen = row.LastSeen.Int64
diag.DeletionRows = row.DeletionRows
if row.DeletionMin.Valid {
diag.DeletionMin = row.DeletionMin.Int64
}
if row.DeletionMax.Valid {
diag.DeletionMax = row.DeletionMax.Int64
}
if row.Rows == 0 {
return VmLifecycle{}, false, nil
return VmLifecycle{}, diag, false, nil
}
lifecycle := VmLifecycle{
FirstSeen: row.FirstSeen.Int64,
@@ -1951,38 +2046,70 @@ WHERE ` + matchWhere + `
lifecycle.CreationTime = row.FirstSeen.Int64
lifecycle.CreationApprox = true
}
if row.Deletion.Valid && row.Deletion.Int64 > 0 {
lifecycle.DeletionTime = row.Deletion.Int64
diag.CreationTime = lifecycle.CreationTime
diag.CreationApprox = lifecycle.CreationApprox
if row.DeletionMax.Valid && row.DeletionMax.Int64 > 0 {
if row.LastSeen.Valid && row.LastSeen.Int64 > 0 && row.DeletionMax.Int64 <= row.LastSeen.Int64 {
diag.StaleDeletionIgnored = true
slog.Warn("ignoring stale VM deletion from hourly cache",
"vm_id", vmID,
"vm_uuid", vmUUID,
"name", name,
"deletion_rows", row.DeletionRows,
"deletion_min", row.DeletionMin.Int64,
"deletion_max", row.DeletionMax.Int64,
"last_seen", row.LastSeen.Int64,
)
} else {
lifecycle.DeletionTime = row.DeletionMax.Int64
}
}
return lifecycle, true, nil
diag.SelectedDeletionTime = lifecycle.DeletionTime
return lifecycle, diag, true, nil
}
func fetchVmLifecycleFromLifecycleCache(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, bool, error) {
func fetchVmLifecycleFromLifecycleCache(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, VmLifecycleSourceDiagnostics, bool, error) {
diag := VmLifecycleSourceDiagnostics{Source: "vm_lifecycle_cache"}
matchWhere, args, ok := vmLookupPredicate(vmID, vmUUID, name)
if !ok {
return VmLifecycle{}, false, nil
return VmLifecycle{}, diag, false, nil
}
var row struct {
Rows int64 `db:"rows"`
FirstSeen sql.NullInt64 `db:"first_seen"`
LastSeen sql.NullInt64 `db:"last_seen"`
Deletion sql.NullInt64 `db:"deletion_time"`
Rows int64 `db:"rows"`
FirstSeen sql.NullInt64 `db:"first_seen"`
LastSeen sql.NullInt64 `db:"last_seen"`
DeletionRows int64 `db:"deletion_rows"`
DeletionMin sql.NullInt64 `db:"deletion_min"`
DeletionMax sql.NullInt64 `db:"deletion_max"`
}
query := `
SELECT
COUNT(1) AS rows,
MIN(NULLIF("FirstSeen",0)) AS first_seen,
MAX(NULLIF("LastSeen",0)) AS last_seen,
MIN(NULLIF("DeletedAt",0)) AS deletion_time
SUM(CASE WHEN COALESCE("DeletedAt",0) > 0 THEN 1 ELSE 0 END) AS deletion_rows,
MIN(NULLIF("DeletedAt",0)) AS deletion_min,
MAX(NULLIF("DeletedAt",0)) AS deletion_max
FROM vm_lifecycle_cache
WHERE ` + matchWhere + `
`
query = dbConn.Rebind(query)
if err := getLog(ctx, dbConn, &row, query, args...); err != nil {
return VmLifecycle{}, false, err
diag.Error = err.Error()
return VmLifecycle{}, diag, false, err
}
diag.MatchedRows = row.Rows
diag.FirstSeen = row.FirstSeen.Int64
diag.LastSeen = row.LastSeen.Int64
diag.DeletionRows = row.DeletionRows
if row.DeletionMin.Valid {
diag.DeletionMin = row.DeletionMin.Int64
}
if row.DeletionMax.Valid {
diag.DeletionMax = row.DeletionMax.Int64
}
if row.Rows == 0 {
return VmLifecycle{}, false, nil
return VmLifecycle{}, diag, false, nil
}
lifecycle := VmLifecycle{
FirstSeen: row.FirstSeen.Int64,
@@ -1992,10 +2119,26 @@ WHERE ` + matchWhere + `
lifecycle.CreationTime = row.FirstSeen.Int64
lifecycle.CreationApprox = true
}
if row.Deletion.Valid && row.Deletion.Int64 > 0 {
lifecycle.DeletionTime = row.Deletion.Int64
diag.CreationTime = lifecycle.CreationTime
diag.CreationApprox = lifecycle.CreationApprox
if row.DeletionMax.Valid && row.DeletionMax.Int64 > 0 {
if row.LastSeen.Valid && row.LastSeen.Int64 > 0 && row.DeletionMax.Int64 <= row.LastSeen.Int64 {
diag.StaleDeletionIgnored = true
slog.Warn("ignoring stale VM deletion from lifecycle cache",
"vm_id", vmID,
"vm_uuid", vmUUID,
"name", name,
"deletion_rows", row.DeletionRows,
"deletion_min", row.DeletionMin.Int64,
"deletion_max", row.DeletionMax.Int64,
"last_seen", row.LastSeen.Int64,
)
} else {
lifecycle.DeletionTime = row.DeletionMax.Int64
}
}
return lifecycle, true, nil
diag.SelectedDeletionTime = lifecycle.DeletionTime
return lifecycle, diag, true, nil
}
func mergeVmLifecycle(base, overlay VmLifecycle) VmLifecycle {
@@ -2006,7 +2149,7 @@ func mergeVmLifecycle(base, overlay VmLifecycle) VmLifecycle {
if overlay.LastSeen > out.LastSeen {
out.LastSeen = overlay.LastSeen
}
if overlay.DeletionTime > 0 && (out.DeletionTime == 0 || overlay.DeletionTime < out.DeletionTime) {
if overlay.DeletionTime > 0 && (out.DeletionTime == 0 || overlay.DeletionTime > out.DeletionTime) {
out.DeletionTime = overlay.DeletionTime
}
if overlay.CreationTime > 0 {
@@ -2021,14 +2164,18 @@ func mergeVmLifecycle(base, overlay VmLifecycle) VmLifecycle {
out.CreationTime = out.FirstSeen
out.CreationApprox = true
}
if out.DeletionTime > 0 && out.LastSeen > 0 && out.DeletionTime <= out.LastSeen {
out.DeletionTime = 0
}
return out
}
func fetchVmLifecycleFromSnapshotTables(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, error) {
func fetchVmLifecycleFromSnapshotTables(ctx context.Context, dbConn *sqlx.DB, vmID, vmUUID, name string) (VmLifecycle, VmLifecycleSourceDiagnostics, error) {
var lifecycle VmLifecycle
diag := VmLifecycleSourceDiagnostics{Source: "snapshot_registry/hourly_tables"}
matchWhere, args, ok := vmLookupPredicate(vmID, vmUUID, name)
if !ok {
return lifecycle, nil
return lifecycle, diag, nil
}
var tables []struct {
TableName string `db:"table_name"`
@@ -2040,11 +2187,14 @@ FROM snapshot_registry
WHERE snapshot_type = 'hourly'
ORDER BY snapshot_time
`); err != nil {
return lifecycle, err
diag.Error = err.Error()
return lifecycle, diag, err
}
minCreation := int64(0)
consecutiveMissing := 0
firstMissingAfterLastSeen := int64(0)
matchedRows := int64(0)
for _, t := range tables {
if err := ValidateTableName(t.TableName); err != nil {
continue
@@ -2063,11 +2213,18 @@ WHERE %s
continue
}
if probe.Cnt > 0 {
matchedRows++
if lifecycle.FirstSeen == 0 {
lifecycle.FirstSeen = t.SnapshotTime
}
if lifecycle.DeletionTime > 0 && t.SnapshotTime > lifecycle.DeletionTime {
diag.StaleDeletionIgnored = true
slog.Warn("ignoring stale VM deletion from snapshot-table fallback", "vm_id", vmID, "vm_uuid", vmUUID, "name", name, "deletion_time", lifecycle.DeletionTime, "seen_again_at", t.SnapshotTime)
lifecycle.DeletionTime = 0
}
lifecycle.LastSeen = t.SnapshotTime
consecutiveMissing = 0
firstMissingAfterLastSeen = 0
if probe.MinCreation.Valid {
if minCreation == 0 || probe.MinCreation.Int64 < minCreation {
minCreation = probe.MinCreation.Int64
@@ -2075,12 +2232,15 @@ WHERE %s
}
} else if lifecycle.LastSeen > 0 && lifecycle.DeletionTime == 0 && t.SnapshotTime > lifecycle.LastSeen {
consecutiveMissing++
if firstMissingAfterLastSeen == 0 {
firstMissingAfterLastSeen = t.SnapshotTime
}
if consecutiveMissing >= 2 {
lifecycle.DeletionTime = t.SnapshotTime
break
lifecycle.DeletionTime = firstMissingAfterLastSeen
}
} else {
consecutiveMissing = 0
firstMissingAfterLastSeen = 0
}
}
if minCreation > 0 {
@@ -2090,7 +2250,13 @@ WHERE %s
lifecycle.CreationTime = lifecycle.FirstSeen
lifecycle.CreationApprox = true
}
return lifecycle, nil
diag.MatchedRows = matchedRows
diag.FirstSeen = lifecycle.FirstSeen
diag.LastSeen = lifecycle.LastSeen
diag.CreationTime = lifecycle.CreationTime
diag.CreationApprox = lifecycle.CreationApprox
diag.SelectedDeletionTime = lifecycle.DeletionTime
return lifecycle, diag, nil
}
// SyncVcenterTotalsFromSnapshots backfills vcenter_totals using hourly snapshot tables in snapshot_registry.