code re-org and bugfix hanging hourly snapshot
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-01-21 09:12:25 +11:00
parent c7c7fd3dc9
commit fd9cc185ce
6 changed files with 113 additions and 56 deletions

View File

@@ -24,7 +24,8 @@ import (
)
// RunVcenterSnapshotHourly records hourly inventory snapshots into a daily table.
func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Logger) (err error) {
// If force is true, any in-progress marker will be cleared before starting (useful for manual recovery).
func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Logger, force bool) (err error) {
jobCtx := ctx
jobTimeout := durationFromSeconds(c.Settings.Values.Settings.HourlyJobTimeoutSeconds, 20*time.Minute)
if jobTimeout > 0 {
@@ -40,6 +41,13 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
if err := tracker.ClearStale(staleCtx, "hourly_snapshot", jobTimeout); err != nil {
logger.Warn("failed to clear stale cron status", "error", err)
}
if force {
if err := tracker.ClearAllInProgress(staleCtx); err != nil {
logger.Warn("failed to clear in-progress flag (force run)", "error", err)
} else {
logger.Info("force run cleared in-progress marker before starting")
}
}
startedAt := time.Now()
defer func() {
@@ -51,8 +59,24 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
return err
}
if skip {
logger.Warn("Hourly snapshot skipped because a previous run is still active")
return nil
if force {
logger.Info("Force run requested; clearing in-progress marker and retrying")
if err := tracker.ClearAllInProgress(jobCtx); err != nil {
logger.Warn("failed to clear in-progress flag for force run", "error", err)
return nil
}
done, skip, err = tracker.Start(jobCtx, "hourly_snapshot")
if err != nil {
return err
}
if skip {
logger.Warn("Hourly snapshot still marked active after force clear; skipping")
return nil
}
} else {
logger.Warn("Hourly snapshot skipped because a previous run is still active", "force", force)
return nil
}
}
defer func() { done(err) }()
@@ -824,6 +848,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
missingCount := 0
newCount := 0
prevTableName := ""
for _, inv := range inventoryRows {
c.Logger.Debug("checking inventory for deletions", "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "name", inv.Name)
@@ -957,49 +982,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
slog.Warn("failed to insert vcenter totals", "vcenter", url, "snapshot_time", startTime.Unix(), "error", err)
}
// Compare with previous snapshot for this vcenter to mark deletions at snapshot time.
prevTableName, prevTableErr := latestHourlySnapshotBefore(ctx, dbConn, startTime)
if prevTableErr != nil {
c.Logger.Warn("failed to locate previous hourly snapshot for deletion comparison", "error", prevTableErr, "url", url)
}
prevSnapshotTime := int64(0)
if prevTableName != "" {
if suffix := strings.TrimPrefix(prevTableName, "inventory_hourly_"); suffix != prevTableName {
if ts, err := strconv.ParseInt(suffix, 10, 64); err == nil {
prevSnapshotTime = ts
}
}
}
if prevTableName != "" {
moreMissing := c.markMissingFromPrevious(ctx, dbConn, prevTableName, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName)
missingCount += moreMissing
// Guard against gaps: if previous snapshot is much older than expected, skip "new" detection to avoid false positives when an hourly run was missed.
expectedSeconds := int64(durationFromSeconds(c.Settings.Values.Settings.VcenterInventorySnapshotSeconds, time.Hour).Seconds())
if HasSnapshotGap(prevSnapshotTime, startTime.Unix(), expectedSeconds) {
c.Logger.Info("skipping new-VM detection due to gap between snapshots", "prev_table", prevTableName, "prev_snapshot_unix", prevSnapshotTime, "current_snapshot_unix", startTime.Unix())
} else {
newCount = countNewFromPrevious(ctx, dbConn, prevTableName, url, presentSnapshots)
if newCount > 0 {
newRows := listNewFromPrevious(ctx, dbConn, prevTableName, url, presentSnapshots)
names := make([]string, 0, len(newRows))
for _, r := range newRows {
if r.Name != "" {
names = append(names, r.Name)
} else if r.VmId.Valid {
names = append(names, r.VmId.String)
}
}
c.Logger.Info("new VMs since previous snapshot", "prev_table", prevTableName, "count", newCount, "names", names)
}
}
c.Logger.Debug("compared with previous snapshot", "prev_table", prevTableName, "new_since_prev", newCount, "missing_since_prev", missingCount)
} else {
// No previous snapshot found (or lookup failed).
newCount = len(presentSnapshots)
}
prevTableName, newCount, missingCount = c.compareWithPreviousSnapshot(ctx, dbConn, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, missingCount)
// If VM count dropped versus totals and we still haven't marked missing, try another comparison + wider event window.
if missingCount == 0 && prevVmCount.Valid && prevVmCount.Int64 > int64(totals.VmCount) {
@@ -1100,3 +1083,63 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
return nil
}
// compareWithPreviousSnapshot cross-checks current vs. previous hourly snapshots:
// marks deletions, detects new VMs when no gap exists, and returns the previous table name along with new/missing counts.
func (c *CronTask) compareWithPreviousSnapshot(
ctx context.Context,
dbConn *sqlx.DB,
url string,
startTime time.Time,
presentSnapshots map[string]InventorySnapshotRow,
presentByUuid map[string]struct{},
presentByName map[string]struct{},
inventoryByVmID map[string]queries.Inventory,
inventoryByUuid map[string]queries.Inventory,
inventoryByName map[string]queries.Inventory,
missingCount int,
) (string, int, int) {
prevTableName, prevTableErr := latestHourlySnapshotBefore(ctx, dbConn, startTime)
if prevTableErr != nil {
c.Logger.Warn("failed to locate previous hourly snapshot for deletion comparison", "error", prevTableErr, "url", url)
}
prevSnapshotTime := int64(0)
if prevTableName != "" {
if suffix := strings.TrimPrefix(prevTableName, "inventory_hourly_"); suffix != prevTableName {
if ts, err := strconv.ParseInt(suffix, 10, 64); err == nil {
prevSnapshotTime = ts
}
}
}
newCount := 0
if prevTableName != "" {
moreMissing := c.markMissingFromPrevious(ctx, dbConn, prevTableName, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName)
missingCount += moreMissing
expectedSeconds := int64(durationFromSeconds(c.Settings.Values.Settings.VcenterInventorySnapshotSeconds, time.Hour).Seconds())
// Allow runs as soon as half the normal interval; treat larger gaps as unreliable for "new" detection.
if HasSnapshotGap(prevSnapshotTime, startTime.Unix(), expectedSeconds/2) {
c.Logger.Info("skipping new-VM detection due to gap between snapshots", "prev_table", prevTableName, "prev_snapshot_unix", prevSnapshotTime, "current_snapshot_unix", startTime.Unix())
} else {
newCount = countNewFromPrevious(ctx, dbConn, prevTableName, url, presentSnapshots)
if newCount > 0 {
newRows := listNewFromPrevious(ctx, dbConn, prevTableName, url, presentSnapshots)
names := make([]string, 0, len(newRows))
for _, r := range newRows {
if r.Name != "" {
names = append(names, r.Name)
} else if r.VmId.Valid {
names = append(names, r.VmId.String)
}
}
c.Logger.Info("new VMs since previous snapshot", "prev_table", prevTableName, "count", newCount, "names", names)
}
}
c.Logger.Debug("compared with previous snapshot", "prev_table", prevTableName, "new_since_prev", newCount, "missing_since_prev", missingCount)
} else {
newCount = len(presentSnapshots)
}
return prevTableName, newCount, missingCount
}