improve logging and concurrent vcenter inventory
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-01-21 10:25:04 +11:00
parent 00805513c9
commit 2483091861
5 changed files with 115 additions and 55 deletions

View File

@@ -23,6 +23,8 @@ import (
"github.com/vmware/govmomi/vim25/types"
)
type ctxLoggerKey struct{}
// RunVcenterSnapshotHourly records hourly inventory snapshots into a daily table.
// If force is true, any in-progress marker will be cleared before starting (useful for manual recovery).
func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Logger, force bool) (err error) {
@@ -701,13 +703,15 @@ func snapshotFromInventory(inv queries.Inventory, snapshotTime time.Time) Invent
}
func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTime time.Time, tableName string, url string) error {
log := c.Logger.With("vcenter", url)
ctx = context.WithValue(ctx, ctxLoggerKey{}, log)
started := time.Now()
c.Logger.Debug("connecting to vcenter for hourly snapshot", "url", url)
log.Debug("connecting to vcenter for hourly snapshot", "url", url)
vc := vcenter.New(c.Logger, c.VcCreds)
if err := vc.Login(url); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
return fmt.Errorf("unable to connect to vcenter: %w", err)
}
@@ -715,9 +719,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
logCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
if err := vc.Logout(logCtx); err != nil {
c.Logger.Warn("vcenter logout failed", "url", url, "error", err)
log.Warn("vcenter logout failed", "url", url, "error", err)
} else {
c.Logger.Debug("vcenter logout succeeded", "url", url)
log.Debug("vcenter logout succeeded", "url", url)
}
}()
@@ -725,34 +729,34 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
if err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
}
c.Logger.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
log.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
if err := db.EnsureVmIdentityTables(ctx, c.Database.DB()); err != nil {
c.Logger.Warn("failed to ensure vm identity tables", "error", err)
log.Warn("failed to ensure vm identity tables", "error", err)
}
hostLookup, err := vc.BuildHostLookup()
if err != nil {
c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
log.Warn("failed to build host lookup", "url", url, "error", err)
hostLookup = nil
} else {
c.Logger.Debug("built host lookup", "url", url, "hosts", len(hostLookup))
log.Debug("built host lookup", "url", url, "hosts", len(hostLookup))
}
folderLookup, err := vc.BuildFolderPathLookup()
if err != nil {
c.Logger.Warn("failed to build folder lookup", "url", url, "error", err)
log.Warn("failed to build folder lookup", "url", url, "error", err)
folderLookup = nil
} else {
c.Logger.Debug("built folder lookup", "url", url, "folders", len(folderLookup))
log.Debug("built folder lookup", "url", url, "folders", len(folderLookup))
}
rpLookup, err := vc.BuildResourcePoolLookup()
if err != nil {
c.Logger.Warn("failed to build resource pool lookup", "url", url, "error", err)
log.Warn("failed to build resource pool lookup", "url", url, "error", err)
rpLookup = nil
} else {
c.Logger.Debug("built resource pool lookup", "url", url, "pools", len(rpLookup))
log.Debug("built resource pool lookup", "url", url, "pools", len(rpLookup))
}
inventoryRows, err := c.Database.Queries().GetInventoryByVcenter(ctx, url)
@@ -844,7 +848,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
for _, row := range presentSnapshots {
batch = append(batch, row)
}
c.Logger.Debug("checking inventory for missing VMs", "vcenter", url)
log.Debug("checking inventory for missing VMs")
missingCount := 0
newCount := 0
@@ -891,9 +895,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
VmId: inv.VmId,
DatacenterName: inv.Datacenter,
}); err != nil {
c.Logger.Warn("failed to mark inventory record deleted", "error", err, "vm_id", row.VmId.String)
log.Warn("failed to mark inventory record deleted", "error", err, "vm_id", row.VmId.String)
}
c.Logger.Debug("Marked VM as deleted", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", url, "snapshot_time", startTime)
log.Debug("Marked VM as deleted", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", url, "snapshot_time", startTime)
deletionsMarked = true
}
clusterName := ""
@@ -909,10 +913,10 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
datacenter: inv.Datacenter,
})
if err := db.MarkVmDeletedWithDetails(ctx, dbConn, url, inv.VmId.String, inv.VmUuid.String, inv.Name, clusterName, startTime.Unix()); err != nil {
c.Logger.Warn("failed to mark vm deleted in lifecycle cache", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "error", err)
log.Warn("failed to mark vm deleted in lifecycle cache", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "error", err)
}
if err := db.UpsertVmLifecycleCache(ctx, dbConn, url, inv.VmId.String, inv.VmUuid.String, inv.Name, clusterName, startTime); err != nil {
c.Logger.Warn("failed to upsert vm lifecycle cache (deletion path)", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "name", inv.Name, "error", err)
log.Warn("failed to upsert vm lifecycle cache (deletion path)", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "name", inv.Name, "error", err)
}
missingCount++
}
@@ -927,9 +931,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
end := startTime
events, err := vc.FindVmDeletionEvents(ctx, begin, end)
if err != nil {
c.Logger.Warn("failed to fetch vcenter deletion events", "vcenter", url, "error", err)
log.Warn("failed to fetch vcenter deletion events", "vcenter", url, "error", err)
} else {
c.Logger.Debug("fetched vcenter deletion events", "vcenter", url, "count", len(events), "window_start_local", begin, "window_end_local", end, "window_minutes", end.Sub(begin).Minutes(), "window_start_utc", begin.UTC(), "window_end_utc", end.UTC())
log.Debug("fetched vcenter deletion events", "vcenter", url, "count", len(events), "window_start_local", begin, "window_end_local", end, "window_minutes", end.Sub(begin).Minutes(), "window_start_utc", begin.UTC(), "window_end_utc", end.UTC())
for _, cand := range candidates {
if t, ok := events[cand.vmID]; ok {
delTs := sql.NullInt64{Int64: t.Unix(), Valid: true}
@@ -938,12 +942,12 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
VmId: sql.NullString{String: cand.vmID, Valid: cand.vmID != ""},
DatacenterName: cand.datacenter,
}); err != nil {
c.Logger.Warn("failed to update inventory deletion time from event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
log.Warn("failed to update inventory deletion time from event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
}
if err := db.MarkVmDeletedWithDetails(ctx, dbConn, url, cand.vmID, cand.vmUUID, cand.name, cand.cluster, t.Unix()); err != nil {
c.Logger.Warn("failed to refine lifecycle cache deletion time", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
log.Warn("failed to refine lifecycle cache deletion time", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
}
c.Logger.Info("refined deletion time from vcenter event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "name", cand.name, "vcenter", url, "event_time", t)
log.Info("refined deletion time from vcenter event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "name", cand.name, "vcenter", url, "event_time", t)
}
}
}
@@ -958,22 +962,22 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
end := startTime
events, err := vc.FindVmDeletionEvents(ctx, begin, end)
if err != nil {
c.Logger.Warn("count-drop: failed to fetch vcenter deletion events", "vcenter", url, "error", err, "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount)
log.Warn("count-drop: failed to fetch vcenter deletion events", "vcenter", url, "error", err, "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount)
} else {
c.Logger.Info("count-drop: deletion events fetched", "vcenter", url, "events", len(events), "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount, "window_start", begin, "window_end", end)
log.Info("count-drop: deletion events fetched", "vcenter", url, "events", len(events), "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount, "window_start", begin, "window_end", end)
}
}
c.Logger.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
log.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
if err := insertHourlyCache(ctx, dbConn, batch); err != nil {
c.Logger.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err)
log.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err)
}
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
return err
}
@@ -982,6 +986,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
slog.Warn("failed to insert vcenter totals", "vcenter", url, "snapshot_time", startTime.Unix(), "error", err)
}
// Discover previous snapshots once per run (serial) to avoid concurrent probes across vCenters.
prevTableName, newCount, missingCount = c.compareWithPreviousSnapshot(ctx, dbConn, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, missingCount)
// If VM count dropped versus totals and we still haven't marked missing, try another comparison + wider event window.
@@ -1056,11 +1061,11 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
// Backfill lifecycle deletions for VMs missing from inventory and without DeletedAt.
if err := backfillLifecycleDeletionsToday(ctx, c.Logger, dbConn, url, startTime, presentSnapshots); err != nil {
c.Logger.Warn("failed to backfill lifecycle deletions for today", "vcenter", url, "error", err)
if err := backfillLifecycleDeletionsToday(ctx, log, dbConn, url, startTime, presentSnapshots); err != nil {
log.Warn("failed to backfill lifecycle deletions for today", "vcenter", url, "error", err)
}
c.Logger.Info("Hourly snapshot summary",
log.Info("Hourly snapshot summary",
"vcenter", url,
"vm_count", totals.VmCount,
"vcpu_total", totals.VcpuTotal,
@@ -1072,13 +1077,13 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
)
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, nil)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, true, ""); upErr != nil {
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
if deletionsMarked {
if err := c.generateReport(ctx, tableName); err != nil {
c.Logger.Warn("failed to regenerate hourly report after deletions", "error", err, "table", tableName)
log.Warn("failed to regenerate hourly report after deletions", "error", err, "table", tableName)
} else {
c.Logger.Debug("Regenerated hourly report after deletions", "table", tableName)
log.Debug("Regenerated hourly report after deletions", "table", tableName)
}
}
return nil