improve logging and concurrent vcenter inventory
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@@ -23,6 +23,8 @@ import (
|
||||
"github.com/vmware/govmomi/vim25/types"
|
||||
)
|
||||
|
||||
type ctxLoggerKey struct{}
|
||||
|
||||
// RunVcenterSnapshotHourly records hourly inventory snapshots into a daily table.
|
||||
// If force is true, any in-progress marker will be cleared before starting (useful for manual recovery).
|
||||
func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Logger, force bool) (err error) {
|
||||
@@ -701,13 +703,15 @@ func snapshotFromInventory(inv queries.Inventory, snapshotTime time.Time) Invent
|
||||
}
|
||||
|
||||
func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTime time.Time, tableName string, url string) error {
|
||||
log := c.Logger.With("vcenter", url)
|
||||
ctx = context.WithValue(ctx, ctxLoggerKey{}, log)
|
||||
started := time.Now()
|
||||
c.Logger.Debug("connecting to vcenter for hourly snapshot", "url", url)
|
||||
log.Debug("connecting to vcenter for hourly snapshot", "url", url)
|
||||
vc := vcenter.New(c.Logger, c.VcCreds)
|
||||
if err := vc.Login(url); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
|
||||
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
}
|
||||
return fmt.Errorf("unable to connect to vcenter: %w", err)
|
||||
}
|
||||
@@ -715,9 +719,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
logCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
if err := vc.Logout(logCtx); err != nil {
|
||||
c.Logger.Warn("vcenter logout failed", "url", url, "error", err)
|
||||
log.Warn("vcenter logout failed", "url", url, "error", err)
|
||||
} else {
|
||||
c.Logger.Debug("vcenter logout succeeded", "url", url)
|
||||
log.Debug("vcenter logout succeeded", "url", url)
|
||||
}
|
||||
}()
|
||||
|
||||
@@ -725,34 +729,34 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
if err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
|
||||
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
}
|
||||
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
|
||||
}
|
||||
c.Logger.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
|
||||
log.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
|
||||
if err := db.EnsureVmIdentityTables(ctx, c.Database.DB()); err != nil {
|
||||
c.Logger.Warn("failed to ensure vm identity tables", "error", err)
|
||||
log.Warn("failed to ensure vm identity tables", "error", err)
|
||||
}
|
||||
hostLookup, err := vc.BuildHostLookup()
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
|
||||
log.Warn("failed to build host lookup", "url", url, "error", err)
|
||||
hostLookup = nil
|
||||
} else {
|
||||
c.Logger.Debug("built host lookup", "url", url, "hosts", len(hostLookup))
|
||||
log.Debug("built host lookup", "url", url, "hosts", len(hostLookup))
|
||||
}
|
||||
folderLookup, err := vc.BuildFolderPathLookup()
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to build folder lookup", "url", url, "error", err)
|
||||
log.Warn("failed to build folder lookup", "url", url, "error", err)
|
||||
folderLookup = nil
|
||||
} else {
|
||||
c.Logger.Debug("built folder lookup", "url", url, "folders", len(folderLookup))
|
||||
log.Debug("built folder lookup", "url", url, "folders", len(folderLookup))
|
||||
}
|
||||
rpLookup, err := vc.BuildResourcePoolLookup()
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to build resource pool lookup", "url", url, "error", err)
|
||||
log.Warn("failed to build resource pool lookup", "url", url, "error", err)
|
||||
rpLookup = nil
|
||||
} else {
|
||||
c.Logger.Debug("built resource pool lookup", "url", url, "pools", len(rpLookup))
|
||||
log.Debug("built resource pool lookup", "url", url, "pools", len(rpLookup))
|
||||
}
|
||||
|
||||
inventoryRows, err := c.Database.Queries().GetInventoryByVcenter(ctx, url)
|
||||
@@ -844,7 +848,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
for _, row := range presentSnapshots {
|
||||
batch = append(batch, row)
|
||||
}
|
||||
c.Logger.Debug("checking inventory for missing VMs", "vcenter", url)
|
||||
log.Debug("checking inventory for missing VMs")
|
||||
|
||||
missingCount := 0
|
||||
newCount := 0
|
||||
@@ -891,9 +895,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
VmId: inv.VmId,
|
||||
DatacenterName: inv.Datacenter,
|
||||
}); err != nil {
|
||||
c.Logger.Warn("failed to mark inventory record deleted", "error", err, "vm_id", row.VmId.String)
|
||||
log.Warn("failed to mark inventory record deleted", "error", err, "vm_id", row.VmId.String)
|
||||
}
|
||||
c.Logger.Debug("Marked VM as deleted", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", url, "snapshot_time", startTime)
|
||||
log.Debug("Marked VM as deleted", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", url, "snapshot_time", startTime)
|
||||
deletionsMarked = true
|
||||
}
|
||||
clusterName := ""
|
||||
@@ -909,10 +913,10 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
datacenter: inv.Datacenter,
|
||||
})
|
||||
if err := db.MarkVmDeletedWithDetails(ctx, dbConn, url, inv.VmId.String, inv.VmUuid.String, inv.Name, clusterName, startTime.Unix()); err != nil {
|
||||
c.Logger.Warn("failed to mark vm deleted in lifecycle cache", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "error", err)
|
||||
log.Warn("failed to mark vm deleted in lifecycle cache", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "error", err)
|
||||
}
|
||||
if err := db.UpsertVmLifecycleCache(ctx, dbConn, url, inv.VmId.String, inv.VmUuid.String, inv.Name, clusterName, startTime); err != nil {
|
||||
c.Logger.Warn("failed to upsert vm lifecycle cache (deletion path)", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "name", inv.Name, "error", err)
|
||||
log.Warn("failed to upsert vm lifecycle cache (deletion path)", "vcenter", url, "vm_id", inv.VmId, "vm_uuid", inv.VmUuid, "name", inv.Name, "error", err)
|
||||
}
|
||||
missingCount++
|
||||
}
|
||||
@@ -927,9 +931,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
end := startTime
|
||||
events, err := vc.FindVmDeletionEvents(ctx, begin, end)
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to fetch vcenter deletion events", "vcenter", url, "error", err)
|
||||
log.Warn("failed to fetch vcenter deletion events", "vcenter", url, "error", err)
|
||||
} else {
|
||||
c.Logger.Debug("fetched vcenter deletion events", "vcenter", url, "count", len(events), "window_start_local", begin, "window_end_local", end, "window_minutes", end.Sub(begin).Minutes(), "window_start_utc", begin.UTC(), "window_end_utc", end.UTC())
|
||||
log.Debug("fetched vcenter deletion events", "vcenter", url, "count", len(events), "window_start_local", begin, "window_end_local", end, "window_minutes", end.Sub(begin).Minutes(), "window_start_utc", begin.UTC(), "window_end_utc", end.UTC())
|
||||
for _, cand := range candidates {
|
||||
if t, ok := events[cand.vmID]; ok {
|
||||
delTs := sql.NullInt64{Int64: t.Unix(), Valid: true}
|
||||
@@ -938,12 +942,12 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
VmId: sql.NullString{String: cand.vmID, Valid: cand.vmID != ""},
|
||||
DatacenterName: cand.datacenter,
|
||||
}); err != nil {
|
||||
c.Logger.Warn("failed to update inventory deletion time from event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
|
||||
log.Warn("failed to update inventory deletion time from event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
|
||||
}
|
||||
if err := db.MarkVmDeletedWithDetails(ctx, dbConn, url, cand.vmID, cand.vmUUID, cand.name, cand.cluster, t.Unix()); err != nil {
|
||||
c.Logger.Warn("failed to refine lifecycle cache deletion time", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
|
||||
log.Warn("failed to refine lifecycle cache deletion time", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "vcenter", url, "error", err)
|
||||
}
|
||||
c.Logger.Info("refined deletion time from vcenter event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "name", cand.name, "vcenter", url, "event_time", t)
|
||||
log.Info("refined deletion time from vcenter event", "vm_id", cand.vmID, "vm_uuid", cand.vmUUID, "name", cand.name, "vcenter", url, "event_time", t)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -958,22 +962,22 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
end := startTime
|
||||
events, err := vc.FindVmDeletionEvents(ctx, begin, end)
|
||||
if err != nil {
|
||||
c.Logger.Warn("count-drop: failed to fetch vcenter deletion events", "vcenter", url, "error", err, "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount)
|
||||
log.Warn("count-drop: failed to fetch vcenter deletion events", "vcenter", url, "error", err, "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount)
|
||||
} else {
|
||||
c.Logger.Info("count-drop: deletion events fetched", "vcenter", url, "events", len(events), "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount, "window_start", begin, "window_end", end)
|
||||
log.Info("count-drop: deletion events fetched", "vcenter", url, "events", len(events), "prev_vm_count", prevVmCount.Int64, "current_vm_count", totals.VmCount, "window_start", begin, "window_end", end)
|
||||
}
|
||||
}
|
||||
|
||||
c.Logger.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
|
||||
log.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
|
||||
|
||||
if err := insertHourlyCache(ctx, dbConn, batch); err != nil {
|
||||
c.Logger.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err)
|
||||
log.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err)
|
||||
}
|
||||
|
||||
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
|
||||
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
|
||||
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
}
|
||||
return err
|
||||
}
|
||||
@@ -982,6 +986,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
slog.Warn("failed to insert vcenter totals", "vcenter", url, "snapshot_time", startTime.Unix(), "error", err)
|
||||
}
|
||||
|
||||
// Discover previous snapshots once per run (serial) to avoid concurrent probes across vCenters.
|
||||
prevTableName, newCount, missingCount = c.compareWithPreviousSnapshot(ctx, dbConn, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, missingCount)
|
||||
|
||||
// If VM count dropped versus totals and we still haven't marked missing, try another comparison + wider event window.
|
||||
@@ -1056,11 +1061,11 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
}
|
||||
|
||||
// Backfill lifecycle deletions for VMs missing from inventory and without DeletedAt.
|
||||
if err := backfillLifecycleDeletionsToday(ctx, c.Logger, dbConn, url, startTime, presentSnapshots); err != nil {
|
||||
c.Logger.Warn("failed to backfill lifecycle deletions for today", "vcenter", url, "error", err)
|
||||
if err := backfillLifecycleDeletionsToday(ctx, log, dbConn, url, startTime, presentSnapshots); err != nil {
|
||||
log.Warn("failed to backfill lifecycle deletions for today", "vcenter", url, "error", err)
|
||||
}
|
||||
|
||||
c.Logger.Info("Hourly snapshot summary",
|
||||
log.Info("Hourly snapshot summary",
|
||||
"vcenter", url,
|
||||
"vm_count", totals.VmCount,
|
||||
"vcpu_total", totals.VcpuTotal,
|
||||
@@ -1072,13 +1077,13 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
)
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, nil)
|
||||
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, true, ""); upErr != nil {
|
||||
c.Logger.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
|
||||
}
|
||||
if deletionsMarked {
|
||||
if err := c.generateReport(ctx, tableName); err != nil {
|
||||
c.Logger.Warn("failed to regenerate hourly report after deletions", "error", err, "table", tableName)
|
||||
log.Warn("failed to regenerate hourly report after deletions", "error", err, "table", tableName)
|
||||
} else {
|
||||
c.Logger.Debug("Regenerated hourly report after deletions", "table", tableName)
|
||||
log.Debug("Regenerated hourly report after deletions", "table", tableName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user