improve tracking of VM deletions
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@@ -79,6 +79,9 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
|
||||
if err := db.CheckMigrationState(ctx, c.Database.DB()); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := db.EnsureSnapshotRunTable(ctx, c.Database.DB()); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// reload settings in case vcenter list has changed
|
||||
c.Settings.ReadYMLSettings()
|
||||
@@ -178,6 +181,47 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunHourlySnapshotRetry retries failed vCenter hourly snapshots up to a maximum attempt count.
|
||||
func (c *CronTask) RunHourlySnapshotRetry(ctx context.Context, logger *slog.Logger) (err error) {
|
||||
jobStart := time.Now()
|
||||
defer func() {
|
||||
logger.Info("Hourly snapshot retry finished", "duration", time.Since(jobStart))
|
||||
}()
|
||||
|
||||
maxRetries := c.Settings.Values.Settings.HourlySnapshotMaxRetries
|
||||
if maxRetries <= 0 {
|
||||
maxRetries = 3
|
||||
}
|
||||
|
||||
dbConn := c.Database.DB()
|
||||
if err := db.EnsureSnapshotRunTable(ctx, dbConn); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
failed, err := db.ListFailedSnapshotRuns(ctx, dbConn, maxRetries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(failed) == 0 {
|
||||
logger.Debug("No failed hourly snapshots to retry")
|
||||
return nil
|
||||
}
|
||||
|
||||
for _, f := range failed {
|
||||
startTime := time.Unix(f.SnapshotTime, 0)
|
||||
tableName, tnErr := hourlyInventoryTableName(startTime)
|
||||
if tnErr != nil {
|
||||
logger.Warn("unable to derive table name for retry", "error", tnErr, "snapshot_time", startTime, "vcenter", f.Vcenter)
|
||||
continue
|
||||
}
|
||||
logger.Info("Retrying hourly snapshot", "vcenter", f.Vcenter, "snapshot_time", startTime, "attempt", f.Attempts+1)
|
||||
if err := c.captureHourlySnapshotForVcenter(ctx, startTime, tableName, f.Vcenter); err != nil {
|
||||
logger.Warn("retry failed", "vcenter", f.Vcenter, "error", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// RunSnapshotCleanup drops hourly and daily snapshot tables older than retention.
|
||||
func (c *CronTask) RunSnapshotCleanup(ctx context.Context, logger *slog.Logger) (err error) {
|
||||
jobCtx := ctx
|
||||
@@ -696,6 +740,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
vc := vcenter.New(c.Logger, c.VcCreds)
|
||||
if err := vc.Login(url); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
|
||||
return fmt.Errorf("unable to connect to vcenter: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
@@ -707,12 +752,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
vcVms, err := vc.GetAllVMsWithProps()
|
||||
if err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
|
||||
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
|
||||
}
|
||||
canDetectMissing := len(vcVms) > 0
|
||||
if !canDetectMissing {
|
||||
c.Logger.Warn("no VMs returned from vcenter; skipping missing VM detection", "url", url)
|
||||
}
|
||||
hostLookup, err := vc.BuildHostLookup()
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
|
||||
@@ -741,15 +783,26 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
}
|
||||
|
||||
inventoryByVmID := make(map[string]queries.Inventory, len(inventoryRows))
|
||||
inventoryByUuid := make(map[string]queries.Inventory, len(inventoryRows))
|
||||
inventoryByName := make(map[string]queries.Inventory, len(inventoryRows))
|
||||
for _, inv := range inventoryRows {
|
||||
if inv.VmId.Valid {
|
||||
inventoryByVmID[inv.VmId.String] = inv
|
||||
}
|
||||
if inv.VmUuid.Valid {
|
||||
inventoryByUuid[inv.VmUuid.String] = inv
|
||||
}
|
||||
if inv.Name != "" {
|
||||
inventoryByName[inv.Name] = inv
|
||||
}
|
||||
}
|
||||
|
||||
dbConn := c.Database.DB()
|
||||
presentSnapshots := make(map[string]inventorySnapshotRow, len(vcVms))
|
||||
presentByUuid := make(map[string]struct{}, len(vcVms))
|
||||
presentByName := make(map[string]struct{}, len(vcVms))
|
||||
totals := snapshotTotals{}
|
||||
deletionsMarked := false
|
||||
for _, vm := range vcVms {
|
||||
if strings.HasPrefix(vm.Name, "vCLS-") {
|
||||
continue
|
||||
@@ -772,6 +825,12 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
}
|
||||
row.IsPresent = "TRUE"
|
||||
presentSnapshots[vm.Reference().Value] = row
|
||||
if row.VmUuid.Valid {
|
||||
presentByUuid[row.VmUuid.String] = struct{}{}
|
||||
}
|
||||
if row.Name != "" {
|
||||
presentByName[row.Name] = struct{}{}
|
||||
}
|
||||
|
||||
totals.VmCount++
|
||||
totals.VcpuTotal += nullInt64ToInt(row.VcpuCount)
|
||||
@@ -784,30 +843,40 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
batch = append(batch, row)
|
||||
}
|
||||
|
||||
if !canDetectMissing {
|
||||
c.Logger.Info("Hourly snapshot summary",
|
||||
"vcenter", url,
|
||||
"vm_count", totals.VmCount,
|
||||
"vcpu_total", totals.VcpuTotal,
|
||||
"ram_total_gb", totals.RamTotal,
|
||||
"disk_total_gb", totals.DiskTotal,
|
||||
)
|
||||
return nil
|
||||
}
|
||||
missingCount := 0
|
||||
|
||||
for _, inv := range inventoryRows {
|
||||
if strings.HasPrefix(inv.Name, "vCLS-") {
|
||||
continue
|
||||
}
|
||||
vmID := inv.VmId.String
|
||||
uuid := ""
|
||||
if inv.VmUuid.Valid {
|
||||
uuid = inv.VmUuid.String
|
||||
}
|
||||
name := inv.Name
|
||||
|
||||
found := false
|
||||
if vmID != "" {
|
||||
if _, ok := presentSnapshots[vmID]; ok {
|
||||
continue
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found && uuid != "" {
|
||||
if _, ok := presentByUuid[uuid]; ok {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found && name != "" {
|
||||
if _, ok := presentByName[name]; ok {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if found {
|
||||
continue
|
||||
}
|
||||
|
||||
row := snapshotFromInventory(inv, startTime)
|
||||
row.IsPresent = "FALSE"
|
||||
if !row.DeletionTime.Valid {
|
||||
deletionTime := startTime.Unix()
|
||||
row.DeletionTime = sql.NullInt64{Int64: deletionTime, Valid: true}
|
||||
@@ -818,23 +887,43 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
}); err != nil {
|
||||
c.Logger.Warn("failed to mark inventory record deleted", "error", err, "vm_id", row.VmId.String)
|
||||
}
|
||||
c.Logger.Debug("Marked VM as deleted", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", url, "snapshot_time", startTime)
|
||||
deletionsMarked = true
|
||||
}
|
||||
batch = append(batch, row)
|
||||
missingCount++
|
||||
}
|
||||
|
||||
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
|
||||
return err
|
||||
}
|
||||
|
||||
// Compare with previous snapshot for this vcenter to mark deletions at snapshot time.
|
||||
if prevTable, err := latestHourlySnapshotBefore(ctx, dbConn, startTime); err == nil && prevTable != "" {
|
||||
moreMissing := c.markMissingFromPrevious(ctx, dbConn, prevTable, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName)
|
||||
missingCount += moreMissing
|
||||
} else if err != nil {
|
||||
c.Logger.Warn("failed to locate previous hourly snapshot for deletion comparison", "error", err, "url", url)
|
||||
}
|
||||
|
||||
c.Logger.Info("Hourly snapshot summary",
|
||||
"vcenter", url,
|
||||
"vm_count", totals.VmCount,
|
||||
"vcpu_total", totals.VcpuTotal,
|
||||
"ram_total_gb", totals.RamTotal,
|
||||
"disk_total_gb", totals.DiskTotal,
|
||||
"missing_marked", missingCount,
|
||||
)
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, nil)
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, true, "")
|
||||
if deletionsMarked {
|
||||
if err := c.generateReport(ctx, tableName); err != nil {
|
||||
c.Logger.Warn("failed to regenerate hourly report after deletions", "error", err, "table", tableName)
|
||||
} else {
|
||||
c.Logger.Debug("Regenerated hourly report after deletions", "table", tableName)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -865,3 +954,141 @@ func boolStringFromInterface(value interface{}) string {
|
||||
return fmt.Sprint(v)
|
||||
}
|
||||
}
|
||||
|
||||
// latestHourlySnapshotBefore finds the most recent hourly snapshot table prior to the given time.
|
||||
func latestHourlySnapshotBefore(ctx context.Context, dbConn *sqlx.DB, cutoff time.Time) (string, error) {
|
||||
driver := strings.ToLower(dbConn.DriverName())
|
||||
var rows *sqlx.Rows
|
||||
var err error
|
||||
switch driver {
|
||||
case "sqlite":
|
||||
rows, err = dbConn.QueryxContext(ctx, `
|
||||
SELECT name FROM sqlite_master
|
||||
WHERE type = 'table' AND name LIKE 'inventory_hourly_%'
|
||||
`)
|
||||
case "pgx", "postgres":
|
||||
rows, err = dbConn.QueryxContext(ctx, `
|
||||
SELECT tablename FROM pg_catalog.pg_tables
|
||||
WHERE schemaname = 'public' AND tablename LIKE 'inventory_hourly_%'
|
||||
`)
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported driver for snapshot lookup: %s", driver)
|
||||
}
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
var latest string
|
||||
var latestTime int64
|
||||
for rows.Next() {
|
||||
var name string
|
||||
if scanErr := rows.Scan(&name); scanErr != nil {
|
||||
continue
|
||||
}
|
||||
if !strings.HasPrefix(name, "inventory_hourly_") {
|
||||
continue
|
||||
}
|
||||
suffix := strings.TrimPrefix(name, "inventory_hourly_")
|
||||
epoch, parseErr := strconv.ParseInt(suffix, 10, 64)
|
||||
if parseErr != nil {
|
||||
continue
|
||||
}
|
||||
if epoch < cutoff.Unix() && epoch > latestTime {
|
||||
latestTime = epoch
|
||||
latest = name
|
||||
}
|
||||
}
|
||||
return latest, nil
|
||||
}
|
||||
|
||||
// markMissingFromPrevious marks VMs that were present in the previous snapshot but missing now.
|
||||
func (c *CronTask) markMissingFromPrevious(ctx context.Context, dbConn *sqlx.DB, prevTable string, vcenter string, snapshotTime time.Time,
|
||||
currentByID map[string]inventorySnapshotRow, currentByUuid map[string]struct{}, currentByName map[string]struct{},
|
||||
invByID map[string]queries.Inventory, invByUuid map[string]queries.Inventory, invByName map[string]queries.Inventory) int {
|
||||
|
||||
if err := db.ValidateTableName(prevTable); err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
query := fmt.Sprintf(`SELECT "VmId","VmUuid","Name","Datacenter","DeletionTime" FROM %s WHERE "Vcenter" = ?`, prevTable)
|
||||
query = sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), query)
|
||||
|
||||
type prevRow struct {
|
||||
VmId sql.NullString `db:"VmId"`
|
||||
VmUuid sql.NullString `db:"VmUuid"`
|
||||
Name string `db:"Name"`
|
||||
Datacenter sql.NullString `db:"Datacenter"`
|
||||
DeletionTime sql.NullInt64 `db:"DeletionTime"`
|
||||
}
|
||||
|
||||
rows, err := dbConn.QueryxContext(ctx, query, vcenter)
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to read previous snapshot for deletion detection", "error", err, "table", prevTable, "vcenter", vcenter)
|
||||
return 0
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
missing := 0
|
||||
for rows.Next() {
|
||||
var r prevRow
|
||||
if err := rows.StructScan(&r); err != nil {
|
||||
continue
|
||||
}
|
||||
vmID := r.VmId.String
|
||||
uuid := r.VmUuid.String
|
||||
name := r.Name
|
||||
|
||||
found := false
|
||||
if vmID != "" {
|
||||
if _, ok := currentByID[vmID]; ok {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found && uuid != "" {
|
||||
if _, ok := currentByUuid[uuid]; ok {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found && name != "" {
|
||||
if _, ok := currentByName[name]; ok {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if found {
|
||||
continue
|
||||
}
|
||||
|
||||
var inv queries.Inventory
|
||||
var ok bool
|
||||
if vmID != "" {
|
||||
inv, ok = invByID[vmID]
|
||||
}
|
||||
if !ok && uuid != "" {
|
||||
inv, ok = invByUuid[uuid]
|
||||
}
|
||||
if !ok && name != "" {
|
||||
inv, ok = invByName[name]
|
||||
}
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if inv.DeletionTime.Valid {
|
||||
continue
|
||||
}
|
||||
|
||||
delTime := sql.NullInt64{Int64: snapshotTime.Unix(), Valid: true}
|
||||
if err := c.Database.Queries().InventoryMarkDeleted(ctx, queries.InventoryMarkDeletedParams{
|
||||
DeletionTime: delTime,
|
||||
VmId: inv.VmId,
|
||||
DatacenterName: inv.Datacenter,
|
||||
}); err != nil {
|
||||
c.Logger.Warn("failed to mark inventory record deleted from previous snapshot", "error", err, "vm_id", inv.VmId.String)
|
||||
continue
|
||||
}
|
||||
c.Logger.Debug("Detected VM missing compared to previous snapshot", "name", inv.Name, "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "vcenter", vcenter, "snapshot_time", snapshotTime, "prev_table", prevTable)
|
||||
missing++
|
||||
}
|
||||
|
||||
return missing
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user