fix hanging manual snapshot task
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-01-15 17:49:42 +11:00
parent 1b91c73a18
commit 75a5f31a2f
6 changed files with 130 additions and 39 deletions

View File

@@ -43,7 +43,6 @@ type inventorySnapshotRow struct {
SrmPlaceholder string
VmUuid sql.NullString
SnapshotTime int64
IsPresent string
}
type snapshotTotals = db.SnapshotTotals
@@ -87,6 +86,8 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
if err := db.EnsureSnapshotRunTable(ctx, c.Database.DB()); err != nil {
return err
}
// Best-effort cleanup of legacy IsPresent columns to simplify inserts.
c.dropLegacyIsPresentColumns(jobCtx)
// reload settings in case vcenter list has changed
c.Settings.ReadYMLSettings()
@@ -186,6 +187,30 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
return nil
}
// dropLegacyIsPresentColumns attempts to remove the old IsPresent column from hourly tables.
// This keeps inserts simple and avoids keeping unused data around.
func (c *CronTask) dropLegacyIsPresentColumns(ctx context.Context) {
dbConn := c.Database.DB()
if err := report.EnsureSnapshotRegistry(ctx, c.Database); err != nil {
c.Logger.Debug("skip IsPresent cleanup; registry unavailable", "error", err)
return
}
records, err := report.ListSnapshots(ctx, c.Database, "hourly")
if err != nil {
c.Logger.Debug("skip IsPresent cleanup; unable to list hourly snapshots", "error", err)
return
}
for _, r := range records {
if ok, err := db.ColumnExists(ctx, dbConn, r.TableName, "IsPresent"); err == nil && ok {
if _, err := dbConn.ExecContext(ctx, fmt.Sprintf(`ALTER TABLE %s DROP COLUMN "IsPresent"`, r.TableName)); err != nil {
c.Logger.Debug("unable to drop legacy IsPresent column", "table", r.TableName, "error", err)
} else {
c.Logger.Info("dropped legacy IsPresent column", "table", r.TableName)
}
}
}
}
// RunHourlySnapshotRetry retries failed vCenter hourly snapshots up to a maximum attempt count.
func (c *CronTask) RunHourlySnapshotRetry(ctx context.Context, logger *slog.Logger) (err error) {
jobStart := time.Now()
@@ -440,7 +465,7 @@ var summaryUnionColumns = []string{
`"InventoryId"`, `"Name"`, `"Vcenter"`, `"VmId"`, `"EventKey"`, `"CloudId"`, `"CreationTime"`,
`"DeletionTime"`, `"ResourcePool"`, `"Datacenter"`, `"Cluster"`, `"Folder"`,
`"ProvisionedDisk"`, `"VcpuCount"`, `"RamGB"`, `"IsTemplate"`, `"PoweredOn"`,
`"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`, `"IsPresent"`,
`"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`,
}
func ensureSnapshotRowID(ctx context.Context, dbConn *sqlx.DB, tableName string) error {
@@ -621,18 +646,13 @@ func snapshotFromVM(vmObject *mo.VirtualMachine, vc *vcenter.Vcenter, snapshotTi
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
}
}
if !row.ResourcePool.Valid {
if rpName, err := vc.GetVmResourcePool(*vmObject); err == nil {
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
}
}
}
if row.Folder.String == "" {
if folderPath, ok := vc.GetVMFolderPathFromLookup(*vmObject, folderLookup); ok {
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""}
} else if folderPath, err := vc.GetVMFolderPath(*vmObject); err == nil {
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""}
} else {
// Unable to resolve folder path from lookup; leave empty.
}
}
@@ -695,21 +715,64 @@ func insertHourlyBatch(ctx context.Context, dbConn *sqlx.DB, tableName string, r
if err != nil {
return err
}
stmt, err := tx.PreparexContext(ctx, sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), fmt.Sprintf(`
INSERT INTO %s (
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime", "IsPresent"
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`, tableName)))
baseCols := []string{
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime",
}
bind := sqlx.BindType(dbConn.DriverName())
buildStmt := func(cols []string) (*sqlx.Stmt, error) {
colList := `"` + strings.Join(cols, `", "`) + `"`
placeholders := strings.TrimRight(strings.Repeat("?, ", len(cols)), ", ")
return tx.PreparexContext(ctx, sqlx.Rebind(bind, fmt.Sprintf(`INSERT INTO %s (%s) VALUES (%s)`, tableName, colList, placeholders)))
}
stmt, err := buildStmt(baseCols)
if err != nil {
tx.Rollback()
return err
// Fallback for legacy tables that still have IsPresent.
withLegacy := append(append([]string{}, baseCols...), "IsPresent")
stmt, err = buildStmt(withLegacy)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
for _, row := range rows {
args := []interface{}{
row.InventoryId,
row.Name,
row.Vcenter,
row.VmId,
row.EventKey,
row.CloudId,
row.CreationTime,
row.DeletionTime,
row.ResourcePool,
row.Datacenter,
row.Cluster,
row.Folder,
row.ProvisionedDisk,
row.VcpuCount,
row.RamGB,
row.IsTemplate,
row.PoweredOn,
row.SrmPlaceholder,
row.VmUuid,
row.SnapshotTime,
"TRUE",
}
if _, err := stmt.ExecContext(ctx, args...); err != nil {
tx.Rollback()
return err
}
}
return tx.Commit()
}
defer stmt.Close()
for _, row := range rows {
if _, err := stmt.ExecContext(ctx,
args := []interface{}{
row.InventoryId,
row.Name,
row.Vcenter,
@@ -730,8 +793,8 @@ INSERT INTO %s (
row.SrmPlaceholder,
row.VmUuid,
row.SnapshotTime,
row.IsPresent,
); err != nil {
}
if _, err := stmt.ExecContext(ctx, args...); err != nil {
tx.Rollback()
return err
}
@@ -760,6 +823,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
}
c.Logger.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
hostLookup, err := vc.BuildHostLookup()
if err != nil {
c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
@@ -808,7 +872,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
presentByName := make(map[string]struct{}, len(vcVms))
totals := snapshotTotals{}
deletionsMarked := false
for _, vm := range vcVms {
progressEvery := 25
nextLog := progressEvery
for idx, vm := range vcVms {
if strings.HasPrefix(vm.Name, "vCLS-") {
continue
}
@@ -828,7 +894,6 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
c.Logger.Error("unable to build snapshot for VM", "vm_id", vm.Reference().Value, "error", err)
continue
}
row.IsPresent = "TRUE"
presentSnapshots[vm.Reference().Value] = row
if row.VmUuid.Valid {
presentByUuid[row.VmUuid.String] = struct{}{}
@@ -841,16 +906,24 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
totals.VcpuTotal += nullInt64ToInt(row.VcpuCount)
totals.RamTotal += nullInt64ToInt(row.RamGB)
totals.DiskTotal += nullFloat64ToFloat(row.ProvisionedDisk)
if idx+1 >= nextLog {
c.Logger.Debug("hourly snapshot progress", "processed", idx+1, "total", len(vcVms), "vcenter", url)
nextLog += progressEvery
}
}
c.Logger.Debug("hourly snapshot rows prepared", "vcenter", url, "rows", len(presentSnapshots))
batch := make([]inventorySnapshotRow, 0, len(presentSnapshots)+len(inventoryRows))
for _, row := range presentSnapshots {
batch = append(batch, row)
}
c.Logger.Debug("checking inventory for missing VMs", "vcenter", url)
missingCount := 0
for _, inv := range inventoryRows {
c.Logger.Debug("checking inventory for deletions", "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "name", inv.Name)
if strings.HasPrefix(inv.Name, "vCLS-") {
continue
}
@@ -898,6 +971,8 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
missingCount++
}
c.Logger.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())