fix hanging manual snapshot task
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-01-15 17:49:42 +11:00
parent 1b91c73a18
commit 75a5f31a2f
6 changed files with 130 additions and 39 deletions

View File

@@ -185,7 +185,6 @@ SELECT
COALESCE(SUM(CASE WHEN "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total, COALESCE(SUM(CASE WHEN "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total,
COALESCE(SUM(CASE WHEN "ProvisionedDisk" IS NOT NULL THEN "ProvisionedDisk" ELSE 0 END), 0) AS disk_total COALESCE(SUM(CASE WHEN "ProvisionedDisk" IS NOT NULL THEN "ProvisionedDisk" ELSE 0 END), 0) AS disk_total
FROM %s FROM %s
WHERE "IsPresent" = 'TRUE'
`, table) `, table)
var totals SnapshotTotals var totals SnapshotTotals
@@ -206,7 +205,6 @@ SELECT
FROM ( FROM (
%s %s
) snapshots ) snapshots
WHERE "IsPresent" = 'TRUE'
`, unionQuery) `, unionQuery)
var totals SnapshotTotals var totals SnapshotTotals
@@ -247,8 +245,7 @@ func EnsureSnapshotTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
"PoweredOn" TEXT, "PoweredOn" TEXT,
"SrmPlaceholder" TEXT, "SrmPlaceholder" TEXT,
"VmUuid" TEXT, "VmUuid" TEXT,
"SnapshotTime" BIGINT NOT NULL, "SnapshotTime" BIGINT NOT NULL
"IsPresent" TEXT NOT NULL
);`, tableName) );`, tableName)
default: default:
ddl = fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s ( ddl = fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
@@ -272,8 +269,7 @@ func EnsureSnapshotTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
"PoweredOn" TEXT, "PoweredOn" TEXT,
"SrmPlaceholder" TEXT, "SrmPlaceholder" TEXT,
"VmUuid" TEXT, "VmUuid" TEXT,
"SnapshotTime" BIGINT NOT NULL, "SnapshotTime" BIGINT NOT NULL
"IsPresent" TEXT NOT NULL
);`, tableName) );`, tableName)
} }
@@ -449,7 +445,6 @@ SELECT
FROM snapshots s2 FROM snapshots s2
WHERE s2."VmId" = agg."VmId" WHERE s2."VmId" = agg."VmId"
AND s2."Vcenter" = agg."Vcenter" AND s2."Vcenter" = agg."Vcenter"
AND s2."IsPresent" = 'TRUE'
ORDER BY s2."SnapshotTime" DESC ORDER BY s2."SnapshotTime" DESC
LIMIT 1 LIMIT 1
) AS "ResourcePool", ) AS "ResourcePool",
@@ -818,6 +813,11 @@ func EnsureSummaryTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
return err return err
} }
// Best-effort: drop legacy IsPresent column if it exists.
if hasIsPresent, err := ColumnExists(ctx, dbConn, tableName, "IsPresent"); err == nil && hasIsPresent {
_, _ = dbConn.ExecContext(ctx, fmt.Sprintf(`ALTER TABLE %s DROP COLUMN "IsPresent"`, tableName))
}
indexes := []string{ indexes := []string{
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_vm_vcenter_idx ON %s ("VmId","Vcenter")`, tableName, tableName), fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_vm_vcenter_idx ON %s ("VmId","Vcenter")`, tableName, tableName),
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_resourcepool_idx ON %s ("ResourcePool")`, tableName, tableName), fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_resourcepool_idx ON %s ("ResourcePool")`, tableName, tableName),

View File

@@ -1027,13 +1027,13 @@ func buildHourlyTotals(ctx context.Context, dbConn *sqlx.DB, records []SnapshotR
query := fmt.Sprintf(` query := fmt.Sprintf(`
SELECT SELECT
COUNT(DISTINCT "VmId") AS vm_count, COUNT(DISTINCT "VmId") AS vm_count,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND "VcpuCount" IS NOT NULL THEN "VcpuCount" ELSE 0 END), 0) AS vcpu_total, COALESCE(SUM(CASE WHEN "VcpuCount" IS NOT NULL THEN "VcpuCount" ELSE 0 END), 0) AS vcpu_total,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total, COALESCE(SUM(CASE WHEN "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total,
COALESCE(AVG(CASE WHEN "IsPresent" = 'TRUE' THEN 1 ELSE 0 END), 0) AS presence_ratio, 1.0 AS presence_ratio,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'tin' THEN 1 ELSE 0 END), 0) AS tin_total, COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'tin' THEN 1 ELSE 0 END), 0) AS tin_total,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'bronze' THEN 1 ELSE 0 END), 0) AS bronze_total, COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'bronze' THEN 1 ELSE 0 END), 0) AS bronze_total,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'silver' THEN 1 ELSE 0 END), 0) AS silver_total, COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'silver' THEN 1 ELSE 0 END), 0) AS silver_total,
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'gold' THEN 1 ELSE 0 END), 0) AS gold_total COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'gold' THEN 1 ELSE 0 END), 0) AS gold_total
FROM %s FROM %s
WHERE %s WHERE %s
`, record.TableName, templateExclusionFilter()) `, record.TableName, templateExclusionFilter())

View File

@@ -129,6 +129,13 @@ func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Ti
c.Logger.Error("failed to aggregate daily inventory", "error", err, "date", dayStart.Format("2006-01-02")) c.Logger.Error("failed to aggregate daily inventory", "error", err, "date", dayStart.Format("2006-01-02"))
return err return err
} }
// Backfill missing creation times to the start of the day for rows where vCenter had no creation info.
if _, err := dbConn.ExecContext(ctx,
`UPDATE `+summaryTable+` SET "CreationTime" = $1 WHERE "CreationTime" IS NULL OR "CreationTime" = 0`,
dayStart.Unix(),
); err != nil {
c.Logger.Warn("failed to normalize creation times for daily summary", "error", err, "table", summaryTable)
}
if err := db.RefineCreationDeletionFromUnion(ctx, dbConn, summaryTable, unionQuery); err != nil { if err := db.RefineCreationDeletionFromUnion(ctx, dbConn, summaryTable, unionQuery); err != nil {
c.Logger.Warn("failed to refine creation/deletion times", "error", err, "table", summaryTable) c.Logger.Warn("failed to refine creation/deletion times", "error", err, "table", summaryTable)
} }

View File

@@ -43,7 +43,6 @@ type inventorySnapshotRow struct {
SrmPlaceholder string SrmPlaceholder string
VmUuid sql.NullString VmUuid sql.NullString
SnapshotTime int64 SnapshotTime int64
IsPresent string
} }
type snapshotTotals = db.SnapshotTotals type snapshotTotals = db.SnapshotTotals
@@ -87,6 +86,8 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
if err := db.EnsureSnapshotRunTable(ctx, c.Database.DB()); err != nil { if err := db.EnsureSnapshotRunTable(ctx, c.Database.DB()); err != nil {
return err return err
} }
// Best-effort cleanup of legacy IsPresent columns to simplify inserts.
c.dropLegacyIsPresentColumns(jobCtx)
// reload settings in case vcenter list has changed // reload settings in case vcenter list has changed
c.Settings.ReadYMLSettings() c.Settings.ReadYMLSettings()
@@ -186,6 +187,30 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
return nil return nil
} }
// dropLegacyIsPresentColumns attempts to remove the old IsPresent column from hourly tables.
// This keeps inserts simple and avoids keeping unused data around.
func (c *CronTask) dropLegacyIsPresentColumns(ctx context.Context) {
dbConn := c.Database.DB()
if err := report.EnsureSnapshotRegistry(ctx, c.Database); err != nil {
c.Logger.Debug("skip IsPresent cleanup; registry unavailable", "error", err)
return
}
records, err := report.ListSnapshots(ctx, c.Database, "hourly")
if err != nil {
c.Logger.Debug("skip IsPresent cleanup; unable to list hourly snapshots", "error", err)
return
}
for _, r := range records {
if ok, err := db.ColumnExists(ctx, dbConn, r.TableName, "IsPresent"); err == nil && ok {
if _, err := dbConn.ExecContext(ctx, fmt.Sprintf(`ALTER TABLE %s DROP COLUMN "IsPresent"`, r.TableName)); err != nil {
c.Logger.Debug("unable to drop legacy IsPresent column", "table", r.TableName, "error", err)
} else {
c.Logger.Info("dropped legacy IsPresent column", "table", r.TableName)
}
}
}
}
// RunHourlySnapshotRetry retries failed vCenter hourly snapshots up to a maximum attempt count. // RunHourlySnapshotRetry retries failed vCenter hourly snapshots up to a maximum attempt count.
func (c *CronTask) RunHourlySnapshotRetry(ctx context.Context, logger *slog.Logger) (err error) { func (c *CronTask) RunHourlySnapshotRetry(ctx context.Context, logger *slog.Logger) (err error) {
jobStart := time.Now() jobStart := time.Now()
@@ -440,7 +465,7 @@ var summaryUnionColumns = []string{
`"InventoryId"`, `"Name"`, `"Vcenter"`, `"VmId"`, `"EventKey"`, `"CloudId"`, `"CreationTime"`, `"InventoryId"`, `"Name"`, `"Vcenter"`, `"VmId"`, `"EventKey"`, `"CloudId"`, `"CreationTime"`,
`"DeletionTime"`, `"ResourcePool"`, `"Datacenter"`, `"Cluster"`, `"Folder"`, `"DeletionTime"`, `"ResourcePool"`, `"Datacenter"`, `"Cluster"`, `"Folder"`,
`"ProvisionedDisk"`, `"VcpuCount"`, `"RamGB"`, `"IsTemplate"`, `"PoweredOn"`, `"ProvisionedDisk"`, `"VcpuCount"`, `"RamGB"`, `"IsTemplate"`, `"PoweredOn"`,
`"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`, `"IsPresent"`, `"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`,
} }
func ensureSnapshotRowID(ctx context.Context, dbConn *sqlx.DB, tableName string) error { func ensureSnapshotRowID(ctx context.Context, dbConn *sqlx.DB, tableName string) error {
@@ -621,18 +646,13 @@ func snapshotFromVM(vmObject *mo.VirtualMachine, vc *vcenter.Vcenter, snapshotTi
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""} row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
} }
} }
if !row.ResourcePool.Valid {
if rpName, err := vc.GetVmResourcePool(*vmObject); err == nil {
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
}
}
} }
if row.Folder.String == "" { if row.Folder.String == "" {
if folderPath, ok := vc.GetVMFolderPathFromLookup(*vmObject, folderLookup); ok { if folderPath, ok := vc.GetVMFolderPathFromLookup(*vmObject, folderLookup); ok {
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""} row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""}
} else if folderPath, err := vc.GetVMFolderPath(*vmObject); err == nil { } else {
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""} // Unable to resolve folder path from lookup; leave empty.
} }
} }
@@ -695,21 +715,64 @@ func insertHourlyBatch(ctx context.Context, dbConn *sqlx.DB, tableName string, r
if err != nil { if err != nil {
return err return err
} }
stmt, err := tx.PreparexContext(ctx, sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), fmt.Sprintf(`
INSERT INTO %s ( baseCols := []string{
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime", "InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount", "ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime", "IsPresent" "RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime",
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) }
`, tableName))) bind := sqlx.BindType(dbConn.DriverName())
buildStmt := func(cols []string) (*sqlx.Stmt, error) {
colList := `"` + strings.Join(cols, `", "`) + `"`
placeholders := strings.TrimRight(strings.Repeat("?, ", len(cols)), ", ")
return tx.PreparexContext(ctx, sqlx.Rebind(bind, fmt.Sprintf(`INSERT INTO %s (%s) VALUES (%s)`, tableName, colList, placeholders)))
}
stmt, err := buildStmt(baseCols)
if err != nil { if err != nil {
tx.Rollback() // Fallback for legacy tables that still have IsPresent.
return err withLegacy := append(append([]string{}, baseCols...), "IsPresent")
stmt, err = buildStmt(withLegacy)
if err != nil {
tx.Rollback()
return err
}
defer stmt.Close()
for _, row := range rows {
args := []interface{}{
row.InventoryId,
row.Name,
row.Vcenter,
row.VmId,
row.EventKey,
row.CloudId,
row.CreationTime,
row.DeletionTime,
row.ResourcePool,
row.Datacenter,
row.Cluster,
row.Folder,
row.ProvisionedDisk,
row.VcpuCount,
row.RamGB,
row.IsTemplate,
row.PoweredOn,
row.SrmPlaceholder,
row.VmUuid,
row.SnapshotTime,
"TRUE",
}
if _, err := stmt.ExecContext(ctx, args...); err != nil {
tx.Rollback()
return err
}
}
return tx.Commit()
} }
defer stmt.Close() defer stmt.Close()
for _, row := range rows { for _, row := range rows {
if _, err := stmt.ExecContext(ctx, args := []interface{}{
row.InventoryId, row.InventoryId,
row.Name, row.Name,
row.Vcenter, row.Vcenter,
@@ -730,8 +793,8 @@ INSERT INTO %s (
row.SrmPlaceholder, row.SrmPlaceholder,
row.VmUuid, row.VmUuid,
row.SnapshotTime, row.SnapshotTime,
row.IsPresent, }
); err != nil { if _, err := stmt.ExecContext(ctx, args...); err != nil {
tx.Rollback() tx.Rollback()
return err return err
} }
@@ -760,6 +823,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()) _ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
return fmt.Errorf("unable to get VMs from vcenter: %w", err) return fmt.Errorf("unable to get VMs from vcenter: %w", err)
} }
c.Logger.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
hostLookup, err := vc.BuildHostLookup() hostLookup, err := vc.BuildHostLookup()
if err != nil { if err != nil {
c.Logger.Warn("failed to build host lookup", "url", url, "error", err) c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
@@ -808,7 +872,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
presentByName := make(map[string]struct{}, len(vcVms)) presentByName := make(map[string]struct{}, len(vcVms))
totals := snapshotTotals{} totals := snapshotTotals{}
deletionsMarked := false deletionsMarked := false
for _, vm := range vcVms { progressEvery := 25
nextLog := progressEvery
for idx, vm := range vcVms {
if strings.HasPrefix(vm.Name, "vCLS-") { if strings.HasPrefix(vm.Name, "vCLS-") {
continue continue
} }
@@ -828,7 +894,6 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
c.Logger.Error("unable to build snapshot for VM", "vm_id", vm.Reference().Value, "error", err) c.Logger.Error("unable to build snapshot for VM", "vm_id", vm.Reference().Value, "error", err)
continue continue
} }
row.IsPresent = "TRUE"
presentSnapshots[vm.Reference().Value] = row presentSnapshots[vm.Reference().Value] = row
if row.VmUuid.Valid { if row.VmUuid.Valid {
presentByUuid[row.VmUuid.String] = struct{}{} presentByUuid[row.VmUuid.String] = struct{}{}
@@ -841,16 +906,24 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
totals.VcpuTotal += nullInt64ToInt(row.VcpuCount) totals.VcpuTotal += nullInt64ToInt(row.VcpuCount)
totals.RamTotal += nullInt64ToInt(row.RamGB) totals.RamTotal += nullInt64ToInt(row.RamGB)
totals.DiskTotal += nullFloat64ToFloat(row.ProvisionedDisk) totals.DiskTotal += nullFloat64ToFloat(row.ProvisionedDisk)
if idx+1 >= nextLog {
c.Logger.Debug("hourly snapshot progress", "processed", idx+1, "total", len(vcVms), "vcenter", url)
nextLog += progressEvery
}
} }
c.Logger.Debug("hourly snapshot rows prepared", "vcenter", url, "rows", len(presentSnapshots))
batch := make([]inventorySnapshotRow, 0, len(presentSnapshots)+len(inventoryRows)) batch := make([]inventorySnapshotRow, 0, len(presentSnapshots)+len(inventoryRows))
for _, row := range presentSnapshots { for _, row := range presentSnapshots {
batch = append(batch, row) batch = append(batch, row)
} }
c.Logger.Debug("checking inventory for missing VMs", "vcenter", url)
missingCount := 0 missingCount := 0
for _, inv := range inventoryRows { for _, inv := range inventoryRows {
c.Logger.Debug("checking inventory for deletions", "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "name", inv.Name)
if strings.HasPrefix(inv.Name, "vCLS-") { if strings.HasPrefix(inv.Name, "vCLS-") {
continue continue
} }
@@ -898,6 +971,8 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
missingCount++ missingCount++
} }
c.Logger.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil { if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err) metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()) _ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())

View File

@@ -99,6 +99,13 @@ func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time
c.Logger.Error("failed to aggregate monthly inventory", "error", err, "month", targetMonth.Format("2006-01")) c.Logger.Error("failed to aggregate monthly inventory", "error", err, "month", targetMonth.Format("2006-01"))
return err return err
} }
// Backfill missing creation times to the start of the month for rows lacking creation info.
if _, err := dbConn.ExecContext(ctx,
`UPDATE `+monthlyTable+` SET "CreationTime" = $1 WHERE "CreationTime" IS NULL OR "CreationTime" = 0`,
monthStart.Unix(),
); err != nil {
c.Logger.Warn("failed to normalize creation times for monthly summary", "error", err, "table", monthlyTable)
}
rowCount, err := db.TableRowCount(ctx, dbConn, monthlyTable) rowCount, err := db.TableRowCount(ctx, dbConn, monthlyTable)
if err != nil { if err != nil {
c.Logger.Warn("unable to count monthly summary rows", "error", err, "table", monthlyTable) c.Logger.Warn("unable to count monthly summary rows", "error", err, "table", monthlyTable)

View File

@@ -663,7 +663,8 @@ func (v *Vcenter) GetVMFolderPath(vm mo.VirtualMachine) (string, error) {
folderPath := "" folderPath := ""
//v.Logger.Debug("parent is", "parent", parentRef) //v.Logger.Debug("parent is", "parent", parentRef)
for parentRef != nil && parentRef.Type != "Datacenter" { maxHops := 128
for parentRef != nil && parentRef.Type != "Datacenter" && maxHops > 0 {
// Retrieve the parent object // Retrieve the parent object
//parentObj, err := finder.ObjectReference(v.ctx, *parentRef) //parentObj, err := finder.ObjectReference(v.ctx, *parentRef)
//if err != nil { //if err != nil {
@@ -691,9 +692,10 @@ func (v *Vcenter) GetVMFolderPath(vm mo.VirtualMachine) (string, error) {
return "", fmt.Errorf("unexpected parent type: %s", parentObj.Reference().Type) return "", fmt.Errorf("unexpected parent type: %s", parentObj.Reference().Type)
} }
//break //break
maxHops--
} }
if parentRef == nil { if parentRef == nil || maxHops == 0 {
return "", fmt.Errorf("folder traversal terminated early for VM %s", vm.Name) return "", fmt.Errorf("folder traversal terminated early for VM %s", vm.Name)
} }