fix hanging manual snapshot task
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
@@ -185,7 +185,6 @@ SELECT
|
||||
COALESCE(SUM(CASE WHEN "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total,
|
||||
COALESCE(SUM(CASE WHEN "ProvisionedDisk" IS NOT NULL THEN "ProvisionedDisk" ELSE 0 END), 0) AS disk_total
|
||||
FROM %s
|
||||
WHERE "IsPresent" = 'TRUE'
|
||||
`, table)
|
||||
|
||||
var totals SnapshotTotals
|
||||
@@ -206,7 +205,6 @@ SELECT
|
||||
FROM (
|
||||
%s
|
||||
) snapshots
|
||||
WHERE "IsPresent" = 'TRUE'
|
||||
`, unionQuery)
|
||||
|
||||
var totals SnapshotTotals
|
||||
@@ -247,8 +245,7 @@ func EnsureSnapshotTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
|
||||
"PoweredOn" TEXT,
|
||||
"SrmPlaceholder" TEXT,
|
||||
"VmUuid" TEXT,
|
||||
"SnapshotTime" BIGINT NOT NULL,
|
||||
"IsPresent" TEXT NOT NULL
|
||||
"SnapshotTime" BIGINT NOT NULL
|
||||
);`, tableName)
|
||||
default:
|
||||
ddl = fmt.Sprintf(`CREATE TABLE IF NOT EXISTS %s (
|
||||
@@ -272,8 +269,7 @@ func EnsureSnapshotTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
|
||||
"PoweredOn" TEXT,
|
||||
"SrmPlaceholder" TEXT,
|
||||
"VmUuid" TEXT,
|
||||
"SnapshotTime" BIGINT NOT NULL,
|
||||
"IsPresent" TEXT NOT NULL
|
||||
"SnapshotTime" BIGINT NOT NULL
|
||||
);`, tableName)
|
||||
}
|
||||
|
||||
@@ -449,7 +445,6 @@ SELECT
|
||||
FROM snapshots s2
|
||||
WHERE s2."VmId" = agg."VmId"
|
||||
AND s2."Vcenter" = agg."Vcenter"
|
||||
AND s2."IsPresent" = 'TRUE'
|
||||
ORDER BY s2."SnapshotTime" DESC
|
||||
LIMIT 1
|
||||
) AS "ResourcePool",
|
||||
@@ -818,6 +813,11 @@ func EnsureSummaryTable(ctx context.Context, dbConn *sqlx.DB, tableName string)
|
||||
return err
|
||||
}
|
||||
|
||||
// Best-effort: drop legacy IsPresent column if it exists.
|
||||
if hasIsPresent, err := ColumnExists(ctx, dbConn, tableName, "IsPresent"); err == nil && hasIsPresent {
|
||||
_, _ = dbConn.ExecContext(ctx, fmt.Sprintf(`ALTER TABLE %s DROP COLUMN "IsPresent"`, tableName))
|
||||
}
|
||||
|
||||
indexes := []string{
|
||||
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_vm_vcenter_idx ON %s ("VmId","Vcenter")`, tableName, tableName),
|
||||
fmt.Sprintf(`CREATE INDEX IF NOT EXISTS %s_resourcepool_idx ON %s ("ResourcePool")`, tableName, tableName),
|
||||
|
||||
@@ -1027,13 +1027,13 @@ func buildHourlyTotals(ctx context.Context, dbConn *sqlx.DB, records []SnapshotR
|
||||
query := fmt.Sprintf(`
|
||||
SELECT
|
||||
COUNT(DISTINCT "VmId") AS vm_count,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND "VcpuCount" IS NOT NULL THEN "VcpuCount" ELSE 0 END), 0) AS vcpu_total,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total,
|
||||
COALESCE(AVG(CASE WHEN "IsPresent" = 'TRUE' THEN 1 ELSE 0 END), 0) AS presence_ratio,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'tin' THEN 1 ELSE 0 END), 0) AS tin_total,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'bronze' THEN 1 ELSE 0 END), 0) AS bronze_total,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'silver' THEN 1 ELSE 0 END), 0) AS silver_total,
|
||||
COALESCE(SUM(CASE WHEN "IsPresent" = 'TRUE' AND LOWER("ResourcePool") = 'gold' THEN 1 ELSE 0 END), 0) AS gold_total
|
||||
COALESCE(SUM(CASE WHEN "VcpuCount" IS NOT NULL THEN "VcpuCount" ELSE 0 END), 0) AS vcpu_total,
|
||||
COALESCE(SUM(CASE WHEN "RamGB" IS NOT NULL THEN "RamGB" ELSE 0 END), 0) AS ram_total,
|
||||
1.0 AS presence_ratio,
|
||||
COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'tin' THEN 1 ELSE 0 END), 0) AS tin_total,
|
||||
COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'bronze' THEN 1 ELSE 0 END), 0) AS bronze_total,
|
||||
COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'silver' THEN 1 ELSE 0 END), 0) AS silver_total,
|
||||
COALESCE(SUM(CASE WHEN LOWER("ResourcePool") = 'gold' THEN 1 ELSE 0 END), 0) AS gold_total
|
||||
FROM %s
|
||||
WHERE %s
|
||||
`, record.TableName, templateExclusionFilter())
|
||||
|
||||
@@ -129,6 +129,13 @@ func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Ti
|
||||
c.Logger.Error("failed to aggregate daily inventory", "error", err, "date", dayStart.Format("2006-01-02"))
|
||||
return err
|
||||
}
|
||||
// Backfill missing creation times to the start of the day for rows where vCenter had no creation info.
|
||||
if _, err := dbConn.ExecContext(ctx,
|
||||
`UPDATE `+summaryTable+` SET "CreationTime" = $1 WHERE "CreationTime" IS NULL OR "CreationTime" = 0`,
|
||||
dayStart.Unix(),
|
||||
); err != nil {
|
||||
c.Logger.Warn("failed to normalize creation times for daily summary", "error", err, "table", summaryTable)
|
||||
}
|
||||
if err := db.RefineCreationDeletionFromUnion(ctx, dbConn, summaryTable, unionQuery); err != nil {
|
||||
c.Logger.Warn("failed to refine creation/deletion times", "error", err, "table", summaryTable)
|
||||
}
|
||||
|
||||
@@ -43,7 +43,6 @@ type inventorySnapshotRow struct {
|
||||
SrmPlaceholder string
|
||||
VmUuid sql.NullString
|
||||
SnapshotTime int64
|
||||
IsPresent string
|
||||
}
|
||||
|
||||
type snapshotTotals = db.SnapshotTotals
|
||||
@@ -87,6 +86,8 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
|
||||
if err := db.EnsureSnapshotRunTable(ctx, c.Database.DB()); err != nil {
|
||||
return err
|
||||
}
|
||||
// Best-effort cleanup of legacy IsPresent columns to simplify inserts.
|
||||
c.dropLegacyIsPresentColumns(jobCtx)
|
||||
|
||||
// reload settings in case vcenter list has changed
|
||||
c.Settings.ReadYMLSettings()
|
||||
@@ -186,6 +187,30 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
|
||||
return nil
|
||||
}
|
||||
|
||||
// dropLegacyIsPresentColumns attempts to remove the old IsPresent column from hourly tables.
|
||||
// This keeps inserts simple and avoids keeping unused data around.
|
||||
func (c *CronTask) dropLegacyIsPresentColumns(ctx context.Context) {
|
||||
dbConn := c.Database.DB()
|
||||
if err := report.EnsureSnapshotRegistry(ctx, c.Database); err != nil {
|
||||
c.Logger.Debug("skip IsPresent cleanup; registry unavailable", "error", err)
|
||||
return
|
||||
}
|
||||
records, err := report.ListSnapshots(ctx, c.Database, "hourly")
|
||||
if err != nil {
|
||||
c.Logger.Debug("skip IsPresent cleanup; unable to list hourly snapshots", "error", err)
|
||||
return
|
||||
}
|
||||
for _, r := range records {
|
||||
if ok, err := db.ColumnExists(ctx, dbConn, r.TableName, "IsPresent"); err == nil && ok {
|
||||
if _, err := dbConn.ExecContext(ctx, fmt.Sprintf(`ALTER TABLE %s DROP COLUMN "IsPresent"`, r.TableName)); err != nil {
|
||||
c.Logger.Debug("unable to drop legacy IsPresent column", "table", r.TableName, "error", err)
|
||||
} else {
|
||||
c.Logger.Info("dropped legacy IsPresent column", "table", r.TableName)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunHourlySnapshotRetry retries failed vCenter hourly snapshots up to a maximum attempt count.
|
||||
func (c *CronTask) RunHourlySnapshotRetry(ctx context.Context, logger *slog.Logger) (err error) {
|
||||
jobStart := time.Now()
|
||||
@@ -440,7 +465,7 @@ var summaryUnionColumns = []string{
|
||||
`"InventoryId"`, `"Name"`, `"Vcenter"`, `"VmId"`, `"EventKey"`, `"CloudId"`, `"CreationTime"`,
|
||||
`"DeletionTime"`, `"ResourcePool"`, `"Datacenter"`, `"Cluster"`, `"Folder"`,
|
||||
`"ProvisionedDisk"`, `"VcpuCount"`, `"RamGB"`, `"IsTemplate"`, `"PoweredOn"`,
|
||||
`"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`, `"IsPresent"`,
|
||||
`"SrmPlaceholder"`, `"VmUuid"`, `"SnapshotTime"`,
|
||||
}
|
||||
|
||||
func ensureSnapshotRowID(ctx context.Context, dbConn *sqlx.DB, tableName string) error {
|
||||
@@ -621,18 +646,13 @@ func snapshotFromVM(vmObject *mo.VirtualMachine, vc *vcenter.Vcenter, snapshotTi
|
||||
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
|
||||
}
|
||||
}
|
||||
if !row.ResourcePool.Valid {
|
||||
if rpName, err := vc.GetVmResourcePool(*vmObject); err == nil {
|
||||
row.ResourcePool = sql.NullString{String: normalizeResourcePool(rpName), Valid: rpName != ""}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if row.Folder.String == "" {
|
||||
if folderPath, ok := vc.GetVMFolderPathFromLookup(*vmObject, folderLookup); ok {
|
||||
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""}
|
||||
} else if folderPath, err := vc.GetVMFolderPath(*vmObject); err == nil {
|
||||
row.Folder = sql.NullString{String: folderPath, Valid: folderPath != ""}
|
||||
} else {
|
||||
// Unable to resolve folder path from lookup; leave empty.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -695,21 +715,31 @@ func insertHourlyBatch(ctx context.Context, dbConn *sqlx.DB, tableName string, r
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
stmt, err := tx.PreparexContext(ctx, sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), fmt.Sprintf(`
|
||||
INSERT INTO %s (
|
||||
|
||||
baseCols := []string{
|
||||
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
|
||||
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
|
||||
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime", "IsPresent"
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`, tableName)))
|
||||
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime",
|
||||
}
|
||||
bind := sqlx.BindType(dbConn.DriverName())
|
||||
buildStmt := func(cols []string) (*sqlx.Stmt, error) {
|
||||
colList := `"` + strings.Join(cols, `", "`) + `"`
|
||||
placeholders := strings.TrimRight(strings.Repeat("?, ", len(cols)), ", ")
|
||||
return tx.PreparexContext(ctx, sqlx.Rebind(bind, fmt.Sprintf(`INSERT INTO %s (%s) VALUES (%s)`, tableName, colList, placeholders)))
|
||||
}
|
||||
|
||||
stmt, err := buildStmt(baseCols)
|
||||
if err != nil {
|
||||
// Fallback for legacy tables that still have IsPresent.
|
||||
withLegacy := append(append([]string{}, baseCols...), "IsPresent")
|
||||
stmt, err = buildStmt(withLegacy)
|
||||
if err != nil {
|
||||
tx.Rollback()
|
||||
return err
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, row := range rows {
|
||||
if _, err := stmt.ExecContext(ctx,
|
||||
args := []interface{}{
|
||||
row.InventoryId,
|
||||
row.Name,
|
||||
row.Vcenter,
|
||||
@@ -730,8 +760,41 @@ INSERT INTO %s (
|
||||
row.SrmPlaceholder,
|
||||
row.VmUuid,
|
||||
row.SnapshotTime,
|
||||
row.IsPresent,
|
||||
); err != nil {
|
||||
"TRUE",
|
||||
}
|
||||
if _, err := stmt.ExecContext(ctx, args...); err != nil {
|
||||
tx.Rollback()
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, row := range rows {
|
||||
args := []interface{}{
|
||||
row.InventoryId,
|
||||
row.Name,
|
||||
row.Vcenter,
|
||||
row.VmId,
|
||||
row.EventKey,
|
||||
row.CloudId,
|
||||
row.CreationTime,
|
||||
row.DeletionTime,
|
||||
row.ResourcePool,
|
||||
row.Datacenter,
|
||||
row.Cluster,
|
||||
row.Folder,
|
||||
row.ProvisionedDisk,
|
||||
row.VcpuCount,
|
||||
row.RamGB,
|
||||
row.IsTemplate,
|
||||
row.PoweredOn,
|
||||
row.SrmPlaceholder,
|
||||
row.VmUuid,
|
||||
row.SnapshotTime,
|
||||
}
|
||||
if _, err := stmt.ExecContext(ctx, args...); err != nil {
|
||||
tx.Rollback()
|
||||
return err
|
||||
}
|
||||
@@ -760,6 +823,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
|
||||
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
|
||||
}
|
||||
c.Logger.Debug("retrieved VMs from vcenter", "url", url, "vm_count", len(vcVms))
|
||||
hostLookup, err := vc.BuildHostLookup()
|
||||
if err != nil {
|
||||
c.Logger.Warn("failed to build host lookup", "url", url, "error", err)
|
||||
@@ -808,7 +872,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
presentByName := make(map[string]struct{}, len(vcVms))
|
||||
totals := snapshotTotals{}
|
||||
deletionsMarked := false
|
||||
for _, vm := range vcVms {
|
||||
progressEvery := 25
|
||||
nextLog := progressEvery
|
||||
for idx, vm := range vcVms {
|
||||
if strings.HasPrefix(vm.Name, "vCLS-") {
|
||||
continue
|
||||
}
|
||||
@@ -828,7 +894,6 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
c.Logger.Error("unable to build snapshot for VM", "vm_id", vm.Reference().Value, "error", err)
|
||||
continue
|
||||
}
|
||||
row.IsPresent = "TRUE"
|
||||
presentSnapshots[vm.Reference().Value] = row
|
||||
if row.VmUuid.Valid {
|
||||
presentByUuid[row.VmUuid.String] = struct{}{}
|
||||
@@ -841,16 +906,24 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
totals.VcpuTotal += nullInt64ToInt(row.VcpuCount)
|
||||
totals.RamTotal += nullInt64ToInt(row.RamGB)
|
||||
totals.DiskTotal += nullFloat64ToFloat(row.ProvisionedDisk)
|
||||
|
||||
if idx+1 >= nextLog {
|
||||
c.Logger.Debug("hourly snapshot progress", "processed", idx+1, "total", len(vcVms), "vcenter", url)
|
||||
nextLog += progressEvery
|
||||
}
|
||||
}
|
||||
c.Logger.Debug("hourly snapshot rows prepared", "vcenter", url, "rows", len(presentSnapshots))
|
||||
|
||||
batch := make([]inventorySnapshotRow, 0, len(presentSnapshots)+len(inventoryRows))
|
||||
for _, row := range presentSnapshots {
|
||||
batch = append(batch, row)
|
||||
}
|
||||
c.Logger.Debug("checking inventory for missing VMs", "vcenter", url)
|
||||
|
||||
missingCount := 0
|
||||
|
||||
for _, inv := range inventoryRows {
|
||||
c.Logger.Debug("checking inventory for deletions", "vm_id", inv.VmId.String, "vm_uuid", inv.VmUuid.String, "name", inv.Name)
|
||||
if strings.HasPrefix(inv.Name, "vCLS-") {
|
||||
continue
|
||||
}
|
||||
@@ -898,6 +971,8 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
missingCount++
|
||||
}
|
||||
|
||||
c.Logger.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
|
||||
|
||||
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
|
||||
_ = db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error())
|
||||
|
||||
@@ -99,6 +99,13 @@ func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time
|
||||
c.Logger.Error("failed to aggregate monthly inventory", "error", err, "month", targetMonth.Format("2006-01"))
|
||||
return err
|
||||
}
|
||||
// Backfill missing creation times to the start of the month for rows lacking creation info.
|
||||
if _, err := dbConn.ExecContext(ctx,
|
||||
`UPDATE `+monthlyTable+` SET "CreationTime" = $1 WHERE "CreationTime" IS NULL OR "CreationTime" = 0`,
|
||||
monthStart.Unix(),
|
||||
); err != nil {
|
||||
c.Logger.Warn("failed to normalize creation times for monthly summary", "error", err, "table", monthlyTable)
|
||||
}
|
||||
rowCount, err := db.TableRowCount(ctx, dbConn, monthlyTable)
|
||||
if err != nil {
|
||||
c.Logger.Warn("unable to count monthly summary rows", "error", err, "table", monthlyTable)
|
||||
|
||||
@@ -663,7 +663,8 @@ func (v *Vcenter) GetVMFolderPath(vm mo.VirtualMachine) (string, error) {
|
||||
folderPath := ""
|
||||
//v.Logger.Debug("parent is", "parent", parentRef)
|
||||
|
||||
for parentRef != nil && parentRef.Type != "Datacenter" {
|
||||
maxHops := 128
|
||||
for parentRef != nil && parentRef.Type != "Datacenter" && maxHops > 0 {
|
||||
// Retrieve the parent object
|
||||
//parentObj, err := finder.ObjectReference(v.ctx, *parentRef)
|
||||
//if err != nil {
|
||||
@@ -691,9 +692,10 @@ func (v *Vcenter) GetVMFolderPath(vm mo.VirtualMachine) (string, error) {
|
||||
return "", fmt.Errorf("unexpected parent type: %s", parentObj.Reference().Type)
|
||||
}
|
||||
//break
|
||||
maxHops--
|
||||
}
|
||||
|
||||
if parentRef == nil {
|
||||
if parentRef == nil || maxHops == 0 {
|
||||
return "", fmt.Errorf("folder traversal terminated early for VM %s", vm.Name)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user