enhance utilisation of postgres features
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-04-20 10:19:27 +10:00
parent 98e92a8264
commit 8ccf5a7009
28 changed files with 2836 additions and 422 deletions
+245 -80
View File
@@ -121,6 +121,7 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
if err := c.Settings.ReadYMLSettings(); err != nil {
return err
}
db.SetVmHourlyStatsPostgresPartitioningEnabled(c.postgresVmHourlyPartitioningEnabled())
ctx = settings.MarkReloadedInContext(ctx, c.Settings)
if c.FirstHourlySnapshotCheck {
@@ -143,15 +144,20 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
c.FirstHourlySnapshotCheck = false
}
tableName, err := hourlyInventoryTableName(startTime)
if err != nil {
return err
}
dbConn := c.Database.DB()
db.ApplySQLiteTuning(ctx, dbConn)
if err := ensureDailyInventoryTable(ctx, dbConn, tableName); err != nil {
return err
compatMode := c.snapshotTableCompatModeEnabled()
tableName := ""
if compatMode {
tableName, err = hourlyInventoryTableName(startTime)
if err != nil {
return err
}
if err := ensureDailyInventoryTable(ctx, dbConn, tableName); err != nil {
return err
}
} else {
c.Logger.Info("Snapshot table compatibility mode disabled; writing canonical hourly cache only")
}
var wg sync.WaitGroup
@@ -202,17 +208,21 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
return err
}
rowCount, err := db.TableRowCount(ctx, dbConn, tableName)
if err != nil {
c.Logger.Warn("unable to count hourly snapshot rows", "error", err, "table", tableName)
rowCount = -1
}
if err := report.RegisterSnapshot(ctx, c.Database, "hourly", tableName, startTime, rowCount); err != nil {
c.Logger.Warn("failed to register hourly snapshot", "error", err, "table", tableName)
rowCount := int64(-1)
if tableName != "" {
var countErr error
rowCount, countErr = db.TableRowCount(ctx, dbConn, tableName)
if countErr != nil {
c.Logger.Warn("unable to count hourly snapshot rows", "error", countErr, "table", tableName)
rowCount = -1
}
if err := report.RegisterSnapshot(ctx, c.Database, "hourly", tableName, startTime, rowCount); err != nil {
c.Logger.Warn("failed to register hourly snapshot", "error", err, "table", tableName)
}
}
metrics.RecordHourlySnapshot(startTime, rowCount, err)
var deferredTables []string
deferredTables := make([]string, 0, 8)
deferredReportTables.Range(func(key, _ any) bool {
name, ok := key.(string)
if ok && strings.TrimSpace(name) != "" && name != tableName {
@@ -220,17 +230,31 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
}
return true
})
sort.Strings(deferredTables)
for _, reportTable := range deferredTables {
if err := c.generateReport(ctx, reportTable); err != nil {
c.Logger.Warn("failed to regenerate deferred hourly report after deletions", "error", err, "table", reportTable)
} else {
c.Logger.Debug("Regenerated deferred hourly report after deletions", "table", reportTable)
if tableName != "" {
deferredTables = append(deferredTables, tableName)
}
deferredTables = normalizeReportTables(deferredTables)
reportStageStart := time.Now()
reportMode := "sync"
if c.asyncReportGenerationEnabled() {
reportMode = "async"
c.queueReportGeneration(deferredTables)
} else {
for _, reportTable := range deferredTables {
if err := c.generateReport(ctx, reportTable); err != nil {
c.Logger.Warn("failed to regenerate deferred hourly report after deletions", "error", err, "table", reportTable)
} else {
c.Logger.Debug("Regenerated deferred hourly report after deletions", "table", reportTable)
}
}
}
if err := c.generateReport(ctx, tableName); err != nil {
c.Logger.Warn("failed to generate hourly report", "error", err, "table", tableName)
}
c.Logger.Info(
"Hourly snapshot stage complete",
"stage", "report_generation",
"mode", reportMode,
"tables", len(deferredTables),
"duration", time.Since(reportStageStart),
)
c.Logger.Debug("Finished hourly vcenter snapshot", "vcenter_count", len(c.Settings.Values.Settings.VcenterAddresses), "table", tableName, "row_count", rowCount)
return nil
@@ -631,6 +655,13 @@ func intWithDefault(value int, fallback int) int {
return value
}
func boolWithDefault(value *bool, fallback bool) bool {
if value == nil {
return fallback
}
return *value
}
func durationFromSeconds(seconds int, fallback time.Duration) time.Duration {
if seconds > 0 {
return time.Duration(seconds) * time.Second
@@ -665,6 +696,96 @@ func (c *CronTask) reportsDir() string {
return "/var/lib/vctp/reports"
}
func (c *CronTask) captureWriteBatchSize() int {
if c.Settings != nil && c.Settings.Values != nil {
return intWithDefault(c.Settings.Values.Settings.CaptureWriteBatchSize, 1000)
}
return 1000
}
func (c *CronTask) snapshotTableCompatModeEnabled() bool {
if c.Settings != nil && c.Settings.Values != nil {
return boolWithDefault(c.Settings.Values.Settings.SnapshotTableCompatMode, true)
}
return true
}
func (c *CronTask) asyncReportGenerationEnabled() bool {
if c.Settings != nil && c.Settings.Values != nil {
return boolWithDefault(c.Settings.Values.Settings.AsyncReportGeneration, true)
}
return true
}
func (c *CronTask) postgresVmHourlyPartitioningEnabled() bool {
if c.Settings != nil && c.Settings.Values != nil {
return boolWithDefault(c.Settings.Values.Settings.PostgresVmHourlyPartitioning, false)
}
return false
}
func (c *CronTask) scheduledAggregationEngine() string {
if c.Settings == nil || c.Settings.Values == nil {
return "go"
}
engine := strings.ToLower(strings.TrimSpace(c.Settings.Values.Settings.ScheduledAggregationEngine))
if engine == "" {
return "go"
}
switch engine {
case "go", "sql":
return engine
default:
return "go"
}
}
func normalizeReportTables(tables []string) []string {
if len(tables) == 0 {
return nil
}
seen := make(map[string]struct{}, len(tables))
out := make([]string, 0, len(tables))
for _, table := range tables {
trimmed := strings.TrimSpace(table)
if trimmed == "" {
continue
}
if _, ok := seen[trimmed]; ok {
continue
}
seen[trimmed] = struct{}{}
out = append(out, trimmed)
}
sort.Strings(out)
return out
}
func (c *CronTask) queueReportGeneration(tables []string) {
tables = normalizeReportTables(tables)
if len(tables) == 0 {
return
}
c.Logger.Info("Queueing async report generation", "tables", len(tables))
go func(reportTables []string) {
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
defer cancel()
for _, reportTable := range reportTables {
if err := c.generateReport(ctx, reportTable); err != nil {
c.Logger.Warn("failed to generate async report", "table", reportTable, "error", err)
}
}
}(append([]string(nil), tables...))
}
func (c *CronTask) generateReportWithPolicy(ctx context.Context, table string) error {
if c.asyncReportGenerationEnabled() {
c.queueReportGeneration([]string{table})
return nil
}
return c.generateReport(ctx, table)
}
func (c *CronTask) generateReport(ctx context.Context, tableName string) error {
dest := c.reportsDir()
start := time.Now()
@@ -1332,6 +1453,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
log := c.Logger.With("vcenter", url)
ctx = db.WithLoggerContext(ctx, log)
started := time.Now()
captureStageStart := time.Now()
log.Debug("connecting to vcenter for hourly snapshot", "url", url)
vc, resources, cleanup, err := c.initVcenterResources(ctx, log, url, startTime, started)
if err != nil {
@@ -1365,12 +1487,54 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
for _, row := range presentSnapshots {
batch = append(batch, row)
}
log.Info(
"Hourly snapshot stage complete",
"stage", "capture",
"duration", time.Since(captureStageStart),
"present_rows", len(presentSnapshots),
"inventory_rows", len(inventoryRows),
"batch_rows", len(batch),
)
log.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
writeBatchSize := c.captureWriteBatchSize()
for start := 0; start < len(batch); start += writeBatchSize {
end := min(start+writeBatchSize, len(batch))
chunk := batch[start:end]
if err := insertHourlyCache(ctx, dbConn, chunk); err != nil {
log.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err, "chunk_start", start, "chunk_size", len(chunk))
}
if tableName != "" {
if err := insertHourlyBatch(ctx, dbConn, tableName, chunk); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
return err
}
}
}
// Record per-vCenter totals snapshot.
totalsStageStart := time.Now()
if err := db.InsertVcenterTotals(ctx, dbConn, url, startTime, totals.VmCount, totals.VcpuTotal, totals.RamTotal); err != nil {
slog.Warn("failed to insert vcenter totals", "vcenter", url, "snapshot_time", startTime.Unix(), "error", err)
}
log.Info(
"Hourly snapshot stage complete",
"stage", "totals_refresh",
"duration", time.Since(totalsStageStart),
"vm_count", totals.VmCount,
)
log.Debug("checking inventory for missing VMs")
reconcileStageStart := time.Now()
missingCount, deletionsMarked, candidates := prepareDeletionCandidates(ctx, log, dbConn, q, url, inventoryRows, presentSnapshots, presentByUuid, presentByName, startTime)
newCount := 0
prevTableName := ""
reportTables := make(map[string]struct{})
compatSnapshotUpdates := strings.TrimSpace(tableName) != ""
// If deletions detected, refine deletion time using vCenter events in a small window.
if missingCount > 0 {
@@ -1461,18 +1625,20 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
if name == "" {
name = snapRow.Name
}
if rowsAffected, err := updateDeletionTimeInSnapshot(ctx, dbConn, snapTable, url, cand.vmID, vmUUID, name, delTs.Int64); err != nil {
log.Warn("failed to update hourly snapshot deletion time from event", "table", snapTable, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "error", err)
} else if rowsAffected > 0 {
reportTables[snapTable] = struct{}{}
deletionsMarked = true
log.Debug("updated hourly snapshot deletion time from event", "table", snapTable, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "event_time", t)
if snapUnix, ok := parseSnapshotTime(snapTable); ok {
if cacheRows, err := updateDeletionTimeInHourlyCache(ctx, dbConn, url, cand.vmID, vmUUID, name, snapUnix, delTs.Int64); err != nil {
log.Warn("failed to update hourly cache deletion time from event", "snapshot_time", snapUnix, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "error", err)
} else if cacheRows > 0 {
log.Debug("updated hourly cache deletion time from event", "snapshot_time", snapUnix, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "event_time", t)
}
if snapUnix, ok := parseSnapshotTime(snapTable); ok {
if cacheRows, err := updateDeletionTimeInHourlyCache(ctx, dbConn, url, cand.vmID, vmUUID, name, snapUnix, delTs.Int64); err != nil {
log.Warn("failed to update hourly cache deletion time from event", "snapshot_time", snapUnix, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "error", err)
} else if cacheRows > 0 {
log.Debug("updated hourly cache deletion time from event", "snapshot_time", snapUnix, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "event_time", t)
}
}
if compatSnapshotUpdates {
if rowsAffected, err := updateDeletionTimeInSnapshot(ctx, dbConn, snapTable, url, cand.vmID, vmUUID, name, delTs.Int64); err != nil {
log.Warn("failed to update hourly snapshot deletion time from event", "table", snapTable, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "error", err)
} else if rowsAffected > 0 {
reportTables[snapTable] = struct{}{}
deletionsMarked = true
log.Debug("updated hourly snapshot deletion time from event", "table", snapTable, "vm_id", cand.vmID, "vm_uuid", vmUUID, "vcenter", url, "event_time", t)
}
}
}
@@ -1496,27 +1662,9 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
}
log.Debug("inserting hourly snapshot batch", "vcenter", url, "rows", len(batch))
if err := insertHourlyCache(ctx, dbConn, batch); err != nil {
log.Warn("failed to insert hourly cache rows", "vcenter", url, "error", err)
}
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, false, err.Error()); upErr != nil {
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
return err
}
// Record per-vCenter totals snapshot.
if err := db.InsertVcenterTotals(ctx, dbConn, url, startTime, totals.VmCount, totals.VcpuTotal, totals.RamTotal); err != nil {
slog.Warn("failed to insert vcenter totals", "vcenter", url, "snapshot_time", startTime.Unix(), "error", err)
}
// Discover previous snapshots once per run (serial) to avoid concurrent probes across vCenters.
var prevTableTouched bool
prevTableName, newCount, missingCount, prevTableTouched = c.compareWithPreviousSnapshot(ctx, dbConn, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, missingCount)
prevTableName, newCount, missingCount, prevTableTouched = c.compareWithPreviousSnapshot(ctx, dbConn, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, missingCount, compatSnapshotUpdates)
if prevTableTouched && prevTableName != "" {
reportTables[prevTableName] = struct{}{}
deletionsMarked = true
@@ -1527,15 +1675,6 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
// Fallback: locate a previous table only if we didn't already find one.
if prevTableName == "" {
if prevTable, err := latestHourlySnapshotBefore(ctx, dbConn, startTime, loggerFromCtx(ctx, c.Logger)); err == nil && prevTable != "" {
moreMissing, tableUpdated := c.markMissingFromPrevious(ctx, dbConn, prevTable, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName)
if moreMissing > 0 {
missingCount += moreMissing
}
if tableUpdated {
reportTables[prevTable] = struct{}{}
deletionsMarked = true
}
// Reuse this table name for later snapshot lookups when correlating deletion events.
prevTableName = prevTable
}
}
@@ -1599,18 +1738,20 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
tableToUpdate = prevTableName
}
if tableToUpdate != "" {
if rowsAffected, err := updateDeletionTimeInSnapshot(ctx, dbConn, tableToUpdate, url, vmID, inv.VmUuid.String, inv.Name, delTs.Int64); err != nil {
c.Logger.Warn("count-drop: failed to update hourly snapshot deletion time from event", "table", tableToUpdate, "vm_id", vmID, "vcenter", url, "error", err)
} else if rowsAffected > 0 {
reportTables[tableToUpdate] = struct{}{}
deletionsMarked = true
c.Logger.Debug("count-drop: updated hourly snapshot deletion time from event", "table", tableToUpdate, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "event_time", t)
if snapUnix, ok := parseSnapshotTime(tableToUpdate); ok {
if cacheRows, err := updateDeletionTimeInHourlyCache(ctx, dbConn, url, vmID, inv.VmUuid.String, inv.Name, snapUnix, delTs.Int64); err != nil {
c.Logger.Warn("count-drop: failed to update hourly cache deletion time", "snapshot_time", snapUnix, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "error", err)
} else if cacheRows > 0 {
c.Logger.Debug("count-drop: updated hourly cache deletion time", "snapshot_time", snapUnix, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "event_time", t)
}
if snapUnix, ok := parseSnapshotTime(tableToUpdate); ok {
if cacheRows, err := updateDeletionTimeInHourlyCache(ctx, dbConn, url, vmID, inv.VmUuid.String, inv.Name, snapUnix, delTs.Int64); err != nil {
c.Logger.Warn("count-drop: failed to update hourly cache deletion time", "snapshot_time", snapUnix, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "error", err)
} else if cacheRows > 0 {
c.Logger.Debug("count-drop: updated hourly cache deletion time", "snapshot_time", snapUnix, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "event_time", t)
}
}
if compatSnapshotUpdates {
if rowsAffected, err := updateDeletionTimeInSnapshot(ctx, dbConn, tableToUpdate, url, vmID, inv.VmUuid.String, inv.Name, delTs.Int64); err != nil {
c.Logger.Warn("count-drop: failed to update hourly snapshot deletion time from event", "table", tableToUpdate, "vm_id", vmID, "vcenter", url, "error", err)
} else if rowsAffected > 0 {
reportTables[tableToUpdate] = struct{}{}
deletionsMarked = true
c.Logger.Debug("count-drop: updated hourly snapshot deletion time from event", "table", tableToUpdate, "vm_id", vmID, "vm_uuid", inv.VmUuid.String, "vcenter", url, "event_time", t)
}
}
}
@@ -1621,7 +1762,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
// Backfill lifecycle deletions for VMs missing from inventory and without DeletedAt.
if backfillTables, err := backfillLifecycleDeletionsToday(ctx, log, dbConn, url, startTime, presentSnapshots); err != nil {
if backfillTables, err := backfillLifecycleDeletionsToday(ctx, log, dbConn, url, startTime, presentSnapshots, compatSnapshotUpdates); err != nil {
log.Warn("failed to backfill lifecycle deletions for today", "vcenter", url, "error", err)
} else if len(backfillTables) > 0 {
for _, table := range backfillTables {
@@ -1629,6 +1770,14 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
deletionsMarked = true
}
log.Info(
"Hourly snapshot stage complete",
"stage", "reconcile",
"duration", time.Since(reconcileStageStart),
"missing_marked", missingCount,
"created_since_prev", newCount,
"tables_touched", len(reportTables),
)
log.Info("Hourly snapshot summary",
"vcenter", url,
@@ -1644,25 +1793,40 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
if upErr := db.UpsertSnapshotRun(ctx, c.Database.DB(), url, startTime, true, ""); upErr != nil {
log.Warn("failed to record snapshot run", "url", url, "error", upErr)
}
reportStageStart := time.Now()
queuedReports := 0
generatedReports := 0
if deletionsMarked {
if len(reportTables) == 0 {
if len(reportTables) == 0 && strings.TrimSpace(tableName) != "" {
reportTables[tableName] = struct{}{}
}
if deferredReportTables != nil {
for reportTable := range reportTables {
deferredReportTables.Store(reportTable, struct{}{})
queuedReports++
}
log.Debug("Queued hourly report regeneration after deletions", "tables", len(reportTables))
} else {
for reportTable := range reportTables {
if err := c.generateReport(ctx, reportTable); err != nil {
if err := c.generateReportWithPolicy(ctx, reportTable); err != nil {
log.Warn("failed to regenerate hourly report after deletions", "error", err, "table", reportTable)
} else {
generatedReports++
log.Debug("Regenerated hourly report after deletions", "table", reportTable)
}
}
}
}
log.Info(
"Hourly snapshot stage complete",
"stage", "report_generation",
"duration", time.Since(reportStageStart),
"deletions_marked", deletionsMarked,
"tables", len(reportTables),
"queued_tables", queuedReports,
"generated_tables", generatedReports,
"deferred", deferredReportTables != nil,
)
return nil
}
@@ -1680,6 +1844,7 @@ func (c *CronTask) compareWithPreviousSnapshot(
inventoryByUuid map[string]queries.Inventory,
inventoryByName map[string]queries.Inventory,
missingCount int,
updateCompatSnapshot bool,
) (string, int, int, bool) {
prevTableName, prevTableErr := latestHourlySnapshotBefore(ctx, dbConn, startTime, loggerFromCtx(ctx, c.Logger))
if prevTableErr != nil {
@@ -1691,7 +1856,7 @@ func (c *CronTask) compareWithPreviousSnapshot(
newCount := 0
prevTableTouched := false
if prevTableName != "" {
moreMissing, tableUpdated := c.markMissingFromPrevious(ctx, dbConn, prevTableName, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName)
moreMissing, tableUpdated := c.markMissingFromPrevious(ctx, dbConn, prevTableName, url, startTime, presentSnapshots, presentByUuid, presentByName, inventoryByVmID, inventoryByUuid, inventoryByName, updateCompatSnapshot)
missingCount += moreMissing
if tableUpdated {
prevTableTouched = true