add prometheus instrumentation
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
2026-01-15 11:43:29 +11:00
parent 4d754ee263
commit ea68331208
9 changed files with 193 additions and 42 deletions

125
internal/metrics/metrics.go Normal file
View File

@@ -0,0 +1,125 @@
package metrics
import (
"net/http"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
var (
registry = prometheus.NewRegistry()
HourlySnapshotTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_hourly_snapshots_total", Help: "Total number of hourly snapshot jobs completed."})
HourlySnapshotFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_hourly_snapshots_failed_total", Help: "Hourly snapshot jobs that failed."})
HourlySnapshotLast = prometheus.NewGauge(prometheus.GaugeOpts{Name: "vctp_hourly_snapshot_last_unix", Help: "Unix timestamp of the last hourly snapshot start time."})
HourlySnapshotRows = prometheus.NewGauge(prometheus.GaugeOpts{Name: "vctp_hourly_snapshot_last_rows", Help: "Row count of the last hourly snapshot table."})
DailyAggregationsTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_daily_aggregations_total", Help: "Total number of daily aggregation jobs completed."})
DailyAggregationFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_daily_aggregations_failed_total", Help: "Daily aggregation jobs that failed."})
DailyAggregationDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "vctp_daily_aggregation_duration_seconds",
Help: "Duration of daily aggregation jobs.",
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
})
MonthlyAggregationsTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_monthly_aggregations_total", Help: "Total number of monthly aggregation jobs completed."})
MonthlyAggregationFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_monthly_aggregations_failed_total", Help: "Monthly aggregation jobs that failed."})
MonthlyAggregationDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
Name: "vctp_monthly_aggregation_duration_seconds",
Help: "Duration of monthly aggregation jobs.",
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
})
ReportsAvailable = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "vctp_reports_available",
Help: "Number of downloadable reports present on disk.",
})
VcenterConnectFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "vctp_vcenter_connect_failures_total",
Help: "Failed connections to vCenter during snapshot runs.",
}, []string{"vcenter"})
VcenterSnapshotDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "vctp_vcenter_snapshot_duration_seconds",
Help: "Duration of per-vCenter hourly snapshot jobs.",
Buckets: prometheus.ExponentialBuckets(0.5, 2, 10),
}, []string{"vcenter"})
VcenterInventorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "vctp_vcenter_inventory_size",
Help: "Number of VMs seen in the last successful snapshot per vCenter.",
}, []string{"vcenter"})
)
func init() {
registry.MustRegister(
HourlySnapshotTotal,
HourlySnapshotFailures,
HourlySnapshotLast,
HourlySnapshotRows,
DailyAggregationsTotal,
DailyAggregationFailures,
DailyAggregationDuration,
MonthlyAggregationsTotal,
MonthlyAggregationFailures,
MonthlyAggregationDuration,
ReportsAvailable,
VcenterConnectFailures,
VcenterSnapshotDuration,
VcenterInventorySize,
)
}
// Handler returns an http.Handler that serves Prometheus metrics.
func Handler() http.Handler {
return promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
}
// RecordVcenterSnapshot logs per-vCenter snapshot metrics.
func RecordVcenterSnapshot(vcenter string, duration time.Duration, vmCount int64, err error) {
VcenterSnapshotDuration.WithLabelValues(vcenter).Observe(duration.Seconds())
if err != nil {
VcenterConnectFailures.WithLabelValues(vcenter).Inc()
return
}
VcenterInventorySize.WithLabelValues(vcenter).Set(float64(vmCount))
}
// RecordHourlySnapshot logs aggregate hourly snapshot results.
func RecordHourlySnapshot(start time.Time, rows int64, err error) {
HourlySnapshotLast.Set(float64(start.Unix()))
HourlySnapshotRows.Set(float64(rows))
if err != nil {
HourlySnapshotFailures.Inc()
return
}
HourlySnapshotTotal.Inc()
}
// RecordDailyAggregation logs daily aggregation metrics.
func RecordDailyAggregation(duration time.Duration, err error) {
DailyAggregationDuration.Observe(duration.Seconds())
if err != nil {
DailyAggregationFailures.Inc()
return
}
DailyAggregationsTotal.Inc()
}
// RecordMonthlyAggregation logs monthly aggregation metrics.
func RecordMonthlyAggregation(duration time.Duration, err error) {
MonthlyAggregationDuration.Observe(duration.Seconds())
if err != nil {
MonthlyAggregationFailures.Inc()
return
}
MonthlyAggregationsTotal.Inc()
}
// SetReportsAvailable updates the gauge for report files found on disk.
func SetReportsAvailable(count int) {
ReportsAvailable.Set(float64(count))
}

View File

@@ -6,6 +6,7 @@ import (
"log/slog"
"time"
"vctp/db"
"vctp/internal/metrics"
"vctp/internal/report"
)
@@ -27,6 +28,7 @@ func (c *CronTask) AggregateDailySummary(ctx context.Context, date time.Time, fo
}
func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Time, force bool) error {
jobStart := time.Now()
dayStart := time.Date(targetTime.Year(), targetTime.Month(), targetTime.Day(), 0, 0, 0, 0, targetTime.Location())
dayEnd := dayStart.AddDate(0, 0, 1)
summaryTable, err := dailySummaryTableName(targetTime)
@@ -133,9 +135,12 @@ func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Ti
if err := c.generateReport(ctx, summaryTable); err != nil {
c.Logger.Warn("failed to generate daily report", "error", err, "table", summaryTable)
metrics.RecordDailyAggregation(time.Since(jobStart), err)
return err
}
c.Logger.Debug("Finished daily inventory aggregation", "summary_table", summaryTable)
metrics.RecordDailyAggregation(time.Since(jobStart), nil)
return nil
}

View File

@@ -12,6 +12,7 @@ import (
"time"
"vctp/db"
"vctp/db/queries"
"vctp/internal/metrics"
"vctp/internal/report"
"vctp/internal/utils"
"vctp/internal/vcenter"
@@ -168,6 +169,7 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
c.Logger.Warn("failed to register hourly snapshot", "error", err, "table", tableName)
}
metrics.RecordHourlySnapshot(startTime, rowCount, err)
if err := c.generateReport(ctx, tableName); err != nil {
c.Logger.Warn("failed to generate hourly report", "error", err, "table", tableName)
}
@@ -636,44 +638,6 @@ func snapshotFromInventory(inv queries.Inventory, snapshotTime time.Time) invent
}
}
func insertDailyInventoryRow(ctx context.Context, dbConn *sqlx.DB, tableName string, row inventorySnapshotRow) error {
query := fmt.Sprintf(`
INSERT INTO %s (
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime", "IsPresent"
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
`, tableName)
query = sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), query)
_, err := dbConn.ExecContext(ctx, query,
row.InventoryId,
row.Name,
row.Vcenter,
row.VmId,
row.EventKey,
row.CloudId,
row.CreationTime,
row.DeletionTime,
row.ResourcePool,
row.Datacenter,
row.Cluster,
row.Folder,
row.ProvisionedDisk,
row.VcpuCount,
row.RamGB,
row.IsTemplate,
row.PoweredOn,
row.SrmPlaceholder,
row.VmUuid,
row.SnapshotTime,
row.IsPresent,
)
return err
}
func insertHourlyBatch(ctx context.Context, dbConn *sqlx.DB, tableName string, rows []inventorySnapshotRow) error {
if len(rows) == 0 {
return nil
@@ -727,9 +691,11 @@ INSERT INTO %s (
}
func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTime time.Time, tableName string, url string) error {
started := time.Now()
c.Logger.Debug("connecting to vcenter for hourly snapshot", "url", url)
vc := vcenter.New(c.Logger, c.VcCreds)
if err := vc.Login(url); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
return fmt.Errorf("unable to connect to vcenter: %w", err)
}
defer func() {
@@ -740,6 +706,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
vcVms, err := vc.GetAllVMsWithProps()
if err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
}
canDetectMissing := len(vcVms) > 0
@@ -856,6 +823,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
}
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
return err
}
@@ -866,6 +834,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
"ram_total_gb", totals.RamTotal,
"disk_total_gb", totals.DiskTotal,
)
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, nil)
return nil
}

View File

@@ -6,6 +6,7 @@ import (
"log/slog"
"time"
"vctp/db"
"vctp/internal/metrics"
"vctp/internal/report"
)
@@ -29,6 +30,7 @@ func (c *CronTask) AggregateMonthlySummary(ctx context.Context, month time.Time,
}
func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time.Time, force bool) error {
jobStart := time.Now()
if err := report.EnsureSnapshotRegistry(ctx, c.Database); err != nil {
return err
}
@@ -107,9 +109,12 @@ func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time
if err := c.generateReport(ctx, monthlyTable); err != nil {
c.Logger.Warn("failed to generate monthly report", "error", err, "table", monthlyTable)
metrics.RecordMonthlyAggregation(time.Since(jobStart), err)
return err
}
c.Logger.Debug("Finished monthly inventory aggregation", "summary_table", monthlyTable)
metrics.RecordMonthlyAggregation(time.Since(jobStart), nil)
return nil
}