add prometheus instrumentation
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
125
internal/metrics/metrics.go
Normal file
125
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,125 @@
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
)
|
||||
|
||||
var (
|
||||
registry = prometheus.NewRegistry()
|
||||
|
||||
HourlySnapshotTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_hourly_snapshots_total", Help: "Total number of hourly snapshot jobs completed."})
|
||||
HourlySnapshotFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_hourly_snapshots_failed_total", Help: "Hourly snapshot jobs that failed."})
|
||||
HourlySnapshotLast = prometheus.NewGauge(prometheus.GaugeOpts{Name: "vctp_hourly_snapshot_last_unix", Help: "Unix timestamp of the last hourly snapshot start time."})
|
||||
HourlySnapshotRows = prometheus.NewGauge(prometheus.GaugeOpts{Name: "vctp_hourly_snapshot_last_rows", Help: "Row count of the last hourly snapshot table."})
|
||||
|
||||
DailyAggregationsTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_daily_aggregations_total", Help: "Total number of daily aggregation jobs completed."})
|
||||
DailyAggregationFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_daily_aggregations_failed_total", Help: "Daily aggregation jobs that failed."})
|
||||
DailyAggregationDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "vctp_daily_aggregation_duration_seconds",
|
||||
Help: "Duration of daily aggregation jobs.",
|
||||
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
|
||||
})
|
||||
|
||||
MonthlyAggregationsTotal = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_monthly_aggregations_total", Help: "Total number of monthly aggregation jobs completed."})
|
||||
MonthlyAggregationFailures = prometheus.NewCounter(prometheus.CounterOpts{Name: "vctp_monthly_aggregations_failed_total", Help: "Monthly aggregation jobs that failed."})
|
||||
MonthlyAggregationDuration = prometheus.NewHistogram(prometheus.HistogramOpts{
|
||||
Name: "vctp_monthly_aggregation_duration_seconds",
|
||||
Help: "Duration of monthly aggregation jobs.",
|
||||
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
|
||||
})
|
||||
|
||||
ReportsAvailable = prometheus.NewGauge(prometheus.GaugeOpts{
|
||||
Name: "vctp_reports_available",
|
||||
Help: "Number of downloadable reports present on disk.",
|
||||
})
|
||||
|
||||
VcenterConnectFailures = prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Name: "vctp_vcenter_connect_failures_total",
|
||||
Help: "Failed connections to vCenter during snapshot runs.",
|
||||
}, []string{"vcenter"})
|
||||
|
||||
VcenterSnapshotDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Name: "vctp_vcenter_snapshot_duration_seconds",
|
||||
Help: "Duration of per-vCenter hourly snapshot jobs.",
|
||||
Buckets: prometheus.ExponentialBuckets(0.5, 2, 10),
|
||||
}, []string{"vcenter"})
|
||||
|
||||
VcenterInventorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Name: "vctp_vcenter_inventory_size",
|
||||
Help: "Number of VMs seen in the last successful snapshot per vCenter.",
|
||||
}, []string{"vcenter"})
|
||||
)
|
||||
|
||||
func init() {
|
||||
registry.MustRegister(
|
||||
HourlySnapshotTotal,
|
||||
HourlySnapshotFailures,
|
||||
HourlySnapshotLast,
|
||||
HourlySnapshotRows,
|
||||
DailyAggregationsTotal,
|
||||
DailyAggregationFailures,
|
||||
DailyAggregationDuration,
|
||||
MonthlyAggregationsTotal,
|
||||
MonthlyAggregationFailures,
|
||||
MonthlyAggregationDuration,
|
||||
ReportsAvailable,
|
||||
VcenterConnectFailures,
|
||||
VcenterSnapshotDuration,
|
||||
VcenterInventorySize,
|
||||
)
|
||||
}
|
||||
|
||||
// Handler returns an http.Handler that serves Prometheus metrics.
|
||||
func Handler() http.Handler {
|
||||
return promhttp.HandlerFor(registry, promhttp.HandlerOpts{})
|
||||
}
|
||||
|
||||
// RecordVcenterSnapshot logs per-vCenter snapshot metrics.
|
||||
func RecordVcenterSnapshot(vcenter string, duration time.Duration, vmCount int64, err error) {
|
||||
VcenterSnapshotDuration.WithLabelValues(vcenter).Observe(duration.Seconds())
|
||||
if err != nil {
|
||||
VcenterConnectFailures.WithLabelValues(vcenter).Inc()
|
||||
return
|
||||
}
|
||||
VcenterInventorySize.WithLabelValues(vcenter).Set(float64(vmCount))
|
||||
}
|
||||
|
||||
// RecordHourlySnapshot logs aggregate hourly snapshot results.
|
||||
func RecordHourlySnapshot(start time.Time, rows int64, err error) {
|
||||
HourlySnapshotLast.Set(float64(start.Unix()))
|
||||
HourlySnapshotRows.Set(float64(rows))
|
||||
if err != nil {
|
||||
HourlySnapshotFailures.Inc()
|
||||
return
|
||||
}
|
||||
HourlySnapshotTotal.Inc()
|
||||
}
|
||||
|
||||
// RecordDailyAggregation logs daily aggregation metrics.
|
||||
func RecordDailyAggregation(duration time.Duration, err error) {
|
||||
DailyAggregationDuration.Observe(duration.Seconds())
|
||||
if err != nil {
|
||||
DailyAggregationFailures.Inc()
|
||||
return
|
||||
}
|
||||
DailyAggregationsTotal.Inc()
|
||||
}
|
||||
|
||||
// RecordMonthlyAggregation logs monthly aggregation metrics.
|
||||
func RecordMonthlyAggregation(duration time.Duration, err error) {
|
||||
MonthlyAggregationDuration.Observe(duration.Seconds())
|
||||
if err != nil {
|
||||
MonthlyAggregationFailures.Inc()
|
||||
return
|
||||
}
|
||||
MonthlyAggregationsTotal.Inc()
|
||||
}
|
||||
|
||||
// SetReportsAvailable updates the gauge for report files found on disk.
|
||||
func SetReportsAvailable(count int) {
|
||||
ReportsAvailable.Set(float64(count))
|
||||
}
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"time"
|
||||
"vctp/db"
|
||||
"vctp/internal/metrics"
|
||||
"vctp/internal/report"
|
||||
)
|
||||
|
||||
@@ -27,6 +28,7 @@ func (c *CronTask) AggregateDailySummary(ctx context.Context, date time.Time, fo
|
||||
}
|
||||
|
||||
func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Time, force bool) error {
|
||||
jobStart := time.Now()
|
||||
dayStart := time.Date(targetTime.Year(), targetTime.Month(), targetTime.Day(), 0, 0, 0, 0, targetTime.Location())
|
||||
dayEnd := dayStart.AddDate(0, 0, 1)
|
||||
summaryTable, err := dailySummaryTableName(targetTime)
|
||||
@@ -133,9 +135,12 @@ func (c *CronTask) aggregateDailySummary(ctx context.Context, targetTime time.Ti
|
||||
|
||||
if err := c.generateReport(ctx, summaryTable); err != nil {
|
||||
c.Logger.Warn("failed to generate daily report", "error", err, "table", summaryTable)
|
||||
metrics.RecordDailyAggregation(time.Since(jobStart), err)
|
||||
return err
|
||||
}
|
||||
|
||||
c.Logger.Debug("Finished daily inventory aggregation", "summary_table", summaryTable)
|
||||
metrics.RecordDailyAggregation(time.Since(jobStart), nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"time"
|
||||
"vctp/db"
|
||||
"vctp/db/queries"
|
||||
"vctp/internal/metrics"
|
||||
"vctp/internal/report"
|
||||
"vctp/internal/utils"
|
||||
"vctp/internal/vcenter"
|
||||
@@ -168,6 +169,7 @@ func (c *CronTask) RunVcenterSnapshotHourly(ctx context.Context, logger *slog.Lo
|
||||
c.Logger.Warn("failed to register hourly snapshot", "error", err, "table", tableName)
|
||||
}
|
||||
|
||||
metrics.RecordHourlySnapshot(startTime, rowCount, err)
|
||||
if err := c.generateReport(ctx, tableName); err != nil {
|
||||
c.Logger.Warn("failed to generate hourly report", "error", err, "table", tableName)
|
||||
}
|
||||
@@ -636,44 +638,6 @@ func snapshotFromInventory(inv queries.Inventory, snapshotTime time.Time) invent
|
||||
}
|
||||
}
|
||||
|
||||
func insertDailyInventoryRow(ctx context.Context, dbConn *sqlx.DB, tableName string, row inventorySnapshotRow) error {
|
||||
query := fmt.Sprintf(`
|
||||
INSERT INTO %s (
|
||||
"InventoryId", "Name", "Vcenter", "VmId", "EventKey", "CloudId", "CreationTime", "DeletionTime",
|
||||
"ResourcePool", "Datacenter", "Cluster", "Folder", "ProvisionedDisk", "VcpuCount",
|
||||
"RamGB", "IsTemplate", "PoweredOn", "SrmPlaceholder", "VmUuid", "SnapshotTime", "IsPresent"
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
|
||||
`, tableName)
|
||||
|
||||
query = sqlx.Rebind(sqlx.BindType(dbConn.DriverName()), query)
|
||||
|
||||
_, err := dbConn.ExecContext(ctx, query,
|
||||
row.InventoryId,
|
||||
row.Name,
|
||||
row.Vcenter,
|
||||
row.VmId,
|
||||
row.EventKey,
|
||||
row.CloudId,
|
||||
row.CreationTime,
|
||||
row.DeletionTime,
|
||||
row.ResourcePool,
|
||||
row.Datacenter,
|
||||
row.Cluster,
|
||||
row.Folder,
|
||||
row.ProvisionedDisk,
|
||||
row.VcpuCount,
|
||||
row.RamGB,
|
||||
row.IsTemplate,
|
||||
row.PoweredOn,
|
||||
row.SrmPlaceholder,
|
||||
row.VmUuid,
|
||||
row.SnapshotTime,
|
||||
row.IsPresent,
|
||||
)
|
||||
return err
|
||||
}
|
||||
|
||||
func insertHourlyBatch(ctx context.Context, dbConn *sqlx.DB, tableName string, rows []inventorySnapshotRow) error {
|
||||
if len(rows) == 0 {
|
||||
return nil
|
||||
@@ -727,9 +691,11 @@ INSERT INTO %s (
|
||||
}
|
||||
|
||||
func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTime time.Time, tableName string, url string) error {
|
||||
started := time.Now()
|
||||
c.Logger.Debug("connecting to vcenter for hourly snapshot", "url", url)
|
||||
vc := vcenter.New(c.Logger, c.VcCreds)
|
||||
if err := vc.Login(url); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
return fmt.Errorf("unable to connect to vcenter: %w", err)
|
||||
}
|
||||
defer func() {
|
||||
@@ -740,6 +706,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
|
||||
vcVms, err := vc.GetAllVMsWithProps()
|
||||
if err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), 0, err)
|
||||
return fmt.Errorf("unable to get VMs from vcenter: %w", err)
|
||||
}
|
||||
canDetectMissing := len(vcVms) > 0
|
||||
@@ -856,6 +823,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
}
|
||||
|
||||
if err := insertHourlyBatch(ctx, dbConn, tableName, batch); err != nil {
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, err)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -866,6 +834,7 @@ func (c *CronTask) captureHourlySnapshotForVcenter(ctx context.Context, startTim
|
||||
"ram_total_gb", totals.RamTotal,
|
||||
"disk_total_gb", totals.DiskTotal,
|
||||
)
|
||||
metrics.RecordVcenterSnapshot(url, time.Since(started), totals.VmCount, nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"time"
|
||||
"vctp/db"
|
||||
"vctp/internal/metrics"
|
||||
"vctp/internal/report"
|
||||
)
|
||||
|
||||
@@ -29,6 +30,7 @@ func (c *CronTask) AggregateMonthlySummary(ctx context.Context, month time.Time,
|
||||
}
|
||||
|
||||
func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time.Time, force bool) error {
|
||||
jobStart := time.Now()
|
||||
if err := report.EnsureSnapshotRegistry(ctx, c.Database); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -107,9 +109,12 @@ func (c *CronTask) aggregateMonthlySummary(ctx context.Context, targetMonth time
|
||||
|
||||
if err := c.generateReport(ctx, monthlyTable); err != nil {
|
||||
c.Logger.Warn("failed to generate monthly report", "error", err, "table", monthlyTable)
|
||||
metrics.RecordMonthlyAggregation(time.Since(jobStart), err)
|
||||
return err
|
||||
}
|
||||
|
||||
c.Logger.Debug("Finished monthly inventory aggregation", "summary_table", monthlyTable)
|
||||
metrics.RecordMonthlyAggregation(time.Since(jobStart), nil)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user