Files
go-events/main.go
Nathan Coad 334b0b8ab6
All checks were successful
continuous-integration/drone/push Build is passing
improve logging
2024-03-18 11:17:47 +11:00

517 lines
19 KiB
Go

package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"log"
"net/url"
"os"
"sort"
"strings"
"time"
_ "time/tzdata"
"github.com/vmware/govmomi"
"github.com/vmware/govmomi/event"
"github.com/vmware/govmomi/view"
"github.com/vmware/govmomi/vim25/methods"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/types"
)
type OutageResults struct {
VM string
OutageDuration string
OutageStart time.Time
RestartTime time.Time
Cluster string
FailedHost string
NewHost string
GuestOS string
CurrentPowerState string
Description string
}
type HostFailureResults struct {
HostName string
FailureTime time.Time
Cluster string
Vcenter string
}
var (
c *govmomi.Client
ctx context.Context
cancel context.CancelFunc
location *time.Location
sha1ver string // sha1 revision used to build the program
buildTime string // when the executable was built
results []OutageResults
hostResults []HostFailureResults
haUnreachableEvents []types.Event
)
// This function optionally filters events by a single MoRef, any additonal MoRefs are ignored
// See https://golang.hotexamples.com/site/file?hash=0x7414a58898f22f77b81f05c04e2e2a2280e018ba9a71a2f4efb2bda7c55c35ac&fullName=govc/command.go&project=kristinn/govmomi
// for an example of how to deal with multiple entities
func getEvents(eventTypes []string, entities []types.ManagedObjectReference, begin time.Duration, end time.Duration) []types.Event {
var returnEvents []types.Event
// Refer https://github.com/vmware/govmomi/blob/main/examples/events/main.go
now, err := methods.GetCurrentTime(ctx, c) // vCenter server time (UTC)
if err != nil {
fmt.Fprintf(os.Stderr, "Error getting vCenter time: %s\n", err)
os.Exit(1)
} //else {
//log.Printf("vCenter time is '%v'\n", now)
//}
m := event.NewManager(c.Client)
root := c.ServiceContent.RootFolder
// configure the event stream filter (begin of stream)
filter := types.EventFilterSpec{
EventTypeId: eventTypes, // only stream specific types, e.g. VmEvent
Entity: &types.EventFilterSpecByEntity{
Entity: root,
Recursion: types.EventFilterSpecRecursionOptionAll,
},
Time: &types.EventFilterSpecByTime{
BeginTime: types.NewTime(now.Add(begin * -1)),
},
}
if end != 0 {
filter.Time.EndTime = types.NewTime(now.Add(end * -1))
}
if len(entities) == 0 {
entities = append(entities, root)
}
for _, e := range entities {
// Only log the entity we're filtering if it isn't the vcenter root
if e != root {
log.Printf("getEvents setting entity '%v' to filter\n", e)
}
filter.Entity = &types.EventFilterSpecByEntity{
Entity: e,
Recursion: types.EventFilterSpecRecursionOptionAll,
}
collector, err := m.CreateCollectorForEvents(ctx, filter)
if err != nil {
fmt.Fprintf(os.Stderr, "Error creating event collector: %s\n", err)
os.Exit(1)
}
defer collector.Destroy(ctx)
for {
events, err := collector.ReadNextEvents(ctx, 100)
if err != nil {
fmt.Fprintf(os.Stderr, "Error reading events: %s\n", err)
}
if len(events) == 0 {
break
}
for i := range events {
event := events[i].GetEvent()
returnEvents = append(returnEvents, *event)
//kind := reflect.TypeOf(events[i]).Elem().Name()
//log.Printf("%d [%s] [%s] %s\n", event.Key, event.CreatedTime.In(location).Format(time.ANSIC), kind, event.FullFormattedMessage)
log.Printf("%d [%s] %s\n", event.Key, event.CreatedTime.In(location).Format(time.ANSIC), event.FullFormattedMessage)
}
}
}
return returnEvents
}
func getCluster(name string) mo.ClusterComputeResource {
// Create a container view so that we can search vCenter
m := view.NewManager(c.Client)
cv, _ := m.CreateContainerView(ctx, c.ServiceContent.RootFolder, []string{"ClusterComputeResource"}, true)
var clusters []mo.ClusterComputeResource
log.Printf("Searching for Cluster '%s'\n", name)
err := cv.Retrieve(ctx, []string{"ClusterComputeResource"}, []string{"summary", "name"}, &clusters)
if err != nil {
log.Printf("Failed searching for Cluster %s : %s\n", name, err)
return mo.ClusterComputeResource{}
} else {
for _, cluster := range clusters {
if cluster.Name == name {
log.Printf("Found corresponding Cluster with MoRef '%s'\n", cluster.Reference())
return cluster
}
}
}
// If we reached here then we didn't find the Cluster
return mo.ClusterComputeResource{}
}
/*
func getVM(name string) mo.VirtualMachine {
// Create a container view so that we can search vCenter for a VM if we found any failure events
m := view.NewManager(c.Client)
cv, _ := m.CreateContainerView(ctx, c.ServiceContent.RootFolder, []string{"VirtualMachine"}, true)
var vms []mo.VirtualMachine
log.Printf("Searching for VM '%s'\n", name)
err := cv.Retrieve(ctx, []string{"VirtualMachine"}, []string{"summary", "name"}, &vms)
if err != nil {
log.Printf("Failed searching for VM %s : %s\n", name, err)
return mo.VirtualMachine{}
} else {
for _, vm := range vms {
if vm.Name == name {
log.Printf("Found corresponding VM with MoRef '%s'\n", vm.Reference())
return vm
}
}
}
// If we reached here then we didn't find a VM
return mo.VirtualMachine{}
}
*/
func getVmInCluster(name string, cluster types.ManagedObjectReference) mo.VirtualMachine {
// Create a container view so that we can search vCenter for a VM if we found any failure events
m := view.NewManager(c.Client)
cv, _ := m.CreateContainerView(ctx, cluster, []string{"VirtualMachine"}, true)
var vms []mo.VirtualMachine
log.Printf("Searching for VM '%s'\n", name)
err := cv.Retrieve(ctx, []string{"VirtualMachine"}, []string{"summary", "name"}, &vms)
if err != nil {
log.Printf("Failed searching for VM %s : %s\n", name, err)
return mo.VirtualMachine{}
} else {
for _, vm := range vms {
if vm.Name == name {
log.Printf("Found corresponding VM with MoRef '%s'\n", vm.Reference())
return vm
}
}
}
// If we reached here then we didn't find a VM
return mo.VirtualMachine{}
}
func main() {
// Command line flags for the vCenter connection
vURL := flag.String("url", "", "The URL of a vCenter server, eg https://server.domain.example/sdk")
vUser := flag.String("user", "", "The username to use when connecting to vCenter")
vPass := flag.String("password", "", "The password to use when connecting to vCenter")
vTZ := flag.String("tz", "Australia/Sydney", "The timezone to use when converting vCenter UTC times")
vInsecure := flag.Bool("insecure", true, "Allow insecure connections to vCenter")
begin := flag.Duration("b", time.Hour, "Begin time") // default BeginTime is 1h ago
end := flag.Duration("e", 0, "End time")
fuzzyMinutes := flag.Int("fuzziness", 5, "Number of minutes to offset VM restart time when searching for related Host failure event")
flag.Parse()
// Print logs to file
f, err := os.OpenFile("log.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
log.Fatalf("error opening file: %v", err)
}
defer f.Close()
log.SetOutput(f)
log.Printf("Starting execution. Built on %s from sha1 %s\n", buildTime, sha1ver)
// So we can convert vCenter UTC to our local timezone
log.Printf("Setting timezone to '%s'\n", *vTZ)
location, err = time.LoadLocation(*vTZ)
if err != nil {
fmt.Fprintf(os.Stderr, "Error setting timezone to %s : %s\n", *vTZ, err)
os.Exit(1)
}
u, err := url.Parse(*vURL)
if err != nil {
fmt.Fprintf(os.Stderr, "Error parsing url %s : %s\n", *vURL, err)
os.Exit(1)
} else {
if !strings.HasSuffix(u.Path, "/sdk") {
u.Path, _ = url.JoinPath(u.Path, "/sdk")
log.Printf("Updated vCenter URL to '%v'\n", u)
}
}
log.Printf("Connecting to vCenter %s\n", u)
u.User = url.UserPassword(*vUser, *vPass)
ctx, cancel = context.WithCancel(context.Background())
defer cancel()
// Login to vcenter
c, err = govmomi.NewClient(ctx, u, *vInsecure)
if err != nil {
fmt.Fprintf(os.Stderr, "Logging in error: %s\n", err)
os.Exit(1)
}
defer c.Logout(ctx)
//finder := find.NewFinder(c.Client)
log.Printf("Searching for hostfailure events\n")
hostFailures := getEvents([]string{"com.vmware.vc.HA.DasHostFailedEvent"}, []types.ManagedObjectReference{}, *begin, *end)
if len(hostFailures) > 0 {
log.Printf("Found at least one host failure, proceeding with VM restart search\n")
vmFailures := getEvents([]string{"com.vmware.vc.ha.VmRestartedByHAEvent"}, []types.ManagedObjectReference{}, *begin, *end)
log.Printf("Searching for ha status change events\n")
haStatusChanges := getEvents([]string{"com.vmware.vc.HA.HostStateChangedEvent"}, []types.ManagedObjectReference{}, *begin, *end)
// filter ha status changed messages for unreachable ones
for _, h := range haStatusChanges {
unreachableMessage := strings.Contains(strings.ToLower(h.FullFormattedMessage), "changed to unreachable")
if unreachableMessage {
// make sure this host was in the host failures list
hostFound := false
var haFailedTime time.Time
for _, f := range hostFailures {
if h.Host.Name == f.Host.Name {
// got it
hostFound = true
haFailedTime = f.CreatedTime.In(location)
break
}
}
if hostFound {
// make sure that this event is within 10 minutes of the corresponding host failed event
unreachableStartComparison := h.CreatedTime.In(location).Add(time.Duration(int64(time.Minute) * -10))
unreachableEndComparison := h.CreatedTime.In(location).Add(time.Duration(int64(time.Minute) * 10))
if haFailedTime.Before(unreachableEndComparison) && haFailedTime.After(unreachableStartComparison) {
haUnreachableEvents = append(haUnreachableEvents, h)
log.Printf("Recording host %s unreachable HA status event at %s\n", h.Host.Name, h.CreatedTime.In(location).Format(time.ANSIC))
} else {
log.Printf("Host %s HA failure time at %s was before %s or after %s\n", h.Host.Name, haFailedTime.Format(time.ANSIC),
unreachableStartComparison, unreachableEndComparison)
}
} else {
log.Printf("Host %s was not found in the list of hostfailure events, skipping this host\n", h.Host.Name)
}
}
}
// Sort the host failure events by time
sort.Slice(hostFailures[:], func(i, j int) bool {
return hostFailures[i].CreatedTime.Before(hostFailures[j].CreatedTime)
})
for i := range vmFailures {
var outageStart, restartTime time.Time
var failedHost string
var possibleHosts []types.Event
event := vmFailures[i]
vmRestartTime := event.CreatedTime.In(location)
// Sometimes host HA events can come through a few minutes after a VM restart event, so create a "fuzzy" starting time to search for host HA events
fuzzyTime := vmRestartTime.Add(time.Duration(int64(time.Minute) * int64(*fuzzyMinutes)))
log.Printf("Failure event for VM '%s' restarted in cluster '%s' at %s\n", event.Vm.Name, event.ComputeResource.Name, event.CreatedTime.In(location).Format(time.ANSIC))
// Get a reference to the cluster mentioned
cluster := getCluster((event.ComputeResource.Name))
vm := getVmInCluster(event.Vm.Name, cluster.Reference())
//vm := getVM(event.Vm.Name)
// Use VmDisconnectedEvent to see which host this VM was on
vmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end)
log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents))
// Determine which host the VM was previoulsy running on
if len(vmDisconnectedEvents) > 0 {
// Sort the disconnected events by time
sort.Slice(vmDisconnectedEvents[:], func(i, j int) bool {
return vmDisconnectedEvents[i].CreatedTime.Before(vmDisconnectedEvents[j].CreatedTime)
})
log.Printf("After sorting, VmDisconnectedEvent list looks like this:\n")
for _, h := range vmDisconnectedEvents {
log.Printf("%d [%s] VM: %s, Host: %s, Message: %s\n", h.Key, h.CreatedTime.In(location).Format(time.ANSIC), h.Vm.Name, h.Host.Name, h.FullFormattedMessage)
}
// What if there are multiple host HA event messages but only one VM disconnected message?
// Search for any disconnected messages prior to restart time
log.Printf("Filtering VmDisconnectedEvent list based on fuzzy VM restart time %s\n", fuzzyTime)
for _, e := range vmDisconnectedEvents {
if e.CreatedTime.In(location).Before(fuzzyTime) || e.CreatedTime.In(location).Equal(fuzzyTime) {
log.Printf("VM disconnected event on host %s at time %s is applicable\n", e.Host.Name, e.CreatedTime.In(location))
possibleHosts = append(possibleHosts, e)
}
}
log.Printf("After filtering VmDisconnectedEvent there are %d events\n", len(possibleHosts))
// Its possible that the VM disconnected messages dont' relate to the host HA events that we found
// If that is the case then we fall back to the most recent host failure message in our list
if len(possibleHosts) == 0 {
log.Printf("No corresponding VM disconnected messages, falling back to any applicable host that experienced a HA event.\n")
// Search for host failures
for _, hostEvent := range haUnreachableEvents {
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
possibleHosts = append(possibleHosts, hostEvent)
}
}
log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts))
if len(possibleHosts) == 0 {
log.Printf("No ESXi outage events happened before VM %s fuzzy restart event at %s, skipping this event.\n", event.Vm.Name, fuzzyTime)
continue
}
}
} else { // Didn't find any VM disconnected events
log.Printf("could not determine previous host for this VM. Filtering all ha unreachable events prior to fuzzy VM restart time '%s'\n", fuzzyTime)
// TODO Use HA unreachable events to find the host
for _, hostEvent := range haUnreachableEvents {
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
possibleHosts = append(possibleHosts, hostEvent)
}
}
/*
// Search for host failures
for _, hostEvent := range hostFailures {
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
possibleHosts = append(possibleHosts, hostEvent)
}
}
*/
log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts))
for _, hostEvent := range possibleHosts {
log.Printf("Host %s (%s)\n", hostEvent.Host.Name, hostEvent.CreatedTime.In(location).Format(time.ANSIC))
}
if len(possibleHosts) == 0 {
log.Printf("No ESXi outage events happened before VM %s fuzzy restart event at %s, skipping this event.\n", event.Vm.Name, fuzzyTime)
continue
}
}
if len(possibleHosts) == 1 {
log.Printf("Found a single host that failed relating to VM %s\n", event.Vm.Name)
var checkActualTime []types.Event
// Search the list of host failures to get the last host HA event before this VM was disconnected
for _, hostEvent := range hostFailures {
if hostEvent.Host.Name == possibleHosts[0].Host.Name {
if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) {
checkActualTime = append(checkActualTime, hostEvent)
}
}
}
if len(checkActualTime) == 1 {
log.Printf("Found a single host failure event for our failed host that occurred before vm restart time\n")
failedHost = checkActualTime[0].Host.Name
outageStart = checkActualTime[0].CreatedTime.In(location)
restartTime = vmRestartTime
} else {
log.Printf("Found %d failure event(s) for our failed host that occurred before vm restart time\n", len(checkActualTime))
log.Printf("Assuming failed host was '%s', with outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location))
failedHost = possibleHosts[0].Host.Name
outageStart = possibleHosts[0].CreatedTime.In(location)
restartTime = vmRestartTime
}
} else if len(possibleHosts) > 1 {
log.Printf("Found multiple host failure events relating to VM %s\n", event.Vm.Name)
var checkActualTime []types.Event
// Search for any disconnected messages prior to actual restart time rather than fuzzy time
log.Printf("Checking possible hosts list based on actual VM restart time %s\n", vmRestartTime)
for _, hostEvent := range possibleHosts {
if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) {
checkActualTime = append(checkActualTime, hostEvent)
}
}
// if that search gives us exactly one result then use that
if len(checkActualTime) == 1 {
log.Printf("Found exactly one host corresponding to actual VM restart time. Failed host was '%s', using outage start time of '%s'\n", checkActualTime[0].Host.Name, checkActualTime[0].CreatedTime.In(location))
failedHost = checkActualTime[0].Host.Name
outageStart = checkActualTime[0].CreatedTime.In(location)
restartTime = vmRestartTime
} else {
// if using the actual VM restart time doesn't narrow things down then go back to using the last host failure time before the fuzzy VM restart time
lastIndex := len(possibleHosts) - 1
log.Printf("Last failed host before restart time was '%s'. Using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location))
failedHost = possibleHosts[lastIndex].Host.Name
outageStart = possibleHosts[lastIndex].CreatedTime.In(location)
restartTime = vmRestartTime
}
} else {
log.Printf("Didn't find any data to suggest which host this VM was running on before!\n")
restartTime = vmRestartTime
outageStart = vmRestartTime
failedHost = "Unknown"
}
duration := restartTime.Sub(outageStart)
out := time.Time{}.Add(duration)
// Create a new result
result := OutageResults{
VM: event.Vm.Name,
OutageDuration: out.Format("15:04:05"),
OutageStart: outageStart,
RestartTime: restartTime,
Cluster: event.ComputeResource.Name,
FailedHost: failedHost,
NewHost: event.Host.Name,
GuestOS: vm.Summary.Guest.GuestFullName,
CurrentPowerState: string(vm.Summary.Runtime.PowerState),
Description: event.FullFormattedMessage,
}
// Append to list of all results
results = append(results, result)
}
for _, hostEvent := range hostFailures {
hostResults = append(hostResults, HostFailureResults{
HostName: hostEvent.Host.Name,
FailureTime: hostEvent.CreatedTime.In(location),
Cluster: hostEvent.ComputeResource.Name,
Vcenter: u.Host,
})
}
} else {
log.Printf("Found %d hostfailure messages.", len(hostFailures))
}
// Combine details of host outages and VM outages into one interface
var combined []interface{}
for _, h := range hostResults {
combined = append(combined, h)
}
for _, v := range results {
combined = append(combined, v)
}
// Output final results in JSON
if len(combined) > 0 {
j, _ := json.Marshal(combined)
fmt.Println(string(j))
} else {
fmt.Println("{}")
}
}