use HA unreachable events when trying to find a failure

This commit is contained in:
2024-03-18 10:05:44 +11:00
parent f0c9789819
commit bdce428803

42
main.go
View File

@@ -42,14 +42,15 @@ type HostFailureResults struct {
} }
var ( var (
c *govmomi.Client c *govmomi.Client
ctx context.Context ctx context.Context
cancel context.CancelFunc cancel context.CancelFunc
location *time.Location location *time.Location
sha1ver string // sha1 revision used to build the program sha1ver string // sha1 revision used to build the program
buildTime string // when the executable was built buildTime string // when the executable was built
results []OutageResults results []OutageResults
hostResults []HostFailureResults hostResults []HostFailureResults
haUnreachableEvents []types.Event
) )
// This function optionally filters events by a single MoRef, any additonal MoRefs are ignored // This function optionally filters events by a single MoRef, any additonal MoRefs are ignored
@@ -269,6 +270,18 @@ func main() {
log.Printf("Found at least one host failure, proceeding with VM restart search\n") log.Printf("Found at least one host failure, proceeding with VM restart search\n")
vmFailures := getEvents([]string{"com.vmware.vc.ha.VmRestartedByHAEvent"}, []types.ManagedObjectReference{}, *begin, *end) vmFailures := getEvents([]string{"com.vmware.vc.ha.VmRestartedByHAEvent"}, []types.ManagedObjectReference{}, *begin, *end)
log.Printf("Searching for ha status change events\n")
haStatusChanges := getEvents([]string{"com.vmware.vc.HA.HostStateChangedEvent"}, []types.ManagedObjectReference{}, *begin, *end)
// filter ha status changed messages for unreachable ones
for _, h := range haStatusChanges {
unreachableMessage := strings.Contains(strings.ToLower(h.FullFormattedMessage), "changed to unreachable")
if unreachableMessage {
haUnreachableEvents = append(haUnreachableEvents, h)
log.Printf("Host %s unreachable HA status event at %s : '%s'\n", h.Host.Name, h.CreatedTime.In(location).Format(time.ANSIC), h.FullFormattedMessage)
}
}
// Sort the host failure events by time // Sort the host failure events by time
sort.Slice(hostFailures[:], func(i, j int) bool { sort.Slice(hostFailures[:], func(i, j int) bool {
return hostFailures[i].CreatedTime.Before(hostFailures[j].CreatedTime) return hostFailures[i].CreatedTime.Before(hostFailures[j].CreatedTime)
@@ -338,13 +351,20 @@ func main() {
} else { // Didn't find any VM disconnected events } else { // Didn't find any VM disconnected events
log.Printf("could not determine previous host for this VM. Filtering all host failures for events prior to fuzzy VM restart time '%s'\n", fuzzyTime) log.Printf("could not determine previous host for this VM. Filtering all host failures for events prior to fuzzy VM restart time '%s'\n", fuzzyTime)
// Search for host failures // TODO Use HA unreachable events to find the host
for _, hostEvent := range hostFailures { for _, hostEvent := range haUnreachableEvents {
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) { if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
possibleHosts = append(possibleHosts, hostEvent) possibleHosts = append(possibleHosts, hostEvent)
} }
} }
/*
// Search for host failures
for _, hostEvent := range hostFailures {
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
possibleHosts = append(possibleHosts, hostEvent)
}
}
*/
log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts)) log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts))
if len(possibleHosts) == 0 { if len(possibleHosts) == 0 {