From 6e6ed55bd385e21c55a3453414cdebb18ff1d129 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Tue, 18 Jul 2023 10:23:49 +1000 Subject: [PATCH] better searching for correct host failure time --- main.go | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/main.go b/main.go index ee86abc..65e9e05 100644 --- a/main.go +++ b/main.go @@ -291,31 +291,32 @@ func main() { //vm := getVM(event.Vm.Name) // Use VmDisconnectedEvent to see which host this VM was on - disconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end) - log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(disconnectedEvents)) + vmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end) + log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents)) // Determine which host the VM was previoulsy running on - if len(disconnectedEvents) > 0 { + if len(vmDisconnectedEvents) > 0 { // Sort the disconnected events by time - sort.Slice(disconnectedEvents[:], func(i, j int) bool { - return disconnectedEvents[i].CreatedTime.Before(disconnectedEvents[j].CreatedTime) + sort.Slice(vmDisconnectedEvents[:], func(i, j int) bool { + return vmDisconnectedEvents[i].CreatedTime.Before(vmDisconnectedEvents[j].CreatedTime) }) log.Printf("After sorting, VmDisconnectedEvent list looks like this:\n") - for _, h := range disconnectedEvents { + for _, h := range vmDisconnectedEvents { log.Printf("%d [%s] VM: %s, Host: %s, Message: %s\n", h.Key, h.CreatedTime.In(location).Format(time.ANSIC), h.Vm.Name, h.Host.Name, h.FullFormattedMessage) } - // Search for any disconnected messages prior to restart time + // What if there are multiple host HA event messages but only one VM disconnected message? + // Search for any disconnected messages prior to restart time log.Printf("Filtering VmDisconnectedEvent list based on fuzzy VM restart time %s\n", fuzzyTime) - for _, e := range disconnectedEvents { + for _, e := range vmDisconnectedEvents { if e.CreatedTime.In(location).Before(fuzzyTime) || e.CreatedTime.In(location).Equal(fuzzyTime) { log.Printf("VM disconnected event on host %s at time %s is applicable\n", e.Host.Name, e.CreatedTime.In(location)) possibleHosts = append(possibleHosts, e) } } - log.Printf("After filtering there are %d events\n", len(possibleHosts)) + log.Printf("After filtering VmDisconnectedEvent there are %d events\n", len(possibleHosts)) // Its possible that the VM disconnected messages dont' relate to the host HA events that we found // If that is the case then we fall back to the most recent host failure message in our list @@ -354,11 +355,31 @@ func main() { } if len(possibleHosts) == 1 { - log.Printf("Found a single host failure event relating to VM %s\n", event.Vm.Name) - log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location)) - failedHost = possibleHosts[0].Host.Name - outageStart = possibleHosts[0].CreatedTime.In(location) - restartTime = vmRestartTime + log.Printf("Found a single host that failed relating to VM %s\n", event.Vm.Name) + + var checkActualTime []types.Event + + // Search the list of host failures to get the last host HA event before this VM was disconnected + for _, hostEvent := range hostFailures { + if hostEvent.Host.Name == possibleHosts[0].Host.Name { + if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) { + checkActualTime = append(checkActualTime, hostEvent) + } + } + } + + if len(checkActualTime) == 1 { + log.Printf("Found a single host failure event for our failed host that occurred before vm restart time\n") + failedHost = checkActualTime[0].Host.Name + outageStart = checkActualTime[0].CreatedTime.In(location) + restartTime = vmRestartTime + } else { + log.Printf("Found %d failure event(s) for our failed host that occurred before vm restart time\n", len(checkActualTime)) + log.Printf("Assuming failed host was '%s', with outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location)) + failedHost = possibleHosts[0].Host.Name + outageStart = possibleHosts[0].CreatedTime.In(location) + restartTime = vmRestartTime + } } else if len(possibleHosts) > 1 { log.Printf("Found multiple host failure events relating to VM %s\n", event.Vm.Name)