From 348dc474b724c335ad102284276da378481bd904 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Tue, 18 Jul 2023 09:56:38 +1000 Subject: [PATCH 1/2] try narrowing down possible hosts by also checking actual VM restart time --- main.go | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/main.go b/main.go index 13a02be..ee86abc 100644 --- a/main.go +++ b/main.go @@ -362,13 +362,30 @@ func main() { } else if len(possibleHosts) > 1 { log.Printf("Found multiple host failure events relating to VM %s\n", event.Vm.Name) - // possible hosts is sorted by time, so use the last value if there are multiple - lastIndex := len(possibleHosts) - 1 + var checkActualTime []types.Event - log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location)) - failedHost = possibleHosts[lastIndex].Host.Name - outageStart = possibleHosts[lastIndex].CreatedTime.In(location) - restartTime = vmRestartTime + // Search for any disconnected messages prior to actual restart time rather than fuzzy time + log.Printf("Checking host failure list based on actual VM restart time %s\n", vmRestartTime) + for _, hostEvent := range hostFailures { + if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) { + checkActualTime = append(checkActualTime, hostEvent) + } + } + + // if that search gives us exactly one result then use that + if len(checkActualTime) == 1 { + log.Printf("Found exactly one host corresponding to actual VM restart time. Failed host was '%s', using outage start time of '%s'\n", checkActualTime[0].Host.Name, checkActualTime[0].CreatedTime.In(location)) + failedHost = checkActualTime[0].Host.Name + outageStart = checkActualTime[0].CreatedTime.In(location) + restartTime = vmRestartTime + } else { + // if using the actual VM restart time doesn't narrow things down then go back to using the last host failure time before the fuzzy VM restart time + lastIndex := len(possibleHosts) - 1 + log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location)) + failedHost = possibleHosts[lastIndex].Host.Name + outageStart = possibleHosts[lastIndex].CreatedTime.In(location) + restartTime = vmRestartTime + } } else { log.Printf("Didn't find any data to suggest which host this VM was running on before!\n") restartTime = vmRestartTime From 6e6ed55bd385e21c55a3453414cdebb18ff1d129 Mon Sep 17 00:00:00 2001 From: Nathan Coad Date: Tue, 18 Jul 2023 10:23:49 +1000 Subject: [PATCH 2/2] better searching for correct host failure time --- main.go | 49 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/main.go b/main.go index ee86abc..65e9e05 100644 --- a/main.go +++ b/main.go @@ -291,31 +291,32 @@ func main() { //vm := getVM(event.Vm.Name) // Use VmDisconnectedEvent to see which host this VM was on - disconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end) - log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(disconnectedEvents)) + vmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end) + log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents)) // Determine which host the VM was previoulsy running on - if len(disconnectedEvents) > 0 { + if len(vmDisconnectedEvents) > 0 { // Sort the disconnected events by time - sort.Slice(disconnectedEvents[:], func(i, j int) bool { - return disconnectedEvents[i].CreatedTime.Before(disconnectedEvents[j].CreatedTime) + sort.Slice(vmDisconnectedEvents[:], func(i, j int) bool { + return vmDisconnectedEvents[i].CreatedTime.Before(vmDisconnectedEvents[j].CreatedTime) }) log.Printf("After sorting, VmDisconnectedEvent list looks like this:\n") - for _, h := range disconnectedEvents { + for _, h := range vmDisconnectedEvents { log.Printf("%d [%s] VM: %s, Host: %s, Message: %s\n", h.Key, h.CreatedTime.In(location).Format(time.ANSIC), h.Vm.Name, h.Host.Name, h.FullFormattedMessage) } - // Search for any disconnected messages prior to restart time + // What if there are multiple host HA event messages but only one VM disconnected message? + // Search for any disconnected messages prior to restart time log.Printf("Filtering VmDisconnectedEvent list based on fuzzy VM restart time %s\n", fuzzyTime) - for _, e := range disconnectedEvents { + for _, e := range vmDisconnectedEvents { if e.CreatedTime.In(location).Before(fuzzyTime) || e.CreatedTime.In(location).Equal(fuzzyTime) { log.Printf("VM disconnected event on host %s at time %s is applicable\n", e.Host.Name, e.CreatedTime.In(location)) possibleHosts = append(possibleHosts, e) } } - log.Printf("After filtering there are %d events\n", len(possibleHosts)) + log.Printf("After filtering VmDisconnectedEvent there are %d events\n", len(possibleHosts)) // Its possible that the VM disconnected messages dont' relate to the host HA events that we found // If that is the case then we fall back to the most recent host failure message in our list @@ -354,11 +355,31 @@ func main() { } if len(possibleHosts) == 1 { - log.Printf("Found a single host failure event relating to VM %s\n", event.Vm.Name) - log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location)) - failedHost = possibleHosts[0].Host.Name - outageStart = possibleHosts[0].CreatedTime.In(location) - restartTime = vmRestartTime + log.Printf("Found a single host that failed relating to VM %s\n", event.Vm.Name) + + var checkActualTime []types.Event + + // Search the list of host failures to get the last host HA event before this VM was disconnected + for _, hostEvent := range hostFailures { + if hostEvent.Host.Name == possibleHosts[0].Host.Name { + if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) { + checkActualTime = append(checkActualTime, hostEvent) + } + } + } + + if len(checkActualTime) == 1 { + log.Printf("Found a single host failure event for our failed host that occurred before vm restart time\n") + failedHost = checkActualTime[0].Host.Name + outageStart = checkActualTime[0].CreatedTime.In(location) + restartTime = vmRestartTime + } else { + log.Printf("Found %d failure event(s) for our failed host that occurred before vm restart time\n", len(checkActualTime)) + log.Printf("Assuming failed host was '%s', with outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location)) + failedHost = possibleHosts[0].Host.Name + outageStart = possibleHosts[0].CreatedTime.In(location) + restartTime = vmRestartTime + } } else if len(possibleHosts) > 1 { log.Printf("Found multiple host failure events relating to VM %s\n", event.Vm.Name)