package main import ( "context" "encoding/json" "flag" "fmt" "log" "net/url" "os" "sort" "strings" "time" _ "time/tzdata" "github.com/vmware/govmomi" "github.com/vmware/govmomi/event" "github.com/vmware/govmomi/view" "github.com/vmware/govmomi/vim25/methods" "github.com/vmware/govmomi/vim25/mo" "github.com/vmware/govmomi/vim25/types" ) type OutageResults struct { VM string OutageDuration string OutageStart time.Time RestartTime time.Time Cluster string FailedHost string NewHost string GuestOS string CurrentPowerState string Description string } type HostFailureResults struct { HostName string FailureTime time.Time Cluster string Vcenter string } var ( c *govmomi.Client ctx context.Context cancel context.CancelFunc location *time.Location sha1ver string // sha1 revision used to build the program buildTime string // when the executable was built results []OutageResults hostResults []HostFailureResults haUnreachableEvents []types.Event ) // This function optionally filters events by a single MoRef, any additonal MoRefs are ignored // See https://golang.hotexamples.com/site/file?hash=0x7414a58898f22f77b81f05c04e2e2a2280e018ba9a71a2f4efb2bda7c55c35ac&fullName=govc/command.go&project=kristinn/govmomi // for an example of how to deal with multiple entities func getEvents(eventTypes []string, entities []types.ManagedObjectReference, begin time.Duration, end time.Duration) []types.Event { var returnEvents []types.Event // Refer https://github.com/vmware/govmomi/blob/main/examples/events/main.go now, err := methods.GetCurrentTime(ctx, c) // vCenter server time (UTC) if err != nil { fmt.Fprintf(os.Stderr, "Error getting vCenter time: %s\n", err) os.Exit(1) } //else { //log.Printf("vCenter time is '%v'\n", now) //} m := event.NewManager(c.Client) root := c.ServiceContent.RootFolder // configure the event stream filter (begin of stream) filter := types.EventFilterSpec{ EventTypeId: eventTypes, // only stream specific types, e.g. VmEvent Entity: &types.EventFilterSpecByEntity{ Entity: root, Recursion: types.EventFilterSpecRecursionOptionAll, }, Time: &types.EventFilterSpecByTime{ BeginTime: types.NewTime(now.Add(begin * -1)), }, } if end != 0 { filter.Time.EndTime = types.NewTime(now.Add(end * -1)) } if len(entities) == 0 { entities = append(entities, root) } for _, e := range entities { // Only log the entity we're filtering if it isn't the vcenter root if e != root { log.Printf("getEvents setting entity '%v' to filter\n", e) } filter.Entity = &types.EventFilterSpecByEntity{ Entity: e, Recursion: types.EventFilterSpecRecursionOptionAll, } collector, err := m.CreateCollectorForEvents(ctx, filter) if err != nil { fmt.Fprintf(os.Stderr, "Error creating event collector: %s\n", err) os.Exit(1) } defer collector.Destroy(ctx) for { events, err := collector.ReadNextEvents(ctx, 100) if err != nil { fmt.Fprintf(os.Stderr, "Error reading events: %s\n", err) } if len(events) == 0 { break } for i := range events { event := events[i].GetEvent() returnEvents = append(returnEvents, *event) //kind := reflect.TypeOf(events[i]).Elem().Name() //log.Printf("%d [%s] [%s] %s\n", event.Key, event.CreatedTime.In(location).Format(time.ANSIC), kind, event.FullFormattedMessage) log.Printf("%d [%s] %s\n", event.Key, event.CreatedTime.In(location).Format(time.ANSIC), event.FullFormattedMessage) } } } return returnEvents } func getCluster(name string) mo.ClusterComputeResource { // Create a container view so that we can search vCenter m := view.NewManager(c.Client) cv, _ := m.CreateContainerView(ctx, c.ServiceContent.RootFolder, []string{"ClusterComputeResource"}, true) var clusters []mo.ClusterComputeResource log.Printf("Searching for Cluster '%s'\n", name) err := cv.Retrieve(ctx, []string{"ClusterComputeResource"}, []string{"summary", "name"}, &clusters) if err != nil { log.Printf("Failed searching for Cluster %s : %s\n", name, err) return mo.ClusterComputeResource{} } else { for _, cluster := range clusters { if cluster.Name == name { log.Printf("Found corresponding Cluster with MoRef '%s'\n", cluster.Reference()) return cluster } } } // If we reached here then we didn't find the Cluster return mo.ClusterComputeResource{} } /* func getVM(name string) mo.VirtualMachine { // Create a container view so that we can search vCenter for a VM if we found any failure events m := view.NewManager(c.Client) cv, _ := m.CreateContainerView(ctx, c.ServiceContent.RootFolder, []string{"VirtualMachine"}, true) var vms []mo.VirtualMachine log.Printf("Searching for VM '%s'\n", name) err := cv.Retrieve(ctx, []string{"VirtualMachine"}, []string{"summary", "name"}, &vms) if err != nil { log.Printf("Failed searching for VM %s : %s\n", name, err) return mo.VirtualMachine{} } else { for _, vm := range vms { if vm.Name == name { log.Printf("Found corresponding VM with MoRef '%s'\n", vm.Reference()) return vm } } } // If we reached here then we didn't find a VM return mo.VirtualMachine{} } */ func getVmInCluster(name string, cluster types.ManagedObjectReference) mo.VirtualMachine { // Create a container view so that we can search vCenter for a VM if we found any failure events m := view.NewManager(c.Client) cv, _ := m.CreateContainerView(ctx, cluster, []string{"VirtualMachine"}, true) var vms []mo.VirtualMachine log.Printf("Searching for VM '%s'\n", name) err := cv.Retrieve(ctx, []string{"VirtualMachine"}, []string{"summary", "name"}, &vms) if err != nil { log.Printf("Failed searching for VM %s : %s\n", name, err) return mo.VirtualMachine{} } else { for _, vm := range vms { if vm.Name == name { log.Printf("Found corresponding VM with MoRef '%s'\n", vm.Reference()) return vm } } } // If we reached here then we didn't find a VM return mo.VirtualMachine{} } func main() { // Command line flags for the vCenter connection vURL := flag.String("url", "", "The URL of a vCenter server, eg https://server.domain.example/sdk") vUser := flag.String("user", "", "The username to use when connecting to vCenter") vPass := flag.String("password", "", "The password to use when connecting to vCenter") vTZ := flag.String("tz", "Australia/Sydney", "The timezone to use when converting vCenter UTC times") vInsecure := flag.Bool("insecure", true, "Allow insecure connections to vCenter") begin := flag.Duration("b", time.Hour, "Begin time") // default BeginTime is 1h ago end := flag.Duration("e", 0, "End time") fuzzyMinutes := flag.Int("fuzziness", 5, "Number of minutes to offset VM restart time when searching for related Host failure event") flag.Parse() // Print logs to file f, err := os.OpenFile("log.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0666) if err != nil { log.Fatalf("error opening file: %v", err) } defer f.Close() log.SetOutput(f) log.Printf("Starting execution. Built on %s from sha1 %s\n", buildTime, sha1ver) // So we can convert vCenter UTC to our local timezone log.Printf("Setting timezone to '%s'\n", *vTZ) location, err = time.LoadLocation(*vTZ) if err != nil { fmt.Fprintf(os.Stderr, "Error setting timezone to %s : %s\n", *vTZ, err) os.Exit(1) } u, err := url.Parse(*vURL) if err != nil { fmt.Fprintf(os.Stderr, "Error parsing url %s : %s\n", *vURL, err) os.Exit(1) } else { if !strings.HasSuffix(u.Path, "/sdk") { u.Path, _ = url.JoinPath(u.Path, "/sdk") log.Printf("Updated vCenter URL to '%v'\n", u) } } log.Printf("Connecting to vCenter %s\n", u) u.User = url.UserPassword(*vUser, *vPass) ctx, cancel = context.WithCancel(context.Background()) defer cancel() // Login to vcenter c, err = govmomi.NewClient(ctx, u, *vInsecure) if err != nil { fmt.Fprintf(os.Stderr, "Logging in error: %s\n", err) os.Exit(1) } defer c.Logout(ctx) //finder := find.NewFinder(c.Client) log.Printf("Searching for hostfailure events\n") hostFailures := getEvents([]string{"com.vmware.vc.HA.DasHostFailedEvent"}, []types.ManagedObjectReference{}, *begin, *end) if len(hostFailures) > 0 { log.Printf("Found at least one host failure, proceeding with VM restart search\n") vmFailures := getEvents([]string{"com.vmware.vc.ha.VmRestartedByHAEvent"}, []types.ManagedObjectReference{}, *begin, *end) log.Printf("Searching for ha status change events\n") haStatusChanges := getEvents([]string{"com.vmware.vc.HA.HostStateChangedEvent"}, []types.ManagedObjectReference{}, *begin, *end) // filter ha status changed messages for unreachable ones for _, h := range haStatusChanges { unreachableMessage := strings.Contains(strings.ToLower(h.FullFormattedMessage), "changed to unreachable") if unreachableMessage { // make sure this host was in the host failures list hostFound := false for _, f := range hostFailures { if h.Host.Name == f.Host.Name { // got it hostFound = true break } } if hostFound { haUnreachableEvents = append(haUnreachableEvents, h) log.Printf("Host %s unreachable HA status event at %s : '%s'\n", h.Host.Name, h.CreatedTime.In(location).Format(time.ANSIC), h.FullFormattedMessage) } else { log.Printf("Host %s was not found in the list of hostfailure events, skipping this host\n", h.Host.Name) } } } // Sort the host failure events by time sort.Slice(hostFailures[:], func(i, j int) bool { return hostFailures[i].CreatedTime.Before(hostFailures[j].CreatedTime) }) for i := range vmFailures { var outageStart, restartTime time.Time var failedHost string var possibleHosts []types.Event event := vmFailures[i] vmRestartTime := event.CreatedTime.In(location) // Sometimes host HA events can come through a few minutes after a VM restart event, so create a "fuzzy" starting time to search for host HA events fuzzyTime := vmRestartTime.Add(time.Duration(int64(time.Minute) * int64(*fuzzyMinutes))) log.Printf("Failure event for VM '%s' restarted in cluster '%s'\n", event.Vm.Name, event.ComputeResource.Name) // Get a reference to the cluster mentioned cluster := getCluster((event.ComputeResource.Name)) vm := getVmInCluster(event.Vm.Name, cluster.Reference()) //vm := getVM(event.Vm.Name) // Use VmDisconnectedEvent to see which host this VM was on vmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end) log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents)) // Determine which host the VM was previoulsy running on if len(vmDisconnectedEvents) > 0 { // Sort the disconnected events by time sort.Slice(vmDisconnectedEvents[:], func(i, j int) bool { return vmDisconnectedEvents[i].CreatedTime.Before(vmDisconnectedEvents[j].CreatedTime) }) log.Printf("After sorting, VmDisconnectedEvent list looks like this:\n") for _, h := range vmDisconnectedEvents { log.Printf("%d [%s] VM: %s, Host: %s, Message: %s\n", h.Key, h.CreatedTime.In(location).Format(time.ANSIC), h.Vm.Name, h.Host.Name, h.FullFormattedMessage) } // What if there are multiple host HA event messages but only one VM disconnected message? // Search for any disconnected messages prior to restart time log.Printf("Filtering VmDisconnectedEvent list based on fuzzy VM restart time %s\n", fuzzyTime) for _, e := range vmDisconnectedEvents { if e.CreatedTime.In(location).Before(fuzzyTime) || e.CreatedTime.In(location).Equal(fuzzyTime) { log.Printf("VM disconnected event on host %s at time %s is applicable\n", e.Host.Name, e.CreatedTime.In(location)) possibleHosts = append(possibleHosts, e) } } log.Printf("After filtering VmDisconnectedEvent there are %d events\n", len(possibleHosts)) // Its possible that the VM disconnected messages dont' relate to the host HA events that we found // If that is the case then we fall back to the most recent host failure message in our list if len(possibleHosts) == 0 { log.Printf("No corresponding VM disconnected messages, falling back to any applicable host that experienced a HA event.\n") // Search for host failures for _, hostEvent := range hostFailures { if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) { possibleHosts = append(possibleHosts, hostEvent) } } log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts)) if len(possibleHosts) == 0 { log.Printf("No ESXi outage events happened before VM %s fuzzy restart event at %s, skipping this event.\n", event.Vm.Name, fuzzyTime) continue } } } else { // Didn't find any VM disconnected events log.Printf("could not determine previous host for this VM. Filtering all host failures for events prior to fuzzy VM restart time '%s'\n", fuzzyTime) // TODO Use HA unreachable events to find the host for _, hostEvent := range haUnreachableEvents { if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) { possibleHosts = append(possibleHosts, hostEvent) } } /* // Search for host failures for _, hostEvent := range hostFailures { if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) { possibleHosts = append(possibleHosts, hostEvent) } } */ log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts)) for _, hostEvent := range possibleHosts { log.Printf("Host %s\n", hostEvent.Host.Name) } if len(possibleHosts) == 0 { log.Printf("No ESXi outage events happened before VM %s fuzzy restart event at %s, skipping this event.\n", event.Vm.Name, fuzzyTime) continue } } if len(possibleHosts) == 1 { log.Printf("Found a single host that failed relating to VM %s\n", event.Vm.Name) var checkActualTime []types.Event // Search the list of host failures to get the last host HA event before this VM was disconnected for _, hostEvent := range hostFailures { if hostEvent.Host.Name == possibleHosts[0].Host.Name { if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) { checkActualTime = append(checkActualTime, hostEvent) } } } if len(checkActualTime) == 1 { log.Printf("Found a single host failure event for our failed host that occurred before vm restart time\n") failedHost = checkActualTime[0].Host.Name outageStart = checkActualTime[0].CreatedTime.In(location) restartTime = vmRestartTime } else { log.Printf("Found %d failure event(s) for our failed host that occurred before vm restart time\n", len(checkActualTime)) log.Printf("Assuming failed host was '%s', with outage start time of '%s'\n", possibleHosts[0].Host.Name, possibleHosts[0].CreatedTime.In(location)) failedHost = possibleHosts[0].Host.Name outageStart = possibleHosts[0].CreatedTime.In(location) restartTime = vmRestartTime } } else if len(possibleHosts) > 1 { log.Printf("Found multiple host failure events relating to VM %s\n", event.Vm.Name) var checkActualTime []types.Event // Search for any disconnected messages prior to actual restart time rather than fuzzy time log.Printf("Checking possible hosts list based on actual VM restart time %s\n", vmRestartTime) for _, hostEvent := range possibleHosts { if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) { checkActualTime = append(checkActualTime, hostEvent) } } // if that search gives us exactly one result then use that if len(checkActualTime) == 1 { log.Printf("Found exactly one host corresponding to actual VM restart time. Failed host was '%s', using outage start time of '%s'\n", checkActualTime[0].Host.Name, checkActualTime[0].CreatedTime.In(location)) failedHost = checkActualTime[0].Host.Name outageStart = checkActualTime[0].CreatedTime.In(location) restartTime = vmRestartTime } else { // if using the actual VM restart time doesn't narrow things down then go back to using the last host failure time before the fuzzy VM restart time lastIndex := len(possibleHosts) - 1 log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location)) failedHost = possibleHosts[lastIndex].Host.Name outageStart = possibleHosts[lastIndex].CreatedTime.In(location) restartTime = vmRestartTime } } else { log.Printf("Didn't find any data to suggest which host this VM was running on before!\n") restartTime = vmRestartTime outageStart = vmRestartTime failedHost = "Unknown" } duration := restartTime.Sub(outageStart) out := time.Time{}.Add(duration) // Create a new result result := OutageResults{ VM: event.Vm.Name, OutageDuration: out.Format("15:04:05"), OutageStart: outageStart, RestartTime: restartTime, Cluster: event.ComputeResource.Name, FailedHost: failedHost, NewHost: event.Host.Name, GuestOS: vm.Summary.Guest.GuestFullName, CurrentPowerState: string(vm.Summary.Runtime.PowerState), Description: event.FullFormattedMessage, } // Append to list of all results results = append(results, result) } for _, hostEvent := range hostFailures { hostResults = append(hostResults, HostFailureResults{ HostName: hostEvent.Host.Name, FailureTime: hostEvent.CreatedTime.In(location), Cluster: hostEvent.ComputeResource.Name, Vcenter: u.Host, }) } } else { log.Printf("Found %d hostfailure messages.", len(hostFailures)) } // Combine details of host outages and VM outages into one interface var combined []interface{} for _, h := range hostResults { combined = append(combined, h) } for _, v := range results { combined = append(combined, v) } // Output final results in JSON if len(combined) > 0 { j, _ := json.Marshal(combined) fmt.Println(string(j)) } else { fmt.Println("{}") } }