Compare commits
31 Commits
29eab3df05
...
main
Author | SHA1 | Date | |
---|---|---|---|
ed0e5947ea | |||
a9aad6643d | |||
f86aefdca6 | |||
a1e4649455 | |||
a615602bf8 | |||
8dd6146818 | |||
66543b15b6 | |||
508ddd73f7 | |||
fb433b9ef2 | |||
71c397e5fe | |||
4ea27bf071 | |||
ae5ce907c4 | |||
9e20f8b9a2 | |||
e47d660419 | |||
47c61f0417 | |||
1098910135 | |||
f059efc49f | |||
b9a53b240a | |||
9fbe579f43 | |||
919ffd52cf | |||
106eb7d1bb | |||
da8742ea64 | |||
2f80601a40 | |||
c2df4ea3af | |||
e355f5f6bc | |||
334b0b8ab6 | |||
f20712beb0 | |||
328b027fdc | |||
82fb21c710 | |||
fd23121a7c | |||
6aa7627c96 |
47
.drone.yml
47
.drone.yml
@@ -11,18 +11,35 @@ steps:
|
||||
commands:
|
||||
- sh ./.drone.sh
|
||||
|
||||
- name: dell-deploy
|
||||
# # https://github.com/cschlosser/drone-ftps/blob/master/README.md
|
||||
image: cschlosser/drone-ftps
|
||||
environment:
|
||||
FTP_USERNAME:
|
||||
from_secret: FTP_USERNAME
|
||||
FTP_PASSWORD:
|
||||
from_secret: FTP_PASSWORD
|
||||
PLUGIN_HOSTNAME: ftp.emc.com:21
|
||||
PLUGIN_SECURE: false
|
||||
PLUGIN_VERIFY: false
|
||||
PLUGIN_CHMOD: false
|
||||
#PLUGIN_DEBUG: false
|
||||
PLUGIN_INCLUDE: ^events$,^events_checksum.txt$
|
||||
PLUGIN_EXCLUDE: ^\.git/$
|
||||
#- name: dell-deploy
|
||||
## # https://github.com/cschlosser/drone-ftps/blob/master/README.md
|
||||
# image: cschlosser/drone-ftps
|
||||
# environment:
|
||||
# FTP_USERNAME:
|
||||
# from_secret: FTP_USERNAME
|
||||
# FTP_PASSWORD:
|
||||
# from_secret: FTP_PASSWORD
|
||||
# PLUGIN_HOSTNAME: ftp.emc.com:21
|
||||
# PLUGIN_SECURE: false
|
||||
# PLUGIN_VERIFY: false
|
||||
# PLUGIN_CHMOD: false
|
||||
# #PLUGIN_DEBUG: false
|
||||
# PLUGIN_INCLUDE: ^events$,^events_checksum.txt$
|
||||
# PLUGIN_EXCLUDE: ^\.git/$
|
||||
|
||||
# https://github.com/hypervtechnics/drone-sftp
|
||||
- name: dell-sftp-deploy
|
||||
image: hypervtechnics/drone-sftp
|
||||
settings:
|
||||
host: deft.dell.com
|
||||
username:
|
||||
from_secret: DELLFTP_USER
|
||||
password:
|
||||
from_secret: DELLFTP_PASS
|
||||
port: 22
|
||||
source: ./
|
||||
filter: events*
|
||||
clean: false
|
||||
target: /
|
||||
overwrite: true
|
||||
verbose: true
|
4
go.mod
4
go.mod
@@ -1,5 +1,5 @@
|
||||
module nathan/go-events
|
||||
|
||||
go 1.19
|
||||
go 1.24.2
|
||||
|
||||
require github.com/vmware/govmomi v0.30.4
|
||||
require github.com/vmware/govmomi v0.50.0
|
||||
|
5
go.sum
5
go.sum
@@ -1,3 +1,8 @@
|
||||
github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I=
|
||||
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/vmware/govmomi v0.30.4 h1:BCKLoTmiBYRuplv3GxKEMBLtBaJm8PA56vo9bddIpYQ=
|
||||
github.com/vmware/govmomi v0.30.4/go.mod h1:F7adsVewLNHsW/IIm7ziFURaXDaHEwcc+ym4r3INMdY=
|
||||
github.com/vmware/govmomi v0.43.0 h1:7Kg3Bkdly+TrE67BYXzRq7ZrDnn7xqpKX95uEh2f9Go=
|
||||
github.com/vmware/govmomi v0.43.0/go.mod h1:IOv5nTXCPqH9qVJAlRuAGffogaLsNs8aF+e7vLgsHJU=
|
||||
github.com/vmware/govmomi v0.50.0 h1:vFOnUCBCX3m3MgTKfBp68Pz5gsHvKkO07Y2wCGYYQOM=
|
||||
github.com/vmware/govmomi v0.50.0/go.mod h1:Z5uo7z0kRhVV00E4gfbUGwUaXIKTgqngsT+t/mIDpcI=
|
||||
|
145
main.go
145
main.go
@@ -91,14 +91,14 @@ func getEvents(eventTypes []string, entities []types.ManagedObjectReference, beg
|
||||
}
|
||||
|
||||
for _, e := range entities {
|
||||
// Only log the entity we're filtering if it isn't the vcenter root
|
||||
if e != root {
|
||||
if e == root {
|
||||
log.Printf("getEvents leaving event filter spec at root\n")
|
||||
} else { // Only log the entity we're filtering if it isn't the vcenter root
|
||||
log.Printf("getEvents setting entity '%v' to filter\n", e)
|
||||
}
|
||||
|
||||
filter.Entity = &types.EventFilterSpecByEntity{
|
||||
Entity: e,
|
||||
Recursion: types.EventFilterSpecRecursionOptionAll,
|
||||
filter.Entity = &types.EventFilterSpecByEntity{
|
||||
Entity: e,
|
||||
Recursion: types.EventFilterSpecRecursionOptionAll,
|
||||
}
|
||||
}
|
||||
|
||||
collector, err := m.CreateCollectorForEvents(ctx, filter)
|
||||
@@ -182,28 +182,28 @@ func getCluster(name string) mo.ClusterComputeResource {
|
||||
return mo.VirtualMachine{}
|
||||
}
|
||||
*/
|
||||
func getVmInCluster(name string, cluster types.ManagedObjectReference) mo.VirtualMachine {
|
||||
func getVmInCluster(name string, cluster types.ManagedObjectReference) (mo.VirtualMachine, error) {
|
||||
// Create a container view so that we can search vCenter for a VM if we found any failure events
|
||||
m := view.NewManager(c.Client)
|
||||
cv, _ := m.CreateContainerView(ctx, cluster, []string{"VirtualMachine"}, true)
|
||||
|
||||
var vms []mo.VirtualMachine
|
||||
log.Printf("Searching for VM '%s'\n", name)
|
||||
log.Printf("Searching for VM '%s' in cluster '%v'\n", name, cluster.Reference().Value)
|
||||
err := cv.Retrieve(ctx, []string{"VirtualMachine"}, []string{"summary", "name"}, &vms)
|
||||
if err != nil {
|
||||
log.Printf("Failed searching for VM %s : %s\n", name, err)
|
||||
return mo.VirtualMachine{}
|
||||
return mo.VirtualMachine{}, fmt.Errorf("error searching for VM %s : %s", name, err)
|
||||
} else {
|
||||
for _, vm := range vms {
|
||||
if vm.Name == name {
|
||||
log.Printf("Found corresponding VM with MoRef '%s'\n", vm.Reference())
|
||||
return vm
|
||||
log.Printf("Found corresponding VM with MoRef '%s'", vm.Reference())
|
||||
return vm, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we reached here then we didn't find a VM
|
||||
return mo.VirtualMachine{}
|
||||
return mo.VirtualMachine{}, fmt.Errorf("no VM found with name %s", name)
|
||||
}
|
||||
|
||||
func main() {
|
||||
@@ -216,6 +216,7 @@ func main() {
|
||||
begin := flag.Duration("b", time.Hour, "Begin time") // default BeginTime is 1h ago
|
||||
end := flag.Duration("e", 0, "End time")
|
||||
fuzzyMinutes := flag.Int("fuzziness", 5, "Number of minutes to offset VM restart time when searching for related Host failure event")
|
||||
unreachableMinutes := flag.Int("unreachable", 20, "Number of minutes to search for host HA events either side of a VM failure")
|
||||
flag.Parse()
|
||||
|
||||
// Print logs to file
|
||||
@@ -272,13 +273,41 @@ func main() {
|
||||
|
||||
log.Printf("Searching for ha status change events\n")
|
||||
haStatusChanges := getEvents([]string{"com.vmware.vc.HA.HostStateChangedEvent"}, []types.ManagedObjectReference{}, *begin, *end)
|
||||
log.Printf("Found %d ha status change events\n", len(haStatusChanges))
|
||||
|
||||
// filter ha status changed messages for unreachable ones
|
||||
for _, h := range haStatusChanges {
|
||||
unreachableMessage := strings.Contains(strings.ToLower(h.FullFormattedMessage), "changed to unreachable")
|
||||
if unreachableMessage {
|
||||
haUnreachableEvents = append(haUnreachableEvents, h)
|
||||
log.Printf("Host %s unreachable HA status event at %s : '%s'\n", h.Host.Name, h.CreatedTime.In(location).Format(time.ANSIC), h.FullFormattedMessage)
|
||||
hostFailedMessage := strings.Contains(strings.ToLower(h.FullFormattedMessage), "changed to host failed")
|
||||
if unreachableMessage || hostFailedMessage {
|
||||
|
||||
// make sure this host was in the host failures list
|
||||
hostFound := false
|
||||
var haFailedTime time.Time
|
||||
for _, f := range hostFailures {
|
||||
if h.Host.Name == f.Host.Name {
|
||||
// got it
|
||||
hostFound = true
|
||||
haFailedTime = f.CreatedTime.In(location)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// make sure that this event is within 10 minutes either side of the corresponding host failed event
|
||||
if hostFound {
|
||||
unreachableStartComparison := h.CreatedTime.In(location).Add(time.Duration(int64(time.Minute) * -1 * int64(*unreachableMinutes)))
|
||||
unreachableEndComparison := h.CreatedTime.In(location).Add(time.Duration(int64(time.Minute) * int64(*unreachableMinutes)))
|
||||
|
||||
if haFailedTime.Before(unreachableEndComparison) && haFailedTime.After(unreachableStartComparison) {
|
||||
haUnreachableEvents = append(haUnreachableEvents, h)
|
||||
log.Printf("Keeping host %s unreachable HA status event at %s\n", h.Host.Name, h.CreatedTime.In(location).Format(time.ANSIC))
|
||||
} else {
|
||||
log.Printf("Excluding HA Unreachable for Host %s at time %s since it was before %s or after %s\n", h.Host.Name, haFailedTime.Format(time.ANSIC),
|
||||
unreachableEndComparison, unreachableStartComparison)
|
||||
}
|
||||
} else {
|
||||
log.Printf("Host %s was not found in the list of hostfailure events, skipping this host\n", h.Host.Name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -287,25 +316,64 @@ func main() {
|
||||
return hostFailures[i].CreatedTime.Before(hostFailures[j].CreatedTime)
|
||||
})
|
||||
|
||||
log.Printf("Searching for all vm disconnected events\n")
|
||||
allVmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{}, *begin, *end)
|
||||
log.Printf("Retrieved '%d' VmDisconnectedEvent events from '%s' to '%s'.\n", len(allVmDisconnectedEvents), begin.String(), end.String())
|
||||
if len(allVmDisconnectedEvents) > 0 {
|
||||
// Sort the disconnected events by time
|
||||
sort.Slice(allVmDisconnectedEvents[:], func(i, j int) bool {
|
||||
return allVmDisconnectedEvents[i].CreatedTime.Before(allVmDisconnectedEvents[j].CreatedTime)
|
||||
})
|
||||
}
|
||||
|
||||
for i := range vmFailures {
|
||||
var vm mo.VirtualMachine
|
||||
var outageStart, restartTime time.Time
|
||||
var failedHost string
|
||||
var possibleHosts []types.Event
|
||||
var vmDisconnectedEvents []types.Event
|
||||
var vmFound bool
|
||||
var vmOS string
|
||||
var vmPowerState string
|
||||
event := vmFailures[i]
|
||||
vmRestartTime := event.CreatedTime.In(location)
|
||||
|
||||
// Sometimes host HA events can come through a few minutes after a VM restart event, so create a "fuzzy" starting time to search for host HA events
|
||||
fuzzyTime := vmRestartTime.Add(time.Duration(int64(time.Minute) * int64(*fuzzyMinutes)))
|
||||
log.Printf("Failure event for VM '%s' restarted in cluster '%s'\n", event.Vm.Name, event.ComputeResource.Name)
|
||||
|
||||
// Get a reference to the cluster mentioned
|
||||
if event.Vm == nil {
|
||||
log.Printf("Can't read this event properly, skipping\n%v\n", event)
|
||||
continue
|
||||
}
|
||||
|
||||
log.Printf("Failure event for VM '%s' restarted in cluster '%s' at %s\n", event.Vm.Name, event.ComputeResource.Name, event.CreatedTime.In(location).Format(time.ANSIC))
|
||||
|
||||
// filter all the disconnected events to the ones belonging to this VM
|
||||
for _, e := range allVmDisconnectedEvents {
|
||||
if e.Vm.Name == event.Vm.Name {
|
||||
log.Printf("Adding VM disconnected event on host %s at time %s\n", e.Host.Name, e.CreatedTime.In(location))
|
||||
vmDisconnectedEvents = append(vmDisconnectedEvents, e)
|
||||
}
|
||||
}
|
||||
log.Printf("Filtered '%d' VmDisconnectedEvent events belonging to VM '%s'\n", len(vmDisconnectedEvents), event.Vm.Name)
|
||||
|
||||
// Get a reference to the cluster mentioned in the event
|
||||
cluster := getCluster((event.ComputeResource.Name))
|
||||
vm := getVmInCluster(event.Vm.Name, cluster.Reference())
|
||||
vm, err = getVmInCluster(event.Vm.Name, cluster.Reference())
|
||||
//log.Printf("VM: '%+v'\n", vm)
|
||||
//vm := getVM(event.Vm.Name)
|
||||
|
||||
// Use VmDisconnectedEvent to see which host this VM was on
|
||||
vmDisconnectedEvents := getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end)
|
||||
log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents))
|
||||
// If we couldn't find the vm then try using a list of all the disconnected events found
|
||||
//if len(vm.ExtensibleManagedObject.Self.Value) == 0 {
|
||||
if err != nil {
|
||||
log.Printf("No VM matching string '%s' found in cluster '%s'\n", event.Vm.Name, cluster.Reference())
|
||||
vmFound = false
|
||||
} else {
|
||||
// Use VmDisconnectedEvent to see which host this VM was on
|
||||
vmFound = true
|
||||
//vmDisconnectedEvents = getEvents([]string{"VmDisconnectedEvent"}, []types.ManagedObjectReference{vm.Reference()}, *begin, *end)
|
||||
//log.Printf("Retrieved '%d' VmDisconnectedEvent events.\n", len(vmDisconnectedEvents))
|
||||
}
|
||||
|
||||
// Determine which host the VM was previoulsy running on
|
||||
if len(vmDisconnectedEvents) > 0 {
|
||||
@@ -336,7 +404,7 @@ func main() {
|
||||
if len(possibleHosts) == 0 {
|
||||
log.Printf("No corresponding VM disconnected messages, falling back to any applicable host that experienced a HA event.\n")
|
||||
// Search for host failures
|
||||
for _, hostEvent := range hostFailures {
|
||||
for _, hostEvent := range haUnreachableEvents {
|
||||
if hostEvent.CreatedTime.In(location).Before(fuzzyTime) || hostEvent.CreatedTime.In(location).Equal(fuzzyTime) {
|
||||
possibleHosts = append(possibleHosts, hostEvent)
|
||||
}
|
||||
@@ -350,7 +418,7 @@ func main() {
|
||||
}
|
||||
}
|
||||
} else { // Didn't find any VM disconnected events
|
||||
log.Printf("could not determine previous host for this VM. Filtering all host failures for events prior to fuzzy VM restart time '%s'\n", fuzzyTime)
|
||||
log.Printf("could not determine previous host for this VM. Filtering all ha unreachable events prior to fuzzy VM restart time '%s'\n", fuzzyTime)
|
||||
|
||||
// TODO Use HA unreachable events to find the host
|
||||
for _, hostEvent := range haUnreachableEvents {
|
||||
@@ -367,6 +435,9 @@ func main() {
|
||||
}
|
||||
*/
|
||||
log.Printf("Based on event times there were %d possible hosts this VM was running on\n", len(possibleHosts))
|
||||
for _, hostEvent := range possibleHosts {
|
||||
log.Printf("Host %s (%s)\n", hostEvent.Host.Name, hostEvent.CreatedTime.In(location).Format(time.ANSIC))
|
||||
}
|
||||
|
||||
if len(possibleHosts) == 0 {
|
||||
log.Printf("No ESXi outage events happened before VM %s fuzzy restart event at %s, skipping this event.\n", event.Vm.Name, fuzzyTime)
|
||||
@@ -406,8 +477,8 @@ func main() {
|
||||
var checkActualTime []types.Event
|
||||
|
||||
// Search for any disconnected messages prior to actual restart time rather than fuzzy time
|
||||
log.Printf("Checking host failure list based on actual VM restart time %s\n", vmRestartTime)
|
||||
for _, hostEvent := range hostFailures {
|
||||
log.Printf("Checking possible hosts list based on actual VM restart time %s\n", vmRestartTime)
|
||||
for _, hostEvent := range possibleHosts {
|
||||
if hostEvent.CreatedTime.In(location).Before(vmRestartTime) || hostEvent.CreatedTime.In(location).Equal(vmRestartTime) {
|
||||
checkActualTime = append(checkActualTime, hostEvent)
|
||||
}
|
||||
@@ -419,10 +490,16 @@ func main() {
|
||||
failedHost = checkActualTime[0].Host.Name
|
||||
outageStart = checkActualTime[0].CreatedTime.In(location)
|
||||
restartTime = vmRestartTime
|
||||
} else if len(checkActualTime) > 1 {
|
||||
lastIndex := len(checkActualTime) - 1
|
||||
log.Printf("Found multiple hosts corresponding to actual VM restart time. Failed host was '%s', using outage start time of '%s'\n", checkActualTime[lastIndex].Host.Name, checkActualTime[lastIndex].CreatedTime.In(location))
|
||||
failedHost = checkActualTime[lastIndex].Host.Name
|
||||
outageStart = checkActualTime[lastIndex].CreatedTime.In(location)
|
||||
restartTime = vmRestartTime
|
||||
} else {
|
||||
// if using the actual VM restart time doesn't narrow things down then go back to using the last host failure time before the fuzzy VM restart time
|
||||
lastIndex := len(possibleHosts) - 1
|
||||
log.Printf("Failed host was '%s', using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location))
|
||||
log.Printf("Last failed host before restart time was '%s'. Using outage start time of '%s'\n", possibleHosts[lastIndex].Host.Name, possibleHosts[lastIndex].CreatedTime.In(location))
|
||||
failedHost = possibleHosts[lastIndex].Host.Name
|
||||
outageStart = possibleHosts[lastIndex].CreatedTime.In(location)
|
||||
restartTime = vmRestartTime
|
||||
@@ -437,6 +514,14 @@ func main() {
|
||||
duration := restartTime.Sub(outageStart)
|
||||
out := time.Time{}.Add(duration)
|
||||
|
||||
if vmFound {
|
||||
vmOS = vm.Summary.Guest.GuestFullName
|
||||
vmPowerState = string(vm.Summary.Runtime.PowerState)
|
||||
} else {
|
||||
vmOS = ""
|
||||
vmPowerState = ""
|
||||
}
|
||||
|
||||
// Create a new result
|
||||
result := OutageResults{
|
||||
VM: event.Vm.Name,
|
||||
@@ -446,8 +531,8 @@ func main() {
|
||||
Cluster: event.ComputeResource.Name,
|
||||
FailedHost: failedHost,
|
||||
NewHost: event.Host.Name,
|
||||
GuestOS: vm.Summary.Guest.GuestFullName,
|
||||
CurrentPowerState: string(vm.Summary.Runtime.PowerState),
|
||||
GuestOS: vmOS,
|
||||
CurrentPowerState: vmPowerState,
|
||||
Description: event.FullFormattedMessage,
|
||||
}
|
||||
// Append to list of all results
|
||||
@@ -463,7 +548,7 @@ func main() {
|
||||
})
|
||||
}
|
||||
} else {
|
||||
log.Printf("Found %d hostfailure messages.", len(hostFailures))
|
||||
log.Printf("Found %d hostfailure messages in last %.1f hour(s)", len(hostFailures), begin.Abs().Hours())
|
||||
}
|
||||
|
||||
// Combine details of host outages and VM outages into one interface
|
||||
|
Reference in New Issue
Block a user