Skip to content

Commit a21377a

Browse files
authored
Merge pull request #5 from phylsix/alerting
email alerts for exception and bad workflow failures
2 parents b3007eb + d3d8447 commit a21377a

2 files changed

Lines changed: 141 additions & 19 deletions

File tree

workflowmonit/alertingDefs.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#!/usr/bin/env python
2+
from __future__ import print_function
3+
import time
4+
import json
5+
import smtplib
6+
from email.mime.text import MIMEText
7+
8+
9+
def onFailureRate(doc, thres=0.5):
10+
"""
11+
check a workflow (represented by `doc`), if its failureRate is larger than
12+
0.5 AND running time is > 2days, then ALERT.
13+
14+
:param doc dict: information describe a workflow
15+
:param thres float: threshold
16+
:returns: (judge result, short msg if failed)
17+
"""
18+
19+
res = (False, '')
20+
if doc['status'] != 'running-open': return res
21+
if doc['failureRate'] < thres: return res
22+
runningOpen = [
23+
tr for tr in doc['transitions'] if tr['Status'] == 'running-open'
24+
]
25+
if not runningOpen: return res
26+
27+
runningOpenTime = runningOpen[0]['UpdateTime']
28+
if time.time() - runningOpenTime < 2 * 24 * 60 * 60: return res
29+
30+
failMsg = 'FailureRate ({}) larger than threshold({}), while running time over 2 days (started at {})'.format(
31+
doc['failureRate'], thres, time.ctime(runningOpenTime))
32+
33+
return (True, failMsg)
34+
35+
36+
AlertDefs = [
37+
onFailureRate,
38+
]
39+
40+
41+
def alertWithEmail(docs, recipients):
42+
"""
43+
handling docs with alert emails.
44+
45+
46+
:param docs list: list of documents
47+
:param recipients list: list of recipients email addresses
48+
"""
49+
50+
sender = 'toolsandint-workflowmonitalert@cern.ch'
51+
52+
for doc in docs:
53+
alertResults = [ad(doc) for ad in AlertDefs]
54+
positiveRes = filter(lambda d: d[0], alertResults)
55+
if positiveRes:
56+
shortAlertMsgs = [x[1] for x in positiveRes]
57+
_contentMsg = '\n\n'.join([
58+
'*** THIS IS A GENERATED MESSAGE, PLEASE DO NOT REPLY ***',
59+
'Workflow: {}'.format(doc['name']),
60+
'Short Summary:\n{}'.format('\n'.join([
61+
'- {}'.format(s) for s in shortAlertMsgs
62+
])),
63+
'-'* 79,
64+
'Full document:\n{}'.format(
65+
json.dumps(
66+
doc, sort_keys=True, indent=4, separators=(',', ': ')))
67+
])
68+
69+
contentMsg = MIMEText(_contentMsg)
70+
contentMsg['Subject'] = '[workflowmonit] Alert on * {} *'.format(
71+
doc['name'])
72+
contentMsg['From'] = sender
73+
contentMsg['To'] = ', '.join(recipients)
74+
s = smtplib.SMTP('localhost')
75+
s.sendmail(sender, recipients, contentMsg.as_string())
76+
s.quit()
77+
78+
79+
def errorEmailShooter(msg, recipients):
80+
"""
81+
forward the error message to recipients by emails
82+
83+
:param msg str: error mesages
84+
:param recipients list: list of recipients email address
85+
"""
86+
87+
sender = 'toolsandint-workflowmonitalert@cern.ch'
88+
89+
contentMsg = MIMEText(msg)
90+
contentMsg['Subject'] = 'Exception caught for workflowmonit'
91+
contentMsg['From'] = sender
92+
contentMsg['To'] = ', '.join(recipients)
93+
s = smtplib.SMTP('localhost')
94+
s.sendmail(sender, recipients, contentMsg.as_string())
95+
s.quit()
96+
97+
98+
def main():
99+
100+
import os
101+
testdoc = os.path.join(
102+
os.path.dirname(os.path.abspath(__file__)),
103+
'Logs/toSendDoc_190317-033802.json')
104+
docs = json.load(open(testdoc))
105+
print([(d['name'], d['failureRate']) for d in docs])
106+
107+
alertWithEmail(docs, ['weinan.si@cern.ch', ])
108+
109+
110+
111+
if __name__ == "__main__":
112+
main()

workflowmonit/sendToMonit.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import yaml
1414
from workflowmonit.stompAMQ import stompAMQ
1515
import workflowmonit.workflowCollector as wc
16+
import workflowmonit.alertingDefs as ad
1617

1718
CRED_FILE_PATH = os.path.join(os.path.dirname(
1819
os.path.abspath(__file__)), 'credential.yml')
@@ -268,31 +269,40 @@ def sendDoc(cred, docs):
268269

269270
def main():
270271

271-
with open(LOGGING_CONFIG, 'r') as f:
272-
config = yaml.safe_load(f.read())
273-
logging.config.dictConfig(config)
272+
recipients = wc.get_yamlconfig(CONFIG_FILE_PATH).get('alert_recipients', [])
274273

275-
global logger
276-
logger = logging.getLogger('workflowmonitLogger')
274+
try:
275+
with open(LOGGING_CONFIG, 'r') as f:
276+
config = yaml.safe_load(f.read())
277+
logging.config.dictConfig(config)
278+
279+
global logger
280+
logger = logging.getLogger('workflowmonitLogger')
277281

278-
cred = wc.get_yamlconfig(CRED_FILE_PATH)
279-
docs = buildDoc(CONFIG_FILE_PATH)
282+
cred = wc.get_yamlconfig(CRED_FILE_PATH)
283+
docs = buildDoc(CONFIG_FILE_PATH)
280284

281-
if not os.path.isdir(LOGDIR):
282-
os.makedirs(LOGDIR)
285+
# handling alerts
286+
ad.alertWithEmail(docs, recipients)
283287

284-
doc_bkp = os.path.join(LOGDIR, 'toSendDoc_{}'.format(
285-
time.strftime('%y%m%d-%H%M%S')))
286-
wc.save_json(docs, doc_bkp)
287-
logger.info('Document saved at: {}.json'.format(doc_bkp))
288+
# backup documents
289+
if not os.path.isdir(LOGDIR):
290+
os.makedirs(LOGDIR)
288291

289-
failures = sendDoc(cred=cred, docs=docs)
292+
doc_bkp = os.path.join(LOGDIR, 'toSendDoc_{}'.format(
293+
time.strftime('%y%m%d-%H%M%S')))
294+
wc.save_json(docs, doc_bkp)
295+
logger.info('Document saved at: {}.json'.format(doc_bkp))
290296

291-
failedDocs_bkp = os.path.join(
292-
LOGDIR, 'amqFailedMsg_{}'.format(time.strftime('%y%m%d-%H%M%S')))
293-
if len(failures):
294-
wc.save_json(failures, failedDocs_bkp)
295-
logger.info('Failed message saved at: {}.json'.format(failedDocs_bkp))
297+
failures = sendDoc(cred=cred, docs=docs)
298+
299+
failedDocs_bkp = os.path.join(
300+
LOGDIR, 'amqFailedMsg_{}'.format(time.strftime('%y%m%d-%H%M%S')))
301+
if len(failures):
302+
wc.save_json(failures, failedDocs_bkp)
303+
logger.info('Failed message saved at: {}.json'.format(failedDocs_bkp))
304+
except Exception as e:
305+
ad.errorEmailShooter(str(e), recipients)
296306

297307

298308
if __name__ == "__main__":

0 commit comments

Comments
 (0)