Skip to content

Commit

Permalink
Set up cron job for mailing the app admin if any MR jobs have failed …
Browse files Browse the repository at this point in the history
…recently.

WARNING: this is not ready for merge into develop. We still need to patch
gae-mapreduce-1.9.0.0/mapreduce/model.py, line 908, to use indexed=True.
  • Loading branch information
seanlip committed Sep 5, 2014
1 parent 630ef55 commit 45bfd0c
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 18 deletions.
42 changes: 41 additions & 1 deletion core/controllers/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,44 @@

"""Controllers for the cron jobs."""

pass
from core import jobs
from core.controllers import base
from core.platform import models
email_services = models.Registry.import_email_services()
import feconf


class JobFailureMailerHandler(base.BaseHandler):
"""Handler for mailing admin about job failures."""

def get(self):
"""Handles GET requests."""
NINETY_MINUTES_IN_MSECS = 90 * 60 * 1000

failed_jobs = jobs.get_stuck_jobs(NINETY_MINUTES_IN_MSECS)
if failed_jobs:
email_message = (
'Some jobs have failed in the past 90 minutes. '
'More information:')

for job in failed_jobs:
email_message += '\n'
email_message += '-----------------------------------'
email_message += '\n'
email_message += (
'Job with mapreduce ID %s (key name %s) failed. '
'More info:\n\n'
' counters_map: %s\n'
' shard_retries: %s\n'
' slice_retries: %s\n'
' last_update_time: %s\n'
' last_work_item: %s\n'
) % (
job.mapreduce_id, job.key().name(), job.counters_map,
job.retries, job.slice_retries, job.update_time,
job.last_work_item
)

email_services.send_mail_to_admin(
feconf.ADMIN_EMAIL_ADDRESS, 'MapReduce failure alert',
email_message)
19 changes: 19 additions & 0 deletions core/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from mapreduce import context
from mapreduce import input_readers
from mapreduce import mapreduce_pipeline
from mapreduce import model as mapreduce_model
from mapreduce.lib.pipeline import pipeline
from mapreduce import util as mapreduce_util

Expand Down Expand Up @@ -1087,6 +1088,24 @@ def get_continuous_computations_info(cc_classes):
return result


def get_stuck_jobs(recency_msecs):
"""Returns a list of jobs which were last updated at most recency_msecs
milliseconds ago and have experienced more than one retry."""
threshold_time = (
datetime.datetime.utcnow() -
datetime.timedelta(0, 0, 0, recency_msecs))
shard_state_model_class = mapreduce_model.ShardState
recent_job_models = shard_state_model_class.all().filter(
'update_time >', threshold_time)

stuck_jobs = []
for job_model in recent_job_models:
if job_model.retries > 0:
stuck_jobs.append(job_model)

return stuck_jobs


ABSTRACT_BASE_CLASSES = frozenset([
BaseJobManager, BaseDeferredJobManager, BaseMapReduceJobManager,
BaseMapReduceJobManagerForContinuousComputations])
3 changes: 3 additions & 0 deletions cron.yaml
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
cron:
- description: email admin about job failures
url: /mail/admin/job_failure
schedule: every 1 hours
1 change: 1 addition & 0 deletions feconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@
# Committer id for system actions.
ADMIN_COMMITTER_ID = 'admin'
ADMIN_EMAIL_ADDRESS = '[email protected]'
# Ensure that ADMIN_EMAIL_ADDRESS is valid before setting this to True.
CAN_SEND_EMAILS_TO_ADMIN = False

# The maximum size of an uploaded file, in bytes.
Expand Down
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def ui_access_wrapper(self, *args, **kwargs):
mapreduce_parameters.config.BASE_PATH = '/mapreduce/worker'


# Register the URL with the responsible classes
# Register the URLs with the classes responsible for handling them.
urls = [
get_redirect_route(r'/_ah/warmup', WarmupHandler, 'warmup_handler'),

Expand Down
22 changes: 6 additions & 16 deletions main_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,16 @@
from core.controllers import cron
from core.platform import models
transaction_services = models.Registry.import_transaction_services()
import feconf
import main

import webapp2
from webapp2_extras.routes import RedirectRoute


def get_redirect_route(regex_route, handler, name, defaults=None):
"""Returns a route that redirects /foo/ to /foo.

Warning: this method strips off parameters after the trailing slash. URLs
with parameters should be formulated without the trailing slash.
"""
if defaults is None:
defaults = {}
return RedirectRoute(
regex_route, handler, name, strict_slash=True, defaults=defaults)


# Register the URL with the responsible classes
# Register the URLs with the classes responsible for handling them.
urls = [

main.get_redirect_route(
r'/cron/mail/admin/job_failure', cron.JobFailureMailerHandler,
'job_failure_mailer'),
]


Expand Down

0 comments on commit 45bfd0c

Please sign in to comment.