/** * This transition executes in the event-dispatcher thread, though it's * triggered in MRAppMaster's startJobs() method. */ @Override public void transition(JobImpl job, JobEvent event) { JobStartEvent jse = (JobStartEvent) event; if (jse.getRecoveredJobStartTime() != 0) { job.startTime = jse.getRecoveredJobStartTime(); } else { job.startTime = job.clock.getTime(); } JobInitedEvent jie = new JobInitedEvent(job.oldJobId, job.startTime, job.numMapTasks, job.numReduceTasks, job.getState().toString(), job.isUber()); job.eventHandler.handle(new JobHistoryEvent(job.jobId, jie)); JobInfoChangeEvent jice = new JobInfoChangeEvent(job.oldJobId, job.appSubmitTime, job.startTime); job.eventHandler.handle(new JobHistoryEvent(job.jobId, jice)); job.metrics.runningJob(job); job.eventHandler.handle(new CommitterJobSetupEvent( job.jobId, job.jobContext)); }
@Override public JobStateInternal transition(JobImpl job, JobEvent event) { job.completedTaskCount++; LOG.info("Num completed Tasks: " + job.completedTaskCount); JobTaskEvent taskEvent = (JobTaskEvent) event; Task task = job.tasks.get(taskEvent.getTaskID()); if (taskEvent.getState() == TaskState.SUCCEEDED) { taskSucceeded(job, task); } else if (taskEvent.getState() == TaskState.FAILED) { taskFailed(job, task); } else if (taskEvent.getState() == TaskState.KILLED) { taskKilled(job, task); } return checkJobAfterTaskCompletion(job); }
@SuppressWarnings("unchecked") @Override public KillJobResponse killJob(KillJobRequest request) throws IOException { JobId jobId = request.getJobId(); UserGroupInformation callerUGI = UserGroupInformation.getCurrentUser(); String message = "Kill job " + jobId + " received from " + callerUGI + " at " + Server.getRemoteAddress(); LOG.info(message); verifyAndGetJob(jobId, JobACL.MODIFY_JOB, false); appContext.getEventHandler().handle( new JobDiagnosticsUpdateEvent(jobId, message)); appContext.getEventHandler().handle( new JobEvent(jobId, JobEventType.JOB_KILL)); KillJobResponse response = recordFactory.newRecordInstance(KillJobResponse.class); return response; }
@Test public void testJobRebootNotLastRetryOnUnregistrationFailure() throws Exception { MRApp app = new MRApp(1, 0, false, this.getClass().getName(), true); Job job = app.submit(new Configuration()); app.waitForState(job, JobState.RUNNING); Assert.assertEquals("Num tasks not correct", 1, job.getTasks().size()); Iterator<Task> it = job.getTasks().values().iterator(); Task task = it.next(); app.waitForState(task, TaskState.RUNNING); //send an reboot event app.getContext().getEventHandler().handle(new JobEvent(job.getID(), JobEventType.JOB_AM_REBOOT)); // return exteranl state as RUNNING since otherwise the JobClient will // prematurely exit. app.waitForState(job, JobState.RUNNING); }
@Test public void testJobRebootOnLastRetryOnUnregistrationFailure() throws Exception { // make startCount as 2 since this is last retry which equals to // DEFAULT_MAX_AM_RETRY // The last param mocks the unregistration failure MRApp app = new MRApp(1, 0, false, this.getClass().getName(), true, 2, false); Configuration conf = new Configuration(); Job job = app.submit(conf); app.waitForState(job, JobState.RUNNING); Assert.assertEquals("Num tasks not correct", 1, job.getTasks().size()); Iterator<Task> it = job.getTasks().values().iterator(); Task task = it.next(); app.waitForState(task, TaskState.RUNNING); //send an reboot event app.getContext().getEventHandler().handle(new JobEvent(job.getID(), JobEventType.JOB_AM_REBOOT)); app.waitForInternalState((JobImpl) job, JobStateInternal.REBOOT); // return exteranl state as RUNNING if this is the last retry while // unregistration fails app.waitForState(job, JobState.RUNNING); }
@Test(timeout=20000) public void testKilledDuringCommit() throws Exception { Configuration conf = new Configuration(); conf.set(MRJobConfig.MR_AM_STAGING_DIR, stagingDir); AsyncDispatcher dispatcher = new AsyncDispatcher(); dispatcher.init(conf); dispatcher.start(); CyclicBarrier syncBarrier = new CyclicBarrier(2); OutputCommitter committer = new WaitingOutputCommitter(syncBarrier, true); CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer); commitHandler.init(conf); commitHandler.start(); JobImpl job = createRunningStubbedJob(conf, dispatcher, 2, null); completeJobTasks(job); assertJobState(job, JobStateInternal.COMMITTING); syncBarrier.await(); job.handle(new JobEvent(job.getID(), JobEventType.JOB_KILL)); assertJobState(job, JobStateInternal.KILLED); dispatcher.stop(); commitHandler.stop(); }
@Test public void testReportDiagnostics() throws Exception { JobID jobID = JobID.forName("job_1234567890000_0001"); JobId jobId = TypeConverter.toYarn(jobID); final String diagMsg = "some diagnostic message"; final JobDiagnosticsUpdateEvent diagUpdateEvent = new JobDiagnosticsUpdateEvent(jobId, diagMsg); MRAppMetrics mrAppMetrics = MRAppMetrics.create(); AppContext mockContext = mock(AppContext.class); when(mockContext.hasSuccessfullyUnregistered()).thenReturn(true); JobImpl job = new JobImpl(jobId, Records .newRecord(ApplicationAttemptId.class), new Configuration(), mock(EventHandler.class), null, mock(JobTokenSecretManager.class), null, new SystemClock(), null, mrAppMetrics, null, true, null, 0, null, mockContext, null, null); job.handle(diagUpdateEvent); String diagnostics = job.getReport().getDiagnostics(); Assert.assertNotNull(diagnostics); Assert.assertTrue(diagnostics.contains(diagMsg)); job = new JobImpl(jobId, Records .newRecord(ApplicationAttemptId.class), new Configuration(), mock(EventHandler.class), null, mock(JobTokenSecretManager.class), null, new SystemClock(), null, mrAppMetrics, null, true, null, 0, null, mockContext, null, null); job.handle(new JobEvent(jobId, JobEventType.JOB_KILL)); job.handle(diagUpdateEvent); diagnostics = job.getReport().getDiagnostics(); Assert.assertNotNull(diagnostics); Assert.assertTrue(diagnostics.contains(diagMsg)); }
private boolean testUberDecision(Configuration conf) { JobID jobID = JobID.forName("job_1234567890000_0001"); JobId jobId = TypeConverter.toYarn(jobID); MRAppMetrics mrAppMetrics = MRAppMetrics.create(); JobImpl job = new JobImpl(jobId, ApplicationAttemptId.newInstance( ApplicationId.newInstance(0, 0), 0), conf, mock(EventHandler.class), null, new JobTokenSecretManager(), new Credentials(), null, null, mrAppMetrics, null, true, null, 0, null, null, null, null); InitTransition initTransition = getInitTransition(2); JobEvent mockJobEvent = mock(JobEvent.class); initTransition.transition(job, mockJobEvent); boolean isUber = job.isUber(); return isUber; }
@Test public void testTransitionsAtFailed() throws IOException { Configuration conf = new Configuration(); AsyncDispatcher dispatcher = new AsyncDispatcher(); dispatcher.init(conf); dispatcher.start(); OutputCommitter committer = mock(OutputCommitter.class); doThrow(new IOException("forcefail")) .when(committer).setupJob(any(JobContext.class)); CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer); commitHandler.init(conf); commitHandler.start(); AppContext mockContext = mock(AppContext.class); when(mockContext.hasSuccessfullyUnregistered()).thenReturn(false); JobImpl job = createStubbedJob(conf, dispatcher, 2, mockContext); JobId jobId = job.getID(); job.handle(new JobEvent(jobId, JobEventType.JOB_INIT)); assertJobState(job, JobStateInternal.INITED); job.handle(new JobStartEvent(jobId)); assertJobState(job, JobStateInternal.FAILED); job.handle(new JobEvent(jobId, JobEventType.JOB_TASK_COMPLETED)); assertJobState(job, JobStateInternal.FAILED); job.handle(new JobEvent(jobId, JobEventType.JOB_TASK_ATTEMPT_COMPLETED)); assertJobState(job, JobStateInternal.FAILED); job.handle(new JobEvent(jobId, JobEventType.JOB_MAP_TASK_RESCHEDULED)); assertJobState(job, JobStateInternal.FAILED); job.handle(new JobEvent(jobId, JobEventType.JOB_TASK_ATTEMPT_FETCH_FAILURE)); assertJobState(job, JobStateInternal.FAILED); Assert.assertEquals(JobState.RUNNING, job.getState()); when(mockContext.hasSuccessfullyUnregistered()).thenReturn(true); Assert.assertEquals(JobState.FAILED, job.getState()); dispatcher.stop(); commitHandler.stop(); }
@Test public void testAbsentNotificationOnNotLastRetryUnregistrationFailure() throws Exception { HttpServer2 server = startHttpServer(); MRApp app = spy(new MRAppWithCustomContainerAllocator(2, 2, false, this.getClass().getName(), true, 1, false)); doNothing().when(app).sysexit(); JobConf conf = new JobConf(); conf.set(JobContext.MR_JOB_END_NOTIFICATION_URL, JobEndServlet.baseUrl + "jobend?jobid=$jobId&status=$jobStatus"); JobImpl job = (JobImpl)app.submit(conf); app.waitForState(job, JobState.RUNNING); app.getContext().getEventHandler() .handle(new JobEvent(app.getJobId(), JobEventType.JOB_AM_REBOOT)); app.waitForInternalState(job, JobStateInternal.REBOOT); // Now shutdown. // Unregistration fails: isLastAMRetry is recalculated, this is not app.shutDownJob(); // Not the last AM attempt. So user should that the job is still running. app.waitForState(job, JobState.RUNNING); Assert.assertFalse(app.isLastAMRetry()); Assert.assertEquals(0, JobEndServlet.calledTimes); Assert.assertNull(JobEndServlet.requestUri); Assert.assertNull(JobEndServlet.foundJobState); server.stop(); }
private static AppContext createAppContext( ApplicationAttemptId appAttemptId, Job job) { AppContext context = mock(AppContext.class); ApplicationId appId = appAttemptId.getApplicationId(); when(context.getApplicationID()).thenReturn(appId); when(context.getApplicationAttemptId()).thenReturn(appAttemptId); when(context.getJob(isA(JobId.class))).thenReturn(job); when(context.getClusterInfo()).thenReturn( new ClusterInfo(Resource.newInstance(10240, 1))); when(context.getEventHandler()).thenReturn(new EventHandler() { @Override public void handle(Event event) { // Only capture interesting events. if (event instanceof TaskAttemptContainerAssignedEvent) { events.add((TaskAttemptContainerAssignedEvent) event); } else if (event instanceof TaskAttemptKillEvent) { taskAttemptKillEvents.add((TaskAttemptKillEvent)event); } else if (event instanceof JobUpdatedNodesEvent) { jobUpdatedNodeEvents.add((JobUpdatedNodesEvent)event); } else if (event instanceof JobEvent) { jobEvents.add((JobEvent)event); } } }); return context; }
/** * This transition executes in the event-dispatcher thread, though it's * triggered in MRAppMaster's startJobs() method. */ @Override public void transition(JobImpl job, JobEvent event) { JobStartEvent jse = (JobStartEvent) event; if (jse.getRecoveredJobStartTime() != -1L) { job.startTime = jse.getRecoveredJobStartTime(); } else { job.startTime = job.clock.getTime(); } JobInitedEvent jie = new JobInitedEvent(job.oldJobId, job.startTime, job.numMapTasks, job.numReduceTasks, job.getState().toString(), job.isUber()); job.eventHandler.handle(new JobHistoryEvent(job.jobId, jie)); JobInfoChangeEvent jice = new JobInfoChangeEvent(job.oldJobId, job.appSubmitTime, job.startTime); job.eventHandler.handle(new JobHistoryEvent(job.jobId, jice)); job.metrics.runningJob(job); job.eventHandler.handle(new CommitterJobSetupEvent( job.jobId, job.jobContext)); }
@SuppressWarnings("unchecked") @Override public void handle(TaskAttemptEvent event) { if (LOG.isDebugEnabled()) { LOG.debug("Processing " + event.getTaskAttemptID() + " of type " + event.getType()); } writeLock.lock(); try { final TaskAttemptStateInternal oldState = getInternalState() ; try { stateMachine.doTransition(event.getType(), event); } catch (InvalidStateTransitonException e) { LOG.error("Can't handle this event at current state for " + this.attemptId, e); eventHandler.handle(new JobDiagnosticsUpdateEvent( this.attemptId.getTaskId().getJobId(), "Invalid event " + event.getType() + " on TaskAttempt " + this.attemptId)); eventHandler.handle(new JobEvent(this.attemptId.getTaskId().getJobId(), JobEventType.INTERNAL_ERROR)); } if (oldState != getInternalState()) { LOG.info(attemptId + " TaskAttempt Transitioned from " + oldState + " to " + getInternalState()); } } finally { writeLock.unlock(); } }
@Override /** * The only entry point to change the Job. */ public void handle(JobEvent event) { if (LOG.isDebugEnabled()) { LOG.debug("Processing " + event.getJobId() + " of type " + event.getType()); } try { writeLock.lock(); JobStateInternal oldState = getInternalState(); try { getStateMachine().doTransition(event.getType(), event); } catch (InvalidStateTransitonException e) { LOG.error("Can't handle this event at current state", e); addDiagnostic("Invalid event " + event.getType() + " on Job " + this.jobId); eventHandler.handle(new JobEvent(this.jobId, JobEventType.INTERNAL_ERROR)); } //notify the eventhandler of state change if (oldState != getInternalState()) { LOG.info(jobId + "Job Transitioned from " + oldState + " to " + getInternalState()); rememberLastNonFinalState(oldState); } } finally { writeLock.unlock(); } }
@Override public void transition(JobImpl job, JobEvent event) { job.setupProgress = 1.0f; job.scheduleTasks(job.mapTasks, job.numReduceTasks == 0); job.scheduleTasks(job.reduceTasks, true); // If we have no tasks, just transition to job completed if (job.numReduceTasks == 0 && job.numMapTasks == 0) { job.eventHandler.handle(new JobEvent(job.jobId, JobEventType.JOB_COMPLETED)); } }
@Override public void transition(JobImpl job, JobEvent event) { job.metrics.endRunningJob(job); job.addDiagnostic("Job setup failed : " + ((JobSetupFailedEvent) event).getMessage()); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED)); }
@Override public JobStateInternal transition(JobImpl job, JobEvent event) { if(!job.failWaitTriggerScheduledFuture.isCancelled()) { for(Task task: job.tasks.values()) { if(!task.isFinished()) { return JobStateInternal.FAIL_WAIT; } } } //Finished waiting. All tasks finished / were killed job.failWaitTriggerScheduledFuture.cancel(false); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED)); return JobStateInternal.FAIL_ABORT; }
@Override public void transition(JobImpl job, JobEvent event) { LOG.info("Timeout expired in FAIL_WAIT waiting for tasks to get killed." + " Going to fail job anyway"); job.failWaitTriggerScheduledFuture.cancel(false); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED)); }
@Override public void transition(JobImpl job, JobEvent event) { job.setFinishTime(); JobUnsuccessfulCompletionEvent failedEvent = new JobUnsuccessfulCompletionEvent(job.oldJobId, job.finishTime, 0, 0, JobStateInternal.KILLED.toString(), job.diagnostics); job.eventHandler.handle(new JobHistoryEvent(job.jobId, failedEvent)); job.finished(JobStateInternal.KILLED); }
@Override public void transition(JobImpl job, JobEvent event) { job.addDiagnostic("Job received Kill in INITED state."); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.KILLED)); }
@Override public void transition(JobImpl job, JobEvent event) { job.metrics.endRunningJob(job); job.addDiagnostic("Job received kill in SETUP state."); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.KILLED)); }
@Override public void transition(JobImpl job, JobEvent event) { job.addDiagnostic(JOB_KILLED_DIAG); for (Task task : job.tasks.values()) { job.eventHandler.handle( new TaskEvent(task.getID(), TaskEventType.T_KILL)); } job.metrics.endRunningJob(job); }
@Override public void transition(JobImpl job, JobEvent event) { //get number of shuffling reduces int shufflingReduceTasks = 0; for (TaskId taskId : job.reduceTasks) { Task task = job.tasks.get(taskId); if (TaskState.RUNNING.equals(task.getState())) { for(TaskAttempt attempt : task.getAttempts().values()) { if(attempt.getPhase() == Phase.SHUFFLE) { shufflingReduceTasks++; break; } } } } JobTaskAttemptFetchFailureEvent fetchfailureEvent = (JobTaskAttemptFetchFailureEvent) event; for (org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId mapId : fetchfailureEvent.getMaps()) { Integer fetchFailures = job.fetchFailuresMapping.get(mapId); fetchFailures = (fetchFailures == null) ? 1 : (fetchFailures+1); job.fetchFailuresMapping.put(mapId, fetchFailures); float failureRate = shufflingReduceTasks == 0 ? 1.0f : (float) fetchFailures / shufflingReduceTasks; // declare faulty if fetch-failures >= max-allowed-failures if (fetchFailures >= job.getMaxFetchFailuresNotifications() && failureRate >= job.getMaxAllowedFetchFailuresFraction()) { LOG.info("Too many fetch-failures for output of task attempt: " + mapId + " ... raising fetch failure to map"); job.eventHandler.handle(new TaskAttemptEvent(mapId, TaskAttemptEventType.TA_TOO_MANY_FETCH_FAILURE)); job.fetchFailuresMapping.remove(mapId); } } }
@Override public void transition(JobImpl job, JobEvent event) { JobCommitFailedEvent jcfe = (JobCommitFailedEvent)event; job.addDiagnostic("Job commit failed: " + jcfe.getMessage()); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.FAILED)); }
@Override public void transition(JobImpl job, JobEvent event) { job.setFinishTime(); job.eventHandler.handle(new CommitterJobAbortEvent(job.jobId, job.jobContext, org.apache.hadoop.mapreduce.JobStatus.State.KILLED)); }
@Override public void transition(JobImpl job, JobEvent event) { JobCounterUpdateEvent jce = (JobCounterUpdateEvent) event; for (JobCounterUpdateEvent.CounterIncrementalUpdate ci : jce .getCounterUpdates()) { job.jobCounters.findCounter(ci.getCounterKey()).increment( ci.getIncrementValue()); } }
@Override public void transition(JobImpl job, JobEvent event) { JobUpdatedNodesEvent updateEvent = (JobUpdatedNodesEvent) event; for(NodeReport nr: updateEvent.getUpdatedNodes()) { NodeState nodeState = nr.getNodeState(); if(nodeState.isUnusable()) { // act on the updates job.actOnUnusableNode(nr.getNodeId(), nodeState); } } }