/** * The execute() method invokes doExecute() until either: 1. doExecute() succeeds, or 2. the command may no longer be * retried (e.g. runs out of retry-attempts). * * @param arguments The list of arguments for the command. * @return Generic "Object" from doExecute(), on success. * @throws IOException, IOException, on complete failure. */ public T execute(Object... arguments) throws Exception { Exception latestException; int counter = 0; while (true) { try { return doExecute(arguments); } catch (Exception exception) { LOG.error("Failure in Retriable command: {}", description, exception); latestException = exception; } counter++; RetryAction action = retryPolicy.shouldRetry(latestException, counter, 0, true); if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY) { ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); } else { break; } } throw new IOException("Couldn't run retriable-command: " + description, latestException); }
/** * The execute() method invokes doExecute() until either: * 1. doExecute() succeeds, or * 2. the command may no longer be retried (e.g. runs out of retry-attempts). * @param arguments The list of arguments for the command. * @return Generic "Object" from doExecute(), on success. * @throws Exception */ public Object execute(Object... arguments) throws Exception { Exception latestException; int counter = 0; while (true) { try { return doExecute(arguments); } catch(Exception exception) { LOG.error("Failure in Retriable command: " + description, exception); latestException = exception; } counter++; RetryAction action = retryPolicy.shouldRetry(latestException, counter, 0, true); if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY) { ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); } else { break; } } throw new IOException("Couldn't run retriable-command: " + description, latestException); }
/** * The execute() method invokes doExecute() until either: * 1. doExecute() succeeds, or * 2. the command may no longer be retried (e.g. runs out of retry-attempts). * @param arguments The list of arguments for the command. * @return Generic "Object" from doExecute(), on success. * @throws IOException, IOException, on complete failure. */ public Object execute(Object... arguments) throws Exception { Exception latestException; int counter = 0; while (true) { try { return doExecute(arguments); } catch(Exception exception) { LOG.error("Failure in Retriable command: " + description, exception); latestException = exception; } counter++; RetryAction action = retryPolicy.shouldRetry(latestException, counter, 0, true); if (action.action == RetryPolicy.RetryAction.RetryDecision.RETRY) { ThreadUtil.sleepAtLeastIgnoreInterrupts(action.delayMillis); } else { break; } } throw new IOException("Couldn't run retriable-command: " + description, latestException); }
/** * Ensure that when all configured services are throwing StandbyException * that we fail over back and forth between them until one is no longer * throwing StandbyException. */ @Test public void testFailoverBetweenMultipleStandbys() throws UnreliableException, StandbyException, IOException { final long millisToSleep = 10000; final UnreliableImplementation impl1 = new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.STANDBY_EXCEPTION); FlipFlopProxyProvider<UnreliableInterface> proxyProvider = new FlipFlopProxyProvider<UnreliableInterface>( UnreliableInterface.class, impl1, new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.STANDBY_EXCEPTION)); final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy .create(UnreliableInterface.class, proxyProvider, RetryPolicies.failoverOnNetworkException( RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000)); new Thread() { @Override public void run() { ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep); impl1.setIdentifier("renamed-impl1"); } }.start(); String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1"); assertEquals("renamed-impl1", result); }
@Before public void setUp() throws IOException { originalOut = System.err; memOut = new ByteArrayOutputStream(); filterOut = new FilterOut(memOut); capturedOut = new PrintStream(filterOut); System.setErr(capturedOut); InputStream is = ThreadUtil.getResourceAsStream("log4j-kmsaudit.properties"); PropertyConfigurator.configure(is); IOUtils.closeStream(is); this.kmsAudit = new KMSAudit(1000); }
/** * Regression test for HDFS-3864 - NN does not update internal file mtime for * OP_CLOSE when reading from the edit log. */ @Test public void testModTimePersistsAfterRestart() throws IOException { final long sleepTime = 10; // 10 milliseconds MiniDFSCluster cluster = null; FileSystem fs = null; Configuration conf = new HdfsConfiguration(); try { cluster = new MiniDFSCluster.Builder(conf).build(); fs = cluster.getFileSystem(); Path testPath = new Path("/test"); // Open a file, and get its initial modification time. OutputStream out = fs.create(testPath); long initialModTime = fs.getFileStatus(testPath).getModificationTime(); assertTrue(initialModTime > 0); // Wait and then close the file. Ensure that the mod time goes up. ThreadUtil.sleepAtLeastIgnoreInterrupts(sleepTime); out.close(); long modTimeAfterClose = fs.getFileStatus(testPath).getModificationTime(); assertTrue(modTimeAfterClose >= initialModTime + sleepTime); // Restart the NN, and make sure that the later mod time is still used. cluster.restartNameNode(); long modTimeAfterRestart = fs.getFileStatus(testPath).getModificationTime(); assertEquals(modTimeAfterClose, modTimeAfterRestart); } finally { if (fs != null) { fs.close(); } if (cluster != null) { cluster.shutdown(); } } }
private void copyResource(String inputResourceName, File outputFile) throws IOException { InputStream is = null; OutputStream os = null; try { is = ThreadUtil.getResourceAsStream(inputResourceName); os = new FileOutputStream(outputFile); IOUtils.copy(is, os); } finally { IOUtils.closeQuietly(is); IOUtils.closeQuietly(os); } }
/** * Make sure that clients will receive StandbyExceptions even when a * checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer * thread will have FSNS lock. Regression test for HDFS-4591. */ @Test(timeout=300000) public void testStandbyExceptionThrownDuringCheckpoint() throws Exception { // Set it up so that we know when the SBN checkpoint starts and ends. FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); DelayAnswer answerer = new DelayAnswer(LOG); Mockito.doAnswer(answerer).when(spyImage1) .saveNamespace(Mockito.any(FSNamesystem.class), Mockito.any(Canceler.class)); // Perform some edits and wait for a checkpoint to start on the SBN. doEdits(0, 1000); nn0.getRpcServer().rollEditLog(); answerer.waitForCall(); answerer.proceed(); assertTrue("SBN is not performing checkpoint but it should be.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); // Make sure that the lock has actually been taken by the checkpointing // thread. ThreadUtil.sleepAtLeastIgnoreInterrupts(1000); try { // Perform an RPC to the SBN and make sure it throws a StandbyException. nn1.getRpcServer().getFileInfo("/"); fail("Should have thrown StandbyException, but instead succeeded."); } catch (StandbyException se) { GenericTestUtils.assertExceptionContains("is not supported", se); } // Make sure that the checkpoint is still going on, implying that the client // RPC to the SBN happened during the checkpoint. assertTrue("SBN should have still been checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); answerer.waitForResult(); assertTrue("SBN should have finished checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 1); }
/** * Ensure that when all configured services are throwing StandbyException * that we fail over back and forth between them until one is no longer * throwing StandbyException. */ @Test public void testFailoverBetweenMultipleStandbys() throws UnreliableException, StandbyException, IOException { final long millisToSleep = 10000; final UnreliableImplementation impl1 = new UnreliableImplementation("impl1", TypeOfExceptionToFailWith.STANDBY_EXCEPTION); FlipFlopProxyProvider proxyProvider = new FlipFlopProxyProvider( UnreliableInterface.class, impl1, new UnreliableImplementation("impl2", TypeOfExceptionToFailWith.STANDBY_EXCEPTION)); final UnreliableInterface unreliable = (UnreliableInterface)RetryProxy .create(UnreliableInterface.class, proxyProvider, RetryPolicies.failoverOnNetworkException( RetryPolicies.TRY_ONCE_THEN_FAIL, 10, 1000, 10000)); new Thread() { @Override public void run() { ThreadUtil.sleepAtLeastIgnoreInterrupts(millisToSleep); impl1.setIdentifier("renamed-impl1"); } }.start(); String result = unreliable.failsIfIdentifierDoesntMatch("renamed-impl1"); assertEquals("renamed-impl1", result); }
/** * Make sure that clients will receive StandbyExceptions even when a * checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer * thread will have FSNS lock. Regression test for HDFS-4591. */ @Test(timeout=300000) public void testStandbyExceptionThrownDuringCheckpoint() throws Exception { // Set it up so that we know when the SBN checkpoint starts and ends. FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); DelayAnswer answerer = new DelayAnswer(LOG); Mockito.doAnswer(answerer).when(spyImage1) .saveNamespace(Mockito.any(FSNamesystem.class), Mockito.eq(NameNodeFile.IMAGE), Mockito.any(Canceler.class)); // Perform some edits and wait for a checkpoint to start on the SBN. doEdits(0, 1000); nn0.getRpcServer().rollEditLog(); answerer.waitForCall(); assertTrue("SBN is not performing checkpoint but it should be.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); // Make sure that the lock has actually been taken by the checkpointing // thread. ThreadUtil.sleepAtLeastIgnoreInterrupts(1000); try { // Perform an RPC to the SBN and make sure it throws a StandbyException. nn1.getRpcServer().getFileInfo("/"); fail("Should have thrown StandbyException, but instead succeeded."); } catch (StandbyException se) { GenericTestUtils.assertExceptionContains("is not supported", se); } // Make sure that the checkpoint is still going on, implying that the client // RPC to the SBN happened during the checkpoint. assertTrue("SBN should have still been checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); answerer.proceed(); answerer.waitForResult(); assertTrue("SBN should have finished checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 1); }
@Test(timeout=60000) public void testExcludedNodesForgiveness() throws IOException { // Forgive nodes in under 2.5s for this test case. conf.setLong( DFSConfigKeys.DFS_CLIENT_WRITE_EXCLUDE_NODES_CACHE_EXPIRY_INTERVAL, 2500); // We'll be using a 512 bytes block size just for tests // so making sure the checksum bytes too match it. conf.setInt("io.bytes.per.checksum", 512); cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build(); List<DataNodeProperties> props = cluster.dataNodes; FileSystem fs = cluster.getFileSystem(); Path filePath = new Path("/testForgivingExcludedNodes"); // 256 bytes data chunk for writes byte[] bytes = new byte[256]; for (int index=0; index<bytes.length; index++) { bytes[index] = '0'; } // File with a 512 bytes block size FSDataOutputStream out = fs.create(filePath, true, 4096, (short) 3, 512); // Write a block to all 3 DNs (2x256bytes). out.write(bytes); out.write(bytes); out.hflush(); // Remove two DNs, to put them into the exclude list. DataNodeProperties two = cluster.stopDataNode(2); DataNodeProperties one = cluster.stopDataNode(1); // Write another block. // At this point, we have two nodes already in excluded list. out.write(bytes); out.write(bytes); out.hflush(); // Bring back the older DNs, since they are gonna be forgiven only // afterwards of this previous block write. Assert.assertEquals(true, cluster.restartDataNode(one, true)); Assert.assertEquals(true, cluster.restartDataNode(two, true)); cluster.waitActive(); // Sleep for 5s, to let the excluded nodes be expired // from the excludes list (i.e. forgiven after the configured wait period). // [Sleeping just in case the restart of the DNs completed < 5s cause // otherwise, we'll end up quickly excluding those again.] ThreadUtil.sleepAtLeastIgnoreInterrupts(5000); // Terminate the last good DN, to assert that there's no // single-DN-available scenario, caused by not forgiving the other // two by now. cluster.stopDataNode(0); try { // Attempt writing another block, which should still pass // cause the previous two should have been forgiven by now, // while the last good DN added to excludes this time. out.write(bytes); out.hflush(); out.close(); } catch (Exception e) { fail("Excluded DataNodes should be forgiven after a while and " + "not cause file writing exception of: '" + e.getMessage() + "'"); } }
/** * Make sure that clients will receive StandbyExceptions even when a * checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer * thread will have FSNS lock. Regression test for HDFS-4591. */ @Test(timeout=300000) public void testStandbyExceptionThrownDuringCheckpoint() throws Exception { // Set it up so that we know when the SBN checkpoint starts and ends. FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nn1); DelayAnswer answerer = new DelayAnswer(LOG); Mockito.doAnswer(answerer).when(spyImage1) .saveNamespace(Mockito.any(FSNamesystem.class), Mockito.eq(NameNodeFile.IMAGE), Mockito.any(Canceler.class)); // Perform some edits and wait for a checkpoint to start on the SBN. doEdits(0, 1000); nn0.getRpcServer().rollEditLog(); answerer.waitForCall(); assertTrue("SBN is not performing checkpoint but it should be.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); // Make sure that the lock has actually been taken by the checkpointing // thread. ThreadUtil.sleepAtLeastIgnoreInterrupts(1000); try { // Perform an RPC to the SBN and make sure it throws a StandbyException. nn1.getRpcServer().getFileInfo("/"); fail("Should have thrown StandbyException, but instead succeeded."); } catch (StandbyException se) { GenericTestUtils.assertExceptionContains("is not supported", se); } // Make sure new incremental block reports are processed during // checkpointing on the SBN. assertEquals(0, cluster.getNamesystem(1).getPendingDataNodeMessageCount()); doCreate(); Thread.sleep(1000); assertTrue(cluster.getNamesystem(1).getPendingDataNodeMessageCount() > 0); // Make sure that the checkpoint is still going on, implying that the client // RPC to the SBN happened during the checkpoint. assertTrue("SBN should have still been checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0); answerer.proceed(); answerer.waitForResult(); assertTrue("SBN should have finished checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 1); }