private void initNodeIds(int nodeNums, int containerNums, Resource resource) { Random portRandom = new Random(); Random ipRandom = new Random(); for (int i = 0; i < nodeNums; i++) { NodeReport nodeReport = Records.newRecord(NodeReport.class); nodeReport.setNumContainers(containerNums); nodeReport.setNodeLabels(new HashSet<>()); nodeReport.setNodeState(NodeState.RUNNING); nodeReport.setCapability(resource); nodeReport.setUsed(Resource.newInstance(0, 0)); int port = 1024 + portRandom.nextInt(65535 - 1024 + 1); StringBuilder hostStr = new StringBuilder(); for (int j = 0; j < 4; j++) { hostStr.append(".").append(ipRandom.nextInt(256)); } NodeId nodeId = NodeId.newInstance(hostStr.substring(1), port); nodeReport.setNodeId(nodeId); nodeReport.setHttpAddress(nodeId.getHost()); nodeReportList.add(nodeReport); } }
public static NodeReport newNodeReport(NodeId nodeId, NodeState nodeState, String httpAddress, String rackName, Resource used, Resource capability, int numContainers, String healthReport, long lastHealthReportTime, Set<String> nodeLabels) { NodeReport nodeReport = recordFactory.newRecordInstance(NodeReport.class); nodeReport.setNodeId(nodeId); nodeReport.setNodeState(nodeState); nodeReport.setHttpAddress(httpAddress); nodeReport.setRackName(rackName); nodeReport.setUsed(used); nodeReport.setCapability(capability); nodeReport.setNumContainers(numContainers); nodeReport.setHealthReport(healthReport); nodeReport.setLastHealthReportTime(lastHealthReportTime); nodeReport.setNodeLabels(nodeLabels); return nodeReport; }
public void handle(RMNodeEvent event) { LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType()); try { writeLock.lock(); NodeState oldState = getState(); try { stateMachine.doTransition(event.getType(), event); } catch (InvalidStateTransitonException e) { LOG.error("Can't handle this event at current state", e); LOG.error("Invalid event " + event.getType() + " on Node " + this.nodeId); } if (oldState != getState()) { LOG.info(nodeId + " Node Transitioned from " + oldState + " to " + getState()); } } finally { writeLock.unlock(); } }
private void updateMetricsForRejoinedNode(NodeState previousNodeState) { ClusterMetrics metrics = ClusterMetrics.getMetrics(); metrics.incrNumActiveNodes(); switch (previousNodeState) { case LOST: metrics.decrNumLostNMs(); break; case REBOOTED: metrics.decrNumRebootedNMs(); break; case DECOMMISSIONED: metrics.decrDecommisionedNMs(); break; case UNHEALTHY: metrics.decrNumUnhealthyNMs(); break; default: LOG.debug("Unexpected previous node state"); } }
@Override public void transition(RMNodeImpl rmNode, RMNodeEvent event) { // Inform the scheduler rmNode.nodeUpdateQueue.clear(); // If the current state is NodeState.UNHEALTHY // Then node is already been removed from the // Scheduler NodeState initialState = rmNode.getState(); if (!initialState.equals(NodeState.UNHEALTHY)) { rmNode.context.getDispatcher().getEventHandler() .handle(new NodeRemovedSchedulerEvent(rmNode)); } rmNode.context.getDispatcher().getEventHandler().handle( new NodesListManagerEvent( NodesListManagerEventType.NODE_UNUSABLE, rmNode)); // Deactivate the node rmNode.context.getRMNodes().remove(rmNode.nodeId); LOG.info("Deactivating Node " + rmNode.nodeId + " as it is now " + finalState); rmNode.context.getInactiveRMNodes().put(rmNode.nodeId.getHost(), rmNode); //Update the metrics rmNode.updateMetricsForDeactivatedNode(initialState, finalState); }
@Override public NodeState transition(RMNodeImpl rmNode, RMNodeEvent event) { RMNodeStatusEvent statusEvent = (RMNodeStatusEvent) event; // Switch the last heartbeatresponse. rmNode.latestNodeHeartBeatResponse = statusEvent.getLatestResponse(); NodeHealthStatus remoteNodeHealthStatus = statusEvent.getNodeHealthStatus(); rmNode.setHealthReport(remoteNodeHealthStatus.getHealthReport()); rmNode.setLastHealthReportTime( remoteNodeHealthStatus.getLastHealthReportTime()); if (remoteNodeHealthStatus.getIsNodeHealthy()) { rmNode.context.getDispatcher().getEventHandler().handle( new NodeAddedSchedulerEvent(rmNode)); rmNode.context.getDispatcher().getEventHandler().handle( new NodesListManagerEvent( NodesListManagerEventType.NODE_USABLE, rmNode)); // ??? how about updating metrics before notifying to ensure that // notifiers get update metadata because they will very likely query it // upon notification // Update metrics rmNode.updateMetricsForRejoinedNode(NodeState.UNHEALTHY); return NodeState.RUNNING; } return NodeState.UNHEALTHY; }
@Override public GetClusterNodesResponse getClusterNodes(GetClusterNodesRequest request) throws YarnException { GetClusterNodesResponse response = recordFactory.newRecordInstance(GetClusterNodesResponse.class); EnumSet<NodeState> nodeStates = request.getNodeStates(); if (nodeStates == null || nodeStates.isEmpty()) { nodeStates = EnumSet.allOf(NodeState.class); } Collection<RMNode> nodes = RMServerUtils.queryRMNodes(rmContext, nodeStates); List<NodeReport> nodeReports = new ArrayList<NodeReport>(nodes.size()); for (RMNode nodeInfo : nodes) { nodeReports.add(createNodeReports(nodeInfo)); } response.setNodeReports(nodeReports); return response; }
@Test public void testNodesQueryNew() throws JSONException, Exception { WebResource r = resource(); MockNM nm1 = rm.registerNode("h1:1234", 5120); MockNM nm2 = rm.registerNode("h2:1235", 5121); rm.sendNodeStarted(nm1); rm.NMwaitForState(nm1.getNodeId(), NodeState.RUNNING); rm.NMwaitForState(nm2.getNodeId(), NodeState.NEW); ClientResponse response = r.path("ws").path("v1").path("cluster") .path("nodes").queryParam("states", NodeState.NEW.toString()) .accept(MediaType.APPLICATION_JSON).get(ClientResponse.class); assertEquals(MediaType.APPLICATION_JSON_TYPE, response.getType()); JSONObject json = response.getEntity(JSONObject.class); assertEquals("incorrect number of elements", 1, json.length()); JSONObject nodes = json.getJSONObject("nodes"); assertEquals("incorrect number of elements", 1, nodes.length()); JSONArray nodeArray = nodes.getJSONArray("node"); assertEquals("incorrect number of elements", 1, nodeArray.length()); JSONObject info = nodeArray.getJSONObject(0); verifyNodeInfo(info, nm2); }
@Test public void testNodesQueryRunning() throws JSONException, Exception { WebResource r = resource(); MockNM nm1 = rm.registerNode("h1:1234", 5120); MockNM nm2 = rm.registerNode("h2:1235", 5121); rm.sendNodeStarted(nm1); rm.NMwaitForState(nm1.getNodeId(), NodeState.RUNNING); rm.NMwaitForState(nm2.getNodeId(), NodeState.NEW); ClientResponse response = r.path("ws").path("v1").path("cluster") .path("nodes").queryParam("states", "running") .accept(MediaType.APPLICATION_JSON).get(ClientResponse.class); assertEquals(MediaType.APPLICATION_JSON_TYPE, response.getType()); JSONObject json = response.getEntity(JSONObject.class); assertEquals("incorrect number of elements", 1, json.length()); JSONObject nodes = json.getJSONObject("nodes"); assertEquals("incorrect number of elements", 1, nodes.length()); JSONArray nodeArray = nodes.getJSONArray("node"); assertEquals("incorrect number of elements", 1, nodeArray.length()); }
@Test public void testNodesQueryHealthyFalse() throws JSONException, Exception { WebResource r = resource(); MockNM nm1 = rm.registerNode("h1:1234", 5120); MockNM nm2 = rm.registerNode("h2:1235", 5121); rm.sendNodeStarted(nm1); rm.NMwaitForState(nm1.getNodeId(), NodeState.RUNNING); rm.NMwaitForState(nm2.getNodeId(), NodeState.NEW); ClientResponse response = r.path("ws").path("v1").path("cluster") .path("nodes").queryParam("states", "UNHEALTHY") .accept(MediaType.APPLICATION_JSON).get(ClientResponse.class); assertEquals(MediaType.APPLICATION_JSON_TYPE, response.getType()); JSONObject json = response.getEntity(JSONObject.class); assertEquals("incorrect number of elements", 1, json.length()); assertEquals("nodes is not null", JSONObject.NULL, json.get("nodes")); }
@Test public void testQueryAll() throws Exception { WebResource r = resource(); MockNM nm1 = rm.registerNode("h1:1234", 5120); MockNM nm2 = rm.registerNode("h2:1235", 5121); MockNM nm3 = rm.registerNode("h3:1236", 5122); rm.sendNodeStarted(nm1); rm.sendNodeStarted(nm3); rm.NMwaitForState(nm1.getNodeId(), NodeState.RUNNING); rm.NMwaitForState(nm2.getNodeId(), NodeState.NEW); rm.sendNodeLost(nm3); ClientResponse response = r.path("ws").path("v1").path("cluster") .path("nodes") .queryParam("states", Joiner.on(',').join(EnumSet.allOf(NodeState.class))) .accept(MediaType.APPLICATION_JSON).get(ClientResponse.class); assertEquals(MediaType.APPLICATION_JSON_TYPE, response.getType()); JSONObject json = response.getEntity(JSONObject.class); JSONObject nodes = json.getJSONObject("nodes"); assertEquals("incorrect number of elements", 1, nodes.length()); JSONArray nodeArray = nodes.getJSONArray("node"); assertEquals("incorrect number of elements", 3, nodeArray.length()); }
public static List<RMNode> newNodes(int racks, int nodesPerRack, Resource perNode) { List<RMNode> list = Lists.newArrayList(); for (int i = 0; i < racks; ++i) { for (int j = 0; j < nodesPerRack; ++j) { if (j == (nodesPerRack - 1)) { // One unhealthy node per rack. list.add(nodeInfo(i, perNode, NodeState.UNHEALTHY)); } if (j == 0) { // One node with label list.add(nodeInfo(i, perNode, NodeState.RUNNING, ImmutableSet.of("x"))); } else { list.add(newNodeInfo(i, perNode)); } } } return list; }
public MockRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress, Resource perNode, String rackName, String healthReport, long lastHealthReportTime, int cmdPort, String hostName, NodeState state, Set<String> labels) { this.nodeId = nodeId; this.nodeAddr = nodeAddr; this.httpAddress = httpAddress; this.perNode = perNode; this.rackName = rackName; this.healthReport = healthReport; this.lastHealthReportTime = lastHealthReportTime; this.cmdPort = cmdPort; this.hostName = hostName; this.state = state; this.labels = labels; }
private static RMNode buildRMNode(int rack, final Resource perNode, NodeState state, String httpAddr, int hostnum, String hostName, int port, Set<String> labels) { final String rackName = "rack"+ rack; final int nid = hostnum; final String nodeAddr = hostName + ":" + nid; if (hostName == null) { hostName = "host"+ nid; } final NodeId nodeID = NodeId.newInstance(hostName, port); final String httpAddress = httpAddr; String healthReport = (state == NodeState.UNHEALTHY) ? null : "HealthyMe"; return new MockRMNodeImpl(nodeID, nodeAddr, httpAddress, perNode, rackName, healthReport, 0, nid, hostName, state, labels); }
@Test public void testRunningExpire() { RMNodeImpl node = getRunningNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE)); Assert.assertEquals("Active Nodes", initialActive - 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost + 1, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.LOST, node.getState()); }
@Test public void testUnhealthyExpire() { RMNodeImpl node = getUnhealthyNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.EXPIRE)); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost + 1, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy - 1, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.LOST, node.getState()); }
@Test public void testRunningDecommission() { RMNodeImpl node = getRunningNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.DECOMMISSION)); Assert.assertEquals("Active Nodes", initialActive - 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned + 1, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState()); }
@Test public void testUnhealthyDecommission() { RMNodeImpl node = getUnhealthyNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.DECOMMISSION)); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy - 1, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned + 1, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.DECOMMISSIONED, node.getState()); }
@Test public void testRunningRebooting() { RMNodeImpl node = getRunningNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.REBOOTING)); Assert.assertEquals("Active Nodes", initialActive - 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted + 1, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.REBOOTED, node.getState()); }
@Test public void testUnhealthyRebooting() { RMNodeImpl node = getUnhealthyNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeEvent(node.getNodeID(), RMNodeEventType.REBOOTING)); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy - 1, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted + 1, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.REBOOTED, node.getState()); }
@Test public void testAdd() { RMNodeImpl node = getNewNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeStartedEvent(node.getNodeID(), null, null)); Assert.assertEquals("Active Nodes", initialActive + 1, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.RUNNING, node.getState()); Assert.assertNotNull(nodesListManagerEvent); Assert.assertEquals(NodesListManagerEventType.NODE_USABLE, nodesListManagerEvent.getType()); }
@Test public void testReconnect() { RMNodeImpl node = getRunningNode(); ClusterMetrics cm = ClusterMetrics.getMetrics(); int initialActive = cm.getNumActiveNMs(); int initialLost = cm.getNumLostNMs(); int initialUnhealthy = cm.getUnhealthyNMs(); int initialDecommissioned = cm.getNumDecommisionedNMs(); int initialRebooted = cm.getNumRebootedNMs(); node.handle(new RMNodeReconnectEvent(node.getNodeID(), node, null, null)); Assert.assertEquals("Active Nodes", initialActive, cm.getNumActiveNMs()); Assert.assertEquals("Lost Nodes", initialLost, cm.getNumLostNMs()); Assert.assertEquals("Unhealthy Nodes", initialUnhealthy, cm.getUnhealthyNMs()); Assert.assertEquals("Decommissioned Nodes", initialDecommissioned, cm.getNumDecommisionedNMs()); Assert.assertEquals("Rebooted Nodes", initialRebooted, cm.getNumRebootedNMs()); Assert.assertEquals(NodeState.RUNNING, node.getState()); Assert.assertNotNull(nodesListManagerEvent); Assert.assertEquals(NodesListManagerEventType.NODE_USABLE, nodesListManagerEvent.getType()); }
@Test public void testResourceUpdateOnRunningNode() { RMNodeImpl node = getRunningNode(); Resource oldCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", oldCapacity.getMemory(), 4096); assertEquals("CPU resource is not match.", oldCapacity.getVirtualCores(), 4); node.handle(new RMNodeResourceUpdateEvent(node.getNodeID(), ResourceOption.newInstance(Resource.newInstance(2048, 2, 2), RMNode.OVER_COMMIT_TIMEOUT_MILLIS_DEFAULT))); Resource newCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048); assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2); assertEquals("GPU resource is not match.", newCapacity.getGpuCores(), 2); Assert.assertEquals(NodeState.RUNNING, node.getState()); Assert.assertNotNull(nodesListManagerEvent); Assert.assertEquals(NodesListManagerEventType.NODE_USABLE, nodesListManagerEvent.getType()); }
@Test public void testResourceUpdateOnNewNode() { RMNodeImpl node = getNewNode(Resource.newInstance(4096, 4, 4)); Resource oldCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", oldCapacity.getMemory(), 4096); assertEquals("CPU resource is not match.", oldCapacity.getVirtualCores(), 4); node.handle(new RMNodeResourceUpdateEvent(node.getNodeID(), ResourceOption.newInstance(Resource.newInstance(2048, 2, 2), RMNode.OVER_COMMIT_TIMEOUT_MILLIS_DEFAULT))); Resource newCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048); assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2); assertEquals("GPU resource is not match.", newCapacity.getGpuCores(), 2); Assert.assertEquals(NodeState.NEW, node.getState()); }
@Test public void testResourceUpdateOnRebootedNode() { RMNodeImpl node = getRebootedNode(); Resource oldCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", oldCapacity.getMemory(), 4096); assertEquals("CPU resource is not match.", oldCapacity.getVirtualCores(), 4); node.handle(new RMNodeResourceUpdateEvent(node.getNodeID(), ResourceOption.newInstance(Resource.newInstance(2048, 2, 2), RMNode.OVER_COMMIT_TIMEOUT_MILLIS_DEFAULT))); Resource newCapacity = node.getTotalCapability(); assertEquals("Memory resource is not match.", newCapacity.getMemory(), 2048); assertEquals("CPU resource is not match.", newCapacity.getVirtualCores(), 2); assertEquals("GPU resource is not match.", newCapacity.getGpuCores(), 2); Assert.assertEquals(NodeState.REBOOTED, node.getState()); }
private void checkUnealthyNMCount(MockRM rm, MockNM nm1, boolean health, int count) throws Exception { int waitCount = 0; while((rm.getRMContext().getRMNodes().get(nm1.getNodeId()) .getState() != NodeState.UNHEALTHY) == health && waitCount++ < 20) { synchronized (this) { wait(100); } } Assert.assertFalse((rm.getRMContext().getRMNodes().get(nm1.getNodeId()) .getState() != NodeState.UNHEALTHY) == health); Assert.assertEquals("Unhealthy metrics not incremented", count, ClusterMetrics.getMetrics().getUnhealthyNMs()); }
/** * Lists the nodes matching the given node states * * @param nodeStates * @throws YarnException * @throws IOException */ private void listClusterNodes(Set<NodeState> nodeStates) throws YarnException, IOException { PrintWriter writer = new PrintWriter( new OutputStreamWriter(sysout, Charset.forName("UTF-8"))); List<NodeReport> nodesReport = client.getNodeReports( nodeStates.toArray(new NodeState[0])); writer.println("Total Nodes:" + nodesReport.size()); writer.printf(NODES_PATTERN, "Node-Id", "Node-State", "Node-Http-Address", "Number-of-Running-Containers"); for (NodeReport nodeReport : nodesReport) { writer.printf(NODES_PATTERN, nodeReport.getNodeId(), nodeReport .getNodeState(), nodeReport.getHttpAddress(), nodeReport .getNumContainers()); } writer.flush(); }
@Test public void testNodeStatus() throws Exception { NodeId nodeId = NodeId.newInstance("host0", 0); NodeCLI cli = new NodeCLI(); when(client.getNodeReports()).thenReturn( getNodeReports(3, NodeState.RUNNING, false)); cli.setClient(client); cli.setSysOutPrintStream(sysOut); cli.setSysErrPrintStream(sysErr); int result = cli.run(new String[] { "-status", nodeId.toString() }); assertEquals(0, result); verify(client).getNodeReports(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); PrintWriter pw = new PrintWriter(baos); pw.println("Node Report : "); pw.println("\tNode-Id : host0:0"); pw.println("\tRack : rack1"); pw.println("\tNode-State : RUNNING"); pw.println("\tNode-Http-Address : host1:8888"); pw.println("\tLast-Health-Update : " + DateFormatUtils.format(new Date(0), "E dd/MMM/yy hh:mm:ss:SSzz")); pw.println("\tHealth-Report : "); pw.println("\tContainers : 0"); pw.println("\tMemory-Used : 0MB"); pw.println("\tMemory-Capacity : 0MB"); pw.println("\tCPU-Used : 0 vcores"); pw.println("\tCPU-Capacity : 0 vcores"); pw.println("\tNode-Labels : a,b,c,x,y,z"); pw.close(); String nodeStatusStr = baos.toString("UTF-8"); verify(sysOut, times(1)).println(isA(String.class)); verify(sysOut).println(nodeStatusStr); }
@Test public void testAbsentNodeStatus() throws Exception { NodeId nodeId = NodeId.newInstance("Absenthost0", 0); NodeCLI cli = new NodeCLI(); when(client.getNodeReports()).thenReturn( getNodeReports(0, NodeState.RUNNING)); cli.setClient(client); cli.setSysOutPrintStream(sysOut); cli.setSysErrPrintStream(sysErr); int result = cli.run(new String[] { "-status", nodeId.toString() }); assertEquals(0, result); verify(client).getNodeReports(); verify(sysOut, times(1)).println(isA(String.class)); verify(sysOut).println( "Could not find the node report for node id : " + nodeId.toString()); }
private List<NodeReport> getNodeReports(int noOfNodes, NodeState state, boolean emptyNodeLabel) { List<NodeReport> nodeReports = new ArrayList<NodeReport>(); for (int i = 0; i < noOfNodes; i++) { Set<String> nodeLabels = null; if (!emptyNodeLabel) { // node labels is not ordered, but when we output it, it should be // ordered nodeLabels = ImmutableSet.of("c", "b", "a", "x", "z", "y"); } NodeReport nodeReport = NodeReport.newInstance(NodeId .newInstance("host" + i, 0), state, "host" + 1 + ":8888", "rack1", Records.newRecord(Resource.class), Records .newRecord(Resource.class), 0, "", 0, nodeLabels); nodeReports.add(nodeReport); } return nodeReports; }
private void actOnUnusableNode(NodeId nodeId, NodeState nodeState) { // rerun previously successful map tasks List<TaskAttemptId> taskAttemptIdList = nodesToSucceededTaskAttempts.get(nodeId); if(taskAttemptIdList != null) { String mesg = "TaskAttempt killed because it ran on unusable node " + nodeId; for(TaskAttemptId id : taskAttemptIdList) { if(TaskType.MAP == id.getTaskId().getTaskType()) { // reschedule only map tasks because their outputs maybe unusable LOG.info(mesg + ". AttemptId:" + id); eventHandler.handle(new TaskAttemptKillEvent(id, mesg)); } } } // currently running task attempts on unusable nodes are handled in // RMContainerAllocator }
private void waitForNodesRunning() throws InterruptedException { long startTimeMS = System.currentTimeMillis(); while (true) { int numRunningNodes = 0; for (RMNode node : rm.getRMContext().getRMNodes().values()) { if (node.getState() == NodeState.RUNNING) { numRunningNodes ++; } } if (numRunningNodes == numNMs) { break; } LOG.info(MessageFormat.format("SLSRunner is waiting for all " + "nodes RUNNING. {0} of {1} NMs initialized.", numRunningNodes, numNMs)); Thread.sleep(1000); } LOG.info(MessageFormat.format("SLSRunner takes {0} ms to launch all nodes.", (System.currentTimeMillis() - startTimeMS))); }
@Override public List<NodeReport> getClusterNodes(EnumSet<NodeState> states) throws IOException { String sinfoCmd = conf.get( HPCConfiguration.YARN_APPLICATION_HPC_COMMAND_SLURM_SINFO, HPCConfiguration.DEFAULT_YARN_APPLICATION_HPC_COMMAND_SLURM_SINFO); String result = Shell.execCommand(sinfoCmd, "-h", "-o %N"); List<NodeReport> reports = new ArrayList<NodeReport>(); for (String node : HPCUtils.parseHostList(result)) { NodeReport nodeReport = NodeReport.newInstance( NodeId.newInstance(node, 0), NodeState.RUNNING, "", "", Resource.newInstance(0, 0), Resource.newInstance(1024, 2), 0, "", 0); reports.add(nodeReport); } return reports; }
private ClusterResourceDescription getCurrentFreeClusterResources(YarnClient yarnClient) throws YarnException, IOException { List<NodeReport> nodes = yarnClient.getNodeReports(NodeState.RUNNING); int totalFreeMemory = 0; int containerLimit = 0; int[] nodeManagersFree = new int[nodes.size()]; for (int i = 0; i < nodes.size(); i++) { NodeReport rep = nodes.get(i); int free = rep.getCapability().getMemory() - (rep.getUsed() != null ? rep.getUsed().getMemory() : 0); nodeManagersFree[i] = free; totalFreeMemory += free; if (free > containerLimit) { containerLimit = free; } } return new ClusterResourceDescription(totalFreeMemory, containerLimit, nodeManagersFree); }
public void handle(RMNodeEvent event) { LOG.debug("Processing " + event.getNodeId() + " of type " + event.getType()); try { writeLock.lock(); NodeState oldState = getState(); try { stateMachine.doTransition(event.getType(), event); } catch (InvalidStateTransitionException e) { LOG.error("Can't handle this event at current state", e); LOG.error("Invalid event " + event.getType() + " on Node " + this.nodeId); } if (oldState != getState()) { LOG.info(nodeId + " Node Transitioned from " + oldState + " to " + getState()); } } finally { writeLock.unlock(); } }
public FakeRMNodeImpl(NodeId nodeId, String nodeAddr, String httpAddress, Resource perNode, String rackName, String healthReport, int cmdPort, String hostName, NodeState state) { this.nodeId = nodeId; this.nodeAddr = nodeAddr; this.httpAddress = httpAddress; this.perNode = perNode; this.rackName = rackName; this.healthReport = healthReport; this.cmdPort = cmdPort; this.hostName = hostName; this.state = state; toCleanUpApplications = new ArrayList<ApplicationId>(); toCleanUpContainers = new ArrayList<ContainerId>(); runningApplications = new ArrayList<ApplicationId>(); }
/** * Report node is UNUSABLE and update metrics. * @param rmNode * @param finalState */ public static void reportNodeUnusable(RMNodeImpl rmNode, NodeState finalState) { // Inform the scheduler rmNode.nodeUpdateQueue.clear(); // If the current state is NodeState.UNHEALTHY // Then node is already been removed from the // Scheduler NodeState initialState = rmNode.getState(); if (!initialState.equals(NodeState.UNHEALTHY)) { rmNode.context.getDispatcher().getEventHandler() .handle(new NodeRemovedSchedulerEvent(rmNode)); } rmNode.context.getDispatcher().getEventHandler().handle( new NodesListManagerEvent( NodesListManagerEventType.NODE_UNUSABLE, rmNode)); //Update the metrics rmNode.updateMetricsForDeactivatedNode(initialState, finalState); }