@Override public void head(Node node, int depth) { String name = node.nodeName(); if (node instanceof TextNode) { append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. } else if (name.equals("ul")) { listNesting++; } else if (name.equals("li")) { append("\n "); for (int i = 1; i < listNesting; i++) { append(" "); } if (listNesting == 1) { append("* "); } else { append("- "); } } else if (name.equals("dt")) { append(" "); } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) { append("\n"); } }
@NonNull @Override public TailFilterDecision tail(Node node, int depth) { if (signatureFound) { return TailFilterDecision.CONTINUE; } if (node instanceof Element) { Element element = (Element) node; boolean elementIsBr = element.tag().equals(BR); if (elementIsBr || element.tag().equals(P)) { lastElementCausedLineBreak = true; brElementPrecedingDashes = elementIsBr ? element : null; return TailFilterDecision.CONTINUE; } } lastElementCausedLineBreak = false; return TailFilterDecision.CONTINUE; }
private String gatherWuBi(Element tagContentEL) { Elements spans = tagContentEL.select("span.diczx7"); for (Element span : spans) { if (span.text().equals("五笔:")) { // 后一个兄弟文本节点 Node textNode = span.nextSibling(); if (textNode instanceof TextNode) { String wubi=((TextNode) textNode).text(); //去掉特殊字符 wubi=wubi.replaceAll("\\W", ""); return wubi; } } } return null; }
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) { for (Node child : e.childNodes()) { if (unlikely(child)) { continue; } if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) accum.append(' '); else if (element.tagName().equals("br")) accum.append(' '); appendTextSkipHidden(element, accum, indent + 1); } } }
/** * Start a depth-first traverse of the root and all of its descendants. * @param root the root node point to traverse. */ public void traverse(Node root) { Node node = root; int depth = 0; while (node != null) { visitor.head(node, depth); if (node.childNodeSize() > 0) { node = node.childNode(0); depth++; } else { while (node.nextSibling() == null && depth > 0) { visitor.tail(node, depth); node = node.parentNode(); depth--; } visitor.tail(node, depth); if (node == root) break; node = node.nextSibling(); } } }
public Node getFirstNonEmptyNodeChild(Element parent) { if (parent == null) return null; if (parent.childNodeSize() == 0) return null; if (parent.childNode(0) instanceof Element) { return parent.childNode(0); } if (parent.childNode(0) instanceof TextNode && ((TextNode) parent.childNode(0)).text().replaceAll("\u00A0", " ").trim().length() > 0) { return parent.childNode(0); } else { return getNextNonEmptyNode(parent.childNode(0)); } }
@SuppressWarnings("unchecked") protected List<? extends Node> getChildNodes(final Node node, final String withName) { if (node==null) return Collections.EMPTY_LIST; if (Selector.UNIVERSAL_TAG.equals(withName)) return filter(node.childNodes()); ArrayList<Node> result = new ArrayList<Node>(); List<Node> children = node.childNodes(); for(Node child : children) if (nameMatches(child, withName)) result.add(child); return result; }
public String getPath(Node n) { String nodePath = ""; while (n != null) { if (n instanceof TextNode) { n = n.parent(); } if (NodeHelper.isInnerText(n)) { n = n.parent(); } String parentNodeName = n.nodeName(); nodePath = parentNodeName + "." + nodePath; if (!parentNodeName.equalsIgnoreCase("html")) { n = n.parent(); } else { break; } } return nodePath; }
private void mergeToResult(Node node) { Node lastAddedNode = getLastAddedNode(); //the <br><br> is a paragraph separator if (lastAddedNode != null && node.nodeName().equalsIgnoreCase("br") && lastAddedNode .nodeName().equalsIgnoreCase("br")) { insertAsNewParagraph(node); return; } if (lastAddedNode == null) { insertAsNewParagraph(node); return; } AncestorState ancestorState = getAncestorState(lastAddedNode, node); switch (ancestorState) { case BLOCKLEVEL: insertAsNewParagraph(node); return; case INNERTEXT_ONLY: appendToLastParagraph(node); return; } }
/** * Visit from lastNode and currentNode to the first common ancestor of these * 2 nodes, - if all the visited ancestors are * INNERTEXT returns * {@link ParagraphsExplorer.AncestorState#INNERTEXT_ONLY} - if one of the * visited ancestors is * isBlockTag(Node) returns * {@link ParagraphsExplorer.AncestorState#BLOCKLEVEL} - otherwise returns * {@link ParagraphsExplorer.AncestorState#UNKNOW} */ public static AncestorState getAncestorState(Node lastNode, Node currentNode) { if (lastNode == null || currentNode == null) { throw new InvalidParameterException(); } Node ancestor = NodeHelper.nearestCommonAncestor(lastNode, currentNode); AncestorState as1 = getAncestorStateOfBranch(ancestor, lastNode); if (as1 == AncestorState.BLOCKLEVEL) { return AncestorState.BLOCKLEVEL; } AncestorState as2 = getAncestorStateOfBranch(ancestor, currentNode); if (as2 == AncestorState.BLOCKLEVEL) { return AncestorState.BLOCKLEVEL; } if (as1 == AncestorState.INNERTEXT_ONLY && as2 == AncestorState.INNERTEXT_ONLY) { return AncestorState.INNERTEXT_ONLY; } return AncestorState.UNKNOW; }
/** * Returns true if node1 is ancestor of node2 or node1 == node2 * * @param node1 node 1 * @param node2 node 2 * @return boolean value */ public static boolean isAncestor(Node node1, Node node2) { if (node1 == node2) { return true; } Node ancestor = node2; while (ancestor != null) { if (ancestor == node1) { return true; } ancestor = ancestor.parent(); } return false; }
public void initRawInfo() { StringBuilder sb = new StringBuilder(); for (Node n : this) { // NodeHelper.cleanEmptyElements(n); if (n instanceof TextNode) { this.setTagName(getPath(n)); String nodeRawText = ((TextNode) n).text(); sb.append(Utils.normalizeBreaks(nodeRawText).trim()); if (NodeHelper.isLink(n)) { charsCountInLinks += nodeRawText.length(); } } } rawText = sb.toString(); }
private void mergeToResult(Node node) { Node lastAddedNode = getLastAddedNode(); //the <br><br> is a paragraph separator if (lastAddedNode != null && node.nodeName().equalsIgnoreCase("br") && lastAddedNode .nodeName().equalsIgnoreCase("br")) { insertAsNewParagraph(node); return; } if (lastAddedNode == null) { insertAsNewParagraph(node); return; } AncestorState ancestorState = getAncestorState(lastAddedNode, node); switch (ancestorState) { case BLOCKLEVEL: insertAsNewParagraph(node); return; case INNERTEXT_ONLY: appendToLastParagraph(node); } }
@Test public void replaceNonTranslatableNode() { String node1 = "<span translate=\"no\">do not translate</span>"; String node2 = "<em>translate</em>"; String node4 = "<span class=\"notranslate\">do not translate</span>"; String node5 = "<span id=\"private-notes-testing\">do not translate</span>"; String node6 = "<span>translate this</span>"; String node7 = "<p>translate this</p>"; String html = "<div>" + node1 + node2 + node4 + node5 + node6 + node7 + "</div>"; TranslatableHTMLNode node = ArticleUtil.replaceNonTranslatableNode(1, html); assertThat(node.getPlaceholderIdMap()).hasSize(3); assertThat(node.getPlaceholderIdMap().values()).extracting( Node::toString) .contains(node1, node4, node5) .doesNotContain(node6, node7); assertThat(node.getHtml()).doesNotContain(node1) .doesNotContain(node4) .doesNotContain(node5).contains(node2, node6, node7); }
@Test public void replacePlaceholderWithNode() { Map<String, Node> nodeIdMap = new HashMap<>(); String html = "<div>"; for (int i = 0; i < 5; i++) { Attributes attrs = new Attributes(); String id = "id" + i; attrs.put("id", id); Element ele = new Element(Tag.valueOf("span"), "", attrs); ele.append("The original node"); nodeIdMap.put(id, ele); Element placeholder = ArticleUtil.generatePlaceholderNode(id); html += placeholder.outerHtml(); } html += "</div>"; String results = ArticleUtil.replacePlaceholderWithNode(nodeIdMap, html); for (Node originalNode: nodeIdMap.values()) { assertThat(results).contains(originalNode.outerHtml()); } }
public void tail(Node node, int depth) { if (node.getClass().equals(Element.class)) { Element elm = (Element) node; HTMLAnnotation anno = builder.add(beginMap.get(node), HTMLAnnotation.class); anno.setTag(elm.tagName()); anno.setId(elm.id()); anno.setSelector(elm.cssSelector()); if (elm.className().isEmpty()) anno.setCls(elm.attr("type")); else anno.setCls(elm.className()); annotationMap.put(elm.cssSelector(), anno); if (elm.isBlock() || ArrayUtils.contains(blockElements, elm.tagName())) builder.add("\n"); } }
@Override public void tail(Node node, int depth) { if (node.getClass().equals(Element.class)) { Element elm = (Element) node; HTMLAnnotation anno = builder.add(beginMap.get(node), HTMLAnnotation.class); anno.setTag(elm.tagName()); anno.setId(elm.id()); if (elm.className().isEmpty()) anno.setCls(elm.attr("type")); else anno.setCls(elm.className()); annotationMap.put(elm.cssSelector(), anno); if (elm.isBlock() || ArrayUtils.contains(blockElements, elm.tagName())) builder.add("\n"); } }
public void head(Node node, int depth) { String name = node.nodeName(); if (node instanceof TextNode) append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. else if (name.equals("li")) append("\n * "); else if (name.equals("dt")) append(" "); else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) append("\n"); }
public void tail(Node node, int depth) { String name = node.nodeName(); if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) append("\n"); else if (name.equals("a")) append(String.format(" <%s>", node.absUrl("href"))); }
@NonNull @Override public HeadFilterDecision head(Node node, int depth) { if (signatureFound) { return HeadFilterDecision.REMOVE; } if (node instanceof Element) { lastElementCausedLineBreak = false; Element element = (Element) node; if (element.tag().equals(BLOCKQUOTE)) { return HeadFilterDecision.SKIP_ENTIRELY; } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) { Node nextNode = node.nextSibling(); if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) { signatureFound = true; if (brElementPrecedingDashes != null) { brElementPrecedingDashes.remove(); brElementPrecedingDashes = null; } return HeadFilterDecision.REMOVE; } } } return HeadFilterDecision.CONTINUE; }
public void head(Node source, int depth) { if (skipChildren) { return; } if (source instanceof Element) { Element sourceElement = (Element) source; if (isSafeTag(sourceElement)) { String sourceTag = sourceElement.tagName(); Attributes destinationAttributes = sourceElement.attributes().clone(); Element destinationChild = new Element(Tag.valueOf(sourceTag), sourceElement.baseUri(), destinationAttributes); destination.appendChild(destinationChild); destination = destinationChild; } else if (source != root) { skipChildren = true; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destinationText = new TextNode(sourceText.getWholeText(), source.baseUri()); destination.appendChild(destinationText); } else if (source instanceof DataNode && isSafeTag(source.parent())) { DataNode sourceData = (DataNode) source; DataNode destinationData = new DataNode(sourceData.getWholeData(), source.baseUri()); destination.appendChild(destinationData); } }
private boolean isSafeTag(Node node) { if (isMetaRefresh(node)) { return false; } String tag = node.nodeName().toLowerCase(Locale.ROOT); return ALLOWED_TAGS.contains(tag); }
private boolean isMetaRefresh(Node node) { if (!"meta".equalsIgnoreCase(node.nodeName())) { return false; } String attributeValue = node.attributes().getIgnoreCase("http-equiv"); return "refresh".equalsIgnoreCase(attributeValue.trim()); }
private String gatherBiHua(Element tagContentEL) { Elements spans = tagContentEL.select("span.diczx6"); for (Element span : spans) { if (span.text().equals("笔顺编号:")) { // 后一个兄弟文本节点 Node textNode = span.nextSibling(); if (textNode instanceof TextNode) { return ((TextNode)textNode).getWholeText(); } } } return null; }
private List<DuYinDM> gatherDuyins(Element contentEL)throws Exception{ Elements elements=contentEL.select("p"); DuYinDM dm=null; List<DuYinDM> results=new ArrayList<DuYinDM>(3); for (Element p : elements) { if(p.children().isEmpty())continue; Element firstChild=p.child(0); if("span".equals(firstChild.tagName())){ if(firstChild.hasClass("dicpy")){ if(dm!=null){ results.add(dm); } dm=new DuYinDM(); String duyin=firstChild.text(); dm.setDuyin(duyin); } }else if("em".equals(firstChild.tagName())){ StringBuilder ziyi=new StringBuilder(); Node next=firstChild.nextSibling(); while(next!=null){ if(next instanceof TextNode){ ziyi.append(((TextNode) next).text()); }else if(next instanceof Element){ ziyi.append(((Element) next).text()); } next=next.nextSibling(); } dm.addZiyi(ziyi.toString()); } } if(dm!=null){ results.add(dm); } return results; }
private static List<FlowerCategory> getCategoryList() { List<FlowerCategory> categories = new ArrayList<FlowerCategory>(); try { Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get(); Elements catelist = doc.getElementsByClass("catelist"); Element cates = catelist.first(); List<Node> childNodes = cates.childNodes(); for (int i = 0; i < childNodes.size(); i++) { Node node = childNodes.get(i); List<Node> childs = node.childNodes(); if (childs != null && childs.size() > 0) { FlowerCategory category = new FlowerCategory(); for (int j = 0; j < childs.size(); j++) { Node child = childs.get(j); if ("a".equals(child.nodeName())) { category.setUrl(child.attr("href")); category.setImgPath(child.childNode(1).attr("src")); } else if ("h2".equals(child.nodeName())) { category.setName(child.attr("title")); } } categories.add(category); } } } catch (IOException e) { e.printStackTrace(); } return categories; }
private void print2(String baseLocation) throws IOException, TransformerException, ParserConfigurationException { Document document = Jsoup.connect(baseLocation).get(); Elements content = document.getElementsByAttributeValue("class", "entry-content"); String title = null; ArrayList<String> list = new ArrayList<>(); for (Element div : content) { List<Node> nodes = div.childNodes(); for (Node node : nodes) { if (node instanceof Element) { if (((Element) node).tagName().equals("h3")) { writeFile(title, list); list.clear(); System.out.println("Title: " + node.childNode(0)); title = node.childNode(0).toString(); } else if (((Element) node).tagName().equals("table")) { //print table Elements tr = ((Element) node).getElementsByTag("tr"); for (Element element : tr) { Elements td = element.getElementsByTag("td"); for (Element value : td) { if (value.childNodeSize() > 0) { if (!(value.childNode(0) instanceof Comment)) { // System.out.println("Emoticon: " + value.childNode(0) + " " + value.childNode(0).getClass().getSimpleName()); list.add(value.childNode(0).toString()); } } } } } } } } }
public void head(Node node, int i) { String name = node.nodeName(); if (node instanceof TextNode) accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. else if (name.equals("li")) accum.append("\n * "); else if (name.equals("dt")) accum.append(" "); else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) accum.append("\n"); }
public void tail(Node node, int depth) { String name = node.nodeName(); if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5")) accum.append("\n"); }
protected String getText(Element element) { StringBuilder accum = new StringBuilder(); for (Node node : element.childNodes()) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; accum.append(textNode.text()); } } return accum.toString(); }
@Override public void tail(Node node, int depth) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; if (isApplicable(tn)) matchedNodes.add(tn); } }
public static boolean hasAncestor(Node node, Collection<String> tags) { Node parent = node.parentNode(); while (parent != null) { if (parent instanceof Element) { Element e = (Element) parent; if (tags.contains(e.tagName().toLowerCase())) { return true; } } parent = parent.parentNode(); } return false; }
public static void appendReplacement(Matcher matcher, Node node, String replacement) { StringBuffer buffer = new StringBuffer(); matcher.appendReplacement(buffer, ""); if (buffer.length() != 0) node.before(new TextNode(buffer.toString(), node.baseUri())); node.before(new DataNode(replacement, node.baseUri())); }
public static void appendTail(Matcher matcher, Node node) { StringBuffer buffer = new StringBuffer(); matcher.appendTail(buffer); if (buffer.length() != 0) node.before(new TextNode(buffer.toString(), node.baseUri())); node.remove(); }
/** * Extracts elements from the html comments (paragraph breaks, links) * * @param pElement paragraph element * @return plain text */ public static String paragraphElementToString(Element pElement) { StringBuilder sb = new StringBuilder(); for (Node child : pElement.childNodes()) { if (child instanceof TextNode) { TextNode textNode = (TextNode) child; sb.append(textNode.text()); } else if (child instanceof Element) { Element element = (Element) child; // append new line for break if ("br".equals(element.tag().getName())) { sb.append("\n"); } else if ("a".equals(element.tag().getName())) { // extract link from a.href sb.append(" ").append(element.attr("href")).append(" "); } else { // or just add the text sb.append(" ").append(element.text()).append(" "); } } } return sb.toString(); }
private boolean unlikely(Node e) { if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption")) return true; String style = e.attr("style"); String clazz = e.attr("class"); return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find(); }