Java 类org.jsoup.nodes.Node 实例源码

项目:eclipse.jdt.ls    文件:HtmlToPlainText.java   
@Override
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode) {
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    } else if (name.equals("ul")) {
        listNesting++;
    } else if (name.equals("li")) {
        append("\n ");
        for (int i = 1; i < listNesting; i++) {
            append("  ");
        }
        if (listNesting == 1) {
            append("* ");
        } else {
            append("- ");
        }
    } else if (name.equals("dt")) {
        append("  ");
    } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) {
        append("\n");
    }
}
项目:q-mail    文件:HtmlSignatureRemover.java   
@NonNull
@Override
public TailFilterDecision tail(Node node, int depth) {
    if (signatureFound) {
        return TailFilterDecision.CONTINUE;
    }

    if (node instanceof Element) {
        Element element = (Element) node;
        boolean elementIsBr = element.tag().equals(BR);
        if (elementIsBr || element.tag().equals(P)) {
            lastElementCausedLineBreak = true;
            brElementPrecedingDashes = elementIsBr ? element : null;
            return TailFilterDecision.CONTINUE;
        }
    }

    lastElementCausedLineBreak = false;
    return TailFilterDecision.CONTINUE;
}
项目:case-html-data-gather    文件:HTMLDataGather.java   
private String gatherWuBi(Element tagContentEL) {
    Elements spans = tagContentEL.select("span.diczx7");
    for (Element span : spans) {
        if (span.text().equals("五笔:")) {
            // 后一个兄弟文本节点
            Node textNode = span.nextSibling();
            if (textNode instanceof TextNode) {
                String wubi=((TextNode) textNode).text();
                //去掉特殊字符
                wubi=wubi.replaceAll("\\W", "");
                return wubi;
            }
        }
    }
    return null;
}
项目:Xndroid    文件:OutputFormatter.java   
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
项目:common    文件:NodeTraversor.java   
/**
 * Start a depth-first traverse of the root and all of its descendants.
 * @param root the root node point to traverse.
 */
public void traverse(Node root) {
    Node node = root;
    int depth = 0;

    while (node != null) {
        visitor.head(node, depth);
        if (node.childNodeSize() > 0) {
            node = node.childNode(0);
            depth++;
        } else {
            while (node.nextSibling() == null && depth > 0) {
                visitor.tail(node, depth);
                node = node.parentNode();
                depth--;
            }
            visitor.tail(node, depth);
            if (node == root)
                break;
            node = node.nextSibling();
        }
    }
}
项目:awplab-core    文件:JsoupSession.java   
public Node getFirstNonEmptyNodeChild(Element parent) {
    if (parent == null) return null;

    if (parent.childNodeSize() == 0) return null;

    if (parent.childNode(0) instanceof Element) {
        return parent.childNode(0);
    }

    if (parent.childNode(0) instanceof TextNode && ((TextNode) parent.childNode(0)).text().replaceAll("\u00A0", " ").trim().length() > 0) {
        return parent.childNode(0);
    }
    else {
        return getNextNonEmptyNode(parent.childNode(0));
    }

}
项目:RenewPass    文件:JsoupNodeHelper.java   
@SuppressWarnings("unchecked")
protected List<? extends Node> getChildNodes(final Node node, final String withName) {
    if (node==null)
        return Collections.EMPTY_LIST;

    if (Selector.UNIVERSAL_TAG.equals(withName))
        return filter(node.childNodes());

    ArrayList<Node> result = new ArrayList<Node>();

    List<Node> children = node.childNodes();
    for(Node child : children)
        if (nameMatches(child, withName))
            result.add(child);

    return result;
}
项目:sigir2016-collection-for-focused-retrieval    文件:Paragraph.java   
public String getPath(Node n)
{
    String nodePath = "";
    while (n != null) {
        if (n instanceof TextNode) {
            n = n.parent();
        }
        if (NodeHelper.isInnerText(n)) {
            n = n.parent();
        }
        String parentNodeName = n.nodeName();
        nodePath = parentNodeName + "." + nodePath;

        if (!parentNodeName.equalsIgnoreCase("html")) {
            n = n.parent();
        }
        else {
            break;
        }
    }

    return nodePath;
}
项目:sigir2016-collection-for-focused-retrieval    文件:ParagraphsExplorer.java   
private void mergeToResult(Node node)
{
    Node lastAddedNode = getLastAddedNode();
    //the <br><br> is a paragraph separator 
    if (lastAddedNode != null && node.nodeName().equalsIgnoreCase("br") && lastAddedNode
            .nodeName().equalsIgnoreCase("br")) {
        insertAsNewParagraph(node);
        return;
    }
    if (lastAddedNode == null) {
        insertAsNewParagraph(node);
        return;
    }

    AncestorState ancestorState = getAncestorState(lastAddedNode, node);
    switch (ancestorState) {
    case BLOCKLEVEL:
        insertAsNewParagraph(node);
        return;
    case INNERTEXT_ONLY:
        appendToLastParagraph(node);
        return;
    }
}
项目:sigir2016-collection-for-focused-retrieval    文件:ParagraphsExplorer.java   
/**
 * Visit from lastNode and currentNode to the first common ancestor of these
 * 2 nodes, - if all the visited ancestors are
 * INNERTEXT returns
 * {@link ParagraphsExplorer.AncestorState#INNERTEXT_ONLY} - if one of the
 * visited ancestors is
 * isBlockTag(Node) returns
 * {@link ParagraphsExplorer.AncestorState#BLOCKLEVEL} - otherwise returns
 * {@link ParagraphsExplorer.AncestorState#UNKNOW}
 */
public static AncestorState getAncestorState(Node lastNode, Node currentNode)
{
    if (lastNode == null || currentNode == null) {
        throw new InvalidParameterException();
    }

    Node ancestor = NodeHelper.nearestCommonAncestor(lastNode, currentNode);
    AncestorState as1 = getAncestorStateOfBranch(ancestor, lastNode);
    if (as1 == AncestorState.BLOCKLEVEL) {
        return AncestorState.BLOCKLEVEL;
    }
    AncestorState as2 = getAncestorStateOfBranch(ancestor, currentNode);
    if (as2 == AncestorState.BLOCKLEVEL) {
        return AncestorState.BLOCKLEVEL;
    }
    if (as1 == AncestorState.INNERTEXT_ONLY && as2 == AncestorState.INNERTEXT_ONLY) {
        return AncestorState.INNERTEXT_ONLY;
    }
    return AncestorState.UNKNOW;
}
项目:dkpro-c4corpus    文件:NodeHelper.java   
/**
 * Returns true if node1 is ancestor of node2 or node1 == node2
 *
 * @param node1 node 1
 * @param node2 node 2
 * @return boolean value
 */
public static boolean isAncestor(Node node1, Node node2)
{
    if (node1 == node2) {
        return true;
    }
    Node ancestor = node2;

    while (ancestor != null) {
        if (ancestor == node1) {
            return true;
        }
        ancestor = ancestor.parent();
    }

    return false;
}
项目:dkpro-c4corpus    文件:Paragraph.java   
public void initRawInfo()
{
    StringBuilder sb = new StringBuilder();
    for (Node n : this) {
        //            NodeHelper.cleanEmptyElements(n);
        if (n instanceof TextNode) {
            this.setTagName(getPath(n));
            String nodeRawText = ((TextNode) n).text();
            sb.append(Utils.normalizeBreaks(nodeRawText).trim());

            if (NodeHelper.isLink(n)) {
                charsCountInLinks += nodeRawText.length();
            }
        }
    }

    rawText = sb.toString();
}
项目:dkpro-c4corpus    文件:Paragraph.java   
public String getPath(Node n)
{
    String nodePath = "";
    while (n != null) {
        if (n instanceof TextNode) {
            n = n.parent();
        }
        if (NodeHelper.isInnerText(n)) {
            n = n.parent();
        }
        String parentNodeName = n.nodeName();
        nodePath = parentNodeName + "." + nodePath;

        if (!parentNodeName.equalsIgnoreCase("html")) {
            n = n.parent();
        }
        else {
            break;
        }
    }

    return nodePath;
}
项目:dkpro-c4corpus    文件:ParagraphsExplorer.java   
private void mergeToResult(Node node)
{
    Node lastAddedNode = getLastAddedNode();
    //the <br><br> is a paragraph separator 
    if (lastAddedNode != null && node.nodeName().equalsIgnoreCase("br") && lastAddedNode
            .nodeName().equalsIgnoreCase("br")) {
        insertAsNewParagraph(node);
        return;
    }
    if (lastAddedNode == null) {
        insertAsNewParagraph(node);
        return;
    }

    AncestorState ancestorState = getAncestorState(lastAddedNode, node);
    switch (ancestorState) {
    case BLOCKLEVEL:
        insertAsNewParagraph(node);
        return;
    case INNERTEXT_ONLY:
        appendToLastParagraph(node);
    }
}
项目:zanata-mt    文件:ArticleUtilTest.java   
@Test
public void replaceNonTranslatableNode() {
    String node1 = "<span translate=\"no\">do not translate</span>";
    String node2 = "<em>translate</em>";
    String node4 = "<span class=\"notranslate\">do not translate</span>";
    String node5 = "<span id=\"private-notes-testing\">do not translate</span>";
    String node6 = "<span>translate this</span>";
    String node7 = "<p>translate this</p>";
    String html = "<div>" + node1 + node2 + node4 + node5 + node6 +
            node7 + "</div>";
    TranslatableHTMLNode
            node = ArticleUtil.replaceNonTranslatableNode(1, html);
    assertThat(node.getPlaceholderIdMap()).hasSize(3);
    assertThat(node.getPlaceholderIdMap().values()).extracting(
            Node::toString)
            .contains(node1, node4, node5)
            .doesNotContain(node6, node7);
    assertThat(node.getHtml()).doesNotContain(node1)
            .doesNotContain(node4)
            .doesNotContain(node5).contains(node2, node6, node7);
}
项目:zanata-mt    文件:ArticleUtilTest.java   
@Test
public void replacePlaceholderWithNode() {
    Map<String, Node> nodeIdMap = new HashMap<>();
    String html = "<div>";
    for (int i = 0; i < 5; i++) {
        Attributes attrs = new Attributes();
        String id = "id" + i;
        attrs.put("id", id);
        Element ele = new Element(Tag.valueOf("span"), "", attrs);
        ele.append("The original node");
        nodeIdMap.put(id, ele);

        Element placeholder = ArticleUtil.generatePlaceholderNode(id);
        html += placeholder.outerHtml();
    }
    html += "</div>";

    String results = ArticleUtil.replacePlaceholderWithNode(nodeIdMap, html);
    for (Node originalNode: nodeIdMap.values()) {
        assertThat(results).contains(originalNode.outerHtml());
    }
}
项目:DramaNLP    文件:Visitor.java   
public void tail(Node node, int depth) {
    if (node.getClass().equals(Element.class)) {
        Element elm = (Element) node;
        HTMLAnnotation anno =
                builder.add(beginMap.get(node), HTMLAnnotation.class);
        anno.setTag(elm.tagName());
        anno.setId(elm.id());
        anno.setSelector(elm.cssSelector());
        if (elm.className().isEmpty())
            anno.setCls(elm.attr("type"));
        else
            anno.setCls(elm.className());
        annotationMap.put(elm.cssSelector(), anno);
        if (elm.isBlock()
                || ArrayUtils.contains(blockElements, elm.tagName()))
            builder.add("\n");
    }
}
项目:DramaNLP    文件:FolgerReader.java   
@Override
public void tail(Node node, int depth) {
    if (node.getClass().equals(Element.class)) {
        Element elm = (Element) node;
        HTMLAnnotation anno = builder.add(beginMap.get(node), HTMLAnnotation.class);
        anno.setTag(elm.tagName());
        anno.setId(elm.id());
        if (elm.className().isEmpty())
            anno.setCls(elm.attr("type"));
        else
            anno.setCls(elm.className());
        annotationMap.put(elm.cssSelector(), anno);
        if (elm.isBlock() || ArrayUtils.contains(blockElements, elm.tagName()))
            builder.add("\n");

    }
}
项目:DeeBrowser    文件:OutputFormatter.java   
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
    for (Node child : e.childNodes()) {
        if (unlikely(child)) {
            continue;
        }
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;
            String txt = textNode.text();
            accum.append(txt);
        } else if (child instanceof Element) {
            Element element = (Element) child;
            if (accum.length() > 0 && element.isBlock()
                    && !lastCharIsWhitespace(accum))
                accum.append(' ');
            else if (element.tagName().equals("br"))
                accum.append(' ');
            appendTextSkipHidden(element, accum, indent + 1);
        }
    }
}
项目:generator-thundr-gae-react    文件:HtmlFormattingUtil.java   
public void head(Node node, int depth) {
    String name = node.nodeName();
    if (node instanceof TextNode)
        append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    else if (name.equals("li"))
        append("\n * ");
    else if (name.equals("dt"))
        append("  ");
    else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
        append("\n");
}
项目:generator-thundr-gae-react    文件:HtmlFormattingUtil.java   
public void tail(Node node, int depth) {
    String name = node.nodeName();
    if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))
        append("\n");
    else if (name.equals("a"))
        append(String.format(" <%s>", node.absUrl("href")));
}
项目:q-mail    文件:HtmlSignatureRemover.java   
@NonNull
@Override
public HeadFilterDecision head(Node node, int depth) {
    if (signatureFound) {
        return HeadFilterDecision.REMOVE;
    }

    if (node instanceof Element) {
        lastElementCausedLineBreak = false;

        Element element = (Element) node;
        if (element.tag().equals(BLOCKQUOTE)) {
            return HeadFilterDecision.SKIP_ENTIRELY;
        }
    } else if (node instanceof TextNode) {
        TextNode textNode = (TextNode) node;
        if (lastElementCausedLineBreak && DASH_SIGNATURE_HTML.matcher(textNode.getWholeText()).matches()) {
            Node nextNode = node.nextSibling();
            if (nextNode instanceof Element && ((Element) nextNode).tag().equals(BR)) {
                signatureFound = true;
                if (brElementPrecedingDashes != null) {
                    brElementPrecedingDashes.remove();
                    brElementPrecedingDashes = null;
                }

                return HeadFilterDecision.REMOVE;
            }
        }
    }

    return HeadFilterDecision.CONTINUE;
}
项目:q-mail    文件:HeadCleaner.java   
public void head(Node source, int depth) {
    if (skipChildren) {
        return;
    }

    if (source instanceof Element) {
        Element sourceElement = (Element) source;

        if (isSafeTag(sourceElement)) {
            String sourceTag = sourceElement.tagName();
            Attributes destinationAttributes = sourceElement.attributes().clone();
            Element destinationChild = new Element(Tag.valueOf(sourceTag), sourceElement.baseUri(), destinationAttributes);

            destination.appendChild(destinationChild);
            destination = destinationChild;
        } else if (source != root) {
            skipChildren = true;
        }
    } else if (source instanceof TextNode) {
        TextNode sourceText = (TextNode) source;
        TextNode destinationText = new TextNode(sourceText.getWholeText(), source.baseUri());
        destination.appendChild(destinationText);
    } else if (source instanceof DataNode && isSafeTag(source.parent())) {
        DataNode sourceData = (DataNode) source;
        DataNode destinationData = new DataNode(sourceData.getWholeData(), source.baseUri());
        destination.appendChild(destinationData);
    }
}
项目:q-mail    文件:HeadCleaner.java   
private boolean isSafeTag(Node node) {
    if (isMetaRefresh(node)) {
        return false;
    }

    String tag = node.nodeName().toLowerCase(Locale.ROOT);
    return ALLOWED_TAGS.contains(tag);
}
项目:q-mail    文件:HeadCleaner.java   
private boolean isMetaRefresh(Node node) {
    if (!"meta".equalsIgnoreCase(node.nodeName())) {
        return false;
    }

    String attributeValue = node.attributes().getIgnoreCase("http-equiv");
    return "refresh".equalsIgnoreCase(attributeValue.trim());
}
项目:case-html-data-gather    文件:HTMLDataGather.java   
private String gatherBiHua(Element tagContentEL) {
    Elements spans = tagContentEL.select("span.diczx6");
    for (Element span : spans) {
        if (span.text().equals("笔顺编号:")) {
            // 后一个兄弟文本节点
            Node textNode = span.nextSibling();
            if (textNode instanceof TextNode) {
                return ((TextNode)textNode).getWholeText();
            }
        }
    }
    return null;
}
项目:case-html-data-gather    文件:HTMLDataGather.java   
private List<DuYinDM> gatherDuyins(Element contentEL)throws Exception{
    Elements elements=contentEL.select("p");
    DuYinDM dm=null;
    List<DuYinDM> results=new ArrayList<DuYinDM>(3);
    for (Element p : elements) {
        if(p.children().isEmpty())continue;
        Element firstChild=p.child(0);
        if("span".equals(firstChild.tagName())){
            if(firstChild.hasClass("dicpy")){
                if(dm!=null){
                    results.add(dm);
                }
                dm=new DuYinDM();
                String duyin=firstChild.text();
                dm.setDuyin(duyin);
            }
        }else   if("em".equals(firstChild.tagName())){

            StringBuilder ziyi=new StringBuilder();
            Node next=firstChild.nextSibling();
            while(next!=null){
                if(next instanceof TextNode){
                    ziyi.append(((TextNode) next).text());
                }else if(next instanceof Element){
                    ziyi.append(((Element) next).text());
                }
                next=next.nextSibling();
            }
            dm.addZiyi(ziyi.toString());
        }
    }
    if(dm!=null){
        results.add(dm);
    }
    return results;
}
项目:frameworkAggregate    文件:MyJsoup.java   
private static List<FlowerCategory> getCategoryList() {

        List<FlowerCategory> categories = new ArrayList<FlowerCategory>();

        try {
            Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
            Elements catelist = doc.getElementsByClass("catelist");
            Element cates = catelist.first();
            List<Node> childNodes = cates.childNodes();
            for (int i = 0; i < childNodes.size(); i++) {
                Node node = childNodes.get(i);
                List<Node> childs = node.childNodes();
                if (childs != null && childs.size() > 0) {
                    FlowerCategory category = new FlowerCategory();
                    for (int j = 0; j < childs.size(); j++) {
                        Node child = childs.get(j);
                        if ("a".equals(child.nodeName())) {
                            category.setUrl(child.attr("href"));
                            category.setImgPath(child.childNode(1).attr("src"));
                        } else if ("h2".equals(child.nodeName())) {
                            category.setName(child.attr("title"));
                        }
                    }
                    categories.add(category);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        return categories;
    }
项目:frameworkAggregate    文件:TestJsoup.java   
private static List<FlowerCategory> getCategoryList() {

        List<FlowerCategory> categories = new ArrayList<FlowerCategory>();

        try {
            Document doc = Jsoup.connect("http://www.aihuhua.com/baike/").get();
            Elements catelist = doc.getElementsByClass("catelist");
            Element cates = catelist.first();
            List<Node> childNodes = cates.childNodes();
            for (int i = 0; i < childNodes.size(); i++) {
                Node node = childNodes.get(i);
                List<Node> childs = node.childNodes();
                if (childs != null && childs.size() > 0) {
                    FlowerCategory category = new FlowerCategory();
                    for (int j = 0; j < childs.size(); j++) {
                        Node child = childs.get(j);
                        if ("a".equals(child.nodeName())) {
                            category.setUrl(child.attr("href"));
                            category.setImgPath(child.childNode(1).attr("src"));
                        } else if ("h2".equals(child.nodeName())) {
                            category.setName(child.attr("title"));
                        }
                    }
                    categories.add(category);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }

        return categories;
    }
项目:ascii_generate    文件:GetEmoticons.java   
private void print2(String baseLocation) throws IOException, TransformerException, ParserConfigurationException {
        Document document = Jsoup.connect(baseLocation).get();
        Elements content = document.getElementsByAttributeValue("class", "entry-content");
        String title = null;
        ArrayList<String> list = new ArrayList<>();
        for (Element div : content) {
            List<Node> nodes = div.childNodes();
            for (Node node : nodes) {
                if (node instanceof Element) {
                    if (((Element) node).tagName().equals("h3")) {
                        writeFile(title, list);
                        list.clear();
                        System.out.println("Title: " + node.childNode(0));
                        title = node.childNode(0).toString();
                    } else if (((Element) node).tagName().equals("table")) {
                        //print table
                        Elements tr = ((Element) node).getElementsByTag("tr");
                        for (Element element : tr) {
                            Elements td = element.getElementsByTag("td");
                            for (Element value : td) {
                                if (value.childNodeSize() > 0) {
                                    if (!(value.childNode(0) instanceof Comment)) {
//                                        System.out.println("Emoticon: " + value.childNode(0) + " " + value.childNode(0).getClass().getSimpleName());
                                        list.add(value.childNode(0).toString());
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
项目:Saber-Bot    文件:HTMLStripper.java   
public void head(Node node, int i)
{
    String name = node.nodeName();
    if (node instanceof TextNode)
        accum.append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM.
    else if (name.equals("li"))
        accum.append("\n * ");
    else if (name.equals("dt"))
        accum.append("  ");
    else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr"))
        accum.append("\n");
}
项目:Saber-Bot    文件:HTMLStripper.java   
public void tail(Node node, int depth)
{
    String name = node.nodeName();
    if (StringUtil.in(name, "br", "dd", "dt", "p", "h1", "h2", "h3", "h4", "h5"))
        accum.append("\n");
}
项目:NetDiscovery    文件:CssSelector.java   
protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}
项目:gitplex-mit    文件:TextNodeVisitor.java   
@Override
public void tail(Node node, int depth) {
    if (node instanceof TextNode) {
        TextNode tn = (TextNode) node;
        if (isApplicable(tn))
            matchedNodes.add(tn);
    }
}
项目:gitplex-mit    文件:JsoupUtils.java   
public static boolean hasAncestor(Node node, Collection<String> tags) {
    Node parent = node.parentNode();
    while (parent != null) {
        if (parent instanceof Element) {
            Element e = (Element) parent;
            if (tags.contains(e.tagName().toLowerCase())) {
                return true;
            }
        }

        parent = parent.parentNode();
    }

    return false;
}
项目:gitplex-mit    文件:JsoupUtils.java   
public static void appendReplacement(Matcher matcher, Node node, String replacement) {
    StringBuffer buffer = new StringBuffer();
    matcher.appendReplacement(buffer, "");
    if (buffer.length() != 0)
        node.before(new TextNode(buffer.toString(), node.baseUri()));
    node.before(new DataNode(replacement, node.baseUri()));
}
项目:gitplex-mit    文件:JsoupUtils.java   
public static void appendTail(Matcher matcher, Node node) {
    StringBuffer buffer = new StringBuffer();
    matcher.appendTail(buffer);
    if (buffer.length() != 0)
        node.before(new TextNode(buffer.toString(), node.baseUri()));
    node.remove();
}
项目:argument-reasoning-comprehension-task    文件:DebateHTMLParser.java   
/**
 * Extracts elements from the html comments (paragraph breaks, links)
 *
 * @param pElement paragraph element
 * @return plain text
 */
public static String paragraphElementToString(Element pElement)
{
    StringBuilder sb = new StringBuilder();
    for (Node child : pElement.childNodes()) {
        if (child instanceof TextNode) {
            TextNode textNode = (TextNode) child;

            sb.append(textNode.text());
        }
        else if (child instanceof Element) {
            Element element = (Element) child;

            // append new line for break
            if ("br".equals(element.tag().getName())) {
                sb.append("\n");
            }
            else if ("a".equals(element.tag().getName())) {
                // extract link from a.href
                sb.append(" ").append(element.attr("href")).append(" ");
            }
            else {
                // or just add the text
                sb.append(" ").append(element.text()).append(" ");
            }
        }
    }

    return sb.toString();
}
项目:Xndroid    文件:OutputFormatter.java   
private boolean unlikely(Node e) {
    if (e.attr("class") != null && e.attr("class").toLowerCase().contains("caption"))
        return true;

    String style = e.attr("style");
    String clazz = e.attr("class");
    return unlikelyPattern.matcher(style).find() || unlikelyPattern.matcher(clazz).find();
}
项目:crawler    文件:JsoupParser.java   
protected String getText(Element element) {
    StringBuilder accum = new StringBuilder();
    for (Node node : element.childNodes()) {
        if (node instanceof TextNode) {
            TextNode textNode = (TextNode) node;
            accum.append(textNode.text());
        }
    }
    return accum.toString();
}