@Override public void head(Node node, int depth) { String name = node.nodeName(); if (node instanceof TextNode) { append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. } else if (name.equals("ul")) { listNesting++; } else if (name.equals("li")) { append("\n "); for (int i = 1; i < listNesting; i++) { append(" "); } if (listNesting == 1) { append("* "); } else { append("- "); } } else if (name.equals("dt")) { append(" "); } else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) { append("\n"); } }
private String gatherWuBi(Element tagContentEL) { Elements spans = tagContentEL.select("span.diczx7"); for (Element span : spans) { if (span.text().equals("五笔:")) { // 后一个兄弟文本节点 Node textNode = span.nextSibling(); if (textNode instanceof TextNode) { String wubi=((TextNode) textNode).text(); //去掉特殊字符 wubi=wubi.replaceAll("\\W", ""); return wubi; } } } return null; }
protected Map<String, List<String>> getResourceArrayMap(String tag) { Map<String, List<String>> map = new HashMap<>(); Document document = getValuesXmlDocument(); Elements elements = document.getElementsByTag(tag); for (Element element : elements) { Elements items = element.getElementsByTag("item"); List<String> itemsText = new ArrayList<>(); for (Element item : items) { String text = ((TextNode) item.childNode(0)).text(); itemsText.add(text); } String name = element.attr("name"); map.put(name, itemsText); } return map; }
/** * 获取strings.xml 资源名-值 映射表 * * @return */ protected Map<String, String> getStringResNameAndValueMap() { Map<String, String> map = new HashMap<>(); Document document = getValuesXmlDocument(); Elements strings = document.getElementsByTag("string"); for (int i = 0; i < strings.size(); i++) { Element element = strings.get(i); String name = element.attr("name"); if (element.childNodeSize() > 0 && element.childNode(0) instanceof TextNode) { String text = ((TextNode) element.childNode(0)).text(); map.put(name, text); } } return map; }
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) { for (Node child : e.childNodes()) { if (unlikely(child)) { continue; } if (child instanceof TextNode) { TextNode textNode = (TextNode) child; String txt = textNode.text(); accum.append(txt); } else if (child instanceof Element) { Element element = (Element) child; if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum)) accum.append(' '); else if (element.tagName().equals("br")) accum.append(' '); appendTextSkipHidden(element, accum, indent + 1); } } }
public void mapAllElements(String selector, String fieldName) { Elements elements = jsoupDocument.select(selector); for (int i = 0; i < elements.size(); i++) { Element element = elements.get(i); StringBuilder value = new StringBuilder(); for(Element subElements : element.getAllElements()) { for (TextNode textNode : subElements.textNodes()) { final String text = textNode.text(); value.append(text); value.append(" "); } } document.addField(fieldName, value.toString().trim()); } }
public Node getFirstNonEmptyNodeChild(Element parent) { if (parent == null) return null; if (parent.childNodeSize() == 0) return null; if (parent.childNode(0) instanceof Element) { return parent.childNode(0); } if (parent.childNode(0) instanceof TextNode && ((TextNode) parent.childNode(0)).text().replaceAll("\u00A0", " ").trim().length() > 0) { return parent.childNode(0); } else { return getNextNonEmptyNode(parent.childNode(0)); } }
public HtmlNode getHtmlNode(org.jsoup.nodes.Node node) { if(elementCache.containsKey(node)) { return elementCache.get(node); } else { HtmlNode htmlNode = null; if(node instanceof Element) htmlNode = new HtmlElement(page, (Element)node); else if(node instanceof TextNode) htmlNode = new HtmlTextNode(page, (TextNode)node); else htmlNode = new HtmlNode(page, node); elementCache.put(node, htmlNode); return htmlNode; } }
public void initRawInfo() { StringBuilder sb = new StringBuilder(); for (Node n : this) { // NodeHelper.cleanEmptyElements(n); if (n instanceof TextNode) { this.setTagName(getPath(n)); String nodeRawText = ((TextNode) n).text(); sb.append(Utils.normalizeWhitespace(nodeRawText).trim()); if (NodeHelper.isLink(n)) { charsCountInLinks += nodeRawText.length(); } } } rawText = sb.toString(); }
public String getPath(Node n) { String nodePath = ""; while (n != null) { if (n instanceof TextNode) { n = n.parent(); } if (NodeHelper.isInnerText(n)) { n = n.parent(); } String parentNodeName = n.nodeName(); nodePath = parentNodeName + "." + nodePath; if (!parentNodeName.equalsIgnoreCase("html")) { n = n.parent(); } else { break; } } return nodePath; }
public void initRawInfo() { StringBuilder sb = new StringBuilder(); for (Node n : this) { // NodeHelper.cleanEmptyElements(n); if (n instanceof TextNode) { this.setTagName(getPath(n)); String nodeRawText = ((TextNode) n).text(); sb.append(Utils.normalizeBreaks(nodeRawText).trim()); if (NodeHelper.isLink(n)) { charsCountInLinks += nodeRawText.length(); } } } rawText = sb.toString(); }
public void head(Node node, int depth) { String name = node.nodeName(); if (node instanceof TextNode) append(((TextNode) node).text()); // TextNodes carry all user-readable text in the DOM. else if (name.equals("li")) append("\n * "); else if (name.equals("dt")) append(" "); else if (StringUtil.in(name, "p", "h1", "h2", "h3", "h4", "h5", "tr")) append("\n"); }
@Override public String operate(Element element) { int index = 0; StringBuilder accum = new StringBuilder(); for (Node node : element.childNodes()) { if (node instanceof TextNode) { TextNode textNode = (TextNode) node; if (group == 0) { accum.append(textNode.text()); } else if (++index == group) { return textNode.text(); } } } return accum.toString(); }
/** * Extract Date + ID + No * Ex: " 15/02/14(六)07:14:32 ID:F.OqpZFA No.6135732" * @return Post */ private Post extractIDString(Post post, TextNode node) { Pattern r = Pattern.compile("(\\d{2})/(\\d{2})/(\\d{2}).+?(\\d{2}):(\\d{2}):(\\d{2}) ID:([\\./0-9A-Za-z]+?) No\\.(\\d+)"); Matcher m = r.matcher(node.text()); if (m.find()) { Integer Y = Integer.parseInt(m.group(1)) + 2000, //year M = Integer.parseInt(m.group(2)) - 1, //month D = Integer.parseInt(m.group(3)), //day H = Integer.parseInt(m.group(4)), //hours I = Integer.parseInt(m.group(5)), //minutes S = Integer.parseInt(m.group(6)); //seconds Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("Asia/Taipei")); cal.set(Y, M, D, H, I, S); post.date = cal; post.tripId = m.group(7); post.no = m.group(8); } return post; }
private void addView(StringBuilder sb, Node node) { int preSBLen = sb.length(); for (Node subNode : node.childNodes()) { String subNodeName = subNode.nodeName(); if ("img".equals(subNodeName)) { if (sb.length() > 0) { removeLastUselessChars(sb);// 移除最后两个回车符 if (sb.length() > 0) { mView.addTextToContent(sb.toString()); sb.delete(0, sb.length()); } preSBLen = 0; } String link = subNode.attributes().get("src"); mView.addImageToContent(link); mImageUrls.add(link); } else if ("#text".equals(subNodeName)) { sb.append(((TextNode) subNode).text()); } else { addView(sb, subNode); } } if (sb.length() - preSBLen > 0 && "p".equals(node.nodeName())) { sb.append("\n\n"); } }
private List<Element> process(final org.jsoup.nodes.Element element, final Element arPr, final Element apPr, final Slide slide) throws IOException { if (BR_TAG.equals(element.tagName())) { return Arrays.asList(new Element(PPTXDocument.BR_ELEMENT, getDrawingmlNamespace())); } final List<org.jsoup.nodes.Element> tags = getAllTags(element); final List<Element> elements = new ArrayList<>(); for (Node node : element.childNodes()) { if (node instanceof org.jsoup.nodes.Element) { elements.addAll(process((org.jsoup.nodes.Element) node, arPr, apPr, slide)); } else if (node instanceof TextNode) { final TextNode textNode = (TextNode) node; elements.add(createTextElement(tags, arPr, textNode, slide)); } } if (LI_TAG.equals(element.tagName())) { return createListElements(tags, elements, apPr, element); } if (P_TAG.equals(element.tagName())) { return Arrays.asList(createParagraphElement(elements, apPr)); } return elements; }
/** * A recursive function that converts an element and its children, creating spans as * required. * * @param element The element to convert */ public void convert(Element element) { // Begin the span handleStartTag(element); // Process the intermediate nodes List<Node> nodes = element.childNodes(); for (Node node : nodes) { if (node instanceof Element) { // Recursively convert element nodes convert((Element) node); } else if (node instanceof TextNode) { // Add the text to the span characters(((TextNode) node).getWholeText()); } } // End the span handleEndTag(element); }
private String getTableDataValue( Element tdNode ) { //return tdNode.html(); StringBuffer buf = new StringBuffer(); List<Node> childNodes = tdNode.childNodes(); for ( Node tdChild : childNodes ) { if ( tdChild instanceof TextNode ) { buf.append( ( (TextNode) tdChild ).text() ); } else if ( tdChild instanceof Element ) { Element tdChildElement = (Element) tdChild; if ( "br".equals( tdChildElement.tagName() ) ) { buf.append( "<br />" ); } } } return buf.toString(); }
private void addTextNode(TextNode tNode) { String text = tNode.text().trim(); if (text.isEmpty()) { return; } String xpath = JsoupHelper.getXpath(tNode); tNodeList.add(tNode); xpathMap.put(tNode, xpath); CountInfo countInfo = new CountInfo(tNode); ArrayList<CountInfo> countInfoList = countMap.get(xpath); if (countInfoList == null) { countInfoList = new ArrayList<CountInfo>(); countMap.put(xpath, countInfoList); } countInfoList.add(countInfo); }
public void head(Node source, int depth) { if (source instanceof Element) { Element sourceEl = (Element) source; if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs ElementMeta meta = createSafeElement(sourceEl); Element destChild = meta.el; destination.appendChild(destChild); numDiscarded += meta.numAttribsDiscarded; destination = destChild; } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. numDiscarded++; } } else if (source instanceof TextNode) { TextNode sourceText = (TextNode) source; TextNode destText = new TextNode(sourceText.getWholeText()); destination.appendChild(destText); } else if (source instanceof DataNode && whitelist.isSafeTag(source.parent().nodeName())) { DataNode sourceData = (DataNode) source; DataNode destData = new DataNode(sourceData.getWholeData()); destination.appendChild(destData); } else { // else, we don't care about comments, xml proc instructions, etc numDiscarded++; } }
/** * Produce predictable html (attributes in alphabetical order), always * include close tags */ private String elementToHtml(Element producedElem, StringBuilder sb) { ArrayList<String> names = new ArrayList<String>(); for (Attribute a : producedElem.attributes().asList()) { names.add(a.getKey()); } Collections.sort(names); sb.append("<" + producedElem.tagName() + ""); for (String attrName : names) { sb.append(" ").append(attrName).append("=").append("\'") .append(producedElem.attr(attrName)).append("\'"); } sb.append(">"); for (Node child : producedElem.childNodes()) { if (child instanceof Element) { elementToHtml((Element) child, sb); } else if (child instanceof TextNode) { String text = ((TextNode) child).text(); sb.append(text.trim()); } } sb.append("</").append(producedElem.tagName()).append(">"); return sb.toString(); }
/** * Parst eine "Nachrichten zum Tag"-Tabelle aus Untis-Vertretungsplänen * * @param table * das <code>table</code>-Element des HTML-Dokuments, das geparst * werden soll * @param data * Daten von der Schule (aus <code>Schule.getData()</code>) * @param tag * der {@link VertretungsplanTag} in dem die Nachrichten * gespeichert werden sollen */ protected void parseNachrichten(Element table, JSONObject data, VertretungsplanTag tag) { Elements zeilen = table .select("tr:not(:contains(Nachrichten zum Tag))"); for (Element i : zeilen) { Elements spalten = i.select("td"); String info = ""; for (Element b : spalten) { info += "\n" + TextNode.createFromEncoded(b.html(), null) .getWholeText(); } info = info.substring(1); // remove first \n tag.getNachrichten().add(info); } }
@Override public void head(Node node, int depth) { if (node instanceof TextNode) { TextNode text = (TextNode) node; String textContent = text.text(); if (textLen >= maxTextLen) { text.text(""); } else if (textLen + textContent.length() > maxTextLen) { int ptr = maxTextLen - textLen; if (!killwords) { ptr = Functions.movePointerToJustBeforeLastWord(ptr, textContent) - 1; } text.text(textContent.substring(0, ptr) + ending); textLen = maxTextLen; } else { textLen += textContent.length(); } } }
public String cleanHtml(Document doc) { StringBuilder cleanText = new StringBuilder(); baseCleaner.intitialCleanse(doc); Elements spans = doc.getElementsByTag("span"); for (Element span : spans) { baseCleaner.removeSingleSpaceTextNodes(span); List<Node> children = span.childNodes(); if (baseCleaner.nodesContainConsecutiveBR(children)) { Elements brs = span.getElementsByTag("br"); for (Element br : brs) { br.replaceWith(new TextNode("LINEBREAK", null)); } if (!baseCleaner.elementOnlyContainLink(span) && baseCleaner.elementHasPromisingIdentifier(span)) { cleanText.append(span.text()); } } } String cText = cleanText.toString(); return cText.replaceAll("LINEBREAK", "\r\n"); }
public String cleanHtml(Document doc) { StringBuilder cleanText = new StringBuilder(); baseCleaner.intitialCleanse(doc); Elements divs = doc.getElementsByTag("div"); for (Element div : divs) { baseCleaner.removeSingleSpaceTextNodes(div); List<Node> children = div.childNodes(); if (baseCleaner.nodesContainConsecutiveBR(children)) { Elements brs = div.getElementsByTag("br"); for (Element br : brs) { br.replaceWith(new TextNode("LINEBREAK", null)); } if (!baseCleaner.elementOnlyContainLink(div) && baseCleaner.elementHasPromisingIdentifier(div)) { cleanText.append(div.text()); } } } String cText = cleanText.toString(); return cText.replaceAll("LINEBREAK", "\r\n"); }
/** * Creates new Epublines and detects HTML-Linebreaks in the text. * * @param chapter * @param chapterElement * @param mode */ private void addEpubline(List<Epubline> chapter, Element chapterElement, String mode) { String writeNext = ""; List<TextNode> textNodes = chapterElement.textNodes(); int textIndex = 0; for (Node node : chapterElement.childNodes()) { if ("#text".equals(node.nodeName().trim())) { // text node -> add test writeNext = writeNext.concat(textNodes.get(textIndex).text()); textIndex++; } else if ("br".equals(node.nodeName().trim())) { // break -> make a new line chapter.add(new Epubline(mode, writeNext, "")); writeNext = ""; } } if (!"".equals(writeNext)) { chapter.add(new Epubline(mode, writeNext, "")); } }
/** * maps the given elemen e to its rtf type. when the rtf element has * subelement they are given in the childs array * * @param e * the current html node for which a rtf element should be * created * @param childs * the rtf child elements if any * @return an rtf child element */ private Object getRtfTNode(Node node, ElementContainer childs) { final String name = node.nodeName().toLowerCase(); Object ret = null; if (node instanceof TextNode) { ret = ((TextNode) node).text(); } else if (node instanceof Element) { if (name.equals("p")) return childs; NodeHandler<ElementContainer, Object> handler = handlers.get(name); //TODO better use a NoOpHandler that ignores the tag and log it? if (handler == null) throw new RuntimeException("WTF? Don't know this tag: " + name); ret = handler.handle(childs); } return ret; }
public Rtf2Html() { super(); entries = new LinkedHashMap<TextNode, Element>(); parserItems = new ArrayList<RtfElementParser>(); org.jsoup.nodes.Document document = Jsoup.parse("<body></body>"); body = document.body(); // setup the handlers parserItems.add(new FontElementParser("font")); parserItems.add(new BooleanElementParser("b", StyleConstants.Bold)); parserItems.add(new BooleanElementParser("i", StyleConstants.Italic)); parserItems .add(new BooleanElementParser("u", StyleConstants.Underline)); }
private void addToMap(Element element, Document document) { int i = element.getStartOffset(); int j = element.getEndOffset(); String s; try { s = document.getText(i, j - i); // if (s.trim().isEmpty()) // return; org.jsoup.nodes.TextNode n = new org.jsoup.nodes.TextNode(s, ""); body.appendChild(n); entries.put(n, element); } catch (BadLocationException e) { // TODO Auto-generated catch block e.printStackTrace(); } }