public static void main(String[] args) { String d = "<span><div>test</div></span>"; Document doc = Jsoup.parse(d); Element div = doc.select("div").first(); // <div></div> div.html("<p>lorem ipsum</p>"); // <div><p>lorem ipsum</p></div> div.prepend("<p>First</p>"); div.append("<p>Last</p>"); // now: <div><p>First</p><p>lorem ipsum</p><p>Last</p></div> div.appendElement(d); Element span = doc.select("span").first(); // <span>One</span> span.wrap("<li><a href='http://example.com/'></a></li>"); // now: <li><a href="http://example.com"><span>One</span></a></li> System.out.println(doc.html()); String s = Jsoup.clean(doc.html(), "", Whitelist.relaxed(), new OutputSettings().prettyPrint(false)); System.out.println(s); }
public static void main(String[] args) { String baseUri = "http://www.baidu.com"; String html = "<a href=\"http://www.baidu.com/gaoji/preferences.html\"name=\"tj_setting\">搜索设置</a>"; String doc = Jsoup.clean(html, baseUri, Whitelist.none()); System.out.println(doc); System.out.println("*******"); doc = Jsoup.clean(html, baseUri, Whitelist.simpleText()); System.out.println(doc); System.out.println("*******"); doc = Jsoup.clean(html, baseUri, Whitelist.basic()); System.out.println(doc); System.out.println("*******"); doc = Jsoup.clean(html, baseUri, Whitelist.basicWithImages()); System.out.println(doc); System.out.println("*******"); doc = Jsoup.clean(html, baseUri, Whitelist.relaxed()); System.out.println(doc); }
public Reply createReply(ReplyDTO replyDTO, User user) { replyDTO.setUserId(user.getId()); Reply reply = replyDTO.toReply(); String content = Jsoup.clean(reply.getContent(), Whitelist.basicWithImages()); content = updateAtUser(content); reply.setContent(content); reply.setStatus(ReplyStatus.ACTIVE); Reply result = replyRepository.save(reply); reply.setUser(user); afterCreatingReply(reply); return result; }
private String trimValue(String dataFormat, String input) { String cleaned = null; if ("html".equals(dataFormat)) { Document document = Jsoup.parse(input); if (document != null) { document.outputSettings(new Document.OutputSettings().prettyPrint(false)); document.select("br").append("\\n"); String s = org.jsoup.parser.Parser.unescapeEntities(Jsoup.clean(document.html().replaceAll("\\\\n", " "), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)), false); if (s != null) { cleaned = s.trim().replaceAll("\r", "").replaceAll("\n",""); } } } else { cleaned = input.trim(); if (this.eolPattern == null) { this.eolPattern = Pattern.compile("[\r|\n]"); } Matcher m = this.eolPattern.matcher(cleaned); if (m.find()) { cleaned = cleaned.subSequence(0, m.start()).toString(); } } return cleaned != null ? (cleaned.length() > 100 ? cleaned.substring(0, 100) : cleaned) : (input.length() > 100 ? input.substring(0, 100) : input); }
public int checkTextContent(int userId, String content) throws IOException { HashSet<String> sensitiveWords = new HashSet<String>(); InputStream fis = new FileInputStream(source); InputStreamReader isr = new InputStreamReader(fis, Charset.forName("UTF-8")); BufferedReader br = new BufferedReader(isr); String line; while ((line = br.readLine()) != null) sensitiveWords.add(line.substring(0, line.length() - 1)); Result result = ToAnalysis.parse(Jsoup.clean(content, Whitelist.none())); List<Term> termList = result.getTerms(); for (Term term : termList) { if (sensitiveWords.contains(term.getName())) return 0; } return 1; }
/** * Cleans the html content leaving only the following tags: b, em, i, strong, u, br, cite, em, i, p, strong, img, li, ul, ol, sup, sub, s * @param content html content * @param extraTags any other tags that you may want to keep, e. g. "a" * @return */ public String cleanContent(String content, String ... extraTags) { Whitelist allowedTags = Whitelist.simpleText(); // This whitelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed. allowedTags.addTags("br", "cite", "em", "i", "p", "strong", "img", "li", "ul", "ol", "sup", "sub", "s"); allowedTags.addTags(extraTags); allowedTags.addAttributes("p", "style"); // Serve per l'allineamento a destra e sinistra allowedTags.addAttributes("img", "src", "style", "class"); if (Arrays.asList(extraTags).contains("a")) { allowedTags.addAttributes("a", "href", "target"); } Document dirty = Jsoup.parseBodyFragment(content, ""); Cleaner cleaner = new Cleaner(allowedTags); Document clean = cleaner.clean(dirty); clean.outputSettings().escapeMode(EscapeMode.xhtml); // Non fa l'escape dei caratteri utf-8 String safe = clean.body().html(); return safe; }
public String getUnkownCommandResponse(String message,String userName) { if (chatSession==null){ return "Sorry I did not recognize your command and the AI functions are disabled"; } MagicStrings.default_Customer_id = userName; String msg = chatSession.multisentenceRespond(message); if (msg == null || (msg.toLowerCase().contains("google") || msg.contains("<search>"))) { return "Sorry, I do not know"; } if (msg.length() > 250 && !msg.contains("\n")) { msg = "Well\n" + Jsoup.clean(msg, Whitelist.basic()); } return msg.replaceAll("<br/>", "\n"); }
private void parseQodResponse(JSONObject response) throws JSONException { JSONObject parse = response.getJSONObject("parse"); JSONObject text = parse.getJSONObject("text"); String content = text.getString("*"); Document doc = Jsoup.parse(content); Elements table = doc.select("table[style=\"text-align:center; width:100%\"]"); Elements rows = table.select("tr"); Elements qodTd = rows.get(0).select("td"); Elements author = rows.get(1).select("td"); Whitelist whitelist = Whitelist.none(); String newQuote = Html.fromHtml(Jsoup.clean(qodTd.toString(), whitelist)).toString(); String newAuthor = Html.fromHtml(Jsoup.clean(author.toString(), whitelist).replace("~", "")).toString(); Quote qod = sharedPrefStorage.getQod(); if (!qod.getQuoteText().equals(newQuote) || !qod.getQuoteAuthor().equals(newAuthor)) { Snackbar.make(binding.coordinatorLayout, getString(R.string.str_Refreshing), Snackbar.LENGTH_SHORT).show(); } sharedPrefStorage.setQodText(newQuote); sharedPrefStorage.setQodAuthor(newAuthor); setQuoteOfTheDayTextAndAuthor(qod); }
public TRECAquaintDocumentIndexer(String indexPath, String tokenFilterFile, boolean positional){ super(indexPath, tokenFilterFile, positional); try { whiteList = Whitelist.relaxed(); whiteList.addTags("docno"); whiteList.addTags("doc"); whiteList.addTags("headline"); whiteList.addTags("text"); whiteList.addTags("date_time"); whiteList.addTags("slug"); } catch (Exception e){ System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } doc = new Document(); initFields(); initAQUAINTDoc(); }
private String getJavadocCommentAsText(IMember member) { try (Reader reader = JavadocContentAccess.getHTMLContentReader(member, true, true)) { if (reader == null) { return null; } String javadocAsHtml = CharStreams.toString(reader); String javadocAsString = Jsoup.clean(javadocAsHtml, "", Whitelist.none(), new OutputSettings().prettyPrint(false)); // trim lines try (BufferedReader bufferedReader = new BufferedReader(new StringReader(javadocAsString))) { return bufferedReader.lines().map(line->line.trim()).collect(Collectors.joining("\n")); } } catch (JavaModelException | IOException e) { return null; } }
/** * Writes the contents of the create or edit form into the persistent object. * Assumes that the form has already been validated. * Also processes rich-text (HTML) fields by cleaning the submitted HTML according * to the {@link #getWhitelist() whitelist}. */ protected void writeFormToObject() { form.writeToObject(object); for(TextField textField : FormUtil.collectEditableRichTextFields(form)) { //TODO in bulk edit mode, the field should be skipped altogether if the checkbox is not checked. PropertyAccessor propertyAccessor = textField.getPropertyAccessor(); String stringValue = (String) propertyAccessor.get(object); String cleanText; try { Whitelist whitelist = getWhitelist(); cleanText = Jsoup.clean(stringValue, whitelist); } catch (Throwable t) { logger.error("Could not clean HTML, falling back to escaped text", t); cleanText = StringEscapeUtils.escapeHtml(stringValue); } propertyAccessor.set(object, cleanText); } }
protected String cleaner(String rs) { rs = rs.replace(" 廣告",""); rs = rs.replace("data-original=","src="); //rs = rs.replace("<span>","<p>"); //rs = rs.replace("</span>","</p>"); rs = rs.replace("相關新聞", "<!--"); Whitelist wlist = new Whitelist(); wlist.addTags("p", "span"); wlist.addTags("table","tbody","tr","td"); wlist.addTags("img").addAttributes("img", "src"); return Jsoup.clean(rs, wlist); }
protected String cleaner(String rs) { rs = rs.replace("src=\"/cnt", "src=\"http://orientaldaily.on.cc/cnt"); rs = rs.replace("<h3>","<p><b>"); rs = rs.replace("</h3>","</b></p>"); rs = rs.replace("<!--AD-->","<!--"); rs = rs.replace("<div id=\"articleNav\">","<!--"); Whitelist wlist = new Whitelist(); wlist.addTags("p","b"); //wlist.addTags("table","tbody","tr","td"); wlist.addTags("img").addAttributes("img", "src"); return Jsoup.clean(rs, wlist); }
protected String cleaner(String rs) { /* rs = rs.replace("<h2>","<p>"); rs = rs.replace("</h2>","</p>"); rs = rs.replace("#video_player{width:100%; height:100%;}",""); rs = rs.replace("<h1>","<!--"); rs = rs.replace("</h1>","-->"); */ rs = rs.replace("https://staticlayout.appledaily.hk/web_images/layout/art_end.gif",""); Whitelist wlist = new Whitelist(); wlist.addTags("p"); wlist.addTags("table","tbody","tr","td"); wlist.addTags("img").addAttributes("img", "src"); return Jsoup.clean(rs, wlist); }
public String stripXSS( String value ) { if( value != null ) { System.out.println("STRIP XSS -> ["+value+"]"); // Use the ESAPI library to avoid encoded attacks. value = ESAPI.encoder().canonicalize( value ); // Avoid null characters value = value.replaceAll("\0", ""); // Clean out HTML value = Jsoup.clean(value, Whitelist.none()); System.out.println("STRIPED XSS -> ["+value+"]"); } return value; }
/** * This method removes all html markup from the supplied string. * @param value The string containing possible html tags. * @return The string without html tags. */ private String stripXSS( String value ) { if( value != null ) { // System.out.println("STRIP XSS -> ["+value+"]"); // Use the ESAPI library to avoid encoded attacks. value = ESAPI.encoder().canonicalize( value ); //ESAPI.encoder().encodeForHTML() // Avoid null characters value = value.replaceAll("\0", ""); // Clean out HTML //This clean, removes all html tags. so instead of <script>, it simple removes the <script> tag. value = Jsoup.clean(value, Whitelist.none()); //System.out.println("STRIPED XSS -> ["+value+"]"); } return value; }
/** * Writes the contents of the create or edit form into the persistent object. * Assumes that the form has already been validated. * Also processes rich-text (HTML) fields by cleaning the submitted HTML according * to the {@link #getWhitelist() whitelist}. */ protected void writeFormToObject() { form.writeToObject(object); for(TextField textField : getEditableRichTextFields()) { PropertyAccessor propertyAccessor = textField.getPropertyAccessor(); String stringValue = textField.getStringValue(); String cleanText; try { Whitelist whitelist = getWhitelist(); cleanText = Jsoup.clean(stringValue, whitelist); } catch (Throwable t) { logger.error("Could not clean HTML, falling back to escaped text", t); cleanText = StringEscapeUtils.escapeHtml(stringValue); } propertyAccessor.set(object, cleanText); } }
@Override protected void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.activity_sanitize); final EditText inputText = (EditText) findViewById(R.id.input_text); inputText .setText("<p><a href='http://example.com/' onclick='doAttack()'>Link</a></p>"); final EditText sanitizedText = (EditText) findViewById(R.id.sanitized_text); findViewById(R.id.sanitize_button).setOnClickListener( new OnClickListener() { @Override public void onClick(View v) { String sanitized = Jsoup.clean(inputText.getText() .toString(), Whitelist.basic()); sanitizedText.setText(sanitized); } }); }
public static void main(String[] args) { Document test = Jsoup.parse("test"); System.out.println(test); //没用 boolean test1 = Jsoup.isValid("test", Whitelist.none()); System.out.println(test1); Document document = Jsoup.parse(null); System.out.println(document); }
HtmlSanitizer() { Whitelist whitelist = Whitelist.relaxed() .addTags("font") .addAttributes("table", "align", "bgcolor", "border", "cellpadding", "cellspacing", "width") .addAttributes(":all", "class", "style", "id") .addProtocols("img", "src", "http", "https", "cid", "data"); cleaner = new Cleaner(whitelist); headCleaner = new HeadCleaner(); }
public String getDescription(String page) { try { // Fetch the image page Response resp = Http.url(page) .referrer(this.url) .response(); cookies.putAll(resp.cookies()); // Try to find the description Elements els = resp.parse().select("td[class=alt1][width=\"70%\"]"); if (els.size() == 0) { logger.debug("No description at " + page); throw new IOException("No description found"); } logger.debug("Description found!"); Document documentz = resp.parse(); Element ele = documentz.select("td[class=alt1][width=\"70%\"]").get(0); // This is where the description is. // Would break completely if FurAffinity changed site layout. documentz.outputSettings(new Document.OutputSettings().prettyPrint(false)); ele.select("br").append("\\n"); ele.select("p").prepend("\\n\\n"); logger.debug("Returning description at " + page); String tempPage = Jsoup.clean(ele.html().replaceAll("\\\\n", System.getProperty("line.separator")), "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); return documentz.select("meta[property=og:title]").attr("content") + "\n" + tempPage; // Overridden saveText takes first line and makes it the file name. } catch (IOException ioe) { logger.info("Failed to get description " + page + " : '" + ioe.getMessage() + "'"); return null; } }
public String stripHtmlFrom(String original) { // If there's no content, return to sender unopened if( Check.isEmpty(original) ) { return original; } String cleaned = Jsoup.clean(original, Whitelist.simpleText()); return cleaned; }
@Transactional @Timed @Caching(evict = { @CacheEvict(value = TagService.CACHE_COUNT_USER, key = "#postDTO.userId.toString().concat('_posts_count')"), @CacheEvict(value = CACHE_COUNT_USER_TAG_POSTS, key = "#postDTO.userId.toString().concat('_tags_posts_count')", allEntries = true), }) public Optional<Post> createPost(PostDTO postDTO) throws JSONException { Post post = postMapper.postDTOToPost(postDTO); String result = getWebPost(String.format(SERVER_URL, post.getUrl())); if (result == null) return Optional.empty(); JSONObject json = new JSONObject(result); String content = json.getString("content"); // Filter html tags content = Jsoup.clean(content, Whitelist.relaxed()); post.setTitle(json.getString("title")); post.setTitle(post.getTitle().substring(0, Math.min(255, post.getTitle().length()))); post.setContent(content); post.setDomain(json.getString("host")); updateTags(post, null); saveNewPost(post); return Optional.of(post); }
/** * Get full text of a post */ public void getArticle(Post post) { log.debug("Handle crawling article full text from source site, id=" + post.getId() + " , url=" + post.getUrl()); try { String result = postService.getWebPost(String.format(PostService.SERVER_URL, post.getUrl())); if (result == null) { log.error("Failed to get article full text, id=" + post.getId()); return; } Post resultPost = postRepository.findOne(post.getId()); if (resultPost == null) { log.warn("Cancel crawling article full text of post id=" + post.getId() + ", because the post does not exist."); return; } JSONObject json = new JSONObject(result); String content = json.getString("content"); content = Jsoup.clean(content, Whitelist.relaxed()); resultPost.setContent(content); postService.saveNewPost(resultPost); } catch (Exception e) { log.error("Failed to resolve article full text, id=" + post.getId() + ", url=" + post.getUrl() + ", exception: " + e.getMessage()); } }
@Inject public DefaultMarkdownManager(Set<Extension> contributedExtensions, Set<HtmlTransformer> htmlTransformers) { this.contributedExtensions = contributedExtensions; this.htmlTransformers = htmlTransformers; whiteList = new Whitelist() { @Override protected boolean isSafeAttribute(String tagName, Element el, Attribute attr) { if (attr.getKey().startsWith("data-")) return true; else return super.isSafeAttribute(tagName, el, attr); } }; whiteList.addTags(SAFE_TAGS) .addAttributes("a", "href", "title") .addAttributes("img", "align", "alt", "height", "src", "title", "width") .addAttributes("div", "itemscope", "itemtype") .addAttributes(":all", SAFE_ATTRIBUTES) .addProtocols("a", "href", SAFE_ANCHOR_SCHEMES) .addProtocols("blockquote", "cite", "http", "https") .addProtocols("cite", "cite", "http", "https") .addProtocols("img", "src", "http", "https") .addProtocols("q", "cite", "http", "https") .preserveRelativeLinks(true); }
public static SimClassInfo forClass(Class<?> cls) { Optional<ClassJavadoc> javadoc = RuntimeJavadoc.getJavadoc(cls); String comment = javadoc.map(ClassJavadoc::getComment).map(Comment::toString).orElse(""); String sanitizedComment = Jsoup.clean(comment, Whitelist.basic()); return new SimClassInfo(cls.getCanonicalName(), sanitizedComment); }
@Override protected String formatHtml(String content) { Element body = Jsoup.parse(content).body(); Element contentDiv = body.select("div[dir='ltr']").first(); if (contentDiv == null) { log.warn("Found no valid content in e-mail from Gmail, returning empty"); return ""; } while (contentDiv.children().size() > 0 && contentDiv.children().last().is("br")) { contentDiv.children().last().remove(); } return Jsoup.clean(contentDiv.html(), Whitelist.basic()); }
@Override protected String formatHtml(String content) { Element body = Jsoup.parse(content).body(); body.select("blockquote[cite]").remove(); body.select("div.moz-cite-prefix").remove(); while (body.children().size() > 0 && body.children().last().is("br")) { body.children().last().remove(); } return Jsoup.clean(body.html(), Whitelist.basic()); }
@Retryable(backoff = @Backoff(2000L)) public String getServerConsole(String subId) throws IOException { rateLimiter.acquire(); String bodyHtml = validateSessionAndGet(Jsoup.connect(SUB_URL) .userAgent(Constants.USER_AGENT) .data("view", "console_log") .data("SUBID", subId) .timeout(TIMEOUT)).body().toString(); return Jsoup.clean(bodyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
/** * Wrapper around Jsoup clean method with the basic White list * http://jsoup.org/cookbook/cleaning-html/whitelist-sanitizer * @param unsafe * @return */ public static String sanitizeBasicHTML(String unsafe){ if (unsafe == null){ return null; } // basic includes: a, b, blockquote, br, cite, code, dd, dl, dt, em, i, li, ol, p, pre, q, small, span, strike, strong, sub, sup, u, ul //Whitelist wl = Whitelist.basic().addTags("img", "h1", "h2", "h3", "kbd", "hr", "s", "del"); Whitelist wl = Whitelist.basicWithImages().addTags( "h1", "h2", "h3", "kbd", "hr", "s", "del","map","area").addAttributes("img", "usemap") .addAttributes("map", "name").addAttributes("area", "shape","coords","href","title","alt"); return Jsoup.clean(unsafe, wl); }
private String sanitise(String str) { if (str == null) return ""; String _str = Jsoup.clean(str, Whitelist.basic()); String separator = System.getProperty("line.separator"); if ((!_str.contains("\n") || !_str.contains(separator)) && StringUtils.split(_str).length > MAX_WORDS_PER_LINE) { String[] words = StringUtils.split(_str); int counterPerLine = 0; _str = ""; for (int i = 0; i < words.length; i++) { if (counterPerLine > MAX_WORDS_PER_LINE) { _str += separator; counterPerLine = 0; } _str += words[i] + " "; counterPerLine++; if (i == words.length - 1) { _str += separator; break; } } } return _str; }
public static void main( String[] args ) { // load html from file Document doc = loadHtmlFromFile("index.html", "utf-8"); // just leave if doc is null if(doc == null) { LogUtils.d(CLS_NAME, "main", "document is null"); return; } /* the dirty html */ System.out.println("===BEFORE==="); System.out.println(doc.html()); /* create and config whitelist */ Whitelist allowList = Whitelist.relaxed(); allowList .addTags("meta", "title", "script", "iframe") .addAttributes("meta", "charset") .addAttributes("iframe", "src") .addProtocols("iframe", "src", "http", "https"); /* clean the dirty doc */ Cleaner cleaner = new Cleaner(allowList); Document newDoc = cleaner.clean(doc); /* the clean one */ System.out.println("===AFTER==="); System.out.println(newDoc.html()); }
/** * Strips any potential XSS threats out of the value * @param value * @return */ public String stripXSS( String value ) { if( value == null ) return null; // Avoid null characters value = value.replaceAll("\0", ""); // Clean out HTML value = Jsoup.clean( value, Whitelist.none() ); return value; }
private void sendPostsInChat(Long chatId, List<Post> postsAfter) { StringBuilder sb = new StringBuilder(); postsAfter.forEach(post -> { Board board = boardService.getBoard(post.getBoardid()); sb.append("\nBoard is /"); sb.append(board.getName()); sb.append("/ "); sb.append(board.getDesc()); sb.append("\nPOST № "); sb.append(post.getId()); sb.append("\nSubject "); sb.append(post.getSubject()); sb.append("\nMessage "); sb.append(Jsoup.clean(post.getMessage(), Whitelist.simpleText())); sb.append("\nURL : http://belchan.org/"); sb.append(board.getName()); sb.append("/res/"); int id = post.getParentid(); if (id == 0) { id = post.getId(); } sb.append(id); sb.append(".html\n\n"); }); sendMessage(chatId, sb.toString()); }
protected static String br2nl(String html) { if (html == null) return html; Document document = Jsoup.parse(html); document.outputSettings(new Document.OutputSettings().prettyPrint(false)); document.select("br").append("\\n"); document.select("p").prepend("\\n\\n"); String s = document.html().replaceAll("\\\\n", "\n"); return Jsoup.clean(s, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)); }
/** * *********************************************************************************** * Checks if Input contains any HTML or CSS tags * * @param str input String * * @return Validated String */ public static boolean containsHtml( String str ) { if ( Strings.isNullOrEmpty( str ) ) { return false; } else { return Jsoup.isValid( str, Whitelist.none() ) ? false : true; } }