private String getPdfContent(String pdfFile) { try { PdfReader reader = new PdfReader(pdfFile); StringBuffer sb = new StringBuffer(); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); sb.append(strategy.getResultantText()); } reader.close(); return sb.toString(); } catch (IOException e) { throw new IllegalArgumentException("Not able to read file " + pdfFile, e); } }
<E extends TextExtractionStrategy> String extractAndStore(PdfReader reader, String format, Class<E> strategyClass, RenderFilter... filters) throws Exception { StringBuilder builder = new StringBuilder(); for (int page = 1; page <= reader.getNumberOfPages(); page++) { TextExtractionStrategy strategy = strategyClass.getConstructor().newInstance(); if (filters != null && filters.length > 0) { strategy = new FilteredTextRenderListener(strategy, filters); } String pageText = extract(reader, page, strategy); Files.write(Paths.get(String.format(format, page)), pageText.getBytes("UTF8")); if (page > 1) builder.append("\n\n"); builder.append(pageText); } return builder.toString(); }
public static String[] extractsPdfLines(String PdfFile) throws IOException { try { StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReader reader = new PdfReader(PdfFile); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); } String[] LinesArray; LinesArray = buff.toString().split("\n"); reader.close(); return LinesArray; } catch (Exception e) { return null; } }
private static List<LocationTextExtractionStrategy.TextChunk> getTextChunks() throws IOException { PdfReader reader = new PdfReader(RESOURCES_DIR + "KLEE.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(RESOURCES_DIR + "extracted text")); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new LocationTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); return null; }
@Override public BookReadingResult open(@NonNull File file, @NonNull PercentSender percentSender, @NonNull Runnable readingEndSender) { try { PdfReader pdfReader = new PdfReader(file.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); int numberOfPages = pdfReader.getNumberOfPages(); int oldPercent = 0, newPercent; StringBuffer stringBuffer = new StringBuffer(); for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) { TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); String pageText = strategy.getResultantText(); stringBuffer.append(pageText); if (pageText.endsWith("\\.") || pageText.endsWith("!") || pageText.endsWith("?") || pageText.endsWith(":")) { stringBuffer.append(" "); } else { stringBuffer.append(". "); } newPercent = 100 * i / numberOfPages; if (newPercent != oldPercent) { percentSender.refreshPercents(oldPercent, newPercent); oldPercent = newPercent; } } pdfReader.close(); String resultText = new String(stringBuffer); resultText = resultText.trim(); // delete first and last space (if exist) resultText = resultText.replaceAll("\\s+", " "); // delete all duplicate white spaces resultText = resultText.replaceAll("(\\.)+", "\\."); // delete all duplicate dots if (resultText.length() < 1) { return null; } readingEndSender.run(); return new BookReadingResult(resultText, InternalStorageFileHelper.fileNameWithoutExtension(file), ""); } catch (IOException e) { e.printStackTrace(); return null; } }
public RemappingExtractionFilter(TextExtractionStrategy strategy) throws NoSuchFieldException, SecurityException { this.strategy = strategy; this.stringField = TextRenderInfo.class.getDeclaredField("text"); this.stringField.setAccessible(true); }
String extract(PdfReader reader, int pageNo, TextExtractionStrategy strategy) throws IOException { return PdfTextExtractor.getTextFromPage(reader, pageNo, strategy); }
String extractRemapped(PdfReader reader, int pageNo) throws IOException, NoSuchFieldException, SecurityException { TextExtractionStrategy strategy = new RemappingExtractionFilter(new LocationTextExtractionStrategy()); return PdfTextExtractor.getTextFromPage(reader, pageNo, strategy); }
public static void searchforStringinPdfFiles(File file) throws IOException { outputfile.println("<Dateiname>" + (file.getName()) + "</Dateiname>"); int trefferinDatei; if (filetools.pdf.PdfAnalysis.testPdfOk(file)) { try { PdfReader reader = new PdfReader(file.toString()); int pagesPdf = reader.getNumberOfPages(); StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; trefferinDatei = 0; for (int i = 1; i <= pagesPdf; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); String[] LinesArray = buff.toString().split("\n"); int linesPdf = LinesArray.length; for (int j = 0; (j < linesPdf && (stringfound < MAXIMAL_HITS)); j++) { String paragraph = LinesArray[j].toLowerCase(); String searchStringlowerCase = searchedString.toLowerCase(); if (paragraph.contains(searchStringlowerCase)) { trefferinDatei++; stringfound++; outputfile.println("<Seitenzahl>" + i + "</Seitenzahl>"); outputfile.println("<GanzeZeile>" + (LinesArray[j]) + "</GanzeZeile>"); } } } outputfile.println("<TextinDatei>" + trefferinDatei + "</TextinDatei>"); outputfile.println("<Suchergebnis>" + trefferinDatei + " x " + "</Suchergebnis>"); reader.close(); } catch (Exception e) { outputfile.println("<Fehlermeldung>" + e + "</Fehlermeldung>"); } } }