我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用PyPDF2.PdfFileReader()。
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) ####### FUNCTION AnalyzeMetadata doc ######
def render_pdf(self): outpdf = PdfFileWriter() for page in self.pages: if page.extension == "pdf": # the page is already a PDF so append directly outpdf.addPage(PdfFileReader(BytesIO(page.binary)).getPage(0)) else: # otherwise, the page is an image that needs to be converted to PDF first buf = BytesIO() img = Image.open(BytesIO(page.binary)) img.convert("RGB").save(buf, format="pdf") # once image is PDF, it can be appended outpdf.addPage(PdfFileReader(buf).getPage(0)) pdf_page_buf = BytesIO() outpdf.write(pdf_page_buf) return(pdf_page_buf.getvalue())
def pdf_page_to_png(src_pdf, pagenum = 0, resolution = 72,): ''' Returns specified PDF page as wand.image.Image png. :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages. :param int pagenum: Page number to take. :param int resolution: Resolution for resulting png in DPI. ''' dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = Image(file = pdf_bytes, resolution = resolution) img.convert("png") return img # Example of converting exam.pdf located at the same direcory # convert('exam') # NOTE : default resolution is 72 dpi
def outputpapertemplate(self, dest, listchar, output=None): if output == None: output = PyPDF2.PdfFileWriter() while listchar: iopage = self.outputtemplateonepage(listchar) page = PyPDF2.PdfFileReader(iopage) output.addPage(page.getPage(0)) if dest != None: if isinstance(dest, str): # when dest is a file path destdir = os.path.dirname(dest) if destdir != '' and not os.path.isdir(destdir): os.makedirs(destdir) with open(dest, "wb") as w: output.write(w) else: # when dest is io.IOBase output.write(dest) else: return output
def apply(self): """ Applies all requested overlays to the page Returns: bool """ for overlay in self.document.overlays.values(): if isinstance(overlay, BatesOverlay): overlay.text = self.number overlay.apply(self.canvas) elif isinstance(overlay, GenericTextOverlay): overlay.apply(self.canvas) for redaction in self.redactions: redaction.apply(self.canvas) self.canvas.showPage() self.canvas.save() self.canvas_file.seek(0) reader = PdfFileReader(self.canvas_file) overlay_page = reader.getPage(0) self.page.mergePage(overlay_page) return True
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() #print(str(meta)) result = { 'author': meta.author, } return result
def test_report_pdf(self): self.client_post_report_creation() pdf = report_delivery.report_as_pdf( report=self.report, data=mock_report_data, recipient=None, ) pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf)) self.assertIn( "Reported by: testing_12", pdf_reader.getPage(0).extractText()) self.assertIn('food options', pdf_reader.getPage(0).extractText()) self.assertIn('vegetables', pdf_reader.getPage(0).extractText()) self.assertIn('apples: red', pdf_reader.getPage(0).extractText()) self.assertIn('eat it now???', pdf_reader.getPage(0).extractText())
def create(self, cr, uid, ids, data, context=None): pool = registry(cr.dbname) merger = PdfFileMerger() outfiles = [] for p in pool.get(self.model).read(cr,uid,ids): outfiles.append(self.newfilename()) sla = self.render(cr,uid,p,data.get('template') or self.template) if self.report_type == 'scribus_sla': os.unlink(outfiles[-1]) return (sla.read(),'sla') command = "xvfb-run -a scribus-ng -ns -g %s -py %s -pa -o %s" % (sla.name,os.path.join(get_module_path('report_scribus'), 'scribus.py'),outfiles[-1]) _logger.info(command) res = os.system(command) sla.close() if not os.path.exists(outfiles[-1]) or os.stat(outfiles[-1]).st_size == 0: raise MissingError('There are something wrong with the template or scribus installation') merger.append(PdfFileReader(file(outfiles[-1], 'rb'))) outfile = tempfile.NamedTemporaryFile(mode='w+b',suffix='.pdf') merger.write(outfile.name) for filename in outfiles: os.unlink(filename) outfile.seek(0) pdf = outfile.read() outfile.close() return (pdf,'pdf')
def run(self): objeto_pdf = open(self.caminho_arquivo, 'rb') reader = PyPDF2.PdfFileReader(objeto_pdf) conteudos = [] for num in range(reader.numPages): texto = reader.getPage(num).extractText() conteudos.append(texto.encode("utf-8")) instancia = object.__new__(GravarConvertido) instancia.__init__(self.caminho_arquivo," ".join(conteudos)) self.next_pipe = instancia self.next_pipe.run()
def _destinations_in_two_columns(pdf, destinations, cutoff=3): """ Check if the named destinations are organized along two columns (heuristic) @param pdf: a PdfFileReader object @param destinations: 'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the would-be second column start at the same position, return True """ # iterator for the x coordinates of refs in the would-be second column xpositions = (_destination_position(pdf, dest)[3] for (_, dest) in destinations if _destination_position(pdf, dest)[1] == 1) xpos_count = {} for xpos in xpositions: xpos_count[xpos] = xpos_count.get(xpos, 0) + 1 if xpos_count[xpos] >= cutoff: return True return False
def extract_links(self, response): pdf = pyPdf.PdfFileReader(BytesIO(response.body)) pgs = pdf.getNumPages() for page_num in range(pgs): page = pdf.getPage(page_num) annotations = page.get('/Annots', []) for annotation in annotations: annot_object = annotation.getObject() a_tag = annot_object.get('/A') if a_tag and '/URI' in a_tag: uri = a_tag['/URI'] if isinstance(uri, pyPdf.generic.ByteStringObject): uri = uri.decode("utf-8").replace("\x00", "") yield (uri, uri)
def extract_metadata(self, file_path): with open(file_path, 'rb') as fh: pdf = PdfFileReader(fh, strict=False) meta = pdf.getDocumentInfo() if meta is not None: self.update('title', meta.title) self.update('author', meta.author) self.update('generator', meta.creator) self.update('generator', meta.producer) if meta.subject: self.result.keywords.append(meta.subject) xmp = pdf.getXmpMetadata() if xmp is not None: self.update('id', xmp.xmpmm_documentId) for lang, title in xmp.dc_title.items(): self.update('title', title) self.result.languages.append(lang) self.update('generator', xmp.pdf_producer) self.update('created_at', xmp.xmp_createDate) self.update('modified_at', xmp.xmp_modifyDate) self.result.languages.extend(xmp.dc_language) # from pprint import pprint # pprint(self.result.to_dict())
def __loadSATPDF(self, filename): print("loading SAT score pdf") """ loads the SAT PDF file, deletes all nonsense and creates an array containing only the numbers from the table Return ------ All numbers from the SAT table in a string array """ pdf = pypdf.PdfFileReader(open(filename, "rb")) tableContents = [] for page in pdf.pages: content = page.extractText() tableHeader = "Total \nMale Female \nScore \nNumber Percentile Number Percentile Number Percentile " tableFooter = "De˜nitions of statistical terms are provided online at research." tableContents += self.__getTableContent(content, tableHeader, tableFooter) if "Number" and "Mean" and "S.D." in tableContents: tableContents = tableContents[:tableContents.index("S.D.") - 2] return tableContents
def main(): all_pdfs = get_all_pdf_files(os.path.expanduser('~lmx/')) if not all_pdfs: raise SystemExit('No pdf file found!') merger = PyPDF2.PdfFileMerger() with open(all_pdfs[0], 'rb') as first_obj: merger.append(first_obj) for pdf in all_pdfs[1:]: with open(pdf, 'rb') as obj: reader = PyPDF2.PdfFileReader(obj) merger.append(fileobj=obj, pages=(1, reader.getNumPages())) with open('merge-pdfs.pdf', 'wb') as f: merger.write(f)
def Analyze_Metadata_pdf(filename): ####### FUNCTION AnalyzeMetadata ###### pdfFile = PdfFileReader(file(filename, 'rb')) metadata = pdfFile.getDocumentInfo() print ' - Document: ' + str(filename) for meta in metadata: value=(metadata[meta]) print ' - ' + meta + ':' + metadata[meta] if meta == "/Author": if value not in meta_author_array: meta_author_array.append(value) elif meta =="/Producer": if value not in meta_producer_array: meta_producer_array.append(value) elif meta == "/Creator": if value not in meta_creator_array: meta_creator_array.append(value) #Group the different arrays in one with all metadata metadata_files.append(meta_author_array) metadata_files.append(meta_producer_array) metadata_files.append(meta_creator_array) #print metadata_files ####### FUNCTION AnalyzeMetadata doc ######
def pdf_splitter(self): self.log.info('Called pdf_splitter') input_pdf = PdfFileReader(file(self.pdf_file, 'rb')) self.total_pages = input_pdf.numPages for page_number in range(self.total_pages): output = PdfFileWriter() output.addPage(input_pdf.getPage(page_number)) # new filename new_pdf = '_%s%s' % (str(page_number+1), '.pdf') new_pdf = self.pdf_file.replace('.pdf', new_pdf) file_stream = file(new_pdf, 'wb') output.write(file_stream) file_stream.close() # calling pdf to image conversion self.pdf_to_image(new_pdf)
def pdf_parser(s): s = s.strip() # required to suppress warning messages with open(os.devnull, 'w') as fp: pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp) if pdf.isEncrypted: try: pdf.decrypt('') except NotImplementedError: return {} meta = pdf.getDocumentInfo() #print(str(meta)) result = {} for key in meta.keys(): result[key[1:]] = meta.get(key) return result
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154): """ Returns specified PDF page as wand.image.Image png. :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages. :param int pagenum: Page number to take. :param int resolution: Resolution for resulting png in DPI. """ check_dependencies(__optional_dependencies__['pdf']) # Import libraries within this function so as to avoid import-time dependence import PyPDF2 from wand.image import Image # TODO: When we start using this again, document which system-level libraries are required. dst_pdf = PyPDF2.PdfFileWriter() dst_pdf.addPage(src_pdf.getPage(pagenum)) pdf_bytes = io.BytesIO() dst_pdf.write(pdf_bytes) pdf_bytes.seek(0) img = Image(file=pdf_bytes, resolution=resolution) img.convert("png") return img
def count_pages(input_file): """Count the number of pages in a file Keyword arguments input_file -- the full path to the input file """ count = 0 if is_pdf.match(input_file): with open(input_file, 'rb') as fp: count += len(rxcountpages.findall(fp.read())) if count == 0: pdf_read = PyPDF2.PdfFileReader(input_file) count = pdf_read.getNumPages() pdf_read = None else: ops = [ 'identify', '-ping', '-format', "%n\\n", input_file ] results = do_system_call(ops, return_result=True) count = int(results.rstrip().split('\n').pop()) return count
def pdf_date(path): """ Extract a date from PDF file metadata. Args: path (str): The file path. Returns: datetime: The created date. """ reader = PdfFileReader(path) # Get rid of `D:` prefix and timezone. stamp = reader.documentInfo['/CreationDate'] match = re.search('\d+', stamp) return datetime.strptime( match.group(), '%Y%m%d%H%M%S' )
def extract(text, paper=None, logger=logger): # try using pypdf2/pdfminer try: pdf = paper._read_document() pdfReader = PdfFileReader(pdf) author = pdfReader.getDocumentInfo().author value_text, value_result = author, author source_type = "extracted" source_detail = "pdf metadata" if author: return (value_text, value_result, source_type, source_detail) else: return None except: # search for author or return None # Though currently there is no search function return None
def extract(text, paper=None, logger=logger): # try using pypdf2/pdfminer try: pdf = paper._read_document() pdfReader = PdfFileReader(pdf) title = pdfReader.getDocumentInfo().title value_text, value_result = title, title source_type = "extracted" source_detail = "pdf metadata" if title: return (value_text, value_result, source_type, source_detail) else: return None except: # search for title or return None # Though currently there is no search function return None
def test_split(): i = ILovePdf(config.PUBLIC_KEY, config.SECRET_KEY) i.new_task("split") i.add_file("test.pdf") i.execute(ranges="1-2,5-8") i.download() zip_ref = zipfile.ZipFile("out.zip", "r") zip_ref.extractall("test_split") zip_ref.close() assert len(glob.glob("test_split/*.pdf")) == 2 output_file1 = PdfFileReader(open("test_split/test-1-2.pdf", "rb")) output_file2 = PdfFileReader(open("test_split/test-5-8.pdf", "rb")) assert output_file1.getNumPages() == 2 assert output_file2.getNumPages() == 4 os.remove("out.zip") shutil.rmtree("test_split")
def get_pages(pdf_filename, from_, to): if to < from_: to = from_ file = open(pdf_filename, 'rb') pdf = PdfFileReader(file) output = PdfFileWriter() for i in range(from_ - 1, to): output.addPage(pdf.getPage(i)) stream = BytesIO() output.write(stream) data = stream.getvalue() file.close() return data
def pdfMetaData(file_path, save=True): '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)''' pdf_doc = PdfFileReader(open(file_path, "rb")) if pdf_doc.isEncrypted: try: if pdf_doc.decrypt("") != 1: sys.exit("target pdf document is encrypted... exiting...") except: sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...") doc_info = pdf_doc.getDocumentInfo() stats = os.stat(file_path) now = dt.now() file_name = getFileName(file_path) metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month, now.day, now.hour, now.minute, now.second, file_name[:-4]) try: for md in doc_info: metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n" except TypeError: sys.exit("Couldn't read document info! Make sure target is a valid pdf document...") metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime), dt.fromtimestamp(stats.st_mtime), dt.fromtimestamp(stats.st_atime), stats.st_uid) try: print(metadata) except UnicodeEncodeError: print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.") if save: file_name = getFileName(file_path) tgt = file_name + ".txt" saveResult(tgt, metadata)
def validate_pdf(value): """ Validates the uploading file if it is a PDF. Raises an error if validation not passed. :param value: The file object. """ try: PyPDF2.PdfFileReader(io.BytesIO(value.read())) except PyPDF2.utils.PdfReadError: raise ValidationError('Tried to upload not PDF as a book!')
def slice(self, pdf_file_path, page_actions, final_pdf_path): """ Create new pdf from a slice of pages of a PDF :param pdf_file_path: path of the source PDF document, from which a new PDF file will be created. :param pages_actions: list of tuples, each tuple containing the page number and the clockwise rotation to be applied. The page number is non-zero indexed (first is page 1, and so on). :return: None. Writes the resulting PDF file into the provided path. """ output = PdfFileWriter() with open(pdf_file_path, 'rb') as file_input: input = PdfFileReader(file_input, strict=False) # Check page actions correspond to valid input PDF pages input_num_pages = input.getNumPages() actions_page_numbers = zip(*page_actions)[0] largest_page_num = max(actions_page_numbers) lowest_page_num = min(actions_page_numbers) if lowest_page_num < 1: raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.") if (largest_page_num - 1) > input_num_pages: raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers" u"of pages of the source PDF document.") # Perform actual slicing + rotation for num_page, rotation in page_actions: output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation else input.getPage(num_page-1)) self._write_to_pdf(output, final_pdf_path)
def parse_pdf_doc(self): """ Open a pdf document filetype and parse contents to string variable for matching comparison. """ docText = '' # open the file, with read/binary priviledges f = open(self.file, 'rb') pdf = PyPDF2.PdfFileReader(f) for page in pdf.pages : docText += page.extractText() f.close() return docText.strip() or None
def __init__(self, file, prefix, fill, start, area): """ Represents a document to be numbered. Args: file (): PDF file associated with this document. prefix (str): Bates number prefix. fill (int): Length to zero-pad number to. start (int): Number to start with. area (Area): Area on the document where the number should be drawn """ try: self.file = io.BytesIO(file.read()) except AttributeError: with open(file, "rb") as file: self.file = io.BytesIO(file.read()) self.reader = PdfFileReader(self.file) self.prefix = prefix self.fill = fill self.start = copy.copy(start) self.area = area self.overlays = {x: None for x in Area} self.overlays[area] = BatesOverlay(None, self.area) self.index = 0 self.pages = [] for num, page in enumerate(self.reader.pages): p = Page(self, page, self.prefix, self.fill, self.start + num) self.pages.append(p)
def __init__(self, path): self.path = path reader = PdfFileReader(open(path, "rb")) self.writer = PdfFileWriter() self.writer.appendPagesFromReader(reader) self.writer.addMetadata({k: v for k, v in reader.getDocumentInfo().items() if isinstance(v, (utils.string_type, utils.bytes_type))})
def reader(title): """Leest de PDF en converteert het naar TEXT""" pdfFileObj = open(title,'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) text = "" maxpage = int(pdfReader.numPages) + 1 for x in range(0, maxpage + 100): try: pageObj = pdfReader.getPage(x) text += str(pageObj.extractText()) except: break return text
def generate_document(self, data): packet = StringIO() if self.template_file is not None: template = PdfFileReader(open(self.template_file, 'rb')) c = canvas.Canvas(packet, pagesize=(self.width, self.height)) i = 0 for field_cls in self.fields: # TODO: Catch exception if there is less columns than fields field = field_cls(self, c, data[i]) field.render() i += 1 # Save canvas c.save() packet.seek(0) text = PdfFileReader(packet) output = PdfFileWriter() if self.template_file is not None: # Merge text with base page = template.getPage(0) page.mergePage(text.getPage(0)) else: page = text.getPage(0) output.addPage(page) # Save file filename = "%s/%s.pdf" % (self.output_dir, self.generate_filename(data)) outputStream = open(filename, 'wb') output.write(outputStream) outputStream.close()
def getPDFContent(path): content = "" pdf = PyPDF2.PdfFileReader(path, "rb") for i in range(0, pdf.getNumPages()): content += pdf.getPage(i).extractText() + "\n" content = " ".join(content.strip().split()) return content
def add_outlines(toc, filename, output): build_outlines_btree(toc) pdf_out = PdfFileWriter() pdf_in = PdfFileReader(open(filename, 'rb')) for p in pdf_in.pages: pdf_out.addPage(p) toc_num = len(toc) idoix = len(pdf_out._objects) + 1 idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)] ol = PDF.DictionaryObject() ol.update({ PDF.NameObject("/Type"): PDF.NameObject("/Outlines"), PDF.NameObject("/First"): idorefs[1], PDF.NameObject("/Last"): idorefs[-1], PDF.NameObject("/Count"): PDF.NumberObject(toc_num) }) olitems = [] for t in toc: oli = PDF.DictionaryObject() oli.update({ PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")), PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"]) }) opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"} for k, v in opt_keys.items(): n = getattr(t["node"], k)() if n is not None: oli.update({ PDF.NameObject(v): idorefs[n.index] }) olitems.append(oli) pdf_out._addObject(ol) for i in olitems: pdf_out._addObject(i) pdf_out._root_object.update({ PDF.NameObject("/Outlines"): idorefs[0] }) outputFile = open(output, "wb") pdf_out.write(outputFile) outputFile.close()
def iter_pdf_page(self, f): reader = PdfFileReader(f) for pgnum in range(reader.getNumPages()): pg = reader.getPage(pgnum) yield pg
def iter_pdf_page_text(self, filename): self.filename = filename reader = PdfFileReader(filename) logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename)) for pgnum in range(reader.getNumPages()): text = reader.getPage(pgnum).extractText() text = text.encode('ascii', 'ignore') text = text.replace('\n', ' ') yield text
def check_nb_pages(self, data): """ Does this PDF contain enough pages? """ try: s_io = StringIO(data) reader = PyPDF2.PdfFileReader(s_io) num_pages = reader.getNumPages() print("num pages: %d" % num_pages) return num_pages > 2 except PyPdfError as e: return False
def loadFromStream(self, stream): self.reader = PdfFileReader(stream, strict=False)