Python PyPDF2 模块,PdfFileReader() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用PyPDF2.PdfFileReader()

项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
项目:gthnk    作者:iandennismiller    | 项目源码 | 文件源码
def render_pdf(self):
        outpdf = PdfFileWriter()
        for page in self.pages:
            if page.extension == "pdf":
                # the page is already a PDF so append directly
                outpdf.addPage(PdfFileReader(BytesIO(page.binary)).getPage(0))
            else:
                # otherwise, the page is an image that needs to be converted to PDF first
                buf = BytesIO()
                img = Image.open(BytesIO(page.binary))
                img.convert("RGB").save(buf, format="pdf")
                # once image is PDF, it can be appended
                outpdf.addPage(PdfFileReader(buf).getPage(0))

        pdf_page_buf = BytesIO()
        outpdf.write(pdf_page_buf)
        return(pdf_page_buf.getvalue())
项目:CSE371Project    作者:muhakh    | 项目源码 | 文件源码
def pdf_page_to_png(src_pdf, pagenum = 0, resolution = 72,):
    '''
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    '''
    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file = pdf_bytes, resolution = resolution)
    img.convert("png")

    return img

# Example of converting exam.pdf located at the same direcory
# convert('exam')   # NOTE : default resolution is 72 dpi
项目:handfontgen    作者:nixeneko    | 项目源码 | 文件源码
def outputpapertemplate(self, dest, listchar, output=None):
        if output == None:
            output = PyPDF2.PdfFileWriter()

        while listchar:
            iopage = self.outputtemplateonepage(listchar)
            page = PyPDF2.PdfFileReader(iopage)
            output.addPage(page.getPage(0))

        if dest != None:
            if isinstance(dest, str): # when dest is a file path
                destdir = os.path.dirname(dest)
                if destdir != '' and not os.path.isdir(destdir):
                    os.makedirs(destdir)
                with open(dest, "wb") as w:
                    output.write(w)
            else: # when dest is io.IOBase
                output.write(dest)
        else:
            return output
项目:Marisol    作者:wikkiewikkie    | 项目源码 | 文件源码
def apply(self):
        """
        Applies all requested overlays to the page

        Returns:
            bool
        """
        for overlay in self.document.overlays.values():
            if isinstance(overlay, BatesOverlay):
                overlay.text = self.number
                overlay.apply(self.canvas)
            elif isinstance(overlay, GenericTextOverlay):
                overlay.apply(self.canvas)

        for redaction in self.redactions:
            redaction.apply(self.canvas)

        self.canvas.showPage()
        self.canvas.save()

        self.canvas_file.seek(0)
        reader = PdfFileReader(self.canvas_file)
        overlay_page = reader.getPage(0)
        self.page.mergePage(overlay_page)
        return True
项目:pentestly    作者:praetorian-inc    | 项目源码 | 文件源码
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {
        'author': meta.author,
    }
    return result
项目:callisto-core    作者:project-callisto    | 项目源码 | 文件源码
def test_report_pdf(self):
        self.client_post_report_creation()
        pdf = report_delivery.report_as_pdf(
            report=self.report,
            data=mock_report_data,
            recipient=None,
        )
        pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf))

        self.assertIn(
            "Reported by: testing_12",
            pdf_reader.getPage(0).extractText())
        self.assertIn('food options', pdf_reader.getPage(0).extractText())
        self.assertIn('vegetables', pdf_reader.getPage(0).extractText())
        self.assertIn('apples: red', pdf_reader.getPage(0).extractText())
        self.assertIn('eat it now???', pdf_reader.getPage(0).extractText())
项目:odoo-report    作者:vertelab    | 项目源码 | 文件源码
def create(self, cr, uid, ids, data, context=None):
        pool = registry(cr.dbname)
        merger = PdfFileMerger()
        outfiles = []
        for p in pool.get(self.model).read(cr,uid,ids):
            outfiles.append(self.newfilename())
            sla = self.render(cr,uid,p,data.get('template') or self.template)
            if self.report_type == 'scribus_sla':
                os.unlink(outfiles[-1])
                return (sla.read(),'sla')
            command = "xvfb-run -a scribus-ng -ns -g %s -py %s -pa -o %s" % (sla.name,os.path.join(get_module_path('report_scribus'), 'scribus.py'),outfiles[-1])
            _logger.info(command)
            res = os.system(command)
            sla.close()
            if not os.path.exists(outfiles[-1]) or os.stat(outfiles[-1]).st_size == 0:
                raise MissingError('There are something wrong with the template or scribus installation')
            merger.append(PdfFileReader(file(outfiles[-1], 'rb')))
        outfile = tempfile.NamedTemporaryFile(mode='w+b',suffix='.pdf')
        merger.write(outfile.name)
        for filename in outfiles:
            os.unlink(filename)
        outfile.seek(0)
        pdf = outfile.read()
        outfile.close()
        return (pdf,'pdf')
项目:HackathonOAB    作者:Marlysson    | 项目源码 | 文件源码
def run(self):

        objeto_pdf = open(self.caminho_arquivo, 'rb')
        reader = PyPDF2.PdfFileReader(objeto_pdf)

        conteudos = []

        for num in range(reader.numPages):

            texto = reader.getPage(num).extractText()
            conteudos.append(texto.encode("utf-8"))

        instancia = object.__new__(GravarConvertido)
        instancia.__init__(self.caminho_arquivo," ".join(conteudos))

        self.next_pipe = instancia
        self.next_pipe.run()
项目:refextract    作者:inspirehep    | 项目源码 | 文件源码
def _destinations_in_two_columns(pdf, destinations, cutoff=3):
    """
    Check if the named destinations are organized along two columns (heuristic)

    @param pdf: a PdfFileReader object
    @param destinations:

    'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
    would-be second column start at the same position, return True
    """
    # iterator for the x coordinates of refs in the would-be second column
    xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
                  in destinations
                  if _destination_position(pdf, dest)[1] == 1)
    xpos_count = {}
    for xpos in xpositions:
        xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
        if xpos_count[xpos] >= cutoff:
            return True
    return False
项目:osp-scraper    作者:opensyllabus    | 项目源码 | 文件源码
def extract_links(self, response):
        pdf = pyPdf.PdfFileReader(BytesIO(response.body))
        pgs = pdf.getNumPages()

        for page_num in range(pgs):
            page = pdf.getPage(page_num)

            annotations = page.get('/Annots', [])
            for annotation in annotations:
                annot_object = annotation.getObject()

                a_tag = annot_object.get('/A')
                if a_tag and '/URI' in a_tag:
                    uri = a_tag['/URI']
                    if isinstance(uri, pyPdf.generic.ByteStringObject):
                        uri = uri.decode("utf-8").replace("\x00", "")
                    yield (uri, uri)
项目:ingestors    作者:alephdata    | 项目源码 | 文件源码
def extract_metadata(self, file_path):
        with open(file_path, 'rb') as fh:
            pdf = PdfFileReader(fh, strict=False)
            meta = pdf.getDocumentInfo()
            if meta is not None:
                self.update('title', meta.title)
                self.update('author', meta.author)
                self.update('generator', meta.creator)
                self.update('generator', meta.producer)
                if meta.subject:
                    self.result.keywords.append(meta.subject)

            xmp = pdf.getXmpMetadata()
            if xmp is not None:
                self.update('id', xmp.xmpmm_documentId)
                for lang, title in xmp.dc_title.items():
                    self.update('title', title)
                    self.result.languages.append(lang)
                self.update('generator', xmp.pdf_producer)
                self.update('created_at', xmp.xmp_createDate)
                self.update('modified_at', xmp.xmp_modifyDate)
                self.result.languages.extend(xmp.dc_language)

        # from pprint import pprint
        # pprint(self.result.to_dict())
项目:FA-IR_Ranking    作者:MilkaLichtblau    | 项目源码 | 文件源码
def __loadSATPDF(self, filename):
        print("loading SAT score pdf")
        """
        loads the SAT PDF file, deletes all nonsense and creates an array containing only the numbers
        from the table

        Return
        ------
        All numbers from the SAT table in a string array
        """
        pdf = pypdf.PdfFileReader(open(filename, "rb"))
        tableContents = []

        for page in pdf.pages:
            content = page.extractText()
            tableHeader = "Total \nMale Female \nScore \nNumber Percentile Number Percentile Number Percentile "
            tableFooter = "De˜nitions of statistical terms are provided online at research."
            tableContents += self.__getTableContent(content, tableHeader, tableFooter)
            if "Number" and  "Mean" and "S.D." in tableContents:
                tableContents = tableContents[:tableContents.index("S.D.") - 2]

        return tableContents
项目:python_for_linux_system_administration    作者:lalor    | 项目源码 | 文件源码
def main():
    all_pdfs = get_all_pdf_files(os.path.expanduser('~lmx/'))
    if not all_pdfs:
        raise SystemExit('No pdf file found!')

    merger = PyPDF2.PdfFileMerger()

    with open(all_pdfs[0], 'rb') as first_obj:
        merger.append(first_obj)

    for pdf in all_pdfs[1:]:
        with open(pdf, 'rb') as obj:
            reader = PyPDF2.PdfFileReader(obj)
            merger.append(fileobj=obj, pages=(1, reader.getNumPages()))

    with open('merge-pdfs.pdf', 'wb') as f:
        merger.write(f)
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
项目:RastLeak    作者:n4xh4ck5    | 项目源码 | 文件源码
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
项目:PDF-to-PPT    作者:vijayanandrp    | 项目源码 | 文件源码
def pdf_splitter(self):
        self.log.info('Called pdf_splitter')
        input_pdf = PdfFileReader(file(self.pdf_file, 'rb'))
        self.total_pages = input_pdf.numPages

        for page_number in range(self.total_pages):
            output = PdfFileWriter()
            output.addPage(input_pdf.getPage(page_number))
            # new filename
            new_pdf = '_%s%s' % (str(page_number+1), '.pdf')
            new_pdf = self.pdf_file.replace('.pdf', new_pdf)
            file_stream = file(new_pdf, 'wb')
            output.write(file_stream)
            file_stream.close()

            # calling pdf to image conversion
            self.pdf_to_image(new_pdf)
项目:recon-ng    作者:Hehe-Zhc    | 项目源码 | 文件源码
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
项目:knowledge-repo    作者:airbnb    | 项目源码 | 文件源码
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """

    check_dependencies(__optional_dependencies__['pdf'])
    # Import libraries within this function so as to avoid import-time dependence
    import PyPDF2
    from wand.image import Image  # TODO: When we start using this again, document which system-level libraries are required.

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file=pdf_bytes, resolution=resolution)
    img.convert("png")

    return img
项目:multipage_to_book_batch_converter    作者:uml-digitalinitiatives    | 项目源码 | 文件源码
def count_pages(input_file):
    """Count the number of pages in a file

    Keyword arguments
    input_file -- the full path to the input file
    """
    count = 0
    if is_pdf.match(input_file):
        with open(input_file, 'rb') as fp:
            count += len(rxcountpages.findall(fp.read()))
        if count == 0:
            pdf_read = PyPDF2.PdfFileReader(input_file)
            count = pdf_read.getNumPages()
            pdf_read = None
    else:
        ops = [
            'identify', '-ping', '-format', "%n\\n", input_file
        ]
        results = do_system_call(ops, return_result=True)
        count = int(results.rstrip().split('\n').pop())

    return count
项目:recon-ng    作者:captainhooligan    | 项目源码 | 文件源码
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
项目:open-syllabus-project    作者:davidmcclure    | 项目源码 | 文件源码
def pdf_date(path):

    """
    Extract a date from PDF file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    reader = PdfFileReader(path)

    # Get rid of `D:` prefix and timezone.
    stamp = reader.documentInfo['/CreationDate']
    match = re.search('\d+', stamp)

    return datetime.strptime(
        match.group(),
        '%Y%m%d%H%M%S'
    )
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract(text, paper=None, logger=logger):

    # try using pypdf2/pdfminer
    try:
        pdf = paper._read_document()
        pdfReader = PdfFileReader(pdf)
        author = pdfReader.getDocumentInfo().author
        value_text, value_result = author, author
        source_type = "extracted"
        source_detail = "pdf metadata"
        if author:
            return (value_text, value_result, source_type, source_detail)
        else:
            return None
    except:
        # search for author or return None
        # Though currently there is no search function
        return None
项目:repeat-aft    作者:ripeta    | 项目源码 | 文件源码
def extract(text, paper=None, logger=logger):

    # try using pypdf2/pdfminer
    try:
        pdf = paper._read_document()
        pdfReader = PdfFileReader(pdf)
        title = pdfReader.getDocumentInfo().title
        value_text, value_result = title, title
        source_type = "extracted"
        source_detail = "pdf metadata"
        if title:
            return (value_text, value_result, source_type, source_detail)
        else:
            return None
    except:
        # search for title or return None
        # Though currently there is no search function
        return None
项目:ilovepdf    作者:sdelquin    | 项目源码 | 文件源码
def test_split():
    i = ILovePdf(config.PUBLIC_KEY, config.SECRET_KEY)
    i.new_task("split")
    i.add_file("test.pdf")
    i.execute(ranges="1-2,5-8")
    i.download()
    zip_ref = zipfile.ZipFile("out.zip", "r")
    zip_ref.extractall("test_split")
    zip_ref.close()
    assert len(glob.glob("test_split/*.pdf")) == 2
    output_file1 = PdfFileReader(open("test_split/test-1-2.pdf", "rb"))
    output_file2 = PdfFileReader(open("test_split/test-5-8.pdf", "rb"))
    assert output_file1.getNumPages() == 2
    assert output_file2.getNumPages() == 4
    os.remove("out.zip")
    shutil.rmtree("test_split")
项目:pdf-server    作者:nathanielove    | 项目源码 | 文件源码
def get_pages(pdf_filename, from_, to):
    if to < from_:
        to = from_

    file = open(pdf_filename, 'rb')
    pdf = PdfFileReader(file)

    output = PdfFileWriter()

    for i in range(from_ - 1, to):
        output.addPage(pdf.getPage(i))

    stream = BytesIO()
    output.write(stream)
    data = stream.getvalue()
    file.close()
    return data
项目:Forensic-Tools    作者:MonroCoury    | 项目源码 | 文件源码
def pdfMetaData(file_path, save=True):
    '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)'''
    pdf_doc = PdfFileReader(open(file_path, "rb"))

    if pdf_doc.isEncrypted:
        try:
            if pdf_doc.decrypt("") != 1:
                sys.exit("target pdf document is encrypted... exiting...")
        except:
            sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...")

    doc_info = pdf_doc.getDocumentInfo()
    stats = os.stat(file_path)
    now = dt.now()
    file_name = getFileName(file_path)
    metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month,
                                                                                               now.day, now.hour, now.minute,
                                                                                               now.second, file_name[:-4])
    try:
        for md in doc_info:
            metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n"
    except TypeError:
        sys.exit("Couldn't read document info! Make sure target is a valid pdf document...")

    metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime),
                                                                                                           dt.fromtimestamp(stats.st_mtime),
                                                                                                           dt.fromtimestamp(stats.st_atime),
                                                                                                           stats.st_uid)
    try:
        print(metadata)
    except UnicodeEncodeError:
        print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.")

    if save:
        file_name = getFileName(file_path)
        tgt = file_name + ".txt"

        saveResult(tgt, metadata)
项目:Plamber    作者:OlegKlimenko    | 项目源码 | 文件源码
def validate_pdf(value):
    """
    Validates the uploading file if it is a PDF.
    Raises an error if validation not passed.

    :param value: The file object.
    """
    try:
        PyPDF2.PdfFileReader(io.BytesIO(value.read()))
    except PyPDF2.utils.PdfReadError:
        raise ValidationError('Tried to upload not PDF as a book!')
项目:document_clipper    作者:reclamador    | 项目源码 | 文件源码
def slice(self, pdf_file_path, page_actions, final_pdf_path):
        """
        Create new pdf from a slice of pages of a PDF
        :param pdf_file_path: path of the source PDF document, from which a new PDF file will be created.
        :param pages_actions: list of tuples, each tuple containing the page number and the clockwise rotation to
        be applied. The page number is non-zero indexed (first is page 1, and so on).
        :return: None. Writes the resulting PDF file into the provided path.
        """
        output = PdfFileWriter()
        with open(pdf_file_path, 'rb') as file_input:
            input = PdfFileReader(file_input, strict=False)

            # Check page actions correspond to valid input PDF pages
            input_num_pages = input.getNumPages()
            actions_page_numbers = zip(*page_actions)[0]
            largest_page_num = max(actions_page_numbers)
            lowest_page_num = min(actions_page_numbers)

            if lowest_page_num < 1:
                raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")

            if (largest_page_num - 1) > input_num_pages:
                raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
                                u"of pages of the source PDF document.")

            # Perform actual slicing + rotation
            for num_page, rotation in page_actions:
                output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
                               else input.getPage(num_page-1))
            self._write_to_pdf(output, final_pdf_path)
项目:Resume-Ranker    作者:sadmicrowave    | 项目源码 | 文件源码
def parse_pdf_doc(self):
        """
        Open a pdf document filetype and parse contents to string variable
        for matching comparison.
        """

        docText = ''
        # open the file, with read/binary priviledges
        f = open(self.file, 'rb')
        pdf = PyPDF2.PdfFileReader(f)
        for page in pdf.pages :
            docText += page.extractText()

        f.close()
        return docText.strip() or None
项目:Marisol    作者:wikkiewikkie    | 项目源码 | 文件源码
def __init__(self, file, prefix, fill, start, area):
        """
        Represents a document to be numbered.

        Args:
            file (): PDF file associated with this document.
            prefix (str): Bates number prefix.
            fill (int): Length to zero-pad number to.
            start (int): Number to start with.
            area (Area): Area on the document where the number should be drawn
        """
        try:
            self.file = io.BytesIO(file.read())
        except AttributeError:
            with open(file, "rb") as file:
                self.file = io.BytesIO(file.read())
        self.reader = PdfFileReader(self.file)
        self.prefix = prefix
        self.fill = fill
        self.start = copy.copy(start)
        self.area = area

        self.overlays = {x: None for x in Area}
        self.overlays[area] = BatesOverlay(None, self.area)

        self.index = 0

        self.pages = []
        for num, page in enumerate(self.reader.pages):
            p = Page(self, page, self.prefix, self.fill, self.start + num)
            self.pages.append(p)
项目:pdfdir    作者:chroming    | 项目源码 | 文件源码
def __init__(self, path):
        self.path = path
        reader = PdfFileReader(open(path, "rb"))
        self.writer = PdfFileWriter()
        self.writer.appendPagesFromReader(reader)
        self.writer.addMetadata({k: v for k, v in reader.getDocumentInfo().items()
                                 if isinstance(v, (utils.string_type, utils.bytes_type))})
项目:THESIS_LIFEBOAT    作者:Jasper-Koops    | 项目源码 | 文件源码
def reader(title):
    """Leest de PDF en converteert het naar TEXT"""
    pdfFileObj = open(title,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    text = ""
    maxpage = int(pdfReader.numPages) + 1
    for x in range(0, maxpage + 100):
        try:
            pageObj = pdfReader.getPage(x)
            text += str(pageObj.extractText())
        except:
            break
    return text
项目:stereo    作者:suda    | 项目源码 | 文件源码
def generate_document(self, data):
        packet = StringIO()
        if self.template_file is not None:
            template = PdfFileReader(open(self.template_file, 'rb'))
        c = canvas.Canvas(packet, pagesize=(self.width, self.height))

        i = 0
        for field_cls in self.fields:
            # TODO: Catch exception if there is less columns than fields
            field = field_cls(self, c, data[i])
            field.render()
            i += 1

        # Save canvas
        c.save()
        packet.seek(0)
        text = PdfFileReader(packet)
        output = PdfFileWriter()
        if self.template_file is not None:
            # Merge text with base
            page = template.getPage(0)
            page.mergePage(text.getPage(0))
        else:
            page = text.getPage(0)
        output.addPage(page)

        # Save file
        filename = "%s/%s.pdf" % (self.output_dir, self.generate_filename(data))
        outputStream = open(filename, 'wb')
        output.write(outputStream)
        outputStream.close()
项目:Copernicus    作者:Soroboruo    | 项目源码 | 文件源码
def getPDFContent(path):
    content = ""
    pdf = PyPDF2.PdfFileReader(path, "rb")
    for i in range(0, pdf.getNumPages()):

        content += pdf.getPage(i).extractText() + "\n"
    content = " ".join(content.strip().split())
    return content
项目:caj2pdf    作者:JeziL    | 项目源码 | 文件源码
def add_outlines(toc, filename, output):
    build_outlines_btree(toc)
    pdf_out = PdfFileWriter()
    pdf_in = PdfFileReader(open(filename, 'rb'))
    for p in pdf_in.pages:
        pdf_out.addPage(p)
    toc_num = len(toc)
    idoix = len(pdf_out._objects) + 1
    idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)]
    ol = PDF.DictionaryObject()
    ol.update({
        PDF.NameObject("/Type"): PDF.NameObject("/Outlines"),
        PDF.NameObject("/First"): idorefs[1],
        PDF.NameObject("/Last"): idorefs[-1],
        PDF.NameObject("/Count"): PDF.NumberObject(toc_num)
    })
    olitems = []
    for t in toc:
        oli = PDF.DictionaryObject()
        oli.update({
            PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")),
            PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"])
        })
        opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"}
        for k, v in opt_keys.items():
            n = getattr(t["node"], k)()
            if n is not None:
                oli.update({
                    PDF.NameObject(v): idorefs[n.index]
                })
        olitems.append(oli)
    pdf_out._addObject(ol)
    for i in olitems:
        pdf_out._addObject(i)
    pdf_out._root_object.update({
        PDF.NameObject("/Outlines"): idorefs[0]
    })
    outputFile = open(output, "wb")
    pdf_out.write(outputFile)
    outputFile.close()
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
项目:pdf_liberty    作者:mplitnikas    | 项目源码 | 文件源码
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
项目:oabot    作者:dissemin    | 项目源码 | 文件源码
def check_nb_pages(self, data):
        """
        Does this PDF contain enough pages?
        """
        try:
            s_io = StringIO(data)
            reader = PyPDF2.PdfFileReader(s_io)
            num_pages = reader.getNumPages()
            print("num pages: %d" % num_pages)
            return num_pages > 2
        except PyPdfError as e:
            return False
项目:krop    作者:gocarlos    | 项目源码 | 文件源码
def loadFromStream(self, stream):
        self.reader = PdfFileReader(stream, strict=False)