python解析pdf文件

加载pdf文件，获取pdf的每一页对象：

import pdfplumber
path = ""
with pdfplumber.open(path) as pdf_obj:
    pages = pdf_obj.pages
    # 获取页面的宽高
    p_width = page.width
    p_height = page.height

获取当页的所有文本：

text = page.extract_text()

如果未抽取到文本对象，text是None对象。使用该方法，表格中的文本也会被抽取出来，但是文本是按照“行”来读取的，表格内的文字会出现“错行”的情况。

如果页面中既有文字也有表格，可以考虑抽取出本页中所有的文本对象及对应的坐标，然后根据坐标将文本按原位置还原。

words = page.extract_words()
new_words = []
for word in words:
    for k, v in word.items():
        if type(v) == decimal.Decimal:
            word[k] = float(v)
    new_words.append(word)

也可以单独解析表格对象：

def get_new_cells(cells):
    new_cells = []
    for item in cells:
        if item is None:
            item = (None, None, None, None)
        new_cells.append(item)
    return new_cells

ts = page.find_tables()
ts1 = ts[0]
rows = ts1.rows
for row in rows:
    cells = row.cells
    new_cells = get_new_cells(cells)
    from pdfplumber.table import Table as ptb
    nt = ptb(page, new_cells)
    print(nt.extract())

获取本页中的所有图片并保存：

def get_images(page_obj, pdf_path):
    """
    获取本页中的图片
    """
    image_list = []
    imgs = page_obj.images
    pdf_name = pdf_path.split('/')[-1].replace('.pdf', '')
    main_path = 'E:/temp/imgs/%s' % pdf_name
    for img in imgs:
        try:
            name = img.get('name', 'abc')
            new_img_path = '%s_%s' % (main_path, name)
            ism = img.get('stream')
            color_space = ism.__dict__.get('attrs').get('ColorSpace')
            if color_space.name == 'DeviceRGB':
                mode = "RGB"
            else:
                mode = "P"
            img_row_data = ism.get_data()

            img_filter = ism.__dict__.get('attrs').get('Filter')
            img_filter_name = img_filter.name
            if img_filter_name == 'FlateDecode':
                width, height = ism.__dict__.get('attrs').get('Width'), ism.__dict__.get('attrs').get('Height')
                if not width or not height:
                    continue
                new_img_path = new_img_path+'.png'
                size = (width, height)
                new_img = Image.frombytes(mode, size, img_row_data)
                new_img.save(new_img_path)
            elif img_filter_name == 'DCTDecode':
                new_img_path = new_img_path+'.jpg'
                new_img = open(new_img_path, 'wb')
                new_img.write(img_row_data)
                new_img.close()
            elif img_filter_name == 'JPXDecode':
                new_img_path = new_img_path+'.jp2'
                new_img = open(new_img_path, 'wb')
                new_img.write(img_row_data)
                new_img.close()
            elif img_filter_name == 'CCITTFaxDecode':
                new_img_path = new_img_path+'.tiff'
                new_img = open(new_img_path, 'wb')
                new_img.write(img_row_data)
                new_img.close()
            else:
                logging.error('wrong img_filter_name: %s' % img_filter_name)
                continue

            image_list.append(
                {'name': name, 'path': new_img_path}
            )
        except Exception as e:
            logging.error('get_images failed, pdf_path: %s, error: %s' % (pdf_path, e))
    return image_list

也尝试过使用pdfminer解析pdf文档：

def pdfminer_test1():
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTImage, LTFigure, LTCurve, LTTextBox
    from pdfminer.pdfpage import PDFPage

    path = ''
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    if not doc.is_extractable:
        return None
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    count = 1
    for page in PDFPage.create_pages(doc):
        if count == 5:
            texts = []
            images = []
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBox):
                    # 可以通过x.get_text()获取文本
                    texts.append(x)
                if isinstance(x, LTImage):
                    print(x)
                    images.append(x)
                if isinstance(x, LTFigure):
                    figurestack = [x]
                    while figurestack:
                        figure = figurestack.pop()
                        for f in figure:
                            if isinstance(f, LTTextBox):
                                texts.append(f)
                            if isinstance(f, LTImage):
                                print(x)
                                images.append(f)
                            if isinstance(f, LTFigure):
                                figurestack.append(f)
        count += 1

    fp.close()

最开始的目的是，拿到一个pdf文件，可以按照原文件，完全将pdf中的所有文本（包括表格中的文本）和图片抽取出来（包括位置还原），但是一直没找到好的办法。

上面的代码整理到了这里：pdfParser

后来还尝试过其他方式，先将原文件转换为doc文档，然后读取doc文档。

import docx
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph


def iter_block_items(parent):
    """
    Yield each paragraph and table child within *parent*, in document order.
    Each returned value is an instance of either Table or Paragraph. *parent*
    would most commonly be a reference to a main Document object, but
    also works for a _Cell object, which itself can contain paragraphs and tables.
    """
    if isinstance(parent, Document):
        parent_elm = parent.element.body
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        raise ValueError("something's not right")

    for child in parent_elm.iterchildren():
        if isinstance(child, CT_P):
            yield Paragraph(child, parent)
        elif isinstance(child, CT_Tbl):
            yield Table(child, parent)
            # table = Table(child, parent)
            # for row in table.rows:
            #     for cell in row.cells:
            #         for paragraph in cell.paragraphs:
        else:
            print('other type: %s' % type(parent_elm))
            pass


def docx_test():
    path = ''
    document = docx.Document(path)

    content = ''
    for block in iter_block_items(document):
        print(block.style.name)
        if block.style.name == 'Table Grid':
            pass
        if block.style.name == 'Heading 1':
            pass

        if isinstance(block, docx.table.Table):
            table_content = ''
            for i, row in enumerate(block.rows):
                row_content = []
                for cell in row.cells:
                    c = cell.text
                    row_content.append(c)

                row_content_str = ' '.join(row_content) + '\n'
                table_content += row_content_str
            content += table_content

        if isinstance(block, docx.text.paragraph.Paragraph):
            content += block.text + '\n'

    print(content)

上面的代码只提取了doc文件中的文字和表格，未包含图片对象。

参考：

python PDFMiner 处理pdf，保存文本及图片

pdfplumber是怎么做表格抽取的（一）

今天的文章python解析pdf文件分享到此就结束了，感谢您的阅读，如果确实帮到您，您可以动动手指转发给其他人。

版权声明：本文内容由互联网用户自发贡献，该文观点仅代表作者本人。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容，请发送邮件至举报，一经查实，本站将立刻删除。
如需转载请保留出处：https://bianchenghao.cn/27810.html