加载pdf文件,获取pdf的每一页对象:
import pdfplumber
path = ""
with pdfplumber.open(path) as pdf_obj:
pages = pdf_obj.pages
# 获取页面的宽高
p_width = page.width
p_height = page.height
获取当页的所有文本:
text = page.extract_text()
如果未抽取到文本对象,text是None对象。使用该方法,表格中的文本也会被抽取出来,但是文本是按照“行”来读取的,表格内的文字会出现“错行”的情况。
如果页面中既有文字也有表格,可以考虑抽取出本页中所有的文本对象及对应的坐标,然后根据坐标将文本按原位置还原。
words = page.extract_words()
new_words = []
for word in words:
for k, v in word.items():
if type(v) == decimal.Decimal:
word[k] = float(v)
new_words.append(word)
也可以单独解析表格对象:
def get_new_cells(cells):
new_cells = []
for item in cells:
if item is None:
item = (None, None, None, None)
new_cells.append(item)
return new_cells
ts = page.find_tables()
ts1 = ts[0]
rows = ts1.rows
for row in rows:
cells = row.cells
new_cells = get_new_cells(cells)
from pdfplumber.table import Table as ptb
nt = ptb(page, new_cells)
print(nt.extract())
获取本页中的所有图片并保存:
def get_images(page_obj, pdf_path):
"""
获取本页中的图片
"""
image_list = []
imgs = page_obj.images
pdf_name = pdf_path.split('/')[-1].replace('.pdf', '')
main_path = 'E:/temp/imgs/%s' % pdf_name
for img in imgs:
try:
name = img.get('name', 'abc')
new_img_path = '%s_%s' % (main_path, name)
ism = img.get('stream')
color_space = ism.__dict__.get('attrs').get('ColorSpace')
if color_space.name == 'DeviceRGB':
mode = "RGB"
else:
mode = "P"
img_row_data = ism.get_data()
img_filter = ism.__dict__.get('attrs').get('Filter')
img_filter_name = img_filter.name
if img_filter_name == 'FlateDecode':
width, height = ism.__dict__.get('attrs').get('Width'), ism.__dict__.get('attrs').get('Height')
if not width or not height:
continue
new_img_path = new_img_path+'.png'
size = (width, height)
new_img = Image.frombytes(mode, size, img_row_data)
new_img.save(new_img_path)
elif img_filter_name == 'DCTDecode':
new_img_path = new_img_path+'.jpg'
new_img = open(new_img_path, 'wb')
new_img.write(img_row_data)
new_img.close()
elif img_filter_name == 'JPXDecode':
new_img_path = new_img_path+'.jp2'
new_img = open(new_img_path, 'wb')
new_img.write(img_row_data)
new_img.close()
elif img_filter_name == 'CCITTFaxDecode':
new_img_path = new_img_path+'.tiff'
new_img = open(new_img_path, 'wb')
new_img.write(img_row_data)
new_img.close()
else:
logging.error('wrong img_filter_name: %s' % img_filter_name)
continue
image_list.append(
{'name': name, 'path': new_img_path}
)
except Exception as e:
logging.error('get_images failed, pdf_path: %s, error: %s' % (pdf_path, e))
return image_list
也尝试过使用pdfminer解析pdf文档:
def pdfminer_test1():
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTImage, LTFigure, LTCurve, LTTextBox
from pdfminer.pdfpage import PDFPage
path = ''
fp = open(path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)
parser.set_document(doc)
if not doc.is_extractable:
return None
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
count = 1
for page in PDFPage.create_pages(doc):
if count == 5:
texts = []
images = []
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBox):
# 可以通过x.get_text()获取文本
texts.append(x)
if isinstance(x, LTImage):
print(x)
images.append(x)
if isinstance(x, LTFigure):
figurestack = [x]
while figurestack:
figure = figurestack.pop()
for f in figure:
if isinstance(f, LTTextBox):
texts.append(f)
if isinstance(f, LTImage):
print(x)
images.append(f)
if isinstance(f, LTFigure):
figurestack.append(f)
count += 1
fp.close()
最开始的目的是,拿到一个pdf文件,可以按照原文件,完全将pdf中的所有文本(包括表格中的文本)和图片抽取出来(包括位置还原),但是一直没找到好的办法。
上面的代码整理到了这里:pdfParser
后来还尝试过其他方式,先将原文件转换为doc文档,然后读取doc文档。
import docx
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import _Cell, Table
from docx.text.paragraph import Paragraph
def iter_block_items(parent):
"""
Yield each paragraph and table child within *parent*, in document order.
Each returned value is an instance of either Table or Paragraph. *parent*
would most commonly be a reference to a main Document object, but
also works for a _Cell object, which itself can contain paragraphs and tables.
"""
if isinstance(parent, Document):
parent_elm = parent.element.body
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
raise ValueError("something's not right")
for child in parent_elm.iterchildren():
if isinstance(child, CT_P):
yield Paragraph(child, parent)
elif isinstance(child, CT_Tbl):
yield Table(child, parent)
# table = Table(child, parent)
# for row in table.rows:
# for cell in row.cells:
# for paragraph in cell.paragraphs:
else:
print('other type: %s' % type(parent_elm))
pass
def docx_test():
path = ''
document = docx.Document(path)
content = ''
for block in iter_block_items(document):
print(block.style.name)
if block.style.name == 'Table Grid':
pass
if block.style.name == 'Heading 1':
pass
if isinstance(block, docx.table.Table):
table_content = ''
for i, row in enumerate(block.rows):
row_content = []
for cell in row.cells:
c = cell.text
row_content.append(c)
row_content_str = ' '.join(row_content) + '\n'
table_content += row_content_str
content += table_content
if isinstance(block, docx.text.paragraph.Paragraph):
content += block.text + '\n'
print(content)
上面的代码只提取了doc文件中的文字和表格,未包含图片对象。
参考:
今天的文章python解析pdf文件分享到此就结束了,感谢您的阅读,如果确实帮到您,您可以动动手指转发给其他人。
版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 举报,一经查实,本站将立刻删除。
如需转载请保留出处:https://bianchenghao.cn/27810.html