Python使用paddleOCR批量识别pdf的方法-张二牛笔记本

Python使用paddleOCR批量识别pdf的方法

　　import os

　　import cv2

　　from paddleocr import PPStructure,save_structure_res

　　from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx

　　from copy import deepcopy

　　# 中文测试图

　　table_engine = PPStructure(recovery=True,lang='ch')

　　image_path = 'http://www.jb51.net/python/imgs/demo-scan'

　　save_folder = 'http://www.jb51.net/python/txt'

　　def img2docx(img_path):

　　text=[]

　　imgs=os.listdir(img_path)

　　for img_name in imgs:

　　print(os.path.join(img_path,img_name))

　　img = cv2.imread(os.path.join(img_path,img_name))

　　result = table_engine(img)

　　save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])

　　h, w, _ = img.shape

　　res = sorted_layout_boxes(result, w)

　　convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])

　　for line in res:

　　line.pop('img')

　　print(line)

　　for pra in line['res']:

　　text.append(pra['text'])

　　text.append('

　　with open('txt/res.txt', 'w', encoding='utf-8') as f:

　　f.write('

　　'.join(text))

　　img2docx(image_path)

您可能感兴趣的文章:

Python使用paddleOCR批量识别pdf的方法