Python使用paddleOCR批量识别pdf的方法

  import os

  import cv2

  from paddleocr import PPStructure,save_structure_res

  from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx

  from copy import deepcopy

  # 中文测试图

  table_engine = PPStructure(recovery=True,lang='ch')

  image_path = 'http://www.jb51.net/python/imgs/demo-scan'

  save_folder = 'http://www.jb51.net/python/txt'

  def img2docx(img_path):

  text=[]

  imgs=os.listdir(img_path)

  for img_name in imgs:

  print(os.path.join(img_path,img_name))

  img = cv2.imread(os.path.join(img_path,img_name))

  result = table_engine(img)

  save_structure_res(result, save_folder, os.path.basename(img_path).split('.')[0])

  h, w, _ = img.shape

  res = sorted_layout_boxes(result, w)

  convert_info_docx(img, res, save_folder, os.path.basename(img_path).split('.')[0])

  for line in res:

  line.pop('img')

  print(line)

  for pra in line['res']:

  text.append(pra['text'])

  text.append('

  ')

  with open('txt/res.txt', 'w', encoding='utf-8') as f:

  f.write('

  '.join(text))

  img2docx(image_path)