文档中的图片或图片本身的OCR识别可以通过第三方工具如PaddleOCR和CNOCR来实现,如下是两个识别过程的实践,以及使用Streamlit构建可视化页面的示例。
PaddleOCR
安装PaddleOCR环境和依赖
代码语言:javascript复制# gpu
conda create -n paddleocr python=3.9 -y
conda activate paddleocr
pip install paddlepaddle
# pip install paddlepaddle-gpu
pip3 install "paddleocr>=2.6.0.3"
pip install opencv-python
pip install PyMuPDF
PaddleOCR文档参考:https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/quickstart.md
使用PaddleOCR解析PDF中的图片,或直接解析图片中的Table
代码语言:javascript复制def pdf(file):
table_engine = PPStructure(layout=False, show_log=True)
pdf_reader = PyPDF2.PdfReader(file)
file_content = ''
for page in pdf_reader.pages:
file_content = page.extract_text().strip() 'n'
for image in page.images:
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_image:
temp_image.write(image.data)
temp_image_path = temp_image.name
img = cv2.imread(temp_image_path)
result = table_engine(img)
# save_structure_res(result, save_folder, os.path.basename(image.name).split('.')[0])
for line in result:
file_content = f"{line['res']['html']}n"
return file_content
def image(file):
file_content = ''
image_file = Image.open(file)
img = np.array(image_file)
# img = cv2.imread(img_array)
result = table_engine(img)
for line in result:
file_content = f"{line['res']['html']}n"
print('file_content:', file_content)
return file_content
CNOCR
使用CNOCR进行解析
依赖如下:
代码语言:javascript复制pip3 install cnocr
pip3 install onnxruntime
代码实现如下,效果一般
代码语言:javascript复制def image2():
from cnocr.utils import read_img
from cnocr import CnOcr
ocr = CnOcr()
img_path = 'books.jpg'
img = read_img(img_path)
res = ocr.ocr(img)
for r in res:
print(f'{r["text"]}, {r["score"]}')
Streamlit构建前端
使用Streamlit构建前端的完整代码如下:
代码语言:javascript复制import copy
import PyPDF2
import numpy as np
import openai
import streamlit as st
from PIL import Image
import cv2
import tempfile
from paddleocr import PPStructure, save_structure_res
table_engine = PPStructure(layout=False, show_log=True)
def pdf(file):
pdf_reader = PyPDF2.PdfReader(file)
file_content = ''
for page in pdf_reader.pages:
file_content = page.extract_text().strip() 'n'
for image in page.images:
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_image:
temp_image.write(image.data)
temp_image_path = temp_image.name
img = cv2.imread(temp_image_path)
result = table_engine(img)
# save_structure_res(result, save_folder, os.path.basename(image.name).split('.')[0])
for line in result:
file_content = f"{line['res']['html']}n"
return file_content
def image(file):
file_content = ''
image_file = Image.open(file)
img = np.array(image_file)
# img = cv2.imread(img_array)
result = table_engine(img)
for line in result:
file_content = f"{line['res']['html']}n"
print('file_content:', file_content)
return file_content
# 设置OpenAI API凭证
openai.api_key = "sk-xxx"
# 加载聊天记录
if "messages" not in st.session_state:
st.session_state.messages = []
st.session_state.hidden_messages = []
# 加载最后一个文件ID
if "last_file_id" not in st.session_state:
st.session_state.last_file_id = ""
# 上传文件
uploaded_file = st.file_uploader("上传文件:", type=["png", "jpg", "pdf", "excel", "xls", "doc"])
if uploaded_file is not None:
if uploaded_file.file_id != st.session_state.last_file_id:
st.session_state.last_file_id = uploaded_file.file_id
st.session_state.messages = []
print('uploaded_file:', uploaded_file)
with st.chat_message("user"):
st.markdown("分析中,请等待...")
file_suffix = uploaded_file.type.split("/")[1]
messages = copy.deepcopy(st.session_state.messages)
if file_suffix == "png" or file_suffix == "jpg" or file_suffix == "jpeg":
st.session_state.messages.append({"role": "user", "image": uploaded_file})
file_content = image(uploaded_file)
else:
file_content = pdf(uploaded_file)
prompt = "你是一个文档助手,我将传递给你一个文档内容,在下次交互时用户才会开始提问,你只需要回答是否明白。下面的所有内容均为文档内容:n" file_content
messages.append({"role": "user", "content": prompt})
st.session_state.hidden_messages.append({"pos": len(st.session_state.messages), "role": "user", "content": prompt})
print("messages:", messages)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages=messages,
)
reply = response.choices[0].message.content.strip()
print('file reply:', reply)
st.session_state.messages.append({"role": "assistant", "content": "可以开始提问了。"})
for message in st.session_state.messages:
with st.chat_message(message["role"]):
if "content" in message:
st.markdown(message["content"])
if "image" in message:
st.image(message["image"], caption='上传的图片')
# 用户输入
if user_input := st.chat_input("请输入内容..."):
st.session_state.messages.append({"role": "user", "content": user_input})
messages = []
for message in st.session_state.messages:
with st.chat_message(message["role"]):
if "content" in message:
st.markdown(message["content"])
messages.append(message)
if "image" in message:
st.image(message["image"], caption='上传的图片')
for hidden_message in st.session_state.hidden_messages:
messages.insert(hidden_message["pos"], {"role": hidden_message["role"], "content": hidden_message["content"]})
print("chat messages:", messages)
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages=messages,
)
reply = response.choices[0].message.content.strip()
st.session_state.messages.append({"role": "assistant", "content": reply})
with st.chat_message("assistant"):
st.markdown(reply)
参考:https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.7/ppstructure/docs/quickstart.md