https://pan.baidu.com/s/1kNngtcRUXH9J1CEeE2MaVw?pwd=oj5g
提取码:oj5g
代码语言:javascript复制##############################################
########### OCR图文识别 ###############
########### tesseract包 ###############
# REFER
# https://tesseract-ocr.github.io/docs/
# https://tesseract-ocr.github.io/tessdoc/Data-Files
# Date:20220318
##############################################
# 安装和加载包。
# 如遇Rcpp报错,建议remove包后重新install
# 特别情况下可直接至library安装路径下手动删除
# if(!require(Rcpp)) install.packages("Rcpp")
# if(!require(tesseract)) install.packages("tesseract")
rm(list = ls())
library(Rcpp)
library(tesseract) #如果不报错,正常会加载一段实践(30s以上)
# 先查看包含信息,和可以识别文字的包有哪些。
tesseract_info()
# > tesseract_info()
# $datapath
# [1] "C:\Users\USER\AppData\Local\tesseract5\tesseract5\tessdata/"
#
# $available
# [1] "chi_sim" "chi_sim_vert" "chi_tra" "chi_tra_vert" "eng" "osd"
#
# $version
# [1] "5.0.1"
#
# $configs
# [1] "alto" "ambigs.train" "api_config" "bigram"
# [5] "box.train" "box.train.stderr" "digits" "get.images"
# [9] "hocr" "inter" "kannada" "linebox"
# [13] "logfile" "lstm.train" "lstmbox" "lstmdebug"
# [17] "makebox" "pdf" "quiet" "rebox"
# [21] "strokewidth" "tsv" "txt" "unlv"
# [25] "wordstrbox"
# 首次要下载下语言库
# 语言包https://blog.csdn.net/qq_41897154/article/details/109499741
# 官方语言包地址(选择更多)https://tesseract-ocr.github.io/tessdoc/Data-Files
# tesseract_download("chi_tra") # 下载繁体中文
# tesseract_download("chi_sim") # 下载简体中文
# 如果是直接在github下载的语言包
# 【需要操作】直接将相应的语言包复制到tesseract_info()路径下即可。
#查看和设置工作路径(图片所在位置)
# getwd()
setwd("C:\Users\USER\Desktop")
# 图片支持JPG PNG。
text <- ocr('ec.png', engine = tesseract("chi_sim"))
cat(text)
# 支持pdf图文识别,Read from PDF files
pngfile <- pdftools::pdf_convert('ocrscan.pdf', dpi = 600)
## Converting page 1 to ocrscan_1.png... done!
text <- tesseract::ocr(pngfile)
cat(text)
# 更多关于OCR图文识别de wen dang yu chu li