249 lines
6.0 KiB
Go
249 lines
6.0 KiB
Go
package services
|
||
|
||
import (
|
||
"github.com/otiai10/gosseract/v2"
|
||
"regexp"
|
||
"strconv"
|
||
"strings"
|
||
)
|
||
|
||
// OCRService 提供OCR文字识别服务
|
||
type OCRService struct {
|
||
client *gosseract.Client
|
||
}
|
||
|
||
// NewOCRService 创建新的OCR服务实例
|
||
func NewOCRService() (*OCRService, error) {
|
||
client := gosseract.NewClient()
|
||
return &OCRService{client: client}, nil
|
||
}
|
||
|
||
// Close 关闭OCR服务
|
||
func (s *OCRService) Close() {
|
||
s.client.Close()
|
||
}
|
||
|
||
// ExtractInfo 从图片中提取喜报信息
|
||
func (s *OCRService) ExtractInfo(imagePath string) (string, []int, string, error) {
|
||
// 设置中文语言包和OCR配置
|
||
err := s.client.SetLanguage("chi_sim")
|
||
if err != nil {
|
||
return "", nil, "", err
|
||
}
|
||
|
||
// 设置Page Segmentation Mode为自动
|
||
err = s.client.SetPageSegMode(gosseract.PSM_AUTO)
|
||
if err != nil {
|
||
return "", nil, "", err
|
||
}
|
||
|
||
// 设置OCR引擎参数
|
||
configs := []struct {
|
||
key string
|
||
value string
|
||
}{
|
||
{"tessedit_ocr_engine_mode", "2"}, // LSTM only
|
||
{"tessedit_enable_dict_correction", "1"}, // 启用字典校正
|
||
{"tessedit_pageseg_mode", "3"}, // 完全自动页面分割,但没有OSD
|
||
{"tessedit_do_invert", "0"}, // 不反转图像
|
||
{"textord_heavy_nr", "1"}, // 处理粗体文本
|
||
{"language_model_penalty_non_dict_word", "0.2"}, // 降低非字典词的惩罚
|
||
{"language_model_penalty_non_freq_dict_word", "0.2"}, // 降低非常用词的惩罚
|
||
{"tessedit_write_images", "1"}, // 输出调试图像
|
||
}
|
||
|
||
for _, cfg := range configs {
|
||
err = s.client.SetVariable(gosseract.SettableVariable(cfg.key), cfg.value)
|
||
if err != nil {
|
||
return "", nil, "", err
|
||
}
|
||
}
|
||
|
||
// 设置图片
|
||
err = s.client.SetImage(imagePath)
|
||
if err != nil {
|
||
return "", nil, "", err
|
||
}
|
||
|
||
// 获取文本
|
||
text, err := s.client.Text()
|
||
if err != nil {
|
||
return "", nil, "", err
|
||
}
|
||
|
||
// 提取项目名称
|
||
projectName := extractProjectName(text)
|
||
|
||
// 提取点数
|
||
points := extractPoints(text)
|
||
|
||
// 提取代表处
|
||
representative := extractRepresentative(text)
|
||
|
||
return projectName, points, representative, nil
|
||
}
|
||
|
||
// 提取项目名称
|
||
func extractProjectName(text string) string {
|
||
// 按行分割文本
|
||
lines := strings.Split(text, "\n")
|
||
|
||
// 定义项目相关的正则表达式模式
|
||
projectPatterns := []*regexp.Regexp{
|
||
regexp.MustCompile(`([\p{Han}]+)项目`),
|
||
regexp.MustCompile(`项目[::]*\s*([\p{Han}]+)`),
|
||
regexp.MustCompile(`([\p{Han}]+(?:工程|系统))`),
|
||
}
|
||
|
||
// 遍历每一行文本
|
||
for _, line := range lines {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
|
||
// 使用正则表达式匹配项目名称
|
||
for _, pattern := range projectPatterns {
|
||
matches := pattern.FindStringSubmatch(line)
|
||
if len(matches) > 1 {
|
||
name := strings.TrimSpace(matches[1])
|
||
if name != "" && len(name) >= 2 { // 确保项目名称至少包含两个汉字
|
||
return name
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return ""
|
||
}
|
||
|
||
// 提取点数
|
||
func extractPoints(text string) []int {
|
||
var points []int
|
||
|
||
// 将文本按行分割
|
||
lines := strings.Split(text, "\n")
|
||
|
||
// 存储可能的点数
|
||
var possiblePoints []int
|
||
|
||
// 遍历所有行,找出包含"点"字的行,并在其之前的行中查找数字
|
||
for i, line := range lines {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
|
||
// 如果当前行包含"点"字
|
||
if strings.Contains(line, "点") {
|
||
// 向上查找最多3行
|
||
startIdx := maxInt(0, i-3)
|
||
|
||
// 检查当前行之前的行
|
||
for j := startIdx; j <= i; j++ {
|
||
prevLine := strings.TrimSpace(lines[j])
|
||
if prevLine == "" {
|
||
continue
|
||
}
|
||
|
||
// 提取行中的数字
|
||
numPattern := regexp.MustCompile(`(\d+)`)
|
||
matches := numPattern.FindAllStringSubmatch(prevLine, -1)
|
||
for _, match := range matches {
|
||
if len(match) >= 2 {
|
||
if num, err := strconv.Atoi(match[1]); err == nil {
|
||
if num > 0 && num <= 1000 {
|
||
possiblePoints = append(possiblePoints, num)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 去重并返回结果
|
||
pointsMap := make(map[int]bool)
|
||
for _, num := range possiblePoints {
|
||
if !pointsMap[num] {
|
||
points = append(points, num)
|
||
pointsMap[num] = true
|
||
}
|
||
}
|
||
|
||
return points
|
||
}
|
||
|
||
// 提取代表处
|
||
// maxInt 返回两个整数中的较大值
|
||
func maxInt(a, b int) int {
|
||
if a > b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
// minInt 返回两个整数中的较小值
|
||
func minInt(a, b int) int {
|
||
if a < b {
|
||
return a
|
||
}
|
||
return b
|
||
}
|
||
|
||
func extractRepresentative(text string) string {
|
||
// 将文本按行分割
|
||
lines := strings.Split(text, "\n")
|
||
|
||
// 定义代表处相关的关键词和对应的正则表达式模式
|
||
patterns := map[string]string{
|
||
"代表处": `([\p{Han}]{2,}代表处)`,
|
||
"事业部": `([\p{Han}]{2,}事业部)`,
|
||
"项目组": `([\p{Han}]{2,}项目组)`,
|
||
}
|
||
|
||
// 遍历每一行文本
|
||
for _, line := range lines {
|
||
// 移除多余空格
|
||
line = strings.TrimSpace(line)
|
||
|
||
// 跳过空行
|
||
if line == "" {
|
||
continue
|
||
}
|
||
|
||
// 遍历所有模式进行匹配
|
||
for keyword, pattern := range patterns {
|
||
if strings.Contains(line, keyword) {
|
||
re := regexp.MustCompile(pattern)
|
||
matches := re.FindStringSubmatch(line)
|
||
if len(matches) > 1 {
|
||
// 返回匹配到的完整名称
|
||
return matches[1]
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// 如果没有找到完整匹配,尝试提取可能的组织名称
|
||
orgPatterns := []string{
|
||
`([\p{Han}]{2,}(?:组|部|处|司|中心))`,
|
||
`([\p{Han}]{2,}(?:公司|单位))`,
|
||
}
|
||
|
||
for _, line := range lines {
|
||
line = strings.TrimSpace(line)
|
||
if line == "" {
|
||
continue
|
||
}
|
||
|
||
for _, pattern := range orgPatterns {
|
||
re := regexp.MustCompile(pattern)
|
||
matches := re.FindStringSubmatch(line)
|
||
if len(matches) > 1 {
|
||
return matches[1]
|
||
}
|
||
}
|
||
}
|
||
|
||
return ""
|
||
} |