good-news/services/ocr_service.go

249 lines
6.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

package services
import (
"github.com/otiai10/gosseract/v2"
"regexp"
"strconv"
"strings"
)
// OCRService 提供OCR文字识别服务
type OCRService struct {
client *gosseract.Client
}
// NewOCRService 创建新的OCR服务实例
func NewOCRService() (*OCRService, error) {
client := gosseract.NewClient()
return &OCRService{client: client}, nil
}
// Close 关闭OCR服务
func (s *OCRService) Close() {
s.client.Close()
}
// ExtractInfo 从图片中提取喜报信息
func (s *OCRService) ExtractInfo(imagePath string) (string, []int, string, error) {
// 设置中文语言包和OCR配置
err := s.client.SetLanguage("chi_sim")
if err != nil {
return "", nil, "", err
}
// 设置Page Segmentation Mode为自动
err = s.client.SetPageSegMode(gosseract.PSM_AUTO)
if err != nil {
return "", nil, "", err
}
// 设置OCR引擎参数
configs := []struct {
key string
value string
}{
{"tessedit_ocr_engine_mode", "2"}, // LSTM only
{"tessedit_enable_dict_correction", "1"}, // 启用字典校正
{"tessedit_pageseg_mode", "3"}, // 完全自动页面分割但没有OSD
{"tessedit_do_invert", "0"}, // 不反转图像
{"textord_heavy_nr", "1"}, // 处理粗体文本
{"language_model_penalty_non_dict_word", "0.2"}, // 降低非字典词的惩罚
{"language_model_penalty_non_freq_dict_word", "0.2"}, // 降低非常用词的惩罚
{"tessedit_write_images", "1"}, // 输出调试图像
}
for _, cfg := range configs {
err = s.client.SetVariable(gosseract.SettableVariable(cfg.key), cfg.value)
if err != nil {
return "", nil, "", err
}
}
// 设置图片
err = s.client.SetImage(imagePath)
if err != nil {
return "", nil, "", err
}
// 获取文本
text, err := s.client.Text()
if err != nil {
return "", nil, "", err
}
// 提取项目名称
projectName := extractProjectName(text)
// 提取点数
points := extractPoints(text)
// 提取代表处
representative := extractRepresentative(text)
return projectName, points, representative, nil
}
// 提取项目名称
func extractProjectName(text string) string {
// 按行分割文本
lines := strings.Split(text, "\n")
// 定义项目相关的正则表达式模式
projectPatterns := []*regexp.Regexp{
regexp.MustCompile(`([\p{Han}]+)项目`),
regexp.MustCompile(`项目[:]*\s*([\p{Han}]+)`),
regexp.MustCompile(`([\p{Han}]+(?:工程|系统))`),
}
// 遍历每一行文本
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
// 使用正则表达式匹配项目名称
for _, pattern := range projectPatterns {
matches := pattern.FindStringSubmatch(line)
if len(matches) > 1 {
name := strings.TrimSpace(matches[1])
if name != "" && len(name) >= 2 { // 确保项目名称至少包含两个汉字
return name
}
}
}
}
return ""
}
// 提取点数
func extractPoints(text string) []int {
var points []int
// 将文本按行分割
lines := strings.Split(text, "\n")
// 存储可能的点数
var possiblePoints []int
// 遍历所有行,找出包含"点"字的行,并在其之前的行中查找数字
for i, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
// 如果当前行包含"点"字
if strings.Contains(line, "点") {
// 向上查找最多3行
startIdx := maxInt(0, i-3)
// 检查当前行之前的行
for j := startIdx; j <= i; j++ {
prevLine := strings.TrimSpace(lines[j])
if prevLine == "" {
continue
}
// 提取行中的数字
numPattern := regexp.MustCompile(`(\d+)`)
matches := numPattern.FindAllStringSubmatch(prevLine, -1)
for _, match := range matches {
if len(match) >= 2 {
if num, err := strconv.Atoi(match[1]); err == nil {
if num > 0 && num <= 1000 {
possiblePoints = append(possiblePoints, num)
}
}
}
}
}
}
}
// 去重并返回结果
pointsMap := make(map[int]bool)
for _, num := range possiblePoints {
if !pointsMap[num] {
points = append(points, num)
pointsMap[num] = true
}
}
return points
}
// 提取代表处
// maxInt 返回两个整数中的较大值
func maxInt(a, b int) int {
if a > b {
return a
}
return b
}
// minInt 返回两个整数中的较小值
func minInt(a, b int) int {
if a < b {
return a
}
return b
}
func extractRepresentative(text string) string {
// 将文本按行分割
lines := strings.Split(text, "\n")
// 定义代表处相关的关键词和对应的正则表达式模式
patterns := map[string]string{
"代表处": `([\p{Han}]{2,}代表处)`,
"事业部": `([\p{Han}]{2,}事业部)`,
"项目组": `([\p{Han}]{2,}项目组)`,
}
// 遍历每一行文本
for _, line := range lines {
// 移除多余空格
line = strings.TrimSpace(line)
// 跳过空行
if line == "" {
continue
}
// 遍历所有模式进行匹配
for keyword, pattern := range patterns {
if strings.Contains(line, keyword) {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
// 返回匹配到的完整名称
return matches[1]
}
}
}
}
// 如果没有找到完整匹配,尝试提取可能的组织名称
orgPatterns := []string{
`([\p{Han}]{2,}(?:组|部|处|司|中心))`,
`([\p{Han}]{2,}(?:公司|单位))`,
}
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
continue
}
for _, pattern := range orgPatterns {
re := regexp.MustCompile(pattern)
matches := re.FindStringSubmatch(line)
if len(matches) > 1 {
return matches[1]
}
}
}
return ""
}