fix(decision+news): correct Unicode regex escaping & precompile telegram patterns

## Critical Fix: Unicode Regex Escaping

### decision/engine.go
-  WRONG: `regexp.MustCompile(`[\u200B...]`)` (raw string, no escaping)
-  FIXED: `regexp.MustCompile("[\u200B...]")` (double quotes, proper Unicode)

**Impact**: Backticks don't parse \uXXXX escape sequences in Go!
- Raw string: matches literal text "\u200B" (useless)
- Double quotes: matches Unicode characters U+200B, U+200C, U+200D, U+FEFF (correct)

### news/provider/telegram/telegram.go
- Move regex patterns to global precompiled variables
- Eliminates repeated compilation in stripHTML()

## Performance
- Regex compilation: O(n) → O(1)
- stripHTML() now uses precompiled patterns

## Testing
 Compilation successful
 Unicode characters properly matched
This commit is contained in:
ZhouYongyou
2025-11-05 01:02:49 +08:00
parent dcfc997b59
commit db7c0359f4
2 changed files with 17 additions and 18 deletions

View File

@@ -22,7 +22,7 @@ var (
reJSONArray = regexp.MustCompile(`(?is)\[\s*\{.*?\}\s*\]`)
reArrayHead = regexp.MustCompile(`^\[\s*\{`)
reArrayOpenSpace = regexp.MustCompile(`^\[\s+\{`)
reInvisibleRunes = regexp.MustCompile(`[\u200B\u200C\u200D\uFEFF]`)
reInvisibleRunes = regexp.MustCompile("[\u200B\u200C\u200D\uFEFF]")
)
// PositionInfo 持仓信息

View File

@@ -14,6 +14,13 @@ import (
"github.com/samber/lo"
)
var (
reHTMLTag = regexp.MustCompile(`\<[\S\s]+?\>`)
reStyleBlock = regexp.MustCompile(`\<style[\S\s]+?\</style\>`)
reScriptBlock = regexp.MustCompile(`\<script[\S\s]+?\</script\>`)
reMultiSpace = regexp.MustCompile(`\s{2,}`)
)
// Message 表示 Telegram 消息结构
type Message struct {
MessageID string `json:"messageId"`
@@ -268,27 +275,19 @@ func splitLast(s, sep string) []string {
// stripHTML 移除字符串中的所有 HTML 标签,只保留纯文本
func stripHTML(s string) string {
// HTML标签全转换成小写(确保匹配大小写不敏感的标签)
re := regexp.MustCompile(`\<[\S\s]+?\>`)
s = re.ReplaceAllStringFunc(s, strings.ToLower)
// 先將 HTML 標籤統一成小寫字母,方便後續匹配
s = reHTMLTag.ReplaceAllStringFunc(s, strings.ToLower)
// 去除 <style> 标签及其内容
re = regexp.MustCompile(`\<style[\S\s]+?\</style\>`)
s = re.ReplaceAllString(s, "")
// 移除樣式與腳本區塊
s = reStyleBlock.ReplaceAllString(s, "")
s = reScriptBlock.ReplaceAllString(s, "")
// 去除 <script> 标签及其内容
re = regexp.MustCompile(`\<script[\S\s]+?\</script\>`)
s = re.ReplaceAllString(s, "")
// 將剩餘標籤替換為換行,保留文本結構
s = reHTMLTag.ReplaceAllString(s, "\n")
// 去除所有尖括号内的 HTML 代码,并换成换行符
re = regexp.MustCompile(`\<[\S\s]+?\>`)
s = re.ReplaceAllString(s, "\n")
// 收斂連續空白為單一換行
s = reMultiSpace.ReplaceAllString(s, "\n")
// 去除连续的换行符和空白字符
re = regexp.MustCompile(`\s{2,}`)
s = re.ReplaceAllString(s, "\n")
// 去除首尾的空白字符
return strings.TrimSpace(s)
}