diff --git a/sys/wordfilter/core.go b/sys/wordfilter/core.go new file mode 100644 index 000000000..bb66d5d11 --- /dev/null +++ b/sys/wordfilter/core.go @@ -0,0 +1,119 @@ +package wordfilter + +import "io" + +/* +系统:单词守卫 +描述:DFA 算法实践敏感词过滤 敏感词 过滤 验证 替换 +*/ + +type ( + ISys interface { + //加载文件 + LoadWordDict(path string) error + //加载网络数据 + LoadNetWordDict(url string) error + ///加载 + Load(rd io.Reader) error + ///添加敏感词 + AddWord(words ...string) + ///删除敏感词 + DelWord(words ...string) + ///过滤敏感词 + Filter(text string) string + ///和谐敏感词 + Replace(text string, repl rune) string + ///检测敏感词 + FindIn(text string) (bool, string) + ///找到所有匹配词 + FindAll(text string) []string + ///检测字符串是否合法 + Validate(text string) (bool, string) + ///去除空格等噪音 + RemoveNoise(text string) string + ///更新去噪模式 + UpdateNoisePattern(pattern string) + } +) + +var ( + defsys ISys +) + +func OnInit(config map[string]interface{}, opt ...Option) (err error) { + var option *Options + if option, err = newOptions(config, opt...); err != nil { + return + } + defsys, err = newSys(option) + return +} + +func NewSys(opt ...Option) (sys ISys, err error) { + var option *Options + if option, err = newOptionsByOption(opt...); err != nil { + return + } + sys, err = newSys(option) + return +} + +//加载文件 +func LoadWordDict(path string) error { + return defsys.LoadWordDict(path) +} + +//加载网络数据 +func LoadNetWordDict(url string) error { + return defsys.LoadNetWordDict(url) +} + +///加载 +func Load(rd io.Reader) error { + return defsys.Load(rd) +} + +///添加敏感词 +func AddWord(words ...string) { + defsys.AddWord(words...) +} + +///删除敏感词 +func DelWord(words ...string) { + defsys.DelWord(words...) +} + +///过滤敏感词 +func Filter(text string) string { + return defsys.Filter(text) +} + +///和谐敏感词 +func Replace(text string, repl rune) string { + return defsys.Replace(text, repl) +} + +///检测敏感词 +func FindIn(text string) (bool, string) { + return defsys.FindIn(text) +} + +///找到所有匹配词 +func FindAll(text string) []string { + return defsys.FindAll(text) +} + +///检测字符串是否合法 +func Validate(text string) (bool, string) { + return defsys.Validate(text) +} + +///去除空格等噪音 +func RemoveNoise(text string) string { + return defsys.RemoveNoise(text) +} + +///更新去噪模式 +func UpdateNoisePattern(pattern string) { + defsys.UpdateNoisePattern(pattern) +} diff --git a/sys/wordfilter/options.go b/sys/wordfilter/options.go new file mode 100644 index 000000000..e31639823 --- /dev/null +++ b/sys/wordfilter/options.go @@ -0,0 +1,53 @@ +package wordfilter + +import ( + "errors" + + "go_dreamfactory/lego/sys/log" + "go_dreamfactory/lego/utils/mapstructure" +) + +type Option func(*Options) +type Options struct { + Debug bool //日志是否开启 + Log log.Ilogf +} + +func SetDebug(v bool) Option { + return func(o *Options) { + o.Debug = v + } +} + +func SetLog(v log.Ilogf) Option { + return func(o *Options) { + o.Log = v + } +} + +func newOptions(config map[string]interface{}, opts ...Option) (options *Options, err error) { + options = &Options{} + if config != nil { + mapstructure.Decode(config, &options) + } + for _, o := range opts { + o(options) + } + + if options.Log = log.NewTurnlog(options.Debug, log.Clone("sys.Blockcache", 2)); options.Log == nil { + err = errors.New("log is nil") + } + + return +} + +func newOptionsByOption(opts ...Option) (options *Options, err error) { + options = &Options{} + for _, o := range opts { + o(options) + } + if options.Log = log.NewTurnlog(options.Debug, log.Clone("sys.Blockcache", 2)); options.Log == nil { + err = errors.New("log is nil") + } + return +} diff --git a/sys/wordfilter/sys.go b/sys/wordfilter/sys.go new file mode 100644 index 000000000..c83a87925 --- /dev/null +++ b/sys/wordfilter/sys.go @@ -0,0 +1,113 @@ +package wordfilter + +import ( + "bufio" + "io" + "net/http" + "os" + "regexp" + "time" +) + +func newSys(options *Options) (sys *Sys, err error) { + sys = &Sys{ + options: options, + trie: NewTrie(), + noise: regexp.MustCompile(`[\|\s&%$@*]+`), + } + return +} + +type Sys struct { + options *Options + trie *Trie + noise *regexp.Regexp +} + +// UpdateNoisePattern 更新去噪模式 +func (Sys *Sys) UpdateNoisePattern(pattern string) { + Sys.noise = regexp.MustCompile(pattern) +} + +// LoadWordDict 加载敏感词字典 +func (Sys *Sys) LoadWordDict(path string) error { + f, err := os.Open(path) + if err != nil { + return err + } + defer f.Close() + + return Sys.Load(f) +} + +// LoadNetWordDict 加载网络敏感词字典 +func (Sys *Sys) LoadNetWordDict(url string) error { + c := http.Client{ + Timeout: 5 * time.Second, + } + rsp, err := c.Get(url) + if err != nil { + return err + } + defer rsp.Body.Close() + + return Sys.Load(rsp.Body) +} + +// Load common method to add words +func (Sys *Sys) Load(rd io.Reader) error { + buf := bufio.NewReader(rd) + for { + line, _, err := buf.ReadLine() + if err != nil { + if err != io.EOF { + return err + } + break + } + Sys.trie.Add(string(line)) + } + return nil +} + +// AddWord 添加敏感词 +func (Sys *Sys) AddWord(words ...string) { + Sys.trie.Add(words...) +} + +// DelWord 删除敏感词 +func (Sys *Sys) DelWord(words ...string) { + Sys.trie.Del(words...) +} + +// Sys 过滤敏感词 +func (Sys *Sys) Filter(text string) string { + return Sys.trie.Filter(text) +} + +// Replace 和谐敏感词 +func (Sys *Sys) Replace(text string, repl rune) string { + return Sys.trie.Replace(text, repl) +} + +// FindIn 检测敏感词 +func (Sys *Sys) FindIn(text string) (bool, string) { + text = Sys.RemoveNoise(text) + return Sys.trie.FindIn(text) +} + +// FindAll 找到所有匹配词 +func (Sys *Sys) FindAll(text string) []string { + return Sys.trie.FindAll(text) +} + +// Validate 检测字符串是否合法 +func (Sys *Sys) Validate(text string) (bool, string) { + text = Sys.RemoveNoise(text) + return Sys.trie.Validate(text) +} + +// RemoveNoise 去除空格等噪音 +func (Sys *Sys) RemoveNoise(text string) string { + return Sys.noise.ReplaceAllString(text, "") +} diff --git a/sys/wordfilter/trie.go b/sys/wordfilter/trie.go new file mode 100644 index 000000000..fb3e61c49 --- /dev/null +++ b/sys/wordfilter/trie.go @@ -0,0 +1,272 @@ +package wordfilter + +// Node Trie树上的一个节点. +type Node struct { + isRootNode bool + isPathEnd bool + Character rune + Children map[rune]*Node +} + +// Trie 短语组成的Trie树. +type Trie struct { + Root *Node +} + +// NewTrie 新建一棵Trie +func NewTrie() *Trie { + return &Trie{ + Root: NewRootNode(0), + } +} + +// Add 添加若干个词 +func (tree *Trie) Add(words ...string) { + for _, word := range words { + tree.add(word) + } +} + +func (tree *Trie) add(word string) { + var current = tree.Root + var runes = []rune(word) + for position := 0; position < len(runes); position++ { + r := runes[position] + if next, ok := current.Children[r]; ok { + current = next + } else { + newNode := NewNode(r) + current.Children[r] = newNode + current = newNode + } + if position == len(runes)-1 { + current.isPathEnd = true + } + } +} + +func (tree *Trie) Del(words ...string) { + for _, word := range words { + tree.del(word) + } +} + +func (tree *Trie) del(word string) { + var current = tree.Root + var runes = []rune(word) + for position := 0; position < len(runes); position++ { + r := runes[position] + if next, ok := current.Children[r]; !ok { + return + } else { + current = next + } + + if position == len(runes)-1 { + current.SoftDel() + } + } +} + +// Replace 词语替换 +func (tree *Trie) Replace(text string, character rune) string { + var ( + parent = tree.Root + current *Node + runes = []rune(text) + length = len(runes) + left = 0 + found bool + ) + + for position := 0; position < len(runes); position++ { + current, found = parent.Children[runes[position]] + + if !found || (!current.IsPathEnd() && position == length-1) { + parent = tree.Root + position = left + left++ + continue + } + + // println(string(current.Character), current.IsPathEnd(), left) + if current.IsPathEnd() && left <= position { + for i := left; i <= position; i++ { + runes[i] = character + } + } + + parent = current + } + + return string(runes) +} + +// Filter 直接过滤掉字符串中的敏感词 +func (tree *Trie) Filter(text string) string { + var ( + parent = tree.Root + current *Node + left = 0 + found bool + runes = []rune(text) + length = len(runes) + resultRunes = make([]rune, 0, length) + ) + + for position := 0; position < length; position++ { + current, found = parent.Children[runes[position]] + + if !found || (!current.IsPathEnd() && position == length-1) { + resultRunes = append(resultRunes, runes[left]) + parent = tree.Root + position = left + left++ + continue + } + + if current.IsPathEnd() { + left = position + 1 + parent = tree.Root + } else { + parent = current + } + + } + + resultRunes = append(resultRunes, runes[left:]...) + return string(resultRunes) +} + +// Validate 验证字符串是否合法,如不合法则返回false和检测到 +// 的第一个敏感词 +func (tree *Trie) Validate(text string) (bool, string) { + const ( + Empty = "" + ) + var ( + parent = tree.Root + current *Node + runes = []rune(text) + length = len(runes) + left = 0 + found bool + ) + + for position := 0; position < len(runes); position++ { + current, found = parent.Children[runes[position]] + + if !found || (!current.IsPathEnd() && position == length-1) { + parent = tree.Root + position = left + left++ + continue + } + + if current.IsPathEnd() && left <= position { + return false, string(runes[left : position+1]) + } + + parent = current + } + + return true, Empty +} + +// FindIn 判断text中是否含有词库中的词 +func (tree *Trie) FindIn(text string) (bool, string) { + validated, first := tree.Validate(text) + return !validated, first +} + +// FindAll 找有所有包含在词库中的词 +func (tree *Trie) FindAll(text string) []string { + var matches []string + var ( + parent = tree.Root + current *Node + runes = []rune(text) + length = len(runes) + left = 0 + found bool + ) + + for position := 0; position < length; position++ { + current, found = parent.Children[runes[position]] + + if !found { + parent = tree.Root + position = left + left++ + continue + } + + if current.IsPathEnd() && left <= position { + matches = append(matches, string(runes[left:position+1])) + } + + if position == length-1 { + parent = tree.Root + position = left + left++ + continue + } + + parent = current + } + + var i = 0 + if count := len(matches); count > 0 { + set := make(map[string]struct{}) + for i < count { + _, ok := set[matches[i]] + if !ok { + set[matches[i]] = struct{}{} + i++ + continue + } + count-- + copy(matches[i:], matches[i+1:]) + } + return matches[:count] + } + + return nil +} + +// NewNode 新建子节点 +func NewNode(character rune) *Node { + return &Node{ + Character: character, + Children: make(map[rune]*Node, 0), + } +} + +// NewRootNode 新建根节点 +func NewRootNode(character rune) *Node { + return &Node{ + isRootNode: true, + Character: character, + Children: make(map[rune]*Node, 0), + } +} + +// IsLeafNode 判断是否叶子节点 +func (node *Node) IsLeafNode() bool { + return len(node.Children) == 0 +} + +// IsRootNode 判断是否为根节点 +func (node *Node) IsRootNode() bool { + return node.isRootNode +} + +// IsPathEnd 判断是否为某个路径的结束 +func (node *Node) IsPathEnd() bool { + return node.isPathEnd +} + +// SoftDel 置软删除状态 +func (node *Node) SoftDel() { + node.isPathEnd = false +}