上传铭感词过滤系统

This commit is contained in:
liwei1dao 2022-08-29 10:38:32 +08:00
parent d94e4a4460
commit 994162252b
4 changed files with 557 additions and 0 deletions

119
sys/wordfilter/core.go Normal file
View File

@ -0,0 +1,119 @@
package wordfilter
import "io"
/*
系统:单词守卫
描述:DFA 算法实践敏感词过滤 敏感词 过滤 验证 替换
*/
type (
ISys interface {
//加载文件
LoadWordDict(path string) error
//加载网络数据
LoadNetWordDict(url string) error
///加载
Load(rd io.Reader) error
///添加敏感词
AddWord(words ...string)
///删除敏感词
DelWord(words ...string)
///过滤敏感词
Filter(text string) string
///和谐敏感词
Replace(text string, repl rune) string
///检测敏感词
FindIn(text string) (bool, string)
///找到所有匹配词
FindAll(text string) []string
///检测字符串是否合法
Validate(text string) (bool, string)
///去除空格等噪音
RemoveNoise(text string) string
///更新去噪模式
UpdateNoisePattern(pattern string)
}
)
var (
defsys ISys
)
func OnInit(config map[string]interface{}, opt ...Option) (err error) {
var option *Options
if option, err = newOptions(config, opt...); err != nil {
return
}
defsys, err = newSys(option)
return
}
func NewSys(opt ...Option) (sys ISys, err error) {
var option *Options
if option, err = newOptionsByOption(opt...); err != nil {
return
}
sys, err = newSys(option)
return
}
//加载文件
func LoadWordDict(path string) error {
return defsys.LoadWordDict(path)
}
//加载网络数据
func LoadNetWordDict(url string) error {
return defsys.LoadNetWordDict(url)
}
///加载
func Load(rd io.Reader) error {
return defsys.Load(rd)
}
///添加敏感词
func AddWord(words ...string) {
defsys.AddWord(words...)
}
///删除敏感词
func DelWord(words ...string) {
defsys.DelWord(words...)
}
///过滤敏感词
func Filter(text string) string {
return defsys.Filter(text)
}
///和谐敏感词
func Replace(text string, repl rune) string {
return defsys.Replace(text, repl)
}
///检测敏感词
func FindIn(text string) (bool, string) {
return defsys.FindIn(text)
}
///找到所有匹配词
func FindAll(text string) []string {
return defsys.FindAll(text)
}
///检测字符串是否合法
func Validate(text string) (bool, string) {
return defsys.Validate(text)
}
///去除空格等噪音
func RemoveNoise(text string) string {
return defsys.RemoveNoise(text)
}
///更新去噪模式
func UpdateNoisePattern(pattern string) {
defsys.UpdateNoisePattern(pattern)
}

53
sys/wordfilter/options.go Normal file
View File

@ -0,0 +1,53 @@
package wordfilter
import (
"errors"
"go_dreamfactory/lego/sys/log"
"go_dreamfactory/lego/utils/mapstructure"
)
type Option func(*Options)
type Options struct {
Debug bool //日志是否开启
Log log.Ilogf
}
func SetDebug(v bool) Option {
return func(o *Options) {
o.Debug = v
}
}
func SetLog(v log.Ilogf) Option {
return func(o *Options) {
o.Log = v
}
}
func newOptions(config map[string]interface{}, opts ...Option) (options *Options, err error) {
options = &Options{}
if config != nil {
mapstructure.Decode(config, &options)
}
for _, o := range opts {
o(options)
}
if options.Log = log.NewTurnlog(options.Debug, log.Clone("sys.Blockcache", 2)); options.Log == nil {
err = errors.New("log is nil")
}
return
}
func newOptionsByOption(opts ...Option) (options *Options, err error) {
options = &Options{}
for _, o := range opts {
o(options)
}
if options.Log = log.NewTurnlog(options.Debug, log.Clone("sys.Blockcache", 2)); options.Log == nil {
err = errors.New("log is nil")
}
return
}

113
sys/wordfilter/sys.go Normal file
View File

@ -0,0 +1,113 @@
package wordfilter
import (
"bufio"
"io"
"net/http"
"os"
"regexp"
"time"
)
func newSys(options *Options) (sys *Sys, err error) {
sys = &Sys{
options: options,
trie: NewTrie(),
noise: regexp.MustCompile(`[\|\s&%$@*]+`),
}
return
}
type Sys struct {
options *Options
trie *Trie
noise *regexp.Regexp
}
// UpdateNoisePattern 更新去噪模式
func (Sys *Sys) UpdateNoisePattern(pattern string) {
Sys.noise = regexp.MustCompile(pattern)
}
// LoadWordDict 加载敏感词字典
func (Sys *Sys) LoadWordDict(path string) error {
f, err := os.Open(path)
if err != nil {
return err
}
defer f.Close()
return Sys.Load(f)
}
// LoadNetWordDict 加载网络敏感词字典
func (Sys *Sys) LoadNetWordDict(url string) error {
c := http.Client{
Timeout: 5 * time.Second,
}
rsp, err := c.Get(url)
if err != nil {
return err
}
defer rsp.Body.Close()
return Sys.Load(rsp.Body)
}
// Load common method to add words
func (Sys *Sys) Load(rd io.Reader) error {
buf := bufio.NewReader(rd)
for {
line, _, err := buf.ReadLine()
if err != nil {
if err != io.EOF {
return err
}
break
}
Sys.trie.Add(string(line))
}
return nil
}
// AddWord 添加敏感词
func (Sys *Sys) AddWord(words ...string) {
Sys.trie.Add(words...)
}
// DelWord 删除敏感词
func (Sys *Sys) DelWord(words ...string) {
Sys.trie.Del(words...)
}
// Sys 过滤敏感词
func (Sys *Sys) Filter(text string) string {
return Sys.trie.Filter(text)
}
// Replace 和谐敏感词
func (Sys *Sys) Replace(text string, repl rune) string {
return Sys.trie.Replace(text, repl)
}
// FindIn 检测敏感词
func (Sys *Sys) FindIn(text string) (bool, string) {
text = Sys.RemoveNoise(text)
return Sys.trie.FindIn(text)
}
// FindAll 找到所有匹配词
func (Sys *Sys) FindAll(text string) []string {
return Sys.trie.FindAll(text)
}
// Validate 检测字符串是否合法
func (Sys *Sys) Validate(text string) (bool, string) {
text = Sys.RemoveNoise(text)
return Sys.trie.Validate(text)
}
// RemoveNoise 去除空格等噪音
func (Sys *Sys) RemoveNoise(text string) string {
return Sys.noise.ReplaceAllString(text, "")
}

272
sys/wordfilter/trie.go Normal file
View File

@ -0,0 +1,272 @@
package wordfilter
// Node Trie树上的一个节点.
type Node struct {
isRootNode bool
isPathEnd bool
Character rune
Children map[rune]*Node
}
// Trie 短语组成的Trie树.
type Trie struct {
Root *Node
}
// NewTrie 新建一棵Trie
func NewTrie() *Trie {
return &Trie{
Root: NewRootNode(0),
}
}
// Add 添加若干个词
func (tree *Trie) Add(words ...string) {
for _, word := range words {
tree.add(word)
}
}
func (tree *Trie) add(word string) {
var current = tree.Root
var runes = []rune(word)
for position := 0; position < len(runes); position++ {
r := runes[position]
if next, ok := current.Children[r]; ok {
current = next
} else {
newNode := NewNode(r)
current.Children[r] = newNode
current = newNode
}
if position == len(runes)-1 {
current.isPathEnd = true
}
}
}
func (tree *Trie) Del(words ...string) {
for _, word := range words {
tree.del(word)
}
}
func (tree *Trie) del(word string) {
var current = tree.Root
var runes = []rune(word)
for position := 0; position < len(runes); position++ {
r := runes[position]
if next, ok := current.Children[r]; !ok {
return
} else {
current = next
}
if position == len(runes)-1 {
current.SoftDel()
}
}
}
// Replace 词语替换
func (tree *Trie) Replace(text string, character rune) string {
var (
parent = tree.Root
current *Node
runes = []rune(text)
length = len(runes)
left = 0
found bool
)
for position := 0; position < len(runes); position++ {
current, found = parent.Children[runes[position]]
if !found || (!current.IsPathEnd() && position == length-1) {
parent = tree.Root
position = left
left++
continue
}
// println(string(current.Character), current.IsPathEnd(), left)
if current.IsPathEnd() && left <= position {
for i := left; i <= position; i++ {
runes[i] = character
}
}
parent = current
}
return string(runes)
}
// Filter 直接过滤掉字符串中的敏感词
func (tree *Trie) Filter(text string) string {
var (
parent = tree.Root
current *Node
left = 0
found bool
runes = []rune(text)
length = len(runes)
resultRunes = make([]rune, 0, length)
)
for position := 0; position < length; position++ {
current, found = parent.Children[runes[position]]
if !found || (!current.IsPathEnd() && position == length-1) {
resultRunes = append(resultRunes, runes[left])
parent = tree.Root
position = left
left++
continue
}
if current.IsPathEnd() {
left = position + 1
parent = tree.Root
} else {
parent = current
}
}
resultRunes = append(resultRunes, runes[left:]...)
return string(resultRunes)
}
// Validate 验证字符串是否合法如不合法则返回false和检测到
// 的第一个敏感词
func (tree *Trie) Validate(text string) (bool, string) {
const (
Empty = ""
)
var (
parent = tree.Root
current *Node
runes = []rune(text)
length = len(runes)
left = 0
found bool
)
for position := 0; position < len(runes); position++ {
current, found = parent.Children[runes[position]]
if !found || (!current.IsPathEnd() && position == length-1) {
parent = tree.Root
position = left
left++
continue
}
if current.IsPathEnd() && left <= position {
return false, string(runes[left : position+1])
}
parent = current
}
return true, Empty
}
// FindIn 判断text中是否含有词库中的词
func (tree *Trie) FindIn(text string) (bool, string) {
validated, first := tree.Validate(text)
return !validated, first
}
// FindAll 找有所有包含在词库中的词
func (tree *Trie) FindAll(text string) []string {
var matches []string
var (
parent = tree.Root
current *Node
runes = []rune(text)
length = len(runes)
left = 0
found bool
)
for position := 0; position < length; position++ {
current, found = parent.Children[runes[position]]
if !found {
parent = tree.Root
position = left
left++
continue
}
if current.IsPathEnd() && left <= position {
matches = append(matches, string(runes[left:position+1]))
}
if position == length-1 {
parent = tree.Root
position = left
left++
continue
}
parent = current
}
var i = 0
if count := len(matches); count > 0 {
set := make(map[string]struct{})
for i < count {
_, ok := set[matches[i]]
if !ok {
set[matches[i]] = struct{}{}
i++
continue
}
count--
copy(matches[i:], matches[i+1:])
}
return matches[:count]
}
return nil
}
// NewNode 新建子节点
func NewNode(character rune) *Node {
return &Node{
Character: character,
Children: make(map[rune]*Node, 0),
}
}
// NewRootNode 新建根节点
func NewRootNode(character rune) *Node {
return &Node{
isRootNode: true,
Character: character,
Children: make(map[rune]*Node, 0),
}
}
// IsLeafNode 判断是否叶子节点
func (node *Node) IsLeafNode() bool {
return len(node.Children) == 0
}
// IsRootNode 判断是否为根节点
func (node *Node) IsRootNode() bool {
return node.isRootNode
}
// IsPathEnd 判断是否为某个路径的结束
func (node *Node) IsPathEnd() bool {
return node.isPathEnd
}
// SoftDel 置软删除状态
func (node *Node) SoftDel() {
node.isPathEnd = false
}