派生自 projectDept/qhighschool

11
EricsHu
2023-05-26 1bce00f5b3614ca12b683dfe8a3cf733e1ed68b6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
package com.qxueyou.scc.Sensitive;
 
import java.io.*;
import java.util.ArrayList;
import java.util.Collections;
 
 
public final class SensitiveWordFilter {
    public static List wordList;
    private final static char replace = '*'; // 替代字符
    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过,例如,如果"AB"是敏感词,那么"A B","A=B"也会被屏蔽
            '!','*','-','+','_','=',',','.','@'
    };
 
    /**
     * 敏感词替换
     * @param text 待替换文本
     * @return 替换后的文本
     */
    public static String Filter(String text){
        if(wordList == null || wordList.size() == 0) return text;
        char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
        int i,j;
        Word word;
        boolean flag; // 是否需要替换
        for(i=0;i<__char__.length;i++){ // 遍历所有字符
            char c = __char__[i];
            word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
            if(word != null){ // word != null说明找到了
                flag = false;
                j = i+1;
                while (j < __char__.length){ // 开始逐个比较后面的字符
                    if(skip(__char__[j])) { // 跳过空格之类的无关字符
                        j++;
                        continue;
                    }
                    if(word.next != null){ // 字符串尚未结束,不确定是否存在敏感词
                        /*
                        以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
                        例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
                        假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
                         */
                        word = word.next.get(__char__[j]);
                        if(word == null){
                            break;
                        }
                        j++;
                    }else { // 字符串已结束,存在敏感词汇
                        flag = true;
                        break;
                    }
                }
                if(word != null && word.next == null){
                    flag = true;
                }
                if(flag){ // 如果flag==true,说明检测出敏感粗,需要替换
                    while (i<j){
                        if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
                            i++;
                            continue;
                        }
                        __char__[i] = replace;
                        i++;
                    }
                    i--;
                }
            }
        }
        return new String(__char__);
    }
 
    /**
     * 加载敏感词列表
     * @param words 敏感词数组
     */
    public static void loadWord(ArrayList<String> words){
        if(words == null) return;
        char[] chars;
        List now;
        Word word;
        wordList = new List();
        for(String __word__:words){
            if(__word__ == null) continue;
            chars = __word__.toCharArray();
            now = wordList;
            word = null;
            for(char c:chars){
                if(word != null) {
                    if(word.next == null) word.next = new List();
                    now = word.next;
                }
                word = now.get(c);
                if(word == null) word = now.add(c);
            }
        }
        sort(wordList);
    }
 
    /**
     * 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码
     * @param path txt文件的绝对地址
     */
    public static void loadWordFromFile(String path){
        String encoding = "UTF-8";
        File file = new File(path);
        try{
            if(file.isFile() && file.exists()){
                InputStreamReader inputStreamReader = new InputStreamReader(
                        new FileInputStream(file),encoding
                );
                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
                String line;
                ArrayList<String> list = new ArrayList<>();
                while ((line = bufferedReader.readLine()) != null){
                    list.add(line);
                }
                bufferedReader.close();
                inputStreamReader.close();
                loadWord(list);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    /**
     * 对敏感词多叉树递增排序
     * @param list 待排序List
     */
    private static void sort(List list){
        if(list == null) return;
        Collections.sort(list); // 递增排序
        for(Word word:list){
            sort(word.next);
        }
    }
 
    /**
     * 判断是否跳过当前字符
     * @param c 待检测字符
     * @return true:需要跳过   false:不需要跳过
     */
    private static boolean skip(char c){
        for(char c1:skip){
            if(c1 == c) return true;
        }
        return false;
    }
}