~jqueryzhang/qhighschool.git - Gitblit for Qxueyou

parent: 5196a1d1 | 补丁 | 提交 | show whitespace

胡仁荣

2023-04-12 3220d856699b2adc93ee528eee78d49409f22fd7

新增敏感字屏蔽功能

2个文件已修改

4个文件已添加

	src/main/java/com/qxueyou/scc/Sensitive/List.java	41 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	src/main/java/com/qxueyou/scc/Sensitive/SensitiveWordFilter.java	149 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	src/main/java/com/qxueyou/scc/Sensitive/Word.java	19 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	src/main/java/com/qxueyou/scc/operation/topic/action/StuTopicController.java	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	src/main/java/com/qxueyou/scc/wx/service/impl/WechatService.java	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	src/main/resources/SensitiveWordList.txt	14596 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 src/main/java/com/qxueyou/scc/Sensitive/List.java

New file
@@ -0,0 +1,41 @@
package com.qxueyou.scc.Sensitive;

import java.util.ArrayList;

public class List extends ArrayList<Word> {
    public Word get(char c){
        for(Word w :this){
            if(w.c == c) return w;
        }
        return null;
    }

    /**
     * 二分查找，必须先升序排序
     * @param c 需要查找的字符
     * @return Word对象：如果找到   null:如果没找到
     */
    public Word binaryGet(char c){
        int left,right,key;
        Word word;
        left = 0;right = this.size()-1;
        while (left <= right){
            key = (left + right) / 2;
            word = get(key);
            if(word.c == c){
                return word;
            }else if(word.c > c){
                right = key - 1;
            }else {
                left = key + 1;
            }
        }
        return null;
    }

    public Word add(char c){
        Word word = new Word(c);
        super.add(word);
        return word;
    }
}

 src/main/java/com/qxueyou/scc/Sensitive/SensitiveWordFilter.java

New file
@@ -0,0 +1,149 @@
package com.qxueyou.scc.Sensitive;

import java.io.*;
import java.util.ArrayList;
import java.util.Collections;


public final class SensitiveWordFilter {
    public static List wordList;
    private final static char replace = '*'; // 替代字符
    private final static char[] skip = new char[]{ // 遇到这些字符就会跳过，例如,如果"AB"是敏感词，那么"A B","A=B"也会被屏蔽
            '!','*','-','+','_','=',',','.','@'
    };

    /**
     * 敏感词替换
     * @param text 待替换文本
     * @return 替换后的文本
     */
    public static String Filter(String text){
        if(wordList == null || wordList.size() == 0) return text;
        char[] __char__ = text.toCharArray(); // 把String转化成char数组，便于遍历
        int i,j;
        Word word;
        boolean flag; // 是否需要替换
        for(i=0;i<__char__.length;i++){ // 遍历所有字符
            char c = __char__[i];
            word = wordList.binaryGet(c); // 使用二分查找来寻找字符，提高效率
            if(word != null){ // word != null说明找到了
                flag = false;
                j = i+1;
                while (j < __char__.length){ // 开始逐个比较后面的字符
                    if(skip(__char__[j])) { // 跳过空格之类的无关字符
                        j++;
                        continue;
                    }
                    if(word.next != null){ // 字符串尚未结束，不确定是否存在敏感词
                        /*
                        以下代码并没有使用二分查找，因为以同一个字符开头的敏感词较少
                        例如，wordList中记录了所有敏感词的开头第一个字，它的数量通常会有上千个
                        假如现在锁定了字符“T”开头的敏感词，而“T”开头的敏感词只有10个，这时使用二分查找的效率反而低于顺序查找
                         */
                        word = word.next.get(__char__[j]);
                        if(word == null){
                            break;
                        }
                        j++;
                    }else { // 字符串已结束，存在敏感词汇
                        flag = true;
                        break;
                    }
                }
                if(word != null && word.next == null){
                    flag = true;
                }
                if(flag){ // 如果flag==true，说明检测出敏感粗，需要替换
                    while (i<j){
                        if(skip(__char__[i])){ // 跳过空格之类的无关字符，如果要把空格也替换成'*'，则删除这个if语句
                            i++;
                            continue;
                        }
                        __char__[i] = replace;
                        i++;
                    }
                    i--;
                }
            }
        }
        return new String(__char__);
    }

    /**
     * 加载敏感词列表
     * @param words 敏感词数组
     */
    public static void loadWord(ArrayList<String> words){
        if(words == null) return;
        char[] chars;
        List now;
        Word word;
        wordList = new List();
        for(String __word__:words){
            if(__word__ == null) continue;
            chars = __word__.toCharArray();
            now = wordList;
            word = null;
            for(char c:chars){
                if(word != null) {
                    if(word.next == null) word.next = new List();
                    now = word.next;
                }
                word = now.get(c);
                if(word == null) word = now.add(c);
            }
        }
        sort(wordList);
    }

    /**
     * 加载敏感词txt文件，每个敏感词独占一行，不可出现空格，空行，逗号等非文字内容,必须使用UTF-8编码
     * @param path txt文件的绝对地址
     */
    public static void loadWordFromFile(String path){
        String encoding = "UTF-8";
        File file = new File(path);
        try{
            if(file.isFile() && file.exists()){
                InputStreamReader inputStreamReader = new InputStreamReader(
                        new FileInputStream(file),encoding
                );
                BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
                String line;
                ArrayList<String> list = new ArrayList<>();
                while ((line = bufferedReader.readLine()) != null){
                    list.add(line);
                }
                bufferedReader.close();
                inputStreamReader.close();
                loadWord(list);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 对敏感词多叉树递增排序
     * @param list 待排序List
     */
    private static void sort(List list){
        if(list == null) return;
        Collections.sort(list); // 递增排序
        for(Word word:list){
            sort(word.next);
        }
    }

    /**
     * 判断是否跳过当前字符
     * @param c 待检测字符
     * @return true:需要跳过   false:不需要跳过
     */
    private static boolean skip(char c){
        for(char c1:skip){
            if(c1 == c) return true;
        }
        return false;
    }
}

 src/main/java/com/qxueyou/scc/Sensitive/Word.java

New file
@@ -0,0 +1,19 @@
package com.qxueyou.scc.Sensitive;

public class Word implements Comparable<Word>{
    public char c;
    public List next = null;

    public Word(char c){
        this.c = c;
    }

    @Override
    public int compareTo(Word word) {
        return c - word.c;
    }

    public String toString(){
        return c + "(" + (next == null ? null : next.size()) + ")";
    }
}

 src/main/java/com/qxueyou/scc/operation/topic/action/StuTopicController.java

@@ -1,6 +1,7 @@

package com.qxueyou.scc.operation.topic.action;

import com.qxueyou.scc.Sensitive.SensitiveWordFilter;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.bind.annotation.*;
@@ -354,7 +355,9 @@
    @GetMapping(value = "addComment")
    public Result addComment(String content, String commentObjectId, String commentParentId, String commentedId,
            String commentedName) {
        return topicService.addComment(content, commentObjectId, commentParentId, commentedId, commentedName);
        SensitiveWordFilter.loadWordFromFile("D:\\工作\\SensitiveWord\\SensitiveWordList.txt");
        String filter = SensitiveWordFilter.Filter(content);
        return topicService.addComment(filter, commentObjectId, commentParentId, commentedId, commentedName);
    }

    /**

 src/main/java/com/qxueyou/scc/wx/service/impl/WechatService.java

@@ -1,6 +1,5 @@
package com.qxueyou.scc.wx.service.impl;

import cn.hutool.http.HttpUtil;
import com.alibaba.druid.util.HttpClientUtils;
import com.alibaba.druid.util.StringUtils;
import com.alibaba.fastjson.JSON;

 src/main/resources/SensitiveWordList.txt

New file
Diff too large

New file
			@@ -0,0 +1,41 @@
			package com.qxueyou.scc.Sensitive;

			import java.util.ArrayList;

			public class List extends ArrayList<Word> {
			public Word get(char c){
			for(Word w :this){
			if(w.c == c) return w;
			}
			return null;
			}

			/**
			* 二分查找，必须先升序排序
			* @param c 需要查找的字符
			* @return Word对象：如果找到 null:如果没找到
			*/
			public Word binaryGet(char c){
			int left,right,key;
			Word word;
			left = 0;right = this.size()-1;
			while (left <= right){
			key = (left + right) / 2;
			word = get(key);
			if(word.c == c){
			return word;
			}else if(word.c > c){
			right = key - 1;
			}else {
			left = key + 1;
			}
			}
			return null;
			}

			public Word add(char c){
			Word word = new Word(c);
			super.add(word);
			return word;
			}
			}

New file
			@@ -0,0 +1,149 @@
			package com.qxueyou.scc.Sensitive;

			import java.io.*;
			import java.util.ArrayList;
			import java.util.Collections;


			public final class SensitiveWordFilter {
			public static List wordList;
			private final static char replace = '*'; // 替代字符
			private final static char[] skip = new char[]{ // 遇到这些字符就会跳过，例如,如果"AB"是敏感词，那么"A B","A=B"也会被屏蔽
			'!','*','-','+','_','=',',','.','@'
			};

			/**
			* 敏感词替换
			* @param text 待替换文本
			* @return 替换后的文本
			*/
			public static String Filter(String text){
			if(wordList == null \|\| wordList.size() == 0) return text;
			char[] __char__ = text.toCharArray(); // 把String转化成char数组，便于遍历
			int i,j;
			Word word;
			boolean flag; // 是否需要替换
			for(i=0;i<__char__.length;i++){ // 遍历所有字符
			char c = __char__[i];
			word = wordList.binaryGet(c); // 使用二分查找来寻找字符，提高效率
			if(word != null){ // word != null说明找到了
			flag = false;
			j = i+1;
			while (j < __char__.length){ // 开始逐个比较后面的字符
			if(skip(__char__[j])) { // 跳过空格之类的无关字符
			j++;
			continue;
			}
			if(word.next != null){ // 字符串尚未结束，不确定是否存在敏感词
			/*
			以下代码并没有使用二分查找，因为以同一个字符开头的敏感词较少
			例如，wordList中记录了所有敏感词的开头第一个字，它的数量通常会有上千个
			假如现在锁定了字符“T”开头的敏感词，而“T”开头的敏感词只有10个，这时使用二分查找的效率反而低于顺序查找
			*/
			word = word.next.get(__char__[j]);
			if(word == null){
			break;
			}
			j++;
			}else { // 字符串已结束，存在敏感词汇
			flag = true;
			break;
			}
			}
			if(word != null && word.next == null){
			flag = true;
			}
			if(flag){ // 如果flag==true，说明检测出敏感粗，需要替换
			while (i<j){
			if(skip(__char__[i])){ // 跳过空格之类的无关字符，如果要把空格也替换成'*'，则删除这个if语句
			i++;
			continue;
			}
			__char__[i] = replace;
			i++;
			}
			i--;
			}
			}
			}
			return new String(__char__);
			}

			/**
			* 加载敏感词列表
			* @param words 敏感词数组
			*/
			public static void loadWord(ArrayList<String> words){
			if(words == null) return;
			char[] chars;
			List now;
			Word word;
			wordList = new List();
			for(String __word__:words){
			if(__word__ == null) continue;
			chars = __word__.toCharArray();
			now = wordList;
			word = null;
			for(char c:chars){
			if(word != null) {
			if(word.next == null) word.next = new List();
			now = word.next;
			}
			word = now.get(c);
			if(word == null) word = now.add(c);
			}
			}
			sort(wordList);
			}

			/**
			* 加载敏感词txt文件，每个敏感词独占一行，不可出现空格，空行，逗号等非文字内容,必须使用UTF-8编码
			* @param path txt文件的绝对地址
			*/
			public static void loadWordFromFile(String path){
			String encoding = "UTF-8";
			File file = new File(path);
			try{
			if(file.isFile() && file.exists()){
			InputStreamReader inputStreamReader = new InputStreamReader(
			new FileInputStream(file),encoding
			);
			BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
			String line;
			ArrayList<String> list = new ArrayList<>();
			while ((line = bufferedReader.readLine()) != null){
			list.add(line);
			}
			bufferedReader.close();
			inputStreamReader.close();
			loadWord(list);
			}
			} catch (IOException e) {
			e.printStackTrace();
			}
			}

			/**
			* 对敏感词多叉树递增排序
			* @param list 待排序List
			*/
			private static void sort(List list){
			if(list == null) return;
			Collections.sort(list); // 递增排序
			for(Word word:list){
			sort(word.next);
			}
			}

			/**
			* 判断是否跳过当前字符
			* @param c 待检测字符
			* @return true:需要跳过 false:不需要跳过
			*/
			private static boolean skip(char c){
			for(char c1:skip){
			if(c1 == c) return true;
			}
			return false;
			}
			}

New file
			@@ -0,0 +1,19 @@
			package com.qxueyou.scc.Sensitive;

			public class Word implements Comparable<Word>{
			public char c;
			public List next = null;

			public Word(char c){
			this.c = c;
			}

			@Override
			public int compareTo(Word word) {
			return c - word.c;
			}

			public String toString(){
			return c + "(" + (next == null ? null : next.size()) + ")";
			}
			}

			@@ -1,6 +1,7 @@

			package com.qxueyou.scc.operation.topic.action;

			import com.qxueyou.scc.Sensitive.SensitiveWordFilter;
			import org.apache.commons.lang3.StringUtils;
			import org.springframework.beans.factory.annotation.Autowired;
			import org.springframework.web.bind.annotation.*;
			@@ -354,7 +355,9 @@
			@GetMapping(value = "addComment")
			public Result addComment(String content, String commentObjectId, String commentParentId, String commentedId,
			String commentedName) {
			return topicService.addComment(content, commentObjectId, commentParentId, commentedId, commentedName);
			SensitiveWordFilter.loadWordFromFile("D:\\工作\\SensitiveWord\\SensitiveWordList.txt");
			String filter = SensitiveWordFilter.Filter(content);
			return topicService.addComment(filter, commentObjectId, commentParentId, commentedId, commentedName);
			}

			/**

			@@ -1,6 +1,5 @@
			package com.qxueyou.scc.wx.service.impl;

			import cn.hutool.http.HttpUtil;
			import com.alibaba.druid.util.HttpClientUtils;
			import com.alibaba.druid.util.StringUtils;
			import com.alibaba.fastjson.JSON;