本文主要是介绍java敏感词过滤的实现方式,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
《java敏感词过滤的实现方式》文章描述了如何搭建敏感词过滤系统来防御用户生成内容中的违规、广告或恶意言论,包括引入依赖、定义敏感词类、非敏感词类、替换词类和工具类等步骤,并指出资源文件应放在src/...
在论坛、聊天、评论等用户生成内容(UGC)为核心的功能中,完全依赖用户自觉是不现实的。
为了防止个别用户发布违规、广告或恶意言论,从而污染社区环境、带来法律风险或伤害其他用户,我们需要自行搭建敏感词过滤系统来防御。
1.引入依赖
<!-- 敏感词工具包 -->
<dependency>
<groupId>com.github.houbb</groupId>
<artifactId>sensitive-word</artifactId>
<version>0.21.0</version>
</dependency>2.定义自定义敏感词类
package com.heyin.sass.portal.util.sensitive; import com.github.houbb.sensitive.word.api.IWordDeny; import lombok.extern.slf4j.Slf4j; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; import Java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; /** * @author zenghuilin */ @Slf4j public class MyWordDeny implements IWordDeny { @Override public List<String> deny() { List<String> list = new ArrayList<>(); try { Resourwww.chinasem.cnce mySensitiveWords = new ClassPathResource("sensitive/sensitive_word_deny.txt"); Path mySensitiveWordsPath = Paths.get(mySensitiveWords.getFile().getPath()); list = Files.readAllLines(mySensitiveWordsPath, StandardCharsets.UTF_8); } catch (IOException ioException) { log.error("读取敏感词文件错误!" + ioException.getMessage()); } return list; } }
3.定义自定义非敏感类
package com.heyin.sass.porpythontal.util.sensitive; import com.github.houbb.sensitive.word.api.IWordAllow; import lombok.extern.slf4j.Slf4j; import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.Resource; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; /** * @author zenghuilin */ @Slf4j public class MyWordAllow implements IWordAllow { @Override public List<String> allow() { List<String> list = new ArrayList<>(); try { Resource mySensitiveWords = new ClassPathResource("sensitive/sensitive_word_allow.txt"); Path mySensitiveWordsPath = Paths.get(mySensitiveWords.getFile().getPath()); list = Files.readAllLines(mySensitiveWordsPath, StandardCharsets.UTF_8); } catch (IOException ioException) { log.error("读取敏感词文件错误!" + ioException.getMessage()); } return list; } }
4.定义自定义替换词类
package com.heyin.sass.portal.util.sensitive; import lombok.extern.slf4j.Slf4j; import org.springframejavascriptwork.core.io.ClassPathResource; import org.springframework.core.io.Resource; import com.github.houbb.sensitive.word.api.IWordContext; import com.github.houbb.sensitive.word.api.IWordReplace; import com.github.houbb.sensitive.word.api.IWordResult; import com.github.houbb.sensitive.word.utils.InnerWordCharUtils; import java.io.BufferedReader; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; /** * @author zenghuilin */ @Slf4j public class MyWordReplace implements IWordReplace { private static final Map<String, String> SENSITIVE_WORD_MAP = new HashMap<>(); static { try { // 使用 ClassPathResource 加载文件 Resource resource = new ClassPathResource("sensitive/sensitive_word_replace.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(resource.getInputStream(), StandardCharsets.UTF_8)); // 逐行读取文件 String line; while ((line = reader.readLine()) != null) { // 将每行按逗号分割成 kejavascripty 和 value String[] parts = line.split(","); if (parts.length == 2) { SENSITIVE_WORD_MAP.put(parts[0], parts[1]); // 将 a,b 形式加入到Map中 } } reader.close(); } catch (Exception e) { log.info("初始化SENSITIVE_WORD_MAP失败:{}", e.getMessage()); } } @Override public void replace(StringBuilder stringBuilder, final char[] rawChars, IWordResult wordResult, IWordContext wordContext) { String sensitiveWord = InnerWordCharUtils.getString(rawChars, wordResult); // 自定义不同的敏感词替换策略,可以从数据库等地方读取 if (SENSITIVE_WORD_MAP.containsKey(sensitiveWord)) { stringBuilder.append(SENSITIVE_WORD_MAP.get(sensitiveWord)); } else { // 其他默认使用 * 代替 int wordLength = wordResult.endIndex() - wordResult.startIndex(); for (int i = 0; i < wordLength; i++) { stringBuilder.append('*'); } } } }
5.最后定义工具类
package com.heyin.sass.portal.util.sensitive;
import com.github.houbb.sensitive.word.api.IWordAllow;
import com.github.houbb.sensitive.word.api.IWordDeny;
import com.github.houbb.sensitive.word.api.IWordReplace;
import com.github.houbb.sensitive.word.bs.SensitiveWordBs;
import com.github.houbb.sensitive.word.support.allow.WordAllows;
import com.github.houbb.sensitive.word.support.deny.WordDenys;
import java.util.List;
/**
* @author zenghuilin
*/
public class SensitiveWordUtil {
private static final SensitiveWordBs SENSITIVE_WORD_BS;
static {
// 配置默认敏感词 + 自定义敏感词
IWordDeny wordDeny = WordDenys.chains(WordDenys.defaults(), new MyWordDeny());
// 配置默认非敏感词 + 自定义非敏感词
IWordAllow wordAllow = WordAllows.chains(WordAllows.defaults(), new MyWordAllow());
// 配置自定义替换词
IWordReplace wordReplace = new MyWordReplace();
SENSITIVE_WORD_BS = SensitiveWordBs.newInstance()
// 忽略大小写
.ignoreCase(true)
// 忽略半角圆角
.ignoreWidth(true)
// 忽略数字的写法
.ignoreNumStyle(true)
// 忽略中文的书写格式:简繁体
.ignoreChineseStyle(true)
// 忽略英文的书写格式
.ignoreEnglishStyle(true)
// 忽略重复词
.ignoreRepeat(false)
// 是否启用数字检测
.enableNumCheck(false)
// 是否启用邮箱检测
.enableEmailCheck(false)
// 是否启用链接检测
.enableUrlCheck(false)
// 数字检测,自定义指定长度
// .numCheckLen(8)
// 配置自定义敏感词
.wordDeny(wordDeny)
// 配置非自定义敏感词
.wordAllow(wordAllow)
// 配置自定义替换词
.wordReplace(wordReplace)
.init();
}
/**
* 刷新敏感词库与非敏感词库缓存
*/
public static void refresh() {
SENSITIVE_WORD_BS.init();
}
/**
* 判断是否含有敏感词
*
* @param text
* @return
*/
public static boolean contains(String text) {
return SENSITIVE_WORD_BS.contains(text);
}
/**
* 替换敏感词
*
* @param text
* @return
*/
public static String replace(String text) {
return SENSITIVE_WORD_BS.replace(text);
}
/**
* 返回所有敏感词
*
* @param text
* @return
*/
public static List<String> findAll(String text) {
return SENSITIVE_WORD_BS.findAll(text);
}
public static void main(String[] args) {
String text = "五星红旗迎风飘扬";
Systjavascriptem.out.println(findAll(text));
String replace = replace(text);
System.out.println(replace);
}
}6.资源文件放在src/main/resouces目录下

敏感词文件和非敏感词文件,一个词一行

替换词文件,前面是敏感词,后面是想要替换的词

总结
这篇关于java敏感词过滤的实现方式的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!