/** * Created by jpbirdy on 15-5-19. */ package jpbirdy.detection; import jpbirdy.segment.Segmenter; import java.io.*; import java.util.ArrayList; import java.util.List; /** * @author jialou.jp * @project Segmentation * @class StopWord * @date 15-5-19 11:13 * @desc */ public class StopWord { private static List<String> stopWords; public static Segmenter seg = null; static { try { loadStopWords(); } catch (IOException e) { e.printStackTrace(); } } private static void loadStopWords() throws IOException { if (stopWords != null) return; stopWords = new ArrayList<String>(); stopWords.add(" "); InputStream stopFile = StopWord.class.getClassLoader().getResourceAsStream("main/resources/stops.txt"); if (stopFile == null) { System.err.println("停词文件不存在!"); return; } BufferedReader br = new BufferedReader(new InputStreamReader(stopFile, "UTF-8")); String line; while ((line = br.readLine()) != null) { stopWords.add(line); } System.out.println("停词加载成功!"); // System.out.println(stopWords); } public static boolean hasStopWord(String word) { int len = word.length(); int alphaNum = 0; int chineseNum = 0; for (int i = 0; i < len; i++) { char ch = word.charAt(i); if (ch >= 'a' && ch <= 'z') alphaNum++; else if (ch >= 'A' && ch <= 'Z') alphaNum++; else if (ch >= '0' && ch <= '9') alphaNum++; if (isChinese(ch)) chineseNum++; } // if(alphaNum > len/2) return true; if (chineseNum < 1) return true; for (String stop : stopWords) { if (word.contains(stop)) { return true; } } if (seg.segment(word).size() <= 1) return true; return false; } private static boolean isChinese(char c) { Character.UnicodeBlock ub = Character.UnicodeBlock.of(c); return ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION; } public static boolean containsEmoji(String source) { int len = source.length(); for (int i = 0; i < len; i++) { char codePoint = source.charAt(i); if (isEmojiCharacter(codePoint)) { //do nothing,判断到了这里表明,确认有表情字符 return true; } } return false; } private static boolean isEmojiCharacter(char codePoint) { return (codePoint == 0x0) || (codePoint == 0x9) || (codePoint == 0xA) || (codePoint == 0xD) || ((codePoint >= 0x20) && (codePoint <= 0xD7FF)) || ((codePoint >= 0xE000) && (codePoint <= 0xFFFD)) || ((codePoint >= 0x10000) && (codePoint <= 0x10FFFF)); } public static void main(String[] args) throws Exception { loadStopWords(); } }