/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * 文本分析工具 * * @author 杨尚川 */ public class TextAnalyzer { private TextAnalyzer() { } private static final Pattern PATTERN = Pattern.compile("\\d+"); private static final Pattern UNICODE = Pattern.compile("[uU][0-9a-fA-F]{4}"); private static final Logger LOGGER = LoggerFactory.getLogger(TextAnalyzer.class); /** * @param files 文件相对路径或绝对路径 * @return 词频统计数据 */ public static Map<String, AtomicInteger> frequency(Collection<String> files) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); files.forEach(file -> { LOGGER.info("parse text file: " + file); //统计词频 Map<String, AtomicInteger> data = frequency(file); //合并结果 data.entrySet().forEach(entry -> { map.putIfAbsent(entry.getKey(), new AtomicInteger()); map.get(entry.getKey()).addAndGet(entry.getValue().get()); }); data.clear(); }); LOGGER.info("total unique words count: " + map.size()); return map; } public static Map<String, AtomicInteger> frequency(String file) { try{ return frequency(new FileInputStream(file)); }catch (IOException e){ e.printStackTrace(); } return Collections.emptyMap(); } public static Map<String, AtomicInteger> frequency(InputStream inputStream) { Map<String, AtomicInteger> map = new ConcurrentHashMap<>(); try (BufferedReader reader = new BufferedReader( new InputStreamReader( new BufferedInputStream( inputStream)))) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } List<String> words = seg(line); words.forEach(word -> { map.putIfAbsent(word, new AtomicInteger()); map.get(word).incrementAndGet(); }); words.clear(); } } catch (IOException ex) { ex.printStackTrace(); } LOGGER.info("unique words count: " + map.size()); return map; } /** * 分词 * @param sentence * @return */ public static List<String> seg(String sentence) { List<String> data = new ArrayList<>(); //以非字母字符切分行 String[] words = sentence.trim().split("[^a-zA-Z0-9]"); StringBuilder log = new StringBuilder(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("句子:" + sentence); } for (String word : words) { if (StringUtils.isBlank(word) || word.length()<2) { continue; } List<String> list = new ArrayList<>(); //转换为全部小写 if (word.length() < 6 //PostgreSQL等 || (Character.isUpperCase(word.charAt(word.length()-1)) && Character.isUpperCase(word.charAt(0))) //P2P,Neo4j等 || PATTERN.matcher(word).find() || StringUtils.isAllUpperCase(word)) { word = word.toLowerCase(); } //按照大写字母进行单词拆分 int last = 0; for (int i = 1; i < word.length(); i++) { if (Character.isUpperCase(word.charAt(i)) && Character.isLowerCase(word.charAt(i - 1))) { list.add(word.substring(last, i)); last = i; } } if (last < word.length()) { list.add(word.substring(last, word.length())); } list.stream() .map(w -> w.toLowerCase()) .forEach(w -> { if (w.length() < 2) { return; } w = irregularity(w); if(StringUtils.isNotBlank(w)) { data.add(w); if (LOGGER.isDebugEnabled()) { log.append(w).append(" "); } } }); } LOGGER.debug("分词:" + log); return data; } /** * 处理分词意外,即无规则情况 * @param word * @return */ private static String irregularity(String word){ if(Character.isDigit(word.charAt(0))){ LOGGER.debug("词以数字开头,忽略:"+word); return null; } if(word.startsWith("0x") || word.startsWith("0X")){ LOGGER.debug("词为16进制,忽略:"+word); return null; } if(word.endsWith("l") && StringUtils.isNumeric(word.substring(0, word.length()-1))){ LOGGER.debug("词为long类型数字,忽略:"+word); return null; } if(UNICODE.matcher(word).find()){ LOGGER.debug("词为UNICODE字符编码,忽略:"+word); return null; } switch (word){ //I’ll do it. You'll see. case "ll": return "will"; //If you’re already building applications using Spring. case "re": return "are"; //package com.manning.sdmia.ch04; case "ch": return "chapter"; //you find you’ve made a case "ve": return "have"; //but it doesn’t stop there. case "doesn": return "does"; //but it isn’t enough. case "isn": return "is"; //<input type="text" name="firstName" /><br/> case "br": return null; } return word; } /** * 将 {词 : 词频} 逆转过来为{词频 : 词数,前10个词} * @param data 词频统计结果 * @return 词频分布统计 */ public static Map<Integer, Stat> distribute(Map<String, AtomicInteger> data) { Map<Integer, Stat> stat = new HashMap<>(); data.entrySet() .forEach(entry -> { Integer key = entry.getValue().get(); stat.putIfAbsent(key, new Stat()); stat.get(key).increment(); stat.get(key).addWords(entry.getKey()); }); return stat; } /** * 解析目录或文件 * @param path */ public static void parse(String path) { //获取目录下的所有文件列表 或 文件本身 Set<String> fileNames = getFileNames(path); //词频统计 Map<String, AtomicInteger> data = frequency(fileNames); //渲染结果 String htmlFragment = HtmlFormatter.toHtmlFragmentForText(data, fileNames); try{ //保存结果 String resultFile = "target/words_" + Paths.get(path).toFile().getName().replace(".txt", "") + ".txt"; Files.write(Paths.get(resultFile), htmlFragment.getBytes("utf-8")); LOGGER.info("统计结果输出到文件:" + resultFile); }catch (IOException e){ e.printStackTrace(); } } public static Set<String> getFileNames(String path){ Set<String> fileNames = new HashSet<>(); if(Files.isDirectory(Paths.get(path))) { LOGGER.info("处理目录:" + path); }else{ LOGGER.info("处理文件:" + path); fileNames.add(path); return fileNames; } try { Files.walkFileTree(Paths.get(path), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { if (file.toFile().getName().startsWith(".")) { return FileVisitResult.CONTINUE; } String fileName = file.toFile().getAbsolutePath(); if (!fileName.endsWith(".txt")) { LOGGER.info("放弃处理非txt文件:" + fileName); return FileVisitResult.CONTINUE; } fileNames.add(fileName); return FileVisitResult.CONTINUE; } }); }catch (IOException e){ e.printStackTrace(); } return fileNames; } /** * * @param path 待分析的文本路径,目录或文件的绝对路径 * @param limit 句子限制 * @param isTopN 是否是分值最高,反之为分值最低 */ public static TreeMap<Float, String> sentence(String path, int limit, boolean isTopN) { //获取目录下的所有文件列表 或 文件本身 Set<String> fileNames = getFileNames(path); //词频统计 Map<String, AtomicInteger> frequency = frequency(fileNames); //有序 TreeMap<Float, String> sentences = new TreeMap<>(); //句子评分 int count = 0; for(String fileName : fileNames) { try (BufferedReader reader = new BufferedReader( new InputStreamReader( new BufferedInputStream( new FileInputStream(fileName))))) { String line = null; while ((line = reader.readLine()) != null) { if (StringUtils.isBlank(line)) { continue; } //计算分值 float score = 0; List<String> words = seg(line); for(String word : words){ AtomicInteger fre = frequency.get(word); if(fre == null || fre.get() == 0){ LOGGER.error("评分句子没有词频信息:" + line); score = 0; break; } score += 1/(float)fre.get(); } words.clear(); if(score > 0) { //保存句子 if(sentences.get(score) != null){ continue; } sentences.put(score, line + " <u><i>" + Paths.get(fileName).toFile().getName().replace(".txt", "") + "</i></u>"); count++; if(count >= limit) { if(isTopN){ //删除分值最低的 sentences.pollFirstEntry(); }else{ //删除分值最高的 sentences.pollLastEntry(); } } } } } catch (IOException ex) { LOGGER.error("句子评分出错", ex); } } return sentences; } /** * 将文本解析为词典 * @param textPath * @param dicPath */ public static void toDic(String textPath, String dicPath){ Map<String, AtomicInteger> data = frequency(getFileNames(textPath)); List<String> words = data .entrySet() .stream() .filter(w -> StringUtils.isAlpha(w.getKey()) && w.getKey().length() < 12) .sorted((a, b) -> b.getValue().get() - a.getValue().get()) .map(e -> e.getValue()+"\t"+e.getKey()) .collect(Collectors.toList()); try { Files.write(Paths.get(dicPath), words); } catch (IOException e) { LOGGER.error("保存词典文件出错", e); } } /** * CET4、CET6、GRE、IELTS、TOEFL、考研英语的词汇 * 有哪些出现在了指定文本中 * @param textPath * @return */ public static String importantWords(String textPath){ Set<Word> wordSet = WordSources.get("/word_CET4.txt", "/word_CET6.txt", "/word_GRE.txt", "/word_IELTS.txt", "/word_TOEFL.txt", "/word_KY.txt"); Map<Word, AtomicInteger> data = WordSources.convert( frequency( getFileNames(textPath))); Set<Map.Entry<Word, AtomicInteger>> entries = data.entrySet() .stream() .filter(entry -> wordSet.contains(entry.getKey())) .collect(Collectors.toSet()); return HtmlFormatter.toHtmlTableFragment(entries, 5); } public static void main(String[] args) throws Exception { //parse("src/main/resources/it/spring/Spring in Action 4th Edition.txt"); //parse("src/main/resources/it/spring"); //parse("src/main/resources/it"); //toDic("src/main/resources/it", "src/main/resources/word_it.txt"); System.out.print(importantWords("src/main/resources/it")); } public static class Stat { private AtomicInteger count = new AtomicInteger(); private List<String> words = new ArrayList<>(); public int count() { return count.get(); } public void increment() { count.incrementAndGet(); } public List<String> getWords() { return words; } public void addWords(String word) { if (this.words.size() < 11) { this.words.add(word); } } } }