/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.tools; import org.apdplat.superword.model.Word; import org.apdplat.superword.rule.CompoundWord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * 句子摘要 * * @author 杨尚川 */ public class Summary { private static final Logger LOGGER = LoggerFactory.getLogger(Summary.class); public static String summaryForPreciousWords(int topN, String path, String... dics) { return summaryForPreciousWords(0, topN, path, dics); } public static String summaryForPreciousWords(int from, int to, String path, String... dics) { //摘要的词 List<String> words = preciousWords(path, dics); if(to > words.size()){ to = words.size(); } if(from < 0){ from = 0; } words = words.subList(from, to); Map<String, Map<String, List<String>>> data = findEvidence(Paths.get(path), words, 10, 1); String html = toHtmlFragment(data, from); LOGGER.info(words.toString()); return html; } public static List<String> preciousWords(String path, String... dics) { //词典 Set<Word> dic = WordSources.get(dics); return preciousWords(path, dic).stream().map(e -> e.getKey()).collect(Collectors.toList()); } public static List<Map.Entry<String, AtomicInteger>> preciousWords(String path, Set<Word> dic) { //获取目录下的所有文件列表 或 文件本身 Set<String> fileNames = TextAnalyzer.getFileNames(path); //词频统计 Map<String, AtomicInteger> frequency = TextAnalyzer.frequency(fileNames); Map<String, AtomicInteger> unknown = new HashMap<>(); LOGGER.debug("需要检查单词个数:" + frequency.keySet().size()); frequency .keySet() .forEach(key -> { if (!dic.contains(new Word(key.toLowerCase(), ""))) { unknown.put(key, frequency.get(key)); } }); LOGGER.debug("未知的单词个数:" + unknown.size()); AtomicInteger i = new AtomicInteger(); List<Map.Entry<String, AtomicInteger>> result = unknown .entrySet() .stream() .filter(entry -> entry.getKey().length() > 2) .sorted((a, b) -> b.getValue().get() - a.getValue().get()) .collect(Collectors.toList()); return result; } public static String toHtmlFragment(Map<String, Map<String, List<String>>> data) { return toHtmlFragment(data, 0); } public static String toHtmlFragment(Map<String, Map<String, List<String>>> data, int base) { Set<String> books = new HashSet<>(); StringBuilder html = new StringBuilder(); AtomicInteger i = new AtomicInteger(base); data.keySet() .stream() .forEach(word -> { if (data.get(word).isEmpty()) { System.err.println("词:" + word + "没有找到匹配文本"); return; } StringBuilder p = new StringBuilder(); for (char c : word.toCharArray()) { p.append("[") .append(Character.toUpperCase(c)) .append(Character.toLowerCase(c)) .append("]{1}"); } Pattern pattern = Pattern.compile(p.toString()); html.append("<h1>") .append(i.incrementAndGet()) .append("、单词 ") .append(WordLinker.toLink(word)) .append(" 的匹配文本:</h1><br/>\n"); html.append("<ol>\n"); String emPre = "<span style=\"color:red\">"; String emSuf = "</span>"; data.get(word) .entrySet() .forEach(entry -> { books.add(entry.getKey()); String book = " <u><i>" + entry.getKey() + "</i></u>"; entry.getValue() .forEach(t -> { Set<String> targets = new HashSet<>(); Matcher matcher = pattern.matcher(t); while(matcher.find()){ String target = matcher.group(); targets.add(target); } for(String target : targets){ t = t.replaceAll(target, emPre+target+emSuf); } html.append("\t<li>") .append(t) .append(book) .append("</li><br/>\n"); }); }); html.append("</ol><br/>\n"); }); html.append("涉及文献数目:").append(books.size()).append("<br/>\n"); AtomicInteger j = new AtomicInteger(); books.stream().sorted().forEach(b -> html.append("\t").append(j.incrementAndGet()).append("、").append(b).append("<br/>\n")); return html.toString(); } /** * @param dir PDF文档解析之后形成的文本文档所在目录 * @param words 待处理词列表 * @param totalLimitForWord 一个词最多需要多少个句子 * @param bookLimitForBook 一本书里面最多取多少个句子 * @return */ public static Map<String, Map<String, List<String>>> findEvidence(Path dir, List<String> words, int totalLimitForWord, int bookLimitForBook) { LOGGER.info("处理目录:" + dir); Map<String, Map<String, List<String>>> data = new LinkedHashMap<>(); Map<String, AtomicInteger> wordInOneBookCollectCount = new HashMap<>(); Map<String, AtomicInteger> wordInAllBookCollectCount = new HashMap<>(); Set<Integer> hashes =new HashSet<>(); try { Files.walkFileTree(dir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { String fileName = file.toFile().getAbsolutePath(); if (file.toFile().getName().startsWith(".")) { return FileVisitResult.CONTINUE; } if (!fileName.endsWith(".txt")) { LOGGER.info("放弃处理非txt文件:" + fileName); return FileVisitResult.CONTINUE; } LOGGER.info("处理文件:" + fileName); List<String> lines = Files.readAllLines(file); String book = file.toFile().getName().replace(".txt", ""); lines.forEach(line -> { //忽略重复出现的句子 if(hashes.contains(line.hashCode())){ return; } final List<String> wordSet = TextAnalyzer.seg(line); words .forEach(word -> { String wordBook = word + "_" + book; wordInOneBookCollectCount.putIfAbsent(wordBook, new AtomicInteger()); wordInAllBookCollectCount.putIfAbsent(word, new AtomicInteger()); data.putIfAbsent(word, new HashMap<>()); if (wordSet.contains(word) && wordInOneBookCollectCount.get(wordBook).get() < bookLimitForBook && wordInAllBookCollectCount.get(word).get() < totalLimitForWord) { wordInOneBookCollectCount.get(wordBook).incrementAndGet(); wordInAllBookCollectCount.get(word).incrementAndGet(); data.get(word).putIfAbsent(book, new ArrayList<>()); data.get(word).get(book).add(line); hashes.add(line.hashCode()); } }); }); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { e.printStackTrace(); } return data; } public static String summary() { List<String> words = Arrays.asList("categorically", "misleadingly", "weightings", "uniques", "alphanumerics", "misspell", "conducive", "dissection", "marvel", "graciously", "inspections", "appetite", "visualizations", "commonalities", "dissecting", "fidelity", "creativity", "coyote", "reaction"); Collections.sort(words); Map<String, Map<String, List<String>>> data = findEvidence(Paths.get("src/main/resources/it"), words, 100, 10); String html = toHtmlFragment(data); return html; } public static String summary(int totalLimitForWord, int bookLimitForBook, String... word) { List<String> words = Arrays.asList(word); Map<String, Map<String, List<String>>> data = findEvidence(Paths.get("src/main/resources/it"), words, totalLimitForWord, bookLimitForBook); String html = toHtmlFragment(data); return html; } public static void main(String[] args) throws Exception { //String html = summary(); //String html = summaryForPreciousWords(275, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(275, 550, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(550, 800, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(800, 1100, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(1100, 1400, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(1400, 1700, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(1700, 2000, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summaryForPreciousWords(2000, "src/main/resources/it", "/words.txt", "/words_extra.txt", "/words_gre.txt"); //String html = summary(Integer.MAX_VALUE, Integer.MAX_VALUE, "apache"); List<Map.Entry<String, AtomicInteger>> words = preciousWords("src/main/resources/it", WordSources.getAll()); StringBuilder w = new StringBuilder(); AtomicInteger i = new AtomicInteger(); words.forEach(e -> { if(i.get()<=2000){ w.append(i.incrementAndGet()+"\t"+e.getKey()+"\t"+e.getValue()); } }); Files.write(Paths.get("src/main/resources/words_it.txt"), w.toString().getBytes("utf-8")); //Files.write(Paths.get("target/summary.txt"), html.getBytes("utf-8")); } }