/*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.apdplat.superword.tools;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.WordLinker.Dictionary;
import org.apdplat.word.WordSegmenter;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
/**
* 辅助阅读:
* 以电影功夫熊猫使用的单词分析为例
* 你英语四级过了吗? 功夫熊猫看了吗?
* 去除停用词后,功夫熊猫使用了789个英语单词,你会说很简单吧,别急,这些单词中仍然有148个单词不在四级词汇表中,花两分钟时间看看你是否认识这些单词.
* Created by ysc on 11/15/15.
*/
public class AidReading {
public static void main(String[] args) throws IOException {
WordLinker.serverRedirect = null;
String result = analyse(WordSources.get("/word_CET4.txt"), Dictionary.ICIBA, 6, "/it/movie/kungfupanda.txt");
//String result = analyse(WordSources.get("/word_CET4.txt"), Dictionary.ICIBA, 6, "/it/movie/kungfupanda.txt", "/it/movie/kungfupanda2.txt");
/*
String url = "http://spark.apache.org/docs/latest/streaming-programming-guide.html";
String text = Jsoup.parse(new URL(url), 60000).text();
System.out.println(text);
String result = analyse(WordSources.get("/word_CET4.txt"), Dictionary.ICIBA, 6, false, null, Arrays.asList(text));
*/
System.out.println(result);
}
public static String analyse(Set<Word> words, int column, String... resources) {
return analyse(words, Dictionary.ICIBA, column, resources);
}
public static String analyse(Set<Word> words, Dictionary dictionary, int column, String... resources) {
return analyse(words, dictionary, column, false, null, resources);
}
public static String analyse(Set<Word> words, int column, boolean searchOriginalText, String book, String... resources) {
return analyse(words, Dictionary.ICIBA, column, searchOriginalText, book, resources);
}
public static String analyse(Set<Word> words, Dictionary dictionary, int column, boolean searchOriginalText, String book, String... resources) {
List<String> text = new ArrayList<>();
for(String resource : resources) {
text.addAll(FileUtils.readResource(resource));
}
return analyse(words, dictionary, column, searchOriginalText, book, text);
}
public static String analyse(Set<Word> words, int column, boolean searchOriginalText, String book, List<String> text) {
return analyse(words, Dictionary.ICIBA, column, searchOriginalText, book, text);
}
public static String analyse(Set<Word> words, Dictionary dictionary, int column, boolean searchOriginalText, String book, List<String> text) {
Set<String> wordSet = new HashSet<>();
words.forEach(word -> wordSet.add(word.getWord().toLowerCase()));
StringBuilder result = new StringBuilder();
Map<String, AtomicInteger> map = new ConcurrentHashMap<>();
text.forEach(line -> {
StringBuilder buffer = new StringBuilder();
line = line.replaceAll("[^a-zA-Z0-9]*[a-zA-Z0-9]+'[a-zA-Z0-9]+[^a-zA-Z0-9]*", " ")
.replaceAll("[^a-zA-Z0-9]*[a-zA-Z0-9]+`[a-zA-Z0-9]+[^a-zA-Z0-9]*", " ")
.replaceAll("[^a-zA-Z0-9]*[a-zA-Z0-9]+’[a-zA-Z0-9]+[^a-zA-Z0-9]*", " ");
for (org.apdplat.word.segmentation.Word term : WordSegmenter.seg(line, SegmentationAlgorithm.PureEnglish)) {
String word = term.getText();
if (word.contains("'")) {
continue;
}
buffer.setLength(0);
for (char c : word.toCharArray()) {
if (Character.isAlphabetic(c)) {
buffer.append(Character.toLowerCase(c));
}
}
String baseForm = IrregularVerbs.getBaseForm(buffer.toString());
buffer.setLength(0);
buffer.append(baseForm);
String singular = IrregularPlurals.getSingular(buffer.toString());
buffer.setLength(0);
buffer.append(singular);
if (buffer.length() < 2 || buffer.length() > 14) {
continue;
}
map.putIfAbsent(buffer.toString(), new AtomicInteger());
map.get(buffer.toString()).incrementAndGet();
}
});
List<String> list = new ArrayList<>();
Map<String, AtomicInteger> map2 = new ConcurrentHashMap<>();
map.entrySet().stream().sorted((a, b) -> b.getValue().get() - a.getValue().get()).forEach(entry -> {
AtomicInteger v = entry.getValue();
String w = entry.getKey().toLowerCase();
if(w.length() < 3){
return;
}
if (wordSet.contains(w)) {
map2.put(w, v);
return;
}
StringBuilder str = new StringBuilder(w);
if (w.endsWith("ly") && wordSet.contains(w.substring(0, w.length() - 2))) {
str.append("_"+w.substring(0, w.length() - 2));
}
if (w.endsWith("s") && wordSet.contains(w.substring(0, w.length() - 1))) {
str.append("_"+w.substring(0, w.length() - 1));
}
if (w.endsWith("es") && wordSet.contains(w.substring(0, w.length() - 2))) {
str.append("_"+w.substring(0, w.length() - 2));
}
if (w.endsWith("ies") && wordSet.contains(w.substring(0, w.length() - 3)+"y")) {
str.append("_"+w.substring(0, w.length() - 3)+"y");
}
if (w.endsWith("ed") && wordSet.contains(w.substring(0, w.length() - 1))) {
str.append("_"+w.substring(0, w.length() - 1));
}
if (w.endsWith("ed") && wordSet.contains(w.substring(0, w.length() - 2))) {
str.append("_"+w.substring(0, w.length() - 2));
}
if (w.endsWith("ed") && w.length()>5 && wordSet.contains(w.substring(0, w.length() - 3)) && (w.charAt(w.length()-3)==w.charAt(w.length()-4))) {
str.append("_"+w.substring(0, w.length() - 3));
}
if (w.endsWith("ied") && wordSet.contains(w.substring(0, w.length() - 3)+"y")) {
str.append("_"+w.substring(0, w.length() - 3)+"y");
}
if (w.endsWith("ing") && wordSet.contains(w.substring(0, w.length() - 3)+"e")) {
str.append("_"+w.substring(0, w.length() - 3)+"e");
}
if (w.endsWith("ing") && w.length()>6 && wordSet.contains(w.substring(0, w.length() - 4)) && (w.charAt(w.length()-4)==w.charAt(w.length()-5))) {
str.append("_"+w.substring(0, w.length() - 4));
}
if (w.endsWith("ing") && wordSet.contains(w.substring(0, w.length() - 3))) {
str.append("_"+w.substring(0, w.length() - 3));
}
if (w.endsWith("er") && wordSet.contains(w.substring(0, w.length() - 1))) {
str.append("_"+w.substring(0, w.length() - 1));
}
if (w.endsWith("er") && wordSet.contains(w.substring(0, w.length() - 2))) {
str.append("_"+w.substring(0, w.length() - 2));
}
if (w.endsWith("er") && w.length()>5 && wordSet.contains(w.substring(0, w.length() - 3)) && (w.charAt(w.length()-3)==w.charAt(w.length()-4))) {
str.append("_"+w.substring(0, w.length() - 3));
}
if (w.endsWith("est") && wordSet.contains(w.substring(0, w.length() - 2))) {
str.append("_"+w.substring(0, w.length() - 2));
}
if (w.endsWith("est") && wordSet.contains(w.substring(0, w.length() - 3))) {
str.append("_"+w.substring(0, w.length() - 3));
}
if (w.endsWith("est") && w.length()>6 && wordSet.contains(w.substring(0, w.length() - 4)) && (w.charAt(w.length()-4)==w.charAt(w.length()-5))) {
str.append("_"+w.substring(0, w.length() - 4));
}
if (w.endsWith("ier") && wordSet.contains(w.substring(0, w.length() - 3)+"y")) {
str.append("_"+w.substring(0, w.length() - 3)+"y");
}
if (w.endsWith("iest") && wordSet.contains(w.substring(0, w.length() - 4)+"y")) {
str.append("_"+w.substring(0, w.length() - 4)+"y");
}
if (w.endsWith("ves") && wordSet.contains(w.substring(0, w.length() - 3)+"f")) {
str.append("_"+w.substring(0, w.length() - 3)+"f");
}
if(str.length() > w.length()){
map2.put(str.toString(), v);
return;
}
String originalText = "";
if(searchOriginalText){
originalText = "\t<a target=\"_blank\" href=\"book-aid-reading-detail.jsp?book="+book+"&word="+entry.getKey()+"&dict=ICIBA&pageSize="+entry.getValue()+"\">[" + entry.getValue() + "]</a>";
}else{
originalText = "\t[" + entry.getValue() + "]";
}
list.add(WordLinker.toLink(entry.getKey(), dictionary) + originalText);
});
result.append("<h3>words don't occur in specified set: ("+list.size()+") </h3>\n");
result.append(HtmlFormatter.toHtmlTableFragment(list, column));
list.clear();
map2.entrySet().stream().sorted((a, b) -> b.getValue().get() - a.getValue().get()).forEach(entry -> {
String originalText = "";
if (searchOriginalText) {
originalText = "\t<a target=\"_blank\" href=\"book-aid-reading-detail.jsp?book=" + book + "&word=" + entry.getKey() + "&dict=ICIBA&pageSize=" + entry.getValue() + "\">[" + entry.getValue() + "]</a>";
} else {
originalText = "\t[" + entry.getValue() + "]";
}
StringBuilder link = new StringBuilder();
for (String word : entry.getKey().split("_")) {
link.append(WordLinker.toLink(word, dictionary)).append(" | ");
}
link.setLength(link.length()-3);
list.add(link.toString() + originalText);
});
result.append("<h3>words occur in specified set: (" + list.size() + ") </h3>\n");
result.append(HtmlFormatter.toHtmlTableFragment(list, column));
return result.toString();
}
}