/**
*
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
* yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.superword.rule;
import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Suffix;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.WordLinker;
import org.apdplat.superword.tools.WordLinker.Dictionary;
import org.apdplat.superword.tools.WordSources;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
/**
* 动态后缀规则,比如规则为:ise-ize,表示单词集合中
* 有两个词分别以ise和ize结尾
* 且除了后缀外,其他部分都相同
* @author 杨尚川
*/
public class DynamicSuffixRule {
private DynamicSuffixRule(){}
private static final Set<Word> WORDS = WordSources.getAll();
public static List<Word> findBySuffix(Collection<Word> words, List<Suffix> suffixes) {
if(suffixes == null || suffixes.size() < 2){
return Arrays.asList();
}
return words
.parallelStream()
.filter(word -> {
String w = word.getWord();
String p = suffixes.get(0).getSuffix().toLowerCase();
p = p.replaceAll("-", "").replaceAll("\\s+", "");
if (!w.toLowerCase().endsWith(p)) {
return false;
}
String common = w.substring(0, w.length()-p.length());
//这里要用for,忽略第一个元素
for(int i=1; i<suffixes.size(); i++){
String s = suffixes.get(i).getSuffix().toLowerCase();
s = s.replaceAll("-", "").replaceAll("\\s+", "");
if(!words.contains(new Word(common+s, ""))){
return false;
}
}
return true;
})
.sorted()
.collect(Collectors.toList());
}
public static String toHtmlFragment(List<Word> words, List<Suffix> suffixes) {
return toHtmlFragment(words, suffixes, Dictionary.ICIBA);
}
public static String toHtmlFragment(List<Word> words, List<Suffix> suffixes, Dictionary dictionary) {
StringBuilder html = new StringBuilder();
html.append("<h4>common prefix different suffix: ");
suffixes.forEach(suffix -> {
html.append("-").append(suffix.getSuffix());
if(StringUtils.isNotBlank(suffix.getDes())){
html.append(" (")
.append(suffix.getDes())
.append(") ");
}
html.append("\t");
});
html.append(" (hit ")
.append(words.size())
.append(")</h4>\n")
.append("<table>\n");
AtomicInteger wordCounter = new AtomicInteger();
words.forEach(word -> {
String w = word.getWord();
String common = null;
//这里用for比较适合,因为要break
for (Suffix suffix : suffixes) {
String s = suffix.getSuffix().toLowerCase();
s = s.replaceAll("-", "").replaceAll("\\s+", "");
if (w.endsWith(s)) {
common = w.substring(0, w.length() - s.length());
break;
}
}
if (common != null) {
html.append("\t")
.append("<tr><td>")
.append(wordCounter.incrementAndGet())
.append("、</td>");
final String c = common;
suffixes.forEach(suffix -> {
String s = suffix.getSuffix().toLowerCase();
s = s.replaceAll("-", "").replaceAll("\\s+", "");
html.append("<td>")
.append(WordLinker.toLink(c + s, s, dictionary))
.append("</td>");
});
if(WORDS.contains(new Word(c, ""))) {
html.append("<td>")
.append(WordLinker.toLink(c, c, dictionary))
.append("</td>");
}
}
html.append("</tr>\n");
});
html.append("</table>");
return html.toString();
}
public static void main(String[] args) throws Exception {
Set<Word> words = WordSources.getAll();
//List<Suffix> suffixes = Arrays.asList(new Suffix("ise", ""), new Suffix("ize", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ise", ""), new Suffix("ice", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("a", ""), new Suffix("um", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ve", ""), new Suffix("ution", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ce", ""), new Suffix("se", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("e", ""), new Suffix("ation", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ter", ""), new Suffix("tre", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("d", ""), new Suffix("sion", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ize", ""), new Suffix("ization", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("e", ""), new Suffix("ity", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("nate", ""), new Suffix("nation", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("t", ""), new Suffix("tly", ""), new Suffix("ce", ""));
//List<Suffix> suffixes = Arrays.asList(new Suffix("ist", "...人"), new Suffix("ism", "...主义"));
List<Suffix> suffixes = Arrays.asList(new Suffix("or", ""), new Suffix("our", ""));
List<Word> data = DynamicSuffixRule.findBySuffix(words, suffixes);
String htmlFragment = DynamicSuffixRule.toHtmlFragment(data, suffixes, Dictionary.ICIBA);
Files.write(Paths.get("target/dynamic_suffix_rule.txt"), htmlFragment.getBytes("utf-8"));
System.out.println(htmlFragment);
}
}