/*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.apdplat.superword.rule;
import org.apdplat.superword.tools.WordSources;
import org.apdplat.word.analysis.*;
import org.apdplat.word.segmentation.SegmentationAlgorithm;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
/**
* 利用word分词提供的文本相似度算法来辅助记忆英语单词
* @author 杨尚川
*/
public class SimilarWord {
private static final Logger LOGGER = LoggerFactory.getLogger(SimilarWord.class);
//所有的文本相似度算法
private static final List<TextSimilarity> ALL_TEXT_SIMILARITIES = new ArrayList<>();
static {
TextSimilarity similarity = new EditDistanceTextSimilarity();
similarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
ALL_TEXT_SIMILARITIES.add(similarity);
similarity = new JaroDistanceTextSimilarity();
similarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
ALL_TEXT_SIMILARITIES.add(similarity);
similarity = new JaroWinklerDistanceTextSimilarity();
similarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
ALL_TEXT_SIMILARITIES.add(similarity);
}
public SimilarWord(){
textSimilarity = new EditDistanceTextSimilarity();
textSimilarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
}
private boolean all = false;
private int limit = 45;
private TextSimilarity textSimilarity = null;
public int getLimit() {
return limit;
}
public void setLimit(int limit) {
this.limit = limit;
LOGGER.info("设置显示结果条数为:"+limit);
}
public TextSimilarity getTextSimilarity() {
return textSimilarity;
}
public void setTextSimilarity(TextSimilarity textSimilarity) {
this.textSimilarity = textSimilarity;
this.textSimilarity.setSegmentationAlgorithm(SegmentationAlgorithm.PureEnglish);
LOGGER.info("设置相似度算法为:"+textSimilarity.getClass().getName());
}
public Map<String, Hits> compute(String word, List<TextSimilarity> textSimilarities, List<String> words, int limit){
Map<String, Hits> hitses = new HashMap<>();
textSimilarities.forEach(textSimilarity -> {
Hits hits = textSimilarity.rank(word, words, limit);
hitses.put(textSimilarity.getClass().getSimpleName().replace("TextSimilarity", ""), hits);
});
return hitses;
}
public Hits compute(String word, TextSimilarity textSimilarity, List<String> words, int limit){
return textSimilarity.rank(word, words, limit);
}
private void tip(){
LOGGER.info("----------------------------------------------------------");
LOGGER.info("可通过输入命令sa=edi来指定相似度算法,可用的算法有:");
LOGGER.info(" 1、sa=edi,编辑距离");
LOGGER.info(" 2、sa=ja,Jaro距离");
LOGGER.info(" 3、sa=jaw,Jaro–Winkler距离");
LOGGER.info("可通过输入命令sa=all来启用所有的相似度算法");
LOGGER.info("可通过输入命令limit=45来指定显示结果条数");
LOGGER.info("可通过输入命令exit退出程序");
LOGGER.info("输入要查询的词或命令:");
}
private void interact(String encoding, List<String> words) throws Exception{
tip();
try(BufferedReader reader = new BufferedReader(new InputStreamReader(System.in, encoding))){
String line = null;
while((line = reader.readLine()) != null){
if("exit".equals(line)){
System.exit(0);
}
if(line.startsWith("limit=")){
try{
setLimit(Integer.parseInt(line.replace("limit=", "").trim()));
}catch (Exception e){
LOGGER.error("指令不正确,数字非法");
}
continue;
}
if(line.startsWith("sa=")){
switch (line.substring(3)){
case "edi": setTextSimilarity(new EditDistanceTextSimilarity());all=false;continue;
case "ja": setTextSimilarity(new JaroDistanceTextSimilarity());all=false;continue;
case "jaw": setTextSimilarity(new JaroWinklerDistanceTextSimilarity());all=false;continue;
case "all": LOGGER.info("启用所有的相似度算法");all=true;continue;
}
continue;
}
LOGGER.info("计算相似词:" + line);
LOGGER.info("显示结果数目:" + limit);
LOGGER.info("----------------------------------------------------------");
if(all){
process(line, words, ALL_TEXT_SIMILARITIES);
}else{
process(line, words, getTextSimilarity());
}
tip();
}
}
}
private void process(String word, List<String> words, List<TextSimilarity> textSimilarities){
textSimilarities.forEach(textSimilarity -> process(word, words, textSimilarity));
}
private void process(String word, List<String> words, TextSimilarity textSimilarity){
LOGGER.info("----------------------------------------------------------");
LOGGER.info(word+" 的相似词("+textSimilarity.getClass().getSimpleName()+"):");
long start = System.currentTimeMillis();
Hits hits = compute(word, textSimilarity, words, limit);
long cost = System.currentTimeMillis() - start;
AtomicInteger i = new AtomicInteger();
for(Hit hit : hits.getHits()){
LOGGER.info("\t"+i.incrementAndGet()+"、"+hit.getScore()+" "+ hit.getText());
}
LOGGER.info("耗时:" + getTimeDes(cost));
LOGGER.info("----------------------------------------------------------");
}
/**
* 根据毫秒数转换为自然语言表示的时间
* @param ms 毫秒
* @return 自然语言表示的时间
*/
public String getTimeDes(Long ms) {
//处理参数为NULL的情况
if(ms == null){
return "";
}
int ss = 1000;
int mi = ss * 60;
int hh = mi * 60;
int dd = hh * 24;
long day = ms / dd;
long hour = (ms - day * dd) / hh;
long minute = (ms - day * dd - hour * hh) / mi;
long second = (ms - day * dd - hour * hh - minute * mi) / ss;
long milliSecond = ms - day * dd - hour * hh - minute * mi - second * ss;
StringBuilder str=new StringBuilder();
if(day>0){
str.append(day).append("天,");
}
if(hour>0){
str.append(hour).append("小时,");
}
if(minute>0){
str.append(minute).append("分钟,");
}
if(second>0){
str.append(second).append("秒,");
}
if(milliSecond>0){
str.append(milliSecond).append("毫秒,");
}
if(str.length()>0){
str=str.deleteCharAt(str.length()-1);
}
return str.toString();
}
public static void main(String[] args) throws Exception {
//所有的英语单词
List<String> words = WordSources.getSyllabusVocabulary().parallelStream().map(word -> word.getWord()).collect(Collectors.toList());
String encoding = "utf-8";
if(args.length == 1){
encoding = args[0];
}
SimilarWord similarWord = new SimilarWord();
similarWord.interact(encoding, words);
}
}