/**
*
* APDPlat - Application Product Development Platform
* Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.superword.tools;
import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
/**
* 从多个文本文件中读取单词
* 一行一个单词,单词和其他信息之间用空白字符隔开
* @author 杨尚川
*/
public class WordSources {
private WordSources(){}
private static final Logger LOGGER = LoggerFactory.getLogger(WordSources.class);
private static final Map<String, Set<Word>> CACHE = new ConcurrentHashMap<>();
public static List<String> getLevels(String word){
Word w = new Word(word, "");
List<String> levels = new ArrayList<>();
if(get("/word_primary_school.txt").contains(w)){
levels.add("PrimarySchool");
}
if(get("/word_junior_school.txt").contains(w)){
levels.add("JuniorSchool");
}
if(get("/word_senior_school.txt").contains(w)){
levels.add("SeniorSchool");
}
if(get("/word_university.txt").contains(w)){
levels.add("University");
}
if(get("/word_new_conception.txt").contains(w)){
levels.add("NewConception");
}
if(get("/word_ADULT.txt").contains(w)){
levels.add("ADULT");
}
if(get("/word_CET4.txt").contains(w)){
levels.add("CET4");
}
if(get("/word_CET6.txt").contains(w)){
levels.add("CET6");
}
if(get("/word_TEM4.txt").contains(w)){
levels.add("TEM4");
}
if(get("/word_TEM8.txt").contains(w)){
levels.add("TEM8");
}
if(get("/word_CATTI.txt").contains(w)){
levels.add("CATTI");
}
if(get("/word_GMAT.txt").contains(w)){
levels.add("GMAT");
}
if(get("/word_GRE.txt").contains(w)){
levels.add("GRE");
}
if(get("/word_SAT.txt").contains(w)){
levels.add("SAT");
}
if(get("/word_BEC.txt").contains(w)){
levels.add("BEC");
}
if(get("/word_MBA.txt").contains(w)){
levels.add("MBA");
}
if(get("/word_IELTS.txt").contains(w)){
levels.add("IELTS");
}
if(get("/word_TOEFL.txt").contains(w)){
levels.add("TOEFL");
}
if(get("/word_TOEIC.txt").contains(w)){
levels.add("TOEIC");
}
if(get("/word_KY.txt").contains(w)){
levels.add("KY");
}
return levels;
}
/**
* 考纲词汇
* @return
*/
public static Set<Word> getSyllabusVocabulary(){
return get("/word_primary_school.txt",
"/word_junior_school.txt",
"/word_senior_school.txt",
"/word_university.txt",
"/word_new_conception.txt",
"/word_ADULT.txt",
"/word_CET4.txt",
"/word_CET6.txt",
"/word_TEM4.txt",
"/word_TEM8.txt",
"/word_CATTI.txt",
"/word_GMAT.txt",
"/word_GRE.txt",
"/word_SAT.txt",
"/word_BEC.txt",
"/word_MBA.txt",
"/word_IELTS.txt",
"/word_TOEFL.txt",
"/word_TOEIC.txt",
"/word_KY.txt");
}
public static Set<Word> getAll(){
Set<Word> data = get("/words.txt", "/word_computer.txt");
data.addAll(getSyllabusVocabulary());
return data;
}
/**
*
* 一行一个单词,单词和其他信息之间用空白字符隔开
* 默认 index 为1
* @param files 单词文件类路径,以/开头
* @return 不重复的单词集合
*/
public static Set<Word> get(String... files){
return get(1, files);
}
public static Map<Word, AtomicInteger> convert(Map<String, AtomicInteger> words){
Map<Word, AtomicInteger> result = new HashMap<>();
words.keySet().forEach(w -> result.put(new Word(w, ""), words.get(w)));
return result;
}
public static boolean isEnglish(String string){
for(char c : string.toLowerCase().toCharArray()){
if(c<'a' || c>'z'){
return false;
}
}
return true;
}
/**
* 求交集
* @param first
* @param second
* @return
*/
public static Set<Word> intersection(Set<Word> first, Set<Word> second){
LOGGER.info("求交集词典1:"+first.size());
LOGGER.info("求交集词典2:"+second.size());
Set<Word> result = first
.stream()
.filter(w -> second.contains(w))
.collect(Collectors.toSet());
LOGGER.info("交集词典:"+result.size());
return result;
}
public static Set<Word> minus(Set<Word> minuend, Set<Word> subtrahend){
LOGGER.info("被减数个数:"+minuend.size());
LOGGER.info("减数个数:"+subtrahend.size());
Set<Word> result = minuend
.stream()
.filter(word -> !subtrahend.contains(word))
.collect(Collectors.toSet());
LOGGER.info("结果个数:" + result.size());
return result;
}
public static void save(Set<Word> words, String path){
try {
path = "src/main/resources" + path;
LOGGER.info("开始保存词典:" + path);
AtomicInteger i = new AtomicInteger();
List<String> list = words
.stream()
.sorted()
.map(word -> i.incrementAndGet() + "\t" + word.getWord())
.collect(Collectors.toList());
Files.write(Paths.get(path), list);
LOGGER.info("保存成功");
}catch (Exception e){
LOGGER.error("保存词典失败", e);
}
}
/**
* 一行一个单词,单词和其他信息之间用空白字符隔开
* @param index 单词用空白字符隔开后的索引,从0开始
* @param files 单词文件类路径,以/开头
* @return 不重复的单词集合
*/
public static Set<Word> get(int index, String... files){
Set<Word> set = new HashSet<>();
for(String file : files){
Set<Word> value = CACHE.get(file);
if(value != null){
LOGGER.info("cache hit word file: " + file);
set.addAll(value);
continue;
}
URL url = null;
if(file.startsWith("/")){
url = WordSources.class.getResource(file);
}else{
try {
url = Paths.get(file).toUri().toURL();
}catch (Exception e){
LOGGER.error("构造URL出错", e);
}
}
if(url == null){
LOGGER.error("解析词典失败:"+file);
continue;
}
LOGGER.info("parse word file: " + url);
List<String> words = getExistWords(url);
Set<Word> wordSet = words.parallelStream()
.filter(line -> !line.trim().startsWith("#") && !"".equals(line.trim()))
.filter(line -> line.trim().split("\\s+").length >= index+1)
.map(line -> new Word(line.trim().split("\\s+")[index], ""))
.filter(word -> StringUtils.isAlphanumeric(word.getWord()))
.collect(Collectors.toSet());
set.addAll(wordSet);
CACHE.put(file, wordSet);
}
LOGGER.info("unique words count: " + set.size());
return set;
}
private static List<String> getExistWords(URL url){
try {
return Files.readAllLines(Paths.get(url.toURI()));
}catch (Exception e){
return Collections.emptyList();
}
}
public static Set<Word> stem(Set<Word> words){
return words
.stream()
.filter(word -> word.getWord().length() > 3)
.filter(word -> !isPlural(words, word))
.collect(Collectors.toSet());
}
public static Map<String, String> plural(Set<Word> words){
Map<String, String> data = new HashMap<>();
words
.stream()
.filter(word -> word.getWord().length() > 3)
.forEach(word -> {
isPlural(words, word, data);
});
return data;
}
public static boolean isPlural(Set<Word> words, Word word){
return isPlural(words, word, new HashMap<>());
}
public static boolean isPlural(Set<Word> words, Word word, Map<String, String> data){
String w = word.getWord();
//1、以辅音字母+y结尾,变y为i再加es
if (w.endsWith("ies")){
char c = w.charAt(w.length()-4);
if(!(isVowel(c))
&& words.contains(new Word(w.substring(0, w.length()-4)+"y", ""))){
log(w, "ies");
data.put(w, "ies");
return true;
}
}
//2、以ce, se, ze结尾, 加s
if(w.endsWith("ces")
|| w.endsWith("ses")
|| w.endsWith("zes")){
if(words.contains(new Word(w.substring(0, w.length()-1), ""))){
log(w, "s");
data.put(w, "s");
return true;
}
}
//3、以s, sh, ch, x结尾, 加es
if(w.endsWith("ses")
|| w.endsWith("shes")
|| w.endsWith("ches")
|| w.endsWith("xes")){
if(words.contains(new Word(w.substring(0, w.length()-2), ""))){
log(w, "es");
data.put(w, "es");
return true;
}
}
//4、一般情况,加s
if(w.endsWith("s")){
if(words.contains(new Word(w.substring(0, w.length()-1), ""))){
log(w, "s");
data.put(w, "s");
return true;
}
}
return false;
}
private static void log(String word, String suffix){
LOGGER.debug("发现复数:"+word+"\t"+suffix);
}
public static boolean isVowel(char _char){
switch (_char){
case 'a':return true;
case 'e':return true;
case 'i':return true;
case 'o':return true;
case 'u':return true;
}
return false;
}
public static void main(String[] args) {
//AtomicInteger i = new AtomicInteger();
//stem(getSyllabusVocabulary()).forEach(w -> System.out.println(i.incrementAndGet() + "、" + w.getWord()));
String html = HtmlFormatter.toHtmlForPluralFormat(plural(getSyllabusVocabulary()));
System.out.println(html);
}
}