WordSources.java example

Explorer

superword-master
- src
  - main
    - java
      - org
        apdplat
        superword
        extract
        ChineseSynonymAntonymExtractor.java
        DefinitionExtractor.java
        HyphenExtractor.java
        PartOfSpeechExtractor.java
        PhraseExtractor.java
        PrefixExtractor.java
        RootAffixExtractor.java
        SentenceExtractor.java
        SuffixExtractor.java
        SynonymAntonymExtractor.java
        SynonymDiscriminationExtractor.java
        freemarker
        TemplateUtils.java
        model
        CharMap.java
        ComplexPrefix.java
        ComplexSuffix.java
        HistoryRecord.java
        MyNewWord.java
        Prefix.java
        QQUser.java
        Quiz.java
        QuizItem.java
        Suffix.java
        SynonymAntonym.java
        SynonymDiscrimination.java
        User.java
        UserBook.java
        UserDynamicPrefix.java
        UserDynamicSuffix.java
        UserSimilarWord.java
        UserText.java
        UserUrl.java
        UserWord.java
        Word.java
        rule
        CharTransformRule.java
        CompoundWord.java
        DefinitionSimilarRule.java
        DependenceWordRule.java
        DynamicPrefixRule.java
        DynamicSuffixRule.java
        IndependentWordRule.java
        PartOfSpeech.java
        PrefixRule.java
        RootAffixRule.java
        RootRule.java
        SimilarWord.java
        SimilarityRule.java
        SuffixRule.java
        WordLengthStatistics.java
        WordVector.java
        system
        AntiRobotFilter.java
        HistoryFilter.java
        InstantTip.java
        InstantTipServlet.java
        ViewWordServlet.java
        WordsFilter.java
        qq
        AfterLoginRedirectServlet.java
        IndexServlet.java
        tools
        AidReading.java
        Definition.java
        DynamicIp.java
        FileUtils.java
        FootNSentence.java
        HtmlFormatter.java
        IPUtils.java
        IcibaSymbol.java
        IrregularPlurals.java
        IrregularVerbs.java
        JavaCodeAnalyzer.java
        MySQLUtils.java
        OxfordPOS.java
        OxfordSymbol.java
        PdfParser.java
        PrefixSuffixOptimizer.java
        Pronunciation.java
        ProxyIp.java
        SentenceScorer.java
        Summary.java
        TextAnalyzer.java
        TimeUtils.java
        TopNSentence.java
        WebsterPOS.java
        WebsterSymbol.java
        WordClassifier.java
        WordClassifierForOxford.java
        WordClassifierForWebster.java
        WordClassifierForYouDao.java
        WordLinker.java
        WordSources.java
        WordsFetcher.java
        YoudaoSymbol.java
  - test
    - java
      - org
        apdplat
        superword
        rule
        CompoundWordTest.java
        WordVectorTest.java

/**
 * 
 * APDPlat - Application Product Development Platform
 * Copyright (c) 2013, 杨尚川, yang-shangchuan@qq.com
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 */

package org.apdplat.superword.tools;

import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
 * 从多个文本文件中读取单词
 * 一行一个单词，单词和其他信息之间用空白字符隔开
 * @author 杨尚川
 */
public class WordSources {
    private WordSources(){}
    private static final Logger LOGGER = LoggerFactory.getLogger(WordSources.class);
    private static final Map<String, Set<Word>> CACHE = new ConcurrentHashMap<>();
    public static List<String> getLevels(String word){
        Word w = new Word(word, "");
        List<String> levels = new ArrayList<>();
        if(get("/word_primary_school.txt").contains(w)){
            levels.add("PrimarySchool");
        }
        if(get("/word_junior_school.txt").contains(w)){
            levels.add("JuniorSchool");
        }
        if(get("/word_senior_school.txt").contains(w)){
            levels.add("SeniorSchool");
        }
        if(get("/word_university.txt").contains(w)){
            levels.add("University");
        }
        if(get("/word_new_conception.txt").contains(w)){
            levels.add("NewConception");
        }
        if(get("/word_ADULT.txt").contains(w)){
            levels.add("ADULT");
        }
        if(get("/word_CET4.txt").contains(w)){
            levels.add("CET4");
        }
        if(get("/word_CET6.txt").contains(w)){
            levels.add("CET6");
        }
        if(get("/word_TEM4.txt").contains(w)){
            levels.add("TEM4");
        }
        if(get("/word_TEM8.txt").contains(w)){
            levels.add("TEM8");
        }
        if(get("/word_CATTI.txt").contains(w)){
            levels.add("CATTI");
        }
        if(get("/word_GMAT.txt").contains(w)){
            levels.add("GMAT");
        }
        if(get("/word_GRE.txt").contains(w)){
            levels.add("GRE");
        }
        if(get("/word_SAT.txt").contains(w)){
            levels.add("SAT");
        }
        if(get("/word_BEC.txt").contains(w)){
            levels.add("BEC");
        }
        if(get("/word_MBA.txt").contains(w)){
            levels.add("MBA");
        }
        if(get("/word_IELTS.txt").contains(w)){
            levels.add("IELTS");
        }
        if(get("/word_TOEFL.txt").contains(w)){
            levels.add("TOEFL");
        }
        if(get("/word_TOEIC.txt").contains(w)){
            levels.add("TOEIC");
        }
        if(get("/word_KY.txt").contains(w)){
            levels.add("KY");
        }
        return levels;
    }
    /**
     * 考纲词汇
     * @return
     */
    public static Set<Word> getSyllabusVocabulary(){
        return get("/word_primary_school.txt",
                "/word_junior_school.txt",
                "/word_senior_school.txt",
                "/word_university.txt",
                "/word_new_conception.txt",
                "/word_ADULT.txt",
                "/word_CET4.txt",
                "/word_CET6.txt",
                "/word_TEM4.txt",
                "/word_TEM8.txt",
                "/word_CATTI.txt",
                "/word_GMAT.txt",
                "/word_GRE.txt",
                "/word_SAT.txt",
                "/word_BEC.txt",
                "/word_MBA.txt",
                "/word_IELTS.txt",
                "/word_TOEFL.txt",
                "/word_TOEIC.txt",
                "/word_KY.txt");
    }
    public static Set<Word> getAll(){
        Set<Word> data = get("/words.txt", "/word_computer.txt");
        data.addAll(getSyllabusVocabulary());
        return data;
    }
    /**
     * 
     * 一行一个单词，单词和其他信息之间用空白字符隔开
     * 默认 index 为1
     * @param files 单词文件类路径，以/开头
     * @return 不重复的单词集合
     */
    public static Set<Word> get(String... files){
        return get(1, files);
    }

    public static Map<Word, AtomicInteger> convert(Map<String, AtomicInteger> words){
        Map<Word, AtomicInteger> result = new HashMap<>();
        words.keySet().forEach(w -> result.put(new Word(w, ""), words.get(w)));
        return result;
    }

    public static boolean isEnglish(String string){
        for(char c : string.toLowerCase().toCharArray()){
            if(c<'a' || c>'z'){
                return false;
            }
        }
        return true;
    }

    /**
     * 求交集
     * @param first
     * @param second
     * @return
     */
    public static Set<Word> intersection(Set<Word> first, Set<Word> second){
        LOGGER.info("求交集词典1："+first.size());
        LOGGER.info("求交集词典2："+second.size());
        Set<Word> result = first
                .stream()
                .filter(w -> second.contains(w))
                .collect(Collectors.toSet());
        LOGGER.info("交集词典："+result.size());
        return result;
    }
    public static Set<Word> minus(Set<Word> minuend, Set<Word> subtrahend){
        LOGGER.info("被减数个数："+minuend.size());
        LOGGER.info("减数个数："+subtrahend.size());
        Set<Word> result = minuend
                .stream()
                .filter(word -> !subtrahend.contains(word))
                .collect(Collectors.toSet());
        LOGGER.info("结果个数：" + result.size());
        return result;
    }
    public static void save(Set<Word> words, String path){
        try {
            path = "src/main/resources" + path;
            LOGGER.info("开始保存词典：" + path);
            AtomicInteger i = new AtomicInteger();
            List<String> list = words
                        .stream()
                        .sorted()
                        .map(word -> i.incrementAndGet() + "\t" + word.getWord())
                        .collect(Collectors.toList());
            Files.write(Paths.get(path), list);
            LOGGER.info("保存成功");
        }catch (Exception e){
            LOGGER.error("保存词典失败", e);
        }
    }
    /**
     * 一行一个单词，单词和其他信息之间用空白字符隔开
     * @param index 单词用空白字符隔开后的索引，从0开始
     * @param files 单词文件类路径，以/开头
     * @return 不重复的单词集合
     */
    public static Set<Word> get(int index, String... files){
        Set<Word> set = new HashSet<>();
        for(String file : files){
            Set<Word> value = CACHE.get(file);
            if(value != null){
                LOGGER.info("cache hit word file: " + file);
                set.addAll(value);
                continue;
            }

            URL url = null;
            if(file.startsWith("/")){
                url = WordSources.class.getResource(file);
            }else{
                try {
                    url = Paths.get(file).toUri().toURL();
                }catch (Exception e){
                    LOGGER.error("构造URL出错", e);
                }
            }
            if(url == null){
                LOGGER.error("解析词典失败："+file);
                continue;
            }
            LOGGER.info("parse word file: " + url);
            List<String> words = getExistWords(url);
            Set<Word> wordSet = words.parallelStream()
                    .filter(line -> !line.trim().startsWith("#") && !"".equals(line.trim()))
                    .filter(line -> line.trim().split("\\s+").length >= index+1)
                    .map(line -> new Word(line.trim().split("\\s+")[index], ""))
                    .filter(word -> StringUtils.isAlphanumeric(word.getWord()))
                    .collect(Collectors.toSet());
            set.addAll(wordSet);
            CACHE.put(file, wordSet);
        }
        LOGGER.info("unique words count: " + set.size());
        return set;
    }
    private static List<String> getExistWords(URL url){
        try {
            return Files.readAllLines(Paths.get(url.toURI()));
        }catch (Exception e){
            return Collections.emptyList();
        }
    }
    public static Set<Word> stem(Set<Word> words){
        return words
                .stream()
                .filter(word -> word.getWord().length() > 3)
                .filter(word -> !isPlural(words, word))
                .collect(Collectors.toSet());
    }
    public static Map<String, String> plural(Set<Word> words){
        Map<String, String> data = new HashMap<>();
        words
                .stream()
                .filter(word -> word.getWord().length() > 3)
                .forEach(word -> {
                    isPlural(words, word, data);
                });
        return data;
    }
    public static boolean isPlural(Set<Word> words, Word word){
        return isPlural(words, word, new HashMap<>());
    }
    public static boolean isPlural(Set<Word> words, Word word, Map<String, String> data){
        String w = word.getWord();
        //1、以辅音字母+y结尾,变y为i再加es
        if (w.endsWith("ies")){
            char c = w.charAt(w.length()-4);
            if(!(isVowel(c))
                    && words.contains(new Word(w.substring(0, w.length()-4)+"y", ""))){
                log(w, "ies");
                data.put(w, "ies");
                return true;
            }
        }
        //2、以ce, se, ze结尾, 加s
        if(w.endsWith("ces")
                || w.endsWith("ses")
                || w.endsWith("zes")){
            if(words.contains(new Word(w.substring(0, w.length()-1), ""))){
                log(w, "s");
                data.put(w, "s");
                return true;
            }
        }
        //3、以s, sh, ch, x结尾, 加es
        if(w.endsWith("ses")
                || w.endsWith("shes")
                || w.endsWith("ches")
                || w.endsWith("xes")){
            if(words.contains(new Word(w.substring(0, w.length()-2), ""))){
                log(w, "es");
                data.put(w, "es");
                return true;
            }
        }
        //4、一般情况，加s
        if(w.endsWith("s")){
            if(words.contains(new Word(w.substring(0, w.length()-1), ""))){
                log(w, "s");
                data.put(w, "s");
                return true;
            }
        }
        return false;
    }
    private static void log(String word, String suffix){
        LOGGER.debug("发现复数："+word+"\t"+suffix);
    }
    public static boolean isVowel(char _char){
        switch (_char){
            case 'a':return true;
            case 'e':return true;
            case 'i':return true;
            case 'o':return true;
            case 'u':return true;
        }
        return false;
    }
    public static void main(String[] args) {
        //AtomicInteger i = new AtomicInteger();
        //stem(getSyllabusVocabulary()).forEach(w -> System.out.println(i.incrementAndGet() + "、" + w.getWord()));
        String html = HtmlFormatter.toHtmlForPluralFormat(plural(getSyllabusVocabulary()));
        System.out.println(html);
    }
}