/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.extract; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.apdplat.superword.rule.PartOfSpeech; import org.apdplat.superword.tools.HtmlFormatter; import org.apdplat.superword.tools.WordClassifier; import org.apdplat.superword.tools.WordSources; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URL; import java.nio.file.FileSystem; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; /** * 词性提取工具 * @author 杨尚川 */ public class PartOfSpeechExtractor { private PartOfSpeechExtractor(){} private static final Logger LOGGER = LoggerFactory.getLogger(PartOfSpeechExtractor.class); private static final String PART_OF_SPEECH_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.collins div#dict_tab_101.tab_content.tab_authorities div.part_main div.collins_content div.collins_en_cn div.caption span.st"; public static Set<Word> parse(String path){ if(path.endsWith(".zip")){ return parseZip(path); } if(Files.isDirectory(Paths.get(path))){ return parseDir(path); }else{ return parseFile(path); } } public static Set<Word> parseDir(String dir) { Set<Word> data = new HashSet<>(); LOGGER.info("开始解析目录:" + dir); try { Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { data.addAll(parseFile(file.toFile().getAbsolutePath())); return FileVisitResult.CONTINUE; } }); } catch (IOException e) { LOGGER.error("解析文本出错", e); } return data; } public static Set<Word> parseZip(String zipFile){ Set<Word> data = new HashSet<>(); LOGGER.info("开始解析ZIP文件:"+zipFile); try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) { for(Path path : fs.getRootDirectories()){ LOGGER.info("处理目录:"+path); Files.walkFileTree(path, new SimpleFileVisitor<Path>(){ @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { LOGGER.info("处理文件:"+file); // 拷贝到本地文件系统 Path temp = Paths.get("target/origin-html-temp.txt"); Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING); data.addAll(parseFile(temp.toFile().getAbsolutePath())); return FileVisitResult.CONTINUE; } }); } }catch (Exception e){ LOGGER.error("解析文本出错", e); } return data; } public static Set<Word> parseFile(String file){ Set<Word> data = new HashSet<>(); LOGGER.info("开始解析文件:"+file); try (BufferedReader reader = new BufferedReader( new InputStreamReader( new BufferedInputStream( new FileInputStream(file))))) { String line = null; while ((line = reader.readLine()) != null) { //LOGGER.debug("html:"+line); String[] attr = line.split("杨尚川"); if(attr == null || attr.length != 2){ LOGGER.error("解析文本失败,文本应该以'杨尚川'分割,前面是词,后面是网页,网页内容是去除换行符之后的一整行文本:"+line); continue; } String word = attr[0]; String html = attr[1]; Word w = parseWord(html, word); if(w!=null && !w.getPartOfSpeeches().isEmpty()) { data.add(w); } } } catch (IOException e) { LOGGER.error("解析文本出错", e); } return data; } /** * 解析词性 * @param html * @return */ public static Word parseWord(String html, String word){ LOGGER.info("解析单词:"+word); Word w = new Word(word, ""); try { for(Element element : Jsoup.parse(html).select(PART_OF_SPEECH_CSS_PATH)){ String partOfSpeech = element.text(); LOGGER.debug("解析原始词性:" + partOfSpeech); if(StringUtils.isNotBlank(partOfSpeech) && !partOfSpeech.contains("See also")){ partOfSpeech = partOfSpeech.replace(";", "") //处理组合词 .replace("COMB in ADJ and N-COUNT", "COMB-in-ADJ-and-N-COUNT") .replace("COMB in ADJ and N", "COMB-in-ADJ-and-N") .replace("COMB in ADJ", "COMB-in-ADJ") .replace("COMB in ADJ-GRADED", "COMB-in-ADJ-GRADED") .replace("COMB in N-COUNT", "COMB-in-N-COUNT") .replace("COMB in COLOUR", "COMB-in-COLOUR") .replace("COMB in N", "COMB-in-N") .replace("COMB in N-UNCOUNT", "COMB-in-N-UNCOUNT") .replace("COMB in QUANT", "COMB-in-QUANT") .replace("COMB in VERB", "COMB-in-VERB"); String[] attrs = partOfSpeech.split("\\s+"); for(String attr : attrs){ if(attr.length()<1){ LOGGER.debug("忽略空词性:" + attr); continue; } //短语不归入词性 if(attr.contains("PHR")){ LOGGER.debug("忽略短语:" + attr); continue; } attr = attr.replace(",", ""); char c = attr.charAt(0); if(c>='A' && c<='Z'){ if("VERB".equals(attr)){ attr = "V"; } if("VERB-ERG".equals(attr)){ attr = "V-ERG"; } w.addPartOfSpeech(attr); LOGGER.debug("解析出词性:" + attr); } } } } }catch (Exception e){ LOGGER.error("解析词性出错", e); } return w; } private static Set<Word> inSyllabusVocabulary(Set<Word> words){ Set<Word> voc = WordSources.getSyllabusVocabulary(); return words.stream().filter(w -> voc.contains(w)).collect(Collectors.toSet()); } private static Set<Word> notInSyllabusVocabulary(Set<Word> words){ Set<Word> voc = WordSources.getSyllabusVocabulary(); return words.stream().filter(w -> !voc.contains(w)).collect(Collectors.toSet()); } private static void parseWord(){ Set<Word> words = parse("/Users/apple/百度云同步盘/origin_html.zip"); Set<Word> inSyllabusVocabulary = inSyllabusVocabulary(words); compensate(inSyllabusVocabulary); String inSyllabusVocabularyText = formatPartOfSpeech(inSyllabusVocabulary); Set<Word> notInSyllabusVocabulary = notInSyllabusVocabulary(words); String notInSyllabusVocabularyText = formatPartOfSpeech(notInSyllabusVocabulary); LOGGER.info(formatPartOfSpeechType(words)); try{ Files.write(Paths.get("src/main/resources/part_of_speech_in_syllabus_vocabulary.txt"), inSyllabusVocabularyText.getBytes("utf-8")); Files.write(Paths.get("src/main/resources/part_of_speech_not_in_syllabus_vocabulary.txt"), notInSyllabusVocabularyText.getBytes("utf-8")); Files.write(Paths.get("src/main/resources/group_part_of_speech_in_syllabus_vocabulary.txt"), HtmlFormatter.toHtmlForPartOfSpeech(group(inSyllabusVocabulary)).getBytes("utf-8")); Files.write(Paths.get("src/main/resources/group_part_of_speech_not_in_syllabus_vocabulary.txt"), HtmlFormatter.toHtmlForPartOfSpeech(group(notInSyllabusVocabulary)).getBytes("utf-8")); }catch (Exception e) { LOGGER.error(e.getMessage(), e); } } private static Map<String, Set<String>> group(Set<Word> words){ Map<String, Set<String>> data = new HashMap<>(); words.forEach(w -> { w.getPartOfSpeeches().forEach(pos -> { data.putIfAbsent(pos, new HashSet<>()); data.get(pos).add(w.getWord()); }); }); return data; } private static String formatPartOfSpeech(Set<Word> words){ StringBuilder text = new StringBuilder(); AtomicInteger i = new AtomicInteger(); words.forEach(w -> text.append(i.incrementAndGet()) .append("\t") .append(w.getWord()) .append("\t") .append(w.getFormatPartOfSpeeches()) .append("\n") ); text.append(formatPartOfSpeechType(words)); return text.toString(); } private static String formatPartOfSpeechType(Set<Word> words){ StringBuilder text = new StringBuilder(); Set<String> ps = new HashSet<>(); words.forEach(w -> { ps.addAll(w.getPartOfSpeeches()); }); text.append("#词性种类(").append(ps.size()).append("):").append("\n"); ps.forEach(p -> text.append("#").append(p).append("=").append(PartOfSpeech.getMeaning(p)).append("\n")); return text.toString(); } public static void compensate(Set<Word> words){ Set<Word> minus = WordSources.minus(WordSources.getSyllabusVocabulary(), words); LOGGER.debug("本地文件中没有的考纲词数:"+minus.size()); minus.forEach(w -> { LOGGER.debug(w.getWord()); Word word = parseWord(w.getWord()); if(word!=null && !word.getPartOfSpeeches().isEmpty()){ words.add(word); } }); } public static Word parseWord(String word){ try { return parseWord(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word); }catch (Exception e){ LOGGER.error("解析词性出错", e); } return null; } public static void main(String[] args){ //parseWord("up"); //parseWord("like"); //parseWord("nothing"); parseWord(); } }