/**
*
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
* yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.superword.extract;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.HtmlFormatter;
import org.apdplat.superword.tools.WordClassifier;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
/**
* 解析词缀词根
* @author 杨尚川
*/
public class RootAffixExtractor {
private RootAffixExtractor(){}
private static final Logger LOGGER = LoggerFactory.getLogger(RootAffixExtractor.class);
private static final String ROOT_AFFIX_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_6.dict_content.vCigen div.industry_box div.industry";
private static final String WORD = "h4";
private static final String MEANING = "div.vCigen_h4";
private static final String OTHER_WORDS = "ul.dl_show li > a";
public static Map<Word, Set<Word>> parse(String path){
if(path.endsWith(".zip")){
return parseZip(path);
}
if(Files.isDirectory(Paths.get(path))){
return parseDir(path);
}else{
return parseFile(path);
}
}
public static Map<Word, Set<Word>> parseDir(String dir) {
Map<Word, Set<Word>> roots = new HashMap<>();
LOGGER.info("开始解析目录:" + dir);
try {
Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Map<Word, Set<Word>> rs = parseFile(file.toFile().getAbsolutePath());
for(Word ra : rs.keySet()){
roots.putIfAbsent(ra, new HashSet<>());
roots.get(ra).addAll(rs.get(ra));
}
return FileVisitResult.CONTINUE;
}
});
} catch (IOException e) {
LOGGER.error("解析文本出错", e);
}
return roots;
}
public static Map<Word, Set<Word>> parseZip(String zipFile){
Map<Word, Set<Word>> roots = new HashMap<>();
LOGGER.info("开始解析ZIP文件:"+zipFile);
try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
for(Path path : fs.getRootDirectories()){
LOGGER.info("处理目录:"+path);
Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
LOGGER.info("处理文件:"+file);
// 拷贝到本地文件系统
Path temp = Paths.get("target/origin-html-temp.txt");
Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
Map<Word, Set<Word>> rs = parseFile(temp.toFile().getAbsolutePath());
for(Word ra : rs.keySet()){
roots.putIfAbsent(ra, new HashSet<>());
roots.get(ra).addAll(rs.get(ra));
}
return FileVisitResult.CONTINUE;
}
});
}
}catch (Exception e){
LOGGER.error("解析文本出错", e);
}
return roots;
}
public static Map<Word, Set<Word>> parseFile(String file){
Map<Word, Set<Word>> roots = new HashMap<>();
LOGGER.info("开始解析文件:"+file);
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(
new BufferedInputStream(
new FileInputStream(file))))) {
Map<String, List<String>> data = new HashMap<>();
String line = null;
while ((line = reader.readLine()) != null) {
//LOGGER.debug("html:"+line);
String[] attr = line.split("杨尚川");
if(attr == null || attr.length != 2){
LOGGER.error("解析文本失败,文本应该以'杨尚川'分割,前面是词,后面是网页,网页内容是去除换行符之后的一整行文本:"+line);
continue;
}
String word = attr[0];
LOGGER.info("解析单词:"+word);
String html = attr[1];
Map<Word, Set<Word>> rs = parseRootAffix(html);
for(Word ra : rs.keySet()){
roots.putIfAbsent(ra, new HashSet<>());
roots.get(ra).add(new Word(word, ""));
roots.get(ra).addAll(rs.get(ra));
}
}
} catch (IOException e) {
LOGGER.error("解析文本出错", e);
}
return roots;
}
/**
* 解析词根词缀
* 一个HTML可以包括多个词根词缀
* @param html
* @return
*/
public static Map<Word, Set<Word>> parseRootAffix(String html){
Map<Word, Set<Word>> data = new HashMap<>();
try {
for(Element element : Jsoup.parse(html).select(ROOT_AFFIX_CSS_PATH)){
String rootAffix = element.select(WORD).get(0).text().trim();
String meaning = element.select(MEANING).get(0).text().replaceAll("[\n\r]", "").trim();
int index = meaning.indexOf("//");
if(index != -1){
meaning = meaning.substring(0, index);
}
LOGGER.info("解析出词根词缀:" + rootAffix + meaning);
Word ra = new Word(rootAffix, meaning);
data.putIfAbsent(ra, new HashSet<>());
for(Element otherWord : element.select(OTHER_WORDS)){
String w = otherWord.text().trim();
LOGGER.info("解析出新词:" + w);
data.get(ra).add(new Word(w, ""));
}
}
}catch (Exception e){
LOGGER.error("解析词根词缀出错", e);
}
return data;
}
private static void parseRootAffixes(){
Map<Word, Set<Word>> rootAffixes = parse("/Users/apple/百度云同步盘/origin_html.zip");
List<String> rs = new ArrayList<>(rootAffixes.size());
rootAffixes.keySet().stream().sorted().forEach(r -> rs.add(r.getWord()+"杨尚川"+r.getMeaning()));
Map<Word, List<Word>> roots = new HashMap<>();
Map<Word, List<Word>> prefixes = new HashMap<>();
Map<Word, List<Word>> suffixes = new HashMap<>();
rootAffixes.keySet().forEach(k -> {
if(k.getWord().startsWith("词根:")){
roots.put(new Word(k.getWord().substring(3), k.getMeaning()), new ArrayList<>(rootAffixes.get(k)));
}
if(k.getWord().startsWith("前缀:")){
prefixes.put(new Word(k.getWord().substring(3), k.getMeaning()), new ArrayList<>(rootAffixes.get(k)));
}
if(k.getWord().startsWith("后缀:")){
suffixes.put(new Word(k.getWord().substring(3), k.getMeaning()), new ArrayList<>(rootAffixes.get(k)));
}
});
String rootsHtml = HtmlFormatter.toHtmlTableFragmentForRootAffix(roots, 6);
String prefixesHtml = HtmlFormatter.toHtmlTableFragmentForRootAffix(prefixes, 6);
String suffixesHtml = HtmlFormatter.toHtmlTableFragmentForRootAffix(suffixes, 6);
try{
Files.write(Paths.get("src/main/resources/root_affix.txt"), rs);
Files.write(Paths.get("src/main/resources/roots_with_words.txt"), rootsHtml.getBytes("utf-8"));
Files.write(Paths.get("src/main/resources/prefixes_with_words.txt"), prefixesHtml.getBytes("utf-8"));
Files.write(Paths.get("src/main/resources/suffixes_with_words.txt"), suffixesHtml.getBytes("utf-8"));
}catch (Exception e){
LOGGER.error(e.getMessage(), e);
}
}
public static void main(String[] args) {
parseRootAffixes();
}
}