/**
*
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
* yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.superword.extract;
import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.*;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.FileSystem;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
/**
* 用连字符构造的合成词提取工具
* @author 杨尚川
*/
public class HyphenExtractor {
private HyphenExtractor(){}
private static final Logger LOGGER = LoggerFactory.getLogger(HyphenExtractor.class);
private static final String ICIBA = "http://www.iciba.com/";
private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
private static final String ENCODING = "gzip, deflate";
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
private static final String CONNECTION = "keep-alive";
private static final String HOST = "www.iciba.com";
private static final String REFERER = "http://www.iciba.com/";
private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0";
private static final String COLLINS_DEFINITION_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.collins div#dict_tab_101.tab_content.tab_authorities div.part_main div.collins_content div.collins_en_cn div.caption";
public static Map<String, AtomicInteger> parse(String path){
if(path.endsWith(".zip")){
return parseZip(path);
}
if(Files.isDirectory(Paths.get(path))){
return parseDir(path);
}else{
return parseFile(path);
}
}
public static Map<String, AtomicInteger> parseDir(String dir) {
Map<String, AtomicInteger> data = new HashMap<>();
LOGGER.info("开始解析目录:" + dir);
try {
Files.walkFileTree(Paths.get(dir), new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Map<String, AtomicInteger> rs = parseFile(file.toFile().getAbsolutePath());
rs.keySet().forEach(k -> {
data.putIfAbsent(k, new AtomicInteger());
data.get(k).addAndGet(rs.get(k).get());
});
return FileVisitResult.CONTINUE;
}
});
} catch (IOException e) {
LOGGER.error("解析文本出错", e);
}
return data;
}
public static Map<String, AtomicInteger> parseZip(String zipFile){
Map<String, AtomicInteger> data = new HashMap<>();
LOGGER.info("开始解析ZIP文件:"+zipFile);
try (FileSystem fs = FileSystems.newFileSystem(Paths.get(zipFile), WordClassifier.class.getClassLoader())) {
for(Path path : fs.getRootDirectories()){
LOGGER.info("处理目录:"+path);
Files.walkFileTree(path, new SimpleFileVisitor<Path>(){
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
LOGGER.info("处理文件:"+file);
// 拷贝到本地文件系统
Path temp = Paths.get("target/origin-html-temp.txt");
Files.copy(file, temp, StandardCopyOption.REPLACE_EXISTING);
Map<String, AtomicInteger> rs = parseFile(temp.toFile().getAbsolutePath());
rs.keySet().forEach(k -> {
data.putIfAbsent(k, new AtomicInteger());
data.get(k).addAndGet(rs.get(k).get());
});
return FileVisitResult.CONTINUE;
}
});
}
}catch (Exception e){
LOGGER.error("解析文本出错", e);
}
return data;
}
public static Map<String, AtomicInteger> parseFile(String file){
Map<String, AtomicInteger> data = new HashMap<>();
LOGGER.info("开始解析文件:"+file);
try (BufferedReader reader = new BufferedReader(
new InputStreamReader(
new BufferedInputStream(
new FileInputStream(file))))) {
String line = null;
while ((line = reader.readLine()) != null) {
//LOGGER.debug("line:"+line);
String[] attrs = line.split("\\s+");
for(String attr : attrs){
if(attr.contains("-")){
String[] parts = attr.split("-");
if(parts.length==2
&& parts[0].length()>1
&& parts[1].length()>1
&& WordSources.isEnglish(parts[0])
&& WordSources.isEnglish(parts[1])){
LOGGER.debug("发现连字符:"+attr);
attr = attr.toLowerCase();
data.putIfAbsent(attr, new AtomicInteger());
data.get(attr).incrementAndGet();
}
}
}
}
} catch (IOException e) {
LOGGER.error("解析文本出错", e);
}
return data;
}
/**
* 解析单词定义
* @param html
* @return
*/
public static Word parseWord(String html, String word){
LOGGER.info("解析单词:"+word);
Word w = new Word(word, "");
try {
for(Element element : Jsoup.parse(html).select(COLLINS_DEFINITION_CSS_PATH)){
String definition = element.text().trim();
if(StringUtils.isNotBlank(definition)
&& definition.toLowerCase().contains(word.toLowerCase())){
w.addDefinition(definition);
LOGGER.debug("解析出定义:" + definition);
}
}
}catch (Exception e){
LOGGER.error("解析定义出错", e);
}
return w;
}
public static String getContent(String word) {
String url = ICIBA + word + "?renovate=" + (new Random(System.currentTimeMillis()).nextInt(899999)+100000);
LOGGER.debug("url:"+url);
Connection conn = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Referer", REFERER)
.header("Host", HOST)
.header("User-Agent", USER_AGENT)
.ignoreContentType(true);
String html = "";
try {
html = conn.post().html();
html = html.replaceAll("[\n\r]", "");
}catch (Exception e){
LOGGER.error("获取URL:"+url+"页面出错", e);
}
return html;
}
public static boolean verify(String word){
String html = getContent(word);
int times = 1;
while (StringUtils.isBlank(html) && times<4){
times++;
//使用新的IP地址
DynamicIp.toNewIp();
html = getContent(word);
}
//LOGGER.debug("获取到的HTML:" +html);
while(html.contains("非常抱歉,来自您ip的请求异常频繁")){
//使用新的IP地址
DynamicIp.toNewIp();
html = getContent(word);
}
if(StringUtils.isNotBlank(html)) {
Word w = parseWord(html, word);
if(!w.getDefinitions().isEmpty()){
LOGGER.debug("词"+word+"验证通过");
return true;
}
}
LOGGER.debug("词"+word+"验证失败");
return false;
}
public static void extract(String allPath, String wordPath, String htmlPath, boolean verify){
Map<String, AtomicInteger> data = parse("/Users/apple/百度云同步盘/origin_html.zip");
Map<String, AtomicInteger> wordsIT = parse("src/main/resources/it");
wordsIT.keySet().forEach(k -> {
data.putIfAbsent(k, new AtomicInteger());
data.get(k).addAndGet(wordsIT.get(k).get());
});
Map<String, AtomicInteger> wordsJDK = parse("/Library/Java/JavaVirtualMachines/jdk1.8.0_11.jdk/Contents/Home/src.zip");
wordsJDK.keySet().forEach(k -> {
data.putIfAbsent(k, new AtomicInteger());
data.get(k).addAndGet(wordsJDK.get(k).get());
});
try {
Files.write(Paths.get(allPath), data
.entrySet()
.stream()
.sorted((a, b) -> b.getValue().get() - a.getValue().get())
.map(e -> e.getValue() + "\t" + e.getKey())
.collect(Collectors.toList()));
List<String> result =
data
.entrySet()
.stream()
.sorted((a, b) -> b.getValue().get() - a.getValue().get())
.filter(e -> {
if (verify) {
return verify(e.getKey());
}
return true;
})
.map(e -> e.getValue() + "\t" + e.getKey())
.collect(Collectors.toList());
Files.write(Paths.get(wordPath), result);
List<String> forHtmlResult =
result
.stream()
.map(s -> {
String[] attr = s.split("\t");
return WordLinker.toLink(attr[1]) + "(" + attr[0] + ")";
})
.collect(Collectors.toList());
Files.write(Paths.get(htmlPath), HtmlFormatter.toHtmlTableFragment(forHtmlResult, 3).getBytes("utf-8"));
LOGGER.info("完成");
}catch (Exception e){
LOGGER.error("保存文件出错", e);
}
}
public static void main(String[] args) throws Exception{
extract("src/main/resources/hyphen_word_all.txt", "src/main/resources/hyphen_word.txt", "src/main/resources/hyphen.txt", true);
}
}