/**
*
* APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川,
* yang-shangchuan@qq.com
*
* This program is free software: you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation, either version 3 of the License, or (at your option) any later
* version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
package org.apdplat.superword.extract;
import org.apache.commons.lang.StringUtils;
import org.apdplat.superword.model.SynonymAntonym;
import org.apdplat.superword.model.Word;
import org.apdplat.superword.tools.ProxyIp;
import org.eclipse.jetty.util.ConcurrentHashSet;
import org.jsoup.Connection;
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* 汉语同义词反义词提取工具
* @author 杨尚川
*/
public class ChineseSynonymAntonymExtractor {
private ChineseSynonymAntonymExtractor(){}
private static final Logger LOGGER = LoggerFactory.getLogger(ChineseSynonymAntonymExtractor.class);
private static final String SYNONYM_ANTONYM_CSS_PATH = "html body.bg_main div#layout div#center div#main_box div#dict_main div.simple div#dict_content_3.dict_content div.industry_box div.industry.cn_synon_box";
private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
private static final String ENCODING = "gzip, deflate";
private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3";
private static final String CONNECTION = "keep-alive";
private static final String HOST = "www.iciba.com";
private static final String REFERER = "http://www.iciba.com/";
private static final List<String> USER_AGENTS = Arrays.asList("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.117 Safari",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.102 Safari/537.36 OPR"
);
private static final AtomicInteger uac = new AtomicInteger();
private static final Map<String, String> ANTONYM = new ConcurrentHashMap<>();
private static final ExecutorService EXECUTOR_SERVICE = Executors.newCachedThreadPool();
private static final Set<String> CHECKED_WORDS = new ConcurrentHashSet<>();
//用来合并不同条目
private static final Map<Word, Set<Word>> SYNONYM_MAP = new ConcurrentHashMap<>();
private static final Path CHECKED_WORDS_PATH = Paths.get("src/main/resources/checked_words.txt");
private static final Path CHINESE_SYNONYM = Paths.get("src/main/resources/chinese_synonym.txt");
private static final Path CHINESE_ANTONYM = Paths.get("src/main/resources/chinese_antonym.txt");
public static SynonymAntonym parseSynonymAntonym(String html, String word){
SynonymAntonym synonymAntonym = new SynonymAntonym();
synonymAntonym.setWord(new Word(word, ""));
try {
for(Element element : Jsoup.parse(html).select(SYNONYM_ANTONYM_CSS_PATH)){
int size = element.children().size();
LOGGER.debug("element size:" + size);
for(int i=0;i<size/2;i++) {
String type = element.child(i*2).text();
LOGGER.debug("type:"+type);
if ("同义词".equals(type)) {
String synonym = element.child(i*2+1).text();
LOGGER.debug("synonym:"+synonym);
for(String w : synonym.split("\\s+")){
w=w.replaceAll("\\s+", "");
if(w.length()<2){
continue;
}
if(isNotChineseChar(w)){
LOGGER.debug("非中文字符:"+w);
continue;
}
if(w.equals(word)){
continue;
}
LOGGER.debug("word:"+w);
synonymAntonym.addSynonym(new Word(w, ""));
}
}
if ("反义词".equals(type)) {
String antonym = element.child(i*2+1).text();
LOGGER.debug("antonym:"+antonym);
for(String w : antonym.split("\\s+")){
w=w.replaceAll("\\s+", "");
if(w.length()<2){
continue;
}
if(isNotChineseChar(w)){
LOGGER.debug("非中文字符:"+w);
continue;
}
LOGGER.debug("word:"+w);
synonymAntonym.addAntonym(new Word(w, ""));
}
}
}
}
if(!synonymAntonym.getAntonym().isEmpty() || !synonymAntonym.getSynonym().isEmpty()) {
LOGGER.info("解析出同义词反义词:" + synonymAntonym);
}
}catch (Exception e){
LOGGER.error("解析同义词反义词出错", e);
}
return synonymAntonym;
}
public static void parseSynonymAntonym(List<String> words){
LOGGER.info("开始解析,词数:" + words.size());
Set<String> SKIP_WORDS = new ConcurrentSkipListSet<>();
try{
if(Files.notExists(CHECKED_WORDS_PATH)){
CHECKED_WORDS_PATH.toFile().createNewFile();
}
SKIP_WORDS.addAll(Files.readAllLines(CHECKED_WORDS_PATH));
}catch (Exception e){
LOGGER.error("读取文件失败", e);
}
int total = words.size()-SKIP_WORDS.size();
LOGGER.info("之前已经解析的词数:" + SKIP_WORDS.size());
LOGGER.info("现在还需解析的词数:" + total);
String url = "http://www.iciba.com/";
AtomicInteger i = new AtomicInteger();
EXECUTOR_SERVICE.submit(()->{
while(true){
try {
Thread.sleep(60000);
} catch (InterruptedException e) {
e.printStackTrace();
}
save();
}
});
words.parallelStream().forEach(word -> {
if (SKIP_WORDS.contains(word)) {
return;
}
LOGGER.info("进度:" + total + "/" + i.incrementAndGet() + " 来自线程:" + Thread.currentThread());
try {
word = word.trim();
if ("".equals(word) || isNotChineseChar(word) || word.length() < 2) {
return;
}
String html = getContent(url + word);
int times = 1;
while (StringUtils.isBlank(html) && times < 3) {
times++;
//使用新的IP地址
ProxyIp.toNewIp();
html = getContent(url + word);
}
if (StringUtils.isBlank(html)) {
LOGGER.error("获取页面失败:" + url + word);
return;
}
times = 1;
//LOGGER.debug("获取到的HTML:" +html);
while (html.contains("非常抱歉,来自您ip的请求异常频繁") && times < 3) {
times++;
//使用新的IP地址
ProxyIp.toNewIp();
html = getContent(url + word);
}
SynonymAntonym synonymAntonym = parseSynonymAntonym(html, word);
if (!synonymAntonym.getSynonym().isEmpty()) {
SYNONYM_MAP.put(synonymAntonym.getWord(), synonymAntonym.getSynonym());
}
if (!synonymAntonym.getAntonym().isEmpty()) {
StringBuilder str = new StringBuilder();
synonymAntonym.getAntonym().forEach(w -> str.append(w.getWord()).append(" "));
ANTONYM.put(word, str.toString().trim());
}
CHECKED_WORDS.add(word);
} catch (Exception e) {
LOGGER.error("错误:", e);
}
});
save();
filterSameRecord(CHINESE_SYNONYM);
filterSameRecord(CHINESE_ANTONYM);
}
private static synchronized void save(){
System.out.println("开始保存文件");
List<String> SYNONYM_LIST = null;
List<String> ANTONYM_LIST = null;
try {
if(Files.notExists(CHINESE_SYNONYM)){
CHINESE_SYNONYM.toFile().createNewFile();
}
if(Files.notExists(CHINESE_ANTONYM)){
CHINESE_ANTONYM.toFile().createNewFile();
}
System.out.println("同义词数:"+SYNONYM_MAP.size());
Set<String> SYNONYM_STR = new HashSet<>();
SYNONYM_MAP.keySet().forEach(k -> {
StringBuilder str = new StringBuilder();
str.append(k.getWord()).append(" ");
SYNONYM_MAP.get(k).stream().sorted().forEach(w -> {
str.append(w.getWord()).append(" ");
});
SYNONYM_STR.add(str.toString().trim());
});
List<String> existList = Files.readAllLines(CHINESE_SYNONYM);
SYNONYM_STR.addAll(existList);
SYNONYM_LIST = SYNONYM_STR.stream().sorted().collect(Collectors.toList());
System.out.println("总的同义词数:"+SYNONYM_LIST.size());
Files.write(CHINESE_SYNONYM, SYNONYM_LIST);
Set<String> set = ANTONYM.keySet().stream().sorted().map(k -> k + " " + ANTONYM.get(k)).collect(Collectors.toSet());
existList = Files.readAllLines(CHINESE_ANTONYM);
set.addAll(existList);
ANTONYM_LIST = set.stream().sorted().collect(Collectors.toList());
System.out.println("总的反义词数:"+ANTONYM_LIST.size());
Files.write(CHINESE_ANTONYM, ANTONYM_LIST);
existList = Files.readAllLines(CHECKED_WORDS_PATH);
CHECKED_WORDS.addAll(existList);
System.out.println("总的已检查词数:" + CHECKED_WORDS.size());
Files.write(CHECKED_WORDS_PATH, CHECKED_WORDS);
} catch (Exception e) {
LOGGER.error("同义词:",SYNONYM_LIST.toString());
LOGGER.error("反义词:",ANTONYM_LIST.toString());
LOGGER.error("保存文件失败", e);
}
}
public static String getContent(String url) {
LOGGER.debug("url:" + url);
Connection conn = Jsoup.connect(url)
.header("Accept", ACCEPT)
.header("Accept-Encoding", ENCODING)
.header("Accept-Language", LANGUAGE)
.header("Connection", CONNECTION)
.header("Referer", REFERER)
.header("Host", HOST)
.header("User-Agent", USER_AGENTS.get(uac.incrementAndGet() % USER_AGENTS.size()))
.header("X-Forwarded-For", getRandomIp())
.header("Proxy-Client-IP", getRandomIp())
.header("WL-Proxy-Client-IP", getRandomIp())
.ignoreContentType(true);
String html = "";
try {
html = conn.post().html();
}catch (Exception e){
if(e instanceof HttpStatusException) {
HttpStatusException ex = (HttpStatusException) e;
LOGGER.error("error code:"+ex.getStatusCode());
if(ex.getStatusCode()==404){
return "404";
}
}
LOGGER.error("获取URL:"+url+" 页面出错", e);
}
return html;
}
public static boolean isNotChineseChar(String str){
boolean temp = false;
Pattern p= Pattern.compile("[^\u4e00-\u9fa5]");
Matcher m=p.matcher(str);
if(m.find()){
temp = true;
}
return temp;
}
public static SynonymAntonym parseSynonymAntonym(String word){
try {
return parseSynonymAntonym(Jsoup.parse(new URL("http://www.iciba.com/" + word), 15000).html(), word);
}catch (Exception e){
LOGGER.error("解析同义词反义词出错", e);
}
return null;
}
public static String getRandomIp(){
int first = new Random().nextInt(254)+1;
//排除A类私有地址0.0.0.0--10.255.255.255
while(first==10){
first = new Random().nextInt(254)+1;
}
int second = new Random().nextInt(254)+1;
//排除B类私有地址172.16.0.0--172.31.255.255
while(first==172 && (second>=16 && second<=31)){
first = new Random().nextInt(254)+1;
second = new Random().nextInt(254)+1;
}
//排除C类私有地址192.168.0.0--192.168.255.255
while(first==192 && second==168){
first = new Random().nextInt(254)+1;
second = new Random().nextInt(254)+1;
}
int third = new Random().nextInt(254)+1;
int forth = new Random().nextInt(254)+1;
return first+"."+second+"."+second+"."+forth;
}
/**
* 去掉重复的记录,如:
* 一丘之貉 比众不同
* 比众不同 一丘之貉
* 只保留一条记录
* @param path
*/
private static void filterSameRecord(Path path){
try {
AtomicInteger i = new AtomicInteger();
Set<String> set = new HashSet<>();
List<String> list = Files.readAllLines(path).stream().filter(line -> {
String[] attr = line.split("\\s+");
String words = Arrays.asList(attr).stream().sorted().collect(Collectors.toList()).toString();
if (set.contains(words)) {
i.incrementAndGet();
LOGGER.info("去掉重复的记录:" + line);
return false;
}
set.add(words);
return true;
}).sorted().collect(Collectors.toList());
Files.write(path, list);
LOGGER.info("去掉重复的记录数:" + i.get());
}catch (Exception e){
LOGGER.error("去掉重复的记录出错", e);
}
}
public static void main(String[] args) throws Exception{
//parseSynonymAntonym("热爱");
//parseSynonymAntonym("一举成名");
//parseSynonymAntonym(Arrays.asList("热爱", "一举成名"));
//System.out.println(getContent("http://www.iciba.com/%E7%83%AD%E7%88%B1"));
parseSynonymAntonym(Files.readAllLines(Paths.get("src/main/resources/dic.txt")).stream().sorted((a, b) -> new Integer(a.length()).compareTo(b.length())).collect(Collectors.toList()));
}
}