/** * * APDPlat - Application Product Development Platform Copyright (c) 2013, 杨尚川, * yang-shangchuan@qq.com * * This program is free software: you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free Software * Foundation, either version 3 of the License, or (at your option) any later * version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along with * this program. If not, see <http://www.gnu.org/licenses/>. * */ package org.apdplat.superword.extract; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlPage; import org.apache.commons.lang.StringUtils; import org.apdplat.superword.model.Word; import org.apdplat.superword.tools.DynamicIp; import org.apdplat.superword.tools.HtmlFormatter; import org.apdplat.superword.tools.ProxyIp; import org.apdplat.superword.tools.TextAnalyzer; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; import java.nio.file.Files; import java.nio.file.Paths; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; /** * 每日一句提取工具 * @author 杨尚川 */ public class SentenceExtractor { private SentenceExtractor(){} private static final Logger LOGGER = LoggerFactory.getLogger(SentenceExtractor.class); private static final String SENTENCE_CSS_PATH = "html body div#content.clear div.main.fl div.reading div.r_bd.clear div.reading_rg.fr.pr div.reading_txt ul"; private static final String EN_CSS_PATH = "li.en a"; private static final String CN_CSS_PATH = "li.cn a"; private static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; private static final String ENCODING = "gzip, deflate"; private static final String LANGUAGE = "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"; private static final String CONNECTION = "keep-alive"; private static final String USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:36.0) Gecko/20100101 Firefox/36.0"; private static final WebClient WEB_CLIENT = new WebClient(BrowserVersion.INTERNET_EXPLORER_11); private static final Map<Word, AtomicInteger> WORD_FREQUENCE = new HashMap<>(); static { WEB_CLIENT.getOptions().setThrowExceptionOnFailingStatusCode(false); WEB_CLIENT.getOptions().setThrowExceptionOnScriptError(false); WEB_CLIENT.getOptions().setJavaScriptEnabled(false); WEB_CLIENT.getOptions().setCssEnabled(false); } public static Map<String, String> extract(int totalPageNumber){ Map<String, String> sentences = new HashMap<>(); for (int i=1; i<=totalPageNumber; i++){ String url = "http://news.iciba.com/dailysentence/detail-"+i+".html"; String html = getContent(url); int times = 0; while (StringUtils.isBlank(html) && times<10){ times++; //使用新的IP地址 DynamicIp.toNewIp(); html = getContent(url); } if(StringUtils.isBlank(html)){ LOGGER.error("页面获取失败:"+url); continue; } //LOGGER.debug("获取到的HTML:" +html); while(html.contains("非常抱歉,来自您ip的请求异常频繁")){ //使用新的IP地址 DynamicIp.toNewIp(); html = getContent(url+i); } sentences.putAll(parse(html)); LOGGER.info("进度 "+totalPageNumber+"/"+i); } LOGGER.debug("期望获取句子数:" + totalPageNumber); LOGGER.debug("实际获取句子数:" + sentences.size()); return sentences; } public static Map<String, String> extract2(int totalPageNumber){ Map<String, String> sentences = new HashMap<>(); int start = 29351; int total = totalPageNumber - start; for (int i=start; i<=totalPageNumber; i++){ String url = "http://en.dict.cn/news/view/"+i; String html = getContent2(url); int times = 1; while (StringUtils.isBlank(html) && times<10){ times++; //使用新的IP地址 ProxyIp.toNewIp(); html = getContent2(url); } if(StringUtils.isBlank(html)){ LOGGER.error("页面获取失败:"+url); continue; } //LOGGER.debug("获取到的HTML:" +html); while(html.contains("非常抱歉,来自您ip的请求异常频繁")){ //使用新的IP地址 ProxyIp.toNewIp(); html = getContent2(url + i); } sentences.putAll(parse2(html)); LOGGER.info("进度 "+total+"/"+(i-start+1)); } LOGGER.debug("期望获取句子数:" + totalPageNumber); LOGGER.debug("实际获取句子数:" + sentences.size()); return sentences; } public static String getContent2(String url) { try{ LOGGER.debug("url:"+url); HtmlPage htmlPage = WEB_CLIENT.getPage(url); String html = htmlPage.asXml(); //LOGGER.debug("html:"+html); return html; }catch (Exception e) { e.printStackTrace(); LOGGER.error("获取URL:"+url+"页面出错", e); } return ""; } public static Map<String, String> parse(String html){ Map<String, String> sentences = new HashMap<>(); try { for(Element element : Jsoup.parse(html).select(SENTENCE_CSS_PATH)){ String en = null; String cn = null; Elements elements = element.select(EN_CSS_PATH); if(elements.size()==1){ en = elements.get(0).text().trim(); LOGGER.info("解析出句子英文:" + en); if(en.split("\\s+").length<2){ LOGGER.debug("不是句子,放弃"); continue; } } elements = element.select(CN_CSS_PATH); if(elements.size()==1){ cn = elements.get(0).text().trim(); LOGGER.info("解析出句子中文:" + cn); } if(StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)){ sentences.put(en, cn); //统计词频 TextAnalyzer.seg(en).forEach(w -> { Word word = new Word(w, ""); WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger()); WORD_FREQUENCE.get(word).incrementAndGet(); }); } } }catch (Exception e){ LOGGER.error("解析句子出错", e); } return sentences; } public static Map<String, String> parse2(String html){ Map<String, String> sentences = new HashMap<>(); try { Document document = Jsoup.parse(html); String title = document.select("html head title").text(); if(!title.startsWith("每日一句")){ LOGGER.error("不是每日一句:"+title); return sentences; } for(Element element : document.select("html body div#main div.main_sl div.info div.info-body")){ String en = element.child(3).text().trim(); LOGGER.info("解析出句子英文:" + en); if(en.split("\\s+").length<2){ LOGGER.debug("不是句子,放弃"); continue; } String cn = element.child(4).text().trim()+element.child(5).text().trim(); LOGGER.info("解析出句子中文:" + cn); if(StringUtils.isNotBlank(en) && StringUtils.isNotBlank(cn)){ sentences.put(en, cn); //统计词频 TextAnalyzer.seg(en).forEach(w -> { Word word = new Word(w, ""); WORD_FREQUENCE.putIfAbsent(word, new AtomicInteger()); WORD_FREQUENCE.get(word).incrementAndGet(); }); } } }catch (Exception e){ LOGGER.error("解析句子出错", e); } return sentences; } public static String getContent(String url) { LOGGER.debug("url:"+url); String html = ""; try { String host = new URL(url).getHost(); String referer = "http://"+host+"/"; Connection conn = Jsoup.connect(url) .header("Accept", ACCEPT) .header("Accept-Encoding", ENCODING) .header("Accept-Language", LANGUAGE) .header("Connection", CONNECTION) .header("Referer", referer) .header("Host", host) .header("User-Agent", USER_AGENT) .ignoreContentType(true); html = conn.post().html(); html = html.replaceAll("[\n\r]", ""); }catch (Exception e){ e.printStackTrace(); LOGGER.error("获取URL:"+url+"页面出错", e); } return html; } public static void main(String[] args) throws Exception{ Map<String, String> data = extract(1549); LOGGER.info("data 1 size:"+data.size()); String html = HtmlFormatter.toHtmlForSentence(data, WORD_FREQUENCE); Files.write(Paths.get("src/main/resources/sentences_1.txt"), html.toString().getBytes("utf-8")); Map<String, String> data2 = extract2(30364); LOGGER.info("data 2 size:"+data2.size()); String html2 = HtmlFormatter.toHtmlForSentence(data2, WORD_FREQUENCE); Files.write(Paths.get("src/main/resources/sentences_2.txt"), html2.toString().getBytes("utf-8")); data.putAll(data2); LOGGER.info("total size:"+data.size()); html = HtmlFormatter.toHtmlForSentence(data, WORD_FREQUENCE); Files.write(Paths.get("src/main/resources/sentences.txt"), html.toString().getBytes("utf-8")); } }