package org.cneng.httpclient;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.DataInputStream;
import java.io.FileInputStream;
import static org.cneng.httpclient.DateUtil.toDate;
/**
* 利用JSouup解析
*
* @author XiongNeng
* @version 1.0
* @since 2015/2/4
*/
public class JSoupUtil {
private static final Logger _log = LoggerFactory.getLogger(JSoupUtil.class);
/**
* 获取链接
* @param htmlContent html
* @return 链接
* @throws Exception
*/
public static String parseLink(String htmlContent) throws Exception {
// _log.info(htmlContent);
String result = null;
if (htmlContent == null) return null;
if (htmlContent.contains("暂未查询到相关记录")) {
return "404";
}
Document doc = Jsoup.parse(htmlContent);
Elements content = doc.getElementsByAttributeValue("class", "font16");
if (content.size() > 0) {
Element e = content.get(0);
// _log.info("获取链接href=" + e.child(0).attr("href"));
result = e.child(0).attr("href");
if (!result.startsWith("http://")) {
_log.info("获取的相对链接href=" + result);
result = QueryManager.getInstance().getHomepageUrl() + result.substring(3);
}
}
return result;
}
/**
* 获取真实营业场所或地址
* @param htmlContent html
* @return 营业场所或地址
* @throws Exception
*/
public static String parseLocation(String htmlContent) throws Exception {
Document doc = Jsoup.parse(htmlContent);
Element location = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(所$):eq(0) + td").first();
String result = location.text();
_log.info("location=" + result);
return result;
}
/**
* 获取企业完整信息
* @param htmlContent html
* @throws Exception
*/
public static void parseCompany(String htmlContent, String investorHtml, Company c) throws Exception {
Document doc = Jsoup.parse(htmlContent);
// 企业名
Element nameE = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^名称$) + td").first();
if (nameE != null) c.setCompanyName(nameE.text());
// 注册号
Element taxnoE = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^注册号$) + td").first();
if (taxnoE != null) c.setTaxno(taxnoE.text());
// 法定代表人
Element lawPersonE = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^法定代表人|负责人|投资人|经营者$) + td").first();
if (lawPersonE != null) c.setLawPerson(lawPersonE.text());
// 成立日期
Element regDateE = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^成立日期|注册日期$) + td").first();
if (regDateE != null) c.setRegDate(toDate(regDateE.text()));
// 住所
Element location = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(所$) + td").first();
if (location != null) c.setLocation(location.text());
// 经营范围
Element business = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^经营范围$) + td").first();
if (business != null) c.setBusiness(business.text());
// 股东/发起人,这里需要异步再次发起一次请求
c.setStockholder(fetchInvestor(investorHtml));
// 登记状态
Element status = doc.select(
"table[class=detailsList]:eq(0) > tbody > tr >th:matches(^登记状态$) + td").first();
if (business != null) {
String statuss = status.text();
c.setStatus(statuss);
if (!"存续".equals(statuss)) {
c.setResultType(2); // 已经无效了
}
}
}
private static String fetchInvestor(String investorHtml) {
if (StringUtil.isBlank(investorHtml)) return null;
if (investorHtml.contains("<html") && investorHtml.contains("touziren")) {
Document doc = Jsoup.parse(investorHtml);
// 股东/发起人
Elements nameE = doc.select("table#touziren tr >td:eq(1)");
if (nameE != null && nameE.size() > 0) {
StringBuilder sbb = new StringBuilder();
for (Element e : nameE) {
sbb.append(e.text()).append("/");
}
String sbb1 = sbb.toString();
return sbb1.substring(0, sbb1.length() - 1);
} else {
return "";
}
}
JSONObject root = JSON.parseObject(investorHtml);
JSONArray array = root.getJSONArray("investorList");
StringBuilder sb = new StringBuilder();
for(Object o: array) {
JSONObject json = (JSONObject)o;
sb.append(json.getString("inv")).append("/");
}
String sbs = sb.toString();
if (sbs.length() > 0) return sbs.substring(0, sbs.length() - 1);
return null;
}
// private static String parseLinkXpath(String htmlContent) throws Exception {
// InputSource source = new InputSource(new StringReader(htmlContent));
//
// DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
// DocumentBuilder db = dbf.newDocumentBuilder();
// Document document = db.parse(source);
//
// XPathFactory xpathFactory = XPathFactory.newInstance();
// XPath xpath = xpathFactory.newXPath();
//
// String link = xpath.evaluate("//div[@class='list'][1]/ul/li[1]/a/@href", document);
//
// _log.info("link=" + link);
// return link;
// }
public static void main(String[] args) throws Exception{
String investorHtml = "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
"<table cellpadding=\"0\" cellspacing=\"0\" class=\"detailsList\" id=\"touziren\" style=\"\" >\n" +
" <tr>\n" +
" <td width=\"20%\" style=\"text-align:center;\">自然人股东</td>\n" +
" <td width=\"20%\" style=\"text-align:center;\">李芳生</td>\n" +
" <td width=\"25%\" style=\"text-align:center;\"></td>\n" +
" <td width=\"25%\" style=\"text-align:center;\">不公示</td>\n" +
" <td width=\"10%\" style=\"text-align:center;\">\n" +
" <a href=\"#\" onclick=\"return alert('该公司的股东及出资信息在2014年3月1日后发生变化的,股东详情企业自主公示');\">详情</a>\n" +
" </td>\n" +
" </tr>\n" +
" <tr>\n" +
" <td width=\"20%\" style=\"text-align:center;\">自然人股东</td>\n" +
" <td width=\"20%\" style=\"text-align:center;\">李晓毅</td>\n" +
" <td width=\"25%\" style=\"text-align:center;\"></td>\n" +
" <td width=\"25%\" style=\"text-align:center;\">不公示</td>\n" +
" <td width=\"10%\" style=\"text-align:center;\">\n" +
" <a href=\"#\" onclick=\"return alert('该公司的股东及出资信息在2014年3月1日后发生变化的,股东详情企业自主公示');\">详情</a>\n" +
" </td>\n" +
" </tr>\n" +
" </table>\n" +
"</html>";
// todo
if (investorHtml.contains("<html") && investorHtml.contains("touziren")) {
Document doc = Jsoup.parse(investorHtml);
// 股东/发起人
Elements nameE = doc.select("table#touziren tr >td:eq(1)");
if (nameE != null && nameE.size() > 0) {
StringBuilder sbb = new StringBuilder();
for (Element e : nameE) {
sbb.append(e.text()).append("/");
}
String sbb1 = sbb.toString();
System.out.println(sbb1.substring(0, sbb1.length() - 1));
}
}
// DataInputStream dis = new DataInputStream(new FileInputStream(
// "D:\\work\\projects\\thinking-java\\src\\main\\resources\\test.html"));
// byte[] datainBytes = new byte[dis.available()];
// dis.readFully(datainBytes);
// dis.close();
// String content = new String(datainBytes, 0, datainBytes.length);
//// _log.info(content);
//
// parseLocation(content);
}
}