package com.v2ex.v2droid; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class HtmlParser { static final String KEY_ID = "id"; static final String KEY_TITLE = "title"; static final String KEY_REPLIES = "replies"; static final String KEY_USERNAME = "username"; static final String KEY_AVATAR = "avatar"; static final String KEY_NODE = "node"; static final String KEY_HEADER_ID = "header_id"; static final String KEY_HEADER = "header"; static final String KEY_NAME = "name"; static final String KEY_LINK = "link"; // constructor public HtmlParser() { } /** * 根据URL获得所有的html信息 * * @param url * @return */ public static String getHtmlByUrl(String url) { String html = null; HttpClient httpClient = new DefaultHttpClient();// 创建httpClient对象 HttpGet httpget = new HttpGet(url);// 以get方式请求该URL try { HttpResponse responce = httpClient.execute(httpget);// 得到responce对象 int resStatu = responce.getStatusLine().getStatusCode();// 返回码 if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对 // 获得相应实体 HttpEntity entity = responce.getEntity(); if (entity != null) { html = EntityUtils.toString(entity);// 获得html源代码 } } } catch (Exception e) { System.out.println("访问[" + url + "]出现异常!"); e.printStackTrace(); } finally { httpClient.getConnectionManager().shutdown(); } return html; } public static ArrayList<HashMap<String, String>> getTopics(String url, ArrayList<HashMap<String, String>> topics) { try { // String html = getHtmlByUrl(url); // Document doc = Jsoup.parse(html); Document doc = Jsoup.connect(url).get(); Elements items = doc.select("div[class=cell item]"); if (!items.isEmpty() && !topics.isEmpty()) { topics.remove(topics.size() - 1); } for (Element item : items) { //System.out.println("item======>" + item.toString()); Element titleElement = item.select("span[class=item_title]>a") .get(0); String href = titleElement.attr("href"); String id = getMatcher("/t/([\\d]+)", href); String replies = getMatcher("#reply([\\d]+)", href); String title = titleElement.text(); Element usernameElement = item.select("td>a").get(0); String href2 = usernameElement.attr("href"); String username = getMatcher("/member/([0-9a-zA-Z]+)", href2); Element avatarElement = usernameElement.select("img").get(0); String avatar = avatarElement.attr("src"); Element nodeElement = item.select("span[class=small fade]>a") .get(0); String node = nodeElement.text(); System.out.println(node); // creating new HashMap HashMap<String, String> map = new HashMap<String, String>(); // adding each child node to HashMap key => // value map.put(KEY_ID, id); map.put(KEY_TITLE, title); map.put(KEY_USERNAME, username); map.put(KEY_REPLIES, replies); map.put(KEY_AVATAR, avatar); map.put(KEY_NODE, node); // adding HashList to ArrayList topics.add(map); } } catch (IOException e) { System.out.println("访问[" + url + "]出现异常!"); e.printStackTrace(); } HashMap<String, String> mapMore = new HashMap<String, String>(); mapMore.put(KEY_ID, MainActivity.MORE_TAG); mapMore.put(KEY_TITLE, MainActivity.MORE_TAG); mapMore.put(KEY_USERNAME, MainActivity.MORE_TAG); mapMore.put(KEY_REPLIES, MainActivity.MORE_TAG); mapMore.put(KEY_AVATAR, MainActivity.MORE_TAG); mapMore.put(KEY_NODE, MainActivity.MORE_TAG); // adding HashList to ArrayList topics.add(mapMore); return topics; } public static ArrayList<HashMap<String, String>> getNodes(String url, ArrayList<HashMap<String, String>> nodes) { try { // String html = getHtmlByUrl(url); // Document doc = Jsoup.parse(html); Document doc = Jsoup.connect(url).get(); Elements items = doc.select("div#Wrapper") .select("div[class=content]").select("div[class=box]") .select("table"); int headerId = 0; int nodeId = 0; for (Element item : items) { Elements headerElements = item.select("span[class=fade"); if (headerElements.isEmpty()) { continue; } Element headerElement = headerElements.get(0); Elements nodeItems = item.select("a"); for (Element nodeItem : nodeItems) { HashMap<String, String> map = new HashMap<String, String>(); map.put(KEY_ID, Integer.toString(nodeId)); map.put(KEY_HEADER_ID, Integer.toString(headerId)); map.put(KEY_HEADER, headerElement.text()); map.put(KEY_NAME, nodeItem.text()); map.put(KEY_LINK, nodeItem.attr("href")); nodes.add(map); nodeId++; } headerId++; System.out.println("table===> " + headerElement.toString()); } } catch (IOException e) { System.out.println("访问[" + url + "]出现异常!"); e.printStackTrace(); } return nodes; } public static ArrayList<HashMap<String, String>> getNodeTopics(String url, String nodeName, ArrayList<HashMap<String, String>> topics) { //try { String html = getHtmlByUrl(url); System.out.println("getHtmlByUrl======>" + html); Document doc = Jsoup.parse(html); //Document doc = Jsoup.connect(url).get(); Elements items = doc.select("div#TopicsNode").select("table"); System.out.println("item======>" + items.toString()); if (!items.isEmpty() && !topics.isEmpty()) { topics.remove(topics.size() - 1); } for (Element item : items) { System.out.println("item======>" + item.toString()); Element titleElement = item.select("span[class=item_title]>a") .get(0); String href = titleElement.attr("href"); System.out.println("href======>" + href); String id = getMatcher("/t/([\\d]+)", href); System.out.println("id======>" + id); String replies = getMatcher("#reply([\\d]+)", href); System.out.println("replies======>" + replies); String title = titleElement.text(); System.out.println("title======>" + title); Element usernameElement = item.select("td>a").get(0); String href2 = usernameElement.attr("href"); System.out.println("href2======>" + href2); String username = getMatcher("/member/([0-9a-zA-Z]+)", href2); System.out.println("username======>" + username); Element avatarElement = usernameElement.select("img").get(0); String avatar = avatarElement.attr("src"); System.out.println("avatar======>" + avatar); // creating new HashMap HashMap<String, String> map = new HashMap<String, String>(); // adding each child node to HashMap key => // value map.put(KEY_ID, id); map.put(KEY_TITLE, title); map.put(KEY_USERNAME, username); map.put(KEY_REPLIES, replies); map.put(KEY_AVATAR, avatar); map.put(KEY_NODE, nodeName); // adding HashList to ArrayList topics.add(map); } //} catch (IOException e) { // System.out.println("访问[" + url + "]出现异常!"); // e.printStackTrace(); //} HashMap<String, String> mapMore = new HashMap<String, String>(); mapMore.put(KEY_ID, MainActivity.MORE_TAG); mapMore.put(KEY_TITLE, MainActivity.MORE_TAG); mapMore.put(KEY_USERNAME, MainActivity.MORE_TAG); mapMore.put(KEY_REPLIES, MainActivity.MORE_TAG); mapMore.put(KEY_AVATAR, MainActivity.MORE_TAG); mapMore.put(KEY_NODE, MainActivity.MORE_TAG); // adding HashList to ArrayList topics.add(mapMore); return topics; } public static String getMatcher(String regex, String source) { String result = ""; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(source); while (matcher.find()) { result = matcher.group(1);// 只取第一组 } return result; } public static String getTopicOnce(String url) { System.out.println("getTopicOnce======>"); String once = ""; String html = getHtmlByUrl(url); System.out.println("html======>" + html); Document doc = Jsoup.parse(html); Element item = doc.select("input[name=once]").first(); if (item != null) { once = item.attr("value"); System.out.println("once======>" + item.attr("value")); } return once; } public static String getTopicOnce2(String html) { System.out.println("getTopicOnce======>"); String once = ""; // System.out.println("html======>" + html); Document doc = Jsoup.parse(html); Element item = doc.select("input[name=once]").first(); if (item != null) { once = item.attr("value"); System.out.println("once======>" + item.attr("value")); } return once; } }