package reptile.gif; import java.io.File; import java.util.ArrayList; import java.util.List; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import utils.HttpUtils; import utils.OfficeUtils; public class Hnbang { private static String hostUrl = "http://hnbang.com"; private static String imagePageUrl = hostUrl + "/page/%s"; public static void main(String[] args) throws Exception { savePages(); } private static void savePages() throws Exception { File path = new File("temp" + File.separator + "reptile" + File.separator + "gif" + File.separator + "hnbang"); List<GifInfo> infos = new ArrayList<>(); int perCount = 100; for (int i = 501; i <= 780; i++) { File file = new File(path, "page" + i + ".txt"); if (file.exists()) { continue; } String url = String.format(imagePageUrl, i); List<GifInfo> gifs = getAllGif(url); infos.addAll(gifs); if (i % perCount == 0) { OfficeUtils.saveCVS(infos, file); infos.clear(); } } File file = new File(path, "page_last.txt"); OfficeUtils.saveCVS(infos, file); } private static List<GifInfo> getAllGif(String appUrl) throws Exception { System.out.println("get gif ... url = " + appUrl); List<GifInfo> infos = new ArrayList<>(); String response = HttpUtils.getString(appUrl); Document parse = Jsoup.parse(response); Elements contentElements = parse.getElementsByClass("content").get(0) .getElementsByTag("article"); for (Element element : contentElements) { String title = element.getElementsByClass("note").get(0).text(); title = title.replace("猛击图片查看大图!", "").trim(); Elements oriGifUrlElements = element.getElementsByTag("img"); if(oriGifUrlElements.size() == 0) { continue; } String oriGifUrl = oriGifUrlElements.get(0).attr("data-original"); String thumbsUp = element .getElementsByAttributeValueEnding("class", "post-like") .get(0).text().replace("赞 (", "").replace(")", "") .replace(" ", ""); int favCount = Integer.parseInt(thumbsUp); String tag = element.getElementsByClass("post-tags").text() .replace("标签:", "").replace("gif", "").replace(" ", ""); GifInfo info = new GifInfo(); info.tag = tag; info.title = title; info.imgUrl = oriGifUrl; info.favCount = favCount; infos.add(info); } return infos; } public static class GifInfo { /** * 标题 */ public String title; /** * 标签 */ public String tag; /** * 搜索图地址 */ public String thumbnailImgUrl; /** * 动态图地址 */ public String imgUrl; /** * 收藏数量 */ public int favCount; /** * 评论数量 */ public int commentCount; } }