package com.withiter.common.parser; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.withiter.models.merchant.Category; import com.withiter.models.merchant.Merchant; import com.withiter.utils.ExceptionUtil; import com.withiter.utils.StringUtils; public class ParserTest { private static Logger logger = LoggerFactory.getLogger(ParserTest.class); private static Map<String, Merchant> merchantMap = new HashMap<String, Merchant>(); private static Set<Category> categorySet = new HashSet<Category>(); private static List<String> menuList = new ArrayList<String>(); private static Pattern cateAndNumberPattern = Pattern.compile("(.*)\\(([\\d]*)\\)"); private static Pattern openAndCloseTimePattern = Pattern.compile("([^-]*)[-]+(.*)"); public static String categoryName = ""; private static void mainListPage(String url) throws IOException, InterruptedException { logger.info(ParserTest.class.getName() + ": mainListPage start"); Connection conn = Jsoup.connect(url); conn.timeout(0); Document doc = conn.get(); // get list of all categories Elements allCategories = doc.select("div[class=repast list_nav]"); if (checkElementsSize(allCategories)) { Elements list = allCategories.get(0).getElementsByAttributeValue("class", "list"); if (checkElementsSize(allCategories)) { Elements allList = list.first().children(); logger.info(ParserTest.class.getName() + ": All categories url start"); for (Element e : allList) { String menuUrl = e.children().first().attr("href"); menuList.add(menuUrl); logger.info(menuUrl); } logger.info(ParserTest.class.getName() + ": All categories url end"); } } Element e2 = allCategories.get(0).child(1); categoryPage(menuList.get(0), e2); logger.info(ParserTest.class.getName() + ": mainListPage end"); } private static void categoryPage(String url, Element e) throws IOException, InterruptedException { Element e2 = e; Elements e22 = e2.children(); for (Element node : e22) { String cateAndNumberStr = node.text(); Matcher cateAndNumberMatcher = cateAndNumberPattern.matcher(cateAndNumberStr); Category category = null; String cate = null; // caixi int number = 0; // shangjia counts while (cateAndNumberMatcher.find()) { cate = cateAndNumberMatcher.group(1); number = Integer.parseInt(cateAndNumberMatcher.group(2)); category = new Category(cate, number); categorySet.add(category); categoryName = cate; } if (cate.equalsIgnoreCase("本帮菜") || cate.equalsIgnoreCase("中餐菜系") || cate.equalsIgnoreCase("火锅") || cate.equalsIgnoreCase("西餐") || cate.equalsIgnoreCase("韩国料理") || cate.equalsIgnoreCase("粤菜馆") || cate.equalsIgnoreCase("日本料理") || cate.equalsIgnoreCase("面包蛋糕") || cate.equalsIgnoreCase("甜品饮品") || cate.equalsIgnoreCase("湘菜") || cate.equalsIgnoreCase("小吃快餐")) { continue; } int pages = (number % 10) != 0 ? (number / 10) + 1 : (number / 10); pages = (pages > 100) ? 100 : pages; logger.info("start to write to file /Users/user/c/quhao/data/shangjia/" + cate + ".csv"); for (int i = 1; i <= pages; i++) { String childUrl = formatUrlWithPage(node.select("a[href]").attr("href"), i); parseListChild(childUrl); for (String s : merchantMap.keySet()) { Merchant m = merchantMap.get(s); // output.write(m.toString()); // output.newLine(); } logger.info("writing to file /Users/user/c/quhao/data/shangjia/" + cate + ".csv"); merchantMap.clear(); Thread.currentThread().sleep(1000); } // output.flush(); // output.close(); logger.info("end to write to file /Users/user/c/quhao/data/shangjia/" + cate + ".csv"); } } private static String formatUrlWithPage(String url, int page) { String formatedUrl = url + "--page-" + page; return formatedUrl; } private static boolean checkElementsSize(Elements es) { return es != null && es.size() > 0 ? true : false; } private static void parseDetails(String url, Merchant m) throws IOException { if (StringUtils.isEmpty(url)) { return; } Connection conn = Jsoup.connect(url); conn.timeout(0); Document doc = conn.get(); if (url.startsWith("http://dd.taobao.com/")) { Elements shopName = doc.getElementsByAttributeValue("class", "shop-name"); Elements shopPos = doc.getElementsByAttributeValue("class", "shop-pos"); Elements shopAddress = doc.getElementsByAttributeValue("class", "shop-addr"); Elements tel = doc.getElementsByAttributeValue("class", "shop-tel"); Elements openAndCloseTime = doc.getElementsByAttributeValue("class", "time-num"); if (checkElementsSize(shopName)) { String name = shopName.first().text().trim(); m.name = name; } if (checkElementsSize(shopPos)) { String nickName = shopPos.first().text().trim(); m.nickName = nickName; } if (checkElementsSize(shopAddress)) { String address = shopAddress.first().text().trim(); m.address = address; } if (checkElementsSize(tel)) { String telephone = tel.first().text().trim(); m.telephone = new String[] { telephone }; } if (checkElementsSize(openAndCloseTime)) { String openCloeseTime = openAndCloseTime.first().text().trim(); if (!StringUtils.isEmpty(openCloeseTime)) { Matcher timeMatcher = openAndCloseTimePattern.matcher(openCloeseTime); while (timeMatcher.find()) { m.openTime = timeMatcher.group(1); m.closeTime = timeMatcher.group(2); } } } } if (url.startsWith("http://detail.koubei.com/")) { Elements shopName = doc.getElementsByAttributeValue("class", "shop-name"); Elements shopPos = doc.getElementsByAttributeValue("class", "shop-pos"); Elements tel = doc.getElementsByAttributeValue("class", "strong-tel"); Elements openAndCloseTime = doc.getElementsByAttributeValue("class", "time-num"); Elements shopAddress = doc.select("dd[title]"); if (checkElementsSize(shopName)) { String name = shopName.first().text().trim(); m.name = name; } if (checkElementsSize(shopPos)) { String nickName = shopPos.first().text().trim(); m.nickName = nickName; } if (checkElementsSize(shopAddress)) { if (shopAddress.size() > 1) { String address = shopAddress.get(1).text().trim(); m.address = address; } } if (checkElementsSize(tel)) { String telephone = tel.first().text().trim(); m.telephone = new String[] { telephone }; } if (checkElementsSize(openAndCloseTime)) { String openCloeseTime = openAndCloseTime.first().text().trim(); if (!StringUtils.isEmpty(openCloeseTime)) { Matcher timeMatcher = openAndCloseTimePattern.matcher(openCloeseTime); while (timeMatcher.find()) { m.openTime = timeMatcher.group(1); m.closeTime = timeMatcher.group(2); } } } } } /** * parse the list items on the page * * @param url * @throws IOException */ private static void parseListChild(String url) throws IOException { // System.out.println("child url: " + url); Connection conn = Jsoup.connect(url); conn.timeout(0); Document doc = conn.get(); String docHtml = doc.html().replaceAll("<textarea class=\"ks-datalazyload hidden\">", "").replaceAll("</textarea>", "").replaceAll("<", "<").replaceAll(">", ">") .replaceAll(""", "\"").replaceAll("</span>", "</span").replaceAll("</a>", "</a").replaceAll("</div>", "</div").replaceAll("</em>", "</em").replaceAll("</p>", "</p") .replaceAll("</span", "</span>").replaceAll("</a", "</a>").replaceAll("</div", "</div>").replaceAll("</em", "</em>").replaceAll("</p", "</p>"); doc = Jsoup.parse(docHtml); Elements es = doc.select("li[class=clearfix place-item]"); // System.out.println("es size: " + es.size()); for (Element e : es) { Merchant m = new Merchant(); // image element Elements imageDiv = e.select("div[class=photo]"); Elements imageNode = imageDiv.select("img"); String src = imageNode.attr("src"); // System.out.println("src:" + src); Elements titleDiv = e.select("div[class=clearfix]"); m.name = titleDiv.text(); saveImageFromURL(m.name, src); } } public static void saveImageFromURL(String merchantName, String url) { try { URL picUrl; HttpURLConnection conn = null; InputStream is = null; picUrl = new URL(url); conn = (HttpURLConnection) picUrl.openConnection(); conn.setConnectTimeout(20000); conn.setReadTimeout(20000); conn.connect(); // 获取图片大小 int picSize = conn.getContentLength(); is = conn.getInputStream(); String fileName = url.substring(url.lastIndexOf("/") + 1); File folder = new File("c:/testimage/"); if (!folder.exists()) { folder.mkdir(); } File file = new File("c:/testimage/" + categoryName + "_" + merchantName + "_" + fileName); OutputStream os = new FileOutputStream(file); final int buffer_size = 1024; byte[] bytes = new byte[buffer_size]; for (;;) { int count = is.read(bytes, 0, buffer_size); if (count == -1) break; os.write(bytes, 0, count); } os.close(); } catch (MalformedURLException e) { e.printStackTrace(); logger.error(ExceptionUtil.getTrace(e)); } catch (FileNotFoundException e) { e.printStackTrace(); logger.error(ExceptionUtil.getTrace(e)); } catch (IOException e) { e.printStackTrace(); logger.error(ExceptionUtil.getTrace(e)); } } public static void main(String[] args) throws IOException, InterruptedException { // String baseUrl = // "http://bendi.koubei.com/list.htm?spm=0.0.0.0.1NrzyZ&city=310100"; String baseUrl = "http://bendi.koubei.com/shanghai/list--c1-1000243?spm=5026.1000614.0.0.jySz57"; mainListPage(baseUrl); } }