package org.ripple.power.news; import org.json.JSONArray; import org.json.JSONObject; import org.ripple.power.utils.HttpRequest; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URLEncoder; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Calendar; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; public class NewsParser { public static class News { public String title; public String url; public News(String t, String u) { this.title = t.replace(""", ""); this.url = u; } public boolean equals(Object o) { if (o == null) { return false; } if (!(o instanceof News)) { return false; } if (o == this) { return true; } if (url.equals(((News) o).url) || title.equals(((News) o).title)) { return true; } return false; } } public synchronized static List<News> getAllNew(String query) { ArrayList<News> list = new ArrayList<News>(100); List<RssItem> baidu = null; List<RssItem> google = null; List<Feedzilla> feedzilla = null; try { feedzilla = getFeedzillaRssItem(query, -1); } catch (Exception e) { } try { google = getGoogleRssItem(query); } catch (Exception e) { } try { baidu = getBaiduRssItem(query); } catch (Exception e) { } if (feedzilla != null) { for (Feedzilla feed : feedzilla) { list.add(new News(feed.title, feed.url)); } } try { feedzilla = getFeedzillaRssItem(query, 0); } catch (Exception e) { } if (feedzilla != null) { for (Feedzilla feed : feedzilla) { list.add(new News(feed.title, feed.url)); } } if (google != null) { for (RssItem item : google) { News news = new News(item.getTitle(), item.getLink().toString()); if (!list.contains(news)) { list.add(news); } } } if (baidu != null) { for (RssItem item : baidu) { News news = new News(item.getTitle(), item.getLink().toString()); if (!list.contains(news)) { list.add(news); } } } return list; } public static List<RssItem> getBaiduRssItem(String query) throws Exception { String uri = String .format("http://news.baidu.com/ns?word=%s&tn=newsrss&sr=0&cl=1&rn=20&ct=0", URLEncoder.encode(query.trim(), "gb2312")); HttpRequest request = HttpRequest.get(uri); if (request.ok()) { RssFeed feed = NewsParser.parse(request.body(), "gb2312"); return feed.getRssItems(); } return null; } public static List<RssItem> getGoogleRssItem(String query) throws Exception { String uri = String .format("http://news.google.co.kr/news?&hl=en&ie=UTF-8&q=" + URLEncoder.encode(query, "UTF-8") + "&output=rss"); HttpRequest request = HttpRequest.get(uri); if (request.ok()) { RssFeed feed = NewsParser.parse(request.body(), "UTF-8"); return feed.getRssItems(); } return null; } public static ArrayList<Feedzilla> getFeedzillaRssItem(String query, int offset) throws Exception { SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd"); Calendar cal = Calendar.getInstance(); if (offset < 0) { cal.add(Calendar.DATE, offset); } String yesterday = dateFormat.format(cal.getTime()); HttpRequest request = HttpRequest .get("http://api.feedzilla.com/v1/articles/search.json?q=" + URLEncoder.encode(query, "UTF-8") + "&order=date&since=" + yesterday + "&count=50"); if (request.ok()) { String result = request.body(); JSONObject json = new JSONObject(result); JSONArray articles = json.getJSONArray("articles"); ArrayList<Feedzilla> list = new ArrayList<Feedzilla>(50); for (int i = 0; i < articles.length(); i++) { JSONObject article = articles.getJSONObject(i); String title = article.has("title") ? article .getString("title") : null; String summary = article.has("summary") ? article .getString("summary") : null; String source = article.has("source") ? article .getString("source") : null; String sourceUrl = article.has("source") ? article .getString("source_url") : null; String published = article.has("publish_date") ? article .getString("publish_date") : null; String author = null; if (article.has("author")) { author = article.getString("author"); } String url = article.has("url") ? article.getString("url") : null; int feedzillaId = 0; if (url != null) { Pattern p = Pattern.compile("([0-9]+)"); Matcher m = p.matcher(url); if (m.find()) { feedzillaId = Integer.parseInt(m.group(1)); } } Feedzilla feed = new Feedzilla(title, published, source, sourceUrl, url, summary, author, feedzillaId); list.add(feed); } return list; } return null; } public static RssFeed parse(byte[] data, String encoding) throws IOException, SAXException, ParserConfigurationException { return parse(new String(data, encoding), encoding); } public static RssFeed parse(String data, String encoding) throws ParserConfigurationException, SAXException, IOException { SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); SAXParser saxParser = saxParserFactory.newSAXParser(); XMLReader xmlReader = saxParser.getXMLReader(); RssHandler handler = new RssHandler(); xmlReader.setContentHandler(handler); InputSource source = new InputSource(new ByteArrayInputStream( data.getBytes(encoding))); xmlReader.parse(source); return handler.getRssFeed(); } }