/* * Copyright 2010 John R. Hicks * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.determinato.feeddroid.parser; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.http.impl.cookie.DateParseException; import org.apache.http.impl.cookie.DateUtils; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import android.content.ContentResolver; import android.content.ContentValues; import android.database.SQLException; import android.net.Uri; import android.os.Handler; import android.text.TextUtils; import android.util.Log; import com.determinato.feeddroid.provider.FeedDroid; import com.determinato.feeddroid.util.FeedDroidUtils; /** * Class to parse RSS feeds from the internet and store * RSS channels and posts in the database. * * @author John R. Hicks <john@determinato.com> * */ public class RssParser extends DefaultHandler { private static final String TAG = "RssParser"; private static final int STATE_IN_ITEM = (1 << 2); private static final int STATE_IN_ITEM_TITLE = (1 << 3); private static final int STATE_IN_ITEM_LINK = (1 << 4); private static final int STATE_IN_ITEM_DESC = (1 << 5); private static final int STATE_IN_ITEM_DATE = (1 << 6); private static final int STATE_IN_ITEM_AUTHOR = (1 << 7); private static final int STATE_IN_TITLE = (1 << 8); private static final int STATE_MEDIA_CONTENT = (1 << 9); private static final int STATE_IN_IMAGE = (1 << 10); private static HashMap<String, Integer> mStateMap; private Handler mHandler; private String mRssUrl; private ContentResolver mResolver; private ChannelPost mPostBuf; private long mId; private long mFolderId; private int mState; static { mStateMap = new HashMap<String, Integer>(); mStateMap.put("item", STATE_IN_ITEM); mStateMap.put("entry", STATE_IN_ITEM); mStateMap.put("title", STATE_IN_ITEM_TITLE); mStateMap.put("link", STATE_IN_ITEM_LINK); mStateMap.put("feedburner:origLink", STATE_IN_ITEM_LINK); mStateMap.put("description", STATE_IN_ITEM_DESC); mStateMap.put("summary", STATE_IN_ITEM_DESC); mStateMap.put("content", STATE_IN_ITEM_DESC); mStateMap.put("content:encoded", STATE_IN_ITEM_DESC); mStateMap.put("dc:date", STATE_IN_ITEM_DATE); mStateMap.put("updated", STATE_IN_ITEM_DATE); mStateMap.put("modified", STATE_IN_ITEM_DATE); mStateMap.put("pubDate", STATE_IN_ITEM_DATE); mStateMap.put("dc:author", STATE_IN_ITEM_AUTHOR); mStateMap.put("author", STATE_IN_ITEM_AUTHOR); mStateMap.put("name", STATE_IN_ITEM_AUTHOR); mStateMap.put("media:content", STATE_MEDIA_CONTENT); mStateMap.put("enclosure", STATE_MEDIA_CONTENT); mStateMap.put("image", STATE_IN_IMAGE); } /** * Constructor. * @param resolver ContentResolver to gain access to the database. */ public RssParser(ContentResolver resolver) { super(); mResolver = resolver; } /** * Persists RSS item to the database. * @param handler Handler to notify main application thread of parser events * @param id ID of channel * @param rssurl URL of channel * @return ID * @throws Exception */ public long syncDb(Handler handler, long id, String rssurl) throws Exception { mHandler = handler; return syncDb(id, rssurl); } /** * Persists RSS item to the database. * @param id * @param rssurl RSS feed URL. * @return ID * @throws Exception */ public long syncDb(long id, String rssurl) throws Exception { long folderId = 1; return syncDb(id, folderId, rssurl); } /** * Persists RSS item to the database. * @param id item ID * @param folderId ID of containing folder * @param rssurl URL of RSS feed * @return long containing ID of inserted item * @throws Exception */ public long syncDb(long id, long folderId, String rssurl) throws Exception { mId = id; mFolderId = folderId; mRssUrl = rssurl; SAXParserFactory factory = SAXParserFactory.newInstance(); SAXParser parser = factory.newSAXParser(); XMLReader reader = parser.getXMLReader(); reader.setContentHandler(this); reader.setErrorHandler(this); URL url = new URL(mRssUrl); URLConnection c = url.openConnection(); // TODO: Is this a known user agent, or do I need to come up with my own? c.setRequestProperty("User-Agent", "Android/m3-rc37a"); try { BufferedReader bufReader = new BufferedReader(new InputStreamReader(c.getInputStream()), 65535); reader.parse(new InputSource(bufReader)); } catch (NullPointerException e) { Log.e(TAG, Log.getStackTraceString(e)); Log.e(TAG, "Failed to load URL" + url.toString()); } return mId; } /** * Returns indication of updated RSS feed icon. * @param id ID of channel * @param iconUrl RSS channel icon URL * @return true if updated, false otherwise * @throws MalformedURLException */ public boolean updateFavicon(long id, String iconUrl) throws MalformedURLException { return updateFavicon(id, new URL(iconUrl)); } /** * Returns indication of updated RSS feed icon. * @param id ID of channel * @param iconUrl RSS channel icon URL * @return true if updated, false otherwise */ public boolean updateFavicon(long id, URL iconUrl) { InputStream in = null; OutputStream out = null; boolean r = false; Uri iconUri = FeedDroid.Channels.CONTENT_URI .buildUpon() .appendPath(String.valueOf(id)) .appendPath("icon") .build(); try { in = iconUrl.openStream(); out = mResolver.openOutputStream(iconUri); byte[] b = new byte[1024]; int n; while ((n = in.read(b)) != -1) out.write(b, 0, n); r = true; } catch (Exception e) { Log.d(TAG, Log.getStackTraceString(e)); } finally { try { if (in != null) in.close(); if (out != null) out.close(); } catch (IOException e) {} } return r; } /** * {@inheritDoc} */ public void startElement(String uri, String name, String qName, Attributes attrs) { if (mId == -1 && name.equals("title") && (mState & STATE_IN_ITEM) == 0) { mState |= STATE_IN_TITLE; return; } Integer state = mStateMap.get(name); if (state != null) { mState |= state.intValue(); if (state.intValue() == STATE_IN_ITEM) mPostBuf = new ChannelPost(); else if ((mState & STATE_IN_ITEM) != 0 && state.intValue() == STATE_IN_ITEM_LINK) { String href = attrs.getValue("href"); if (href != null) mPostBuf.link = href; } else if ((mState & STATE_IN_ITEM) != 0 && state.intValue() == STATE_MEDIA_CONTENT) { String url = attrs.getValue("url"); String type = attrs.getValue("type"); if (!TextUtils.isEmpty(url) && FeedDroidUtils.isPodcast(type)) { Log.d(TAG, "Podcast: " + url); mPostBuf.podcastUrl = url; mPostBuf.podcastMimeType = type; } } } } /** * {@inheritDoc} */ public void endElement(String uri, String name, String qName) { Integer state = mStateMap.get(name); if (state != null) { mState &= ~(state.intValue()); if (state.intValue() == STATE_IN_ITEM) { if (mId == -1) { Log.d(TAG, "</item> found before feed title."); return; } ContentValues values = new ContentValues(); values.put(FeedDroid.Posts.CHANNEL_ID, mId); values.put(FeedDroid.Posts.TITLE, mPostBuf.title); values.put(FeedDroid.Posts.URL, mPostBuf.link); values.put(FeedDroid.Posts.AUTHOR, mPostBuf.author); values.put(FeedDroid.Posts.DATE, mPostBuf.getDate()); values.put(FeedDroid.Posts.BODY, reEncodeHtml(mPostBuf.desc)); if (!TextUtils.isEmpty(mPostBuf.podcastUrl)) { values.put(FeedDroid.Posts.PODCAST_URL, mPostBuf.podcastUrl); values.put(FeedDroid.Posts.PODCAST_MIME_TYPE, mPostBuf.podcastMimeType); } try { mResolver.insert(FeedDroid.Posts.CONTENT_URI, values); } catch (SQLException e) { // Eating the exception since it's likely due to a duplicate post. } } } } /** * {@inheritDoc} */ public void characters(char[] ch, int start, int length) { // Are we in the Channel or in a Post? if ((mId == -1) && (mState & STATE_IN_TITLE) != 0) { ContentValues values = new ContentValues(); values.put(FeedDroid.Channels.TITLE, new String(ch, start, length)); values.put(FeedDroid.Channels.URL, mRssUrl); values.put(FeedDroid.Channels.FOLDER_ID, mFolderId); Uri added = mResolver.insert(FeedDroid.Channels.CONTENT_URI, values); mId = Long.parseLong(added.getPathSegments().get(1)); mState &= ~STATE_IN_TITLE; return; } if ((mState & STATE_IN_ITEM) == 0) return; StringBuilder str = new StringBuilder(); switch(mState) { case STATE_IN_ITEM | STATE_IN_ITEM_TITLE: str.append(new String(ch, start, length).trim()); if (mPostBuf.title == null) mPostBuf.title = str.toString(); else mPostBuf.title += str.toString(); break; case STATE_IN_ITEM | STATE_IN_ITEM_DESC: str.append(new String(ch, start, length).trim()); if (mPostBuf.desc == null) mPostBuf.desc = str.toString(); else mPostBuf.desc += str.toString(); break; case STATE_IN_ITEM | STATE_IN_ITEM_LINK: mPostBuf.link = new String(ch, start, length).trim(); break; case STATE_IN_ITEM | STATE_IN_ITEM_DATE: mPostBuf.setDate(new String(ch, start, length).trim()); break; case STATE_IN_ITEM | STATE_IN_ITEM_AUTHOR: mPostBuf.author = new String(ch, start, length).trim(); if (mPostBuf.author == null) mPostBuf.author = ""; break; default: } } /** * Examines string and replaces XML-escaped HTML entities with * their appropriate equivalents. * @param str String to examine * @return String with proper HTML elements */ private String reEncodeHtml(String str) { StringBuilder builder = new StringBuilder(); if (str == null) return ""; String[] sources = new String[] {"<![CDATA[", "]]>", ">", "<", "&", "’", "“", "”"}; String[] dests = new String[] {"", "", ">", "<", "&", "'", "\"", "\""}; builder.append(TextUtils.replace(str, sources, dests)); return builder.toString(); } /** * Container for RSS posts * */ private class ChannelPost { public String title; public Date date; public String desc; public String link; public String author; public String podcastUrl; public String podcastMimeType; public ChannelPost() {} private String strDate; public void setDate(String str) { try { date = DateUtils.parseDate(str); } catch (DateParseException e) { Log.i(TAG, "Unable to parse date. Defaulting to system."); date = new Date(); } if (date == null) date = new Date(); SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); strDate = dateFormat.format(date); } public String getDate() { return strDate; } } /** * {@inheritDoc} */ @Override public void endDocument() throws SAXException { Log.d(TAG, "Parsing of " + mRssUrl + " finished."); super.endDocument(); } /** * {@inheritDoc} */ @Override public void startDocument() throws SAXException { Log.d(TAG, "Parsing RSS XML: " + mRssUrl); super.startDocument(); } }