/*******************************************************************************
* BBC News Reader
* Released under the BSD License. See README or LICENSE.
* Copyright (c) 2011, Digital Lizard (Oscar Key, Thomas Boby)
* All rights reserved.
******************************************************************************/
package com.digitallizard.bbcnewsreader.resource.web;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.util.ByteArrayBuffer;
public class HtmlParser {
private static final String USER_AGENT = "Mozilla/5.0 (Linux; U; Android 2.2.1; en-us; MB525 Build/3.4.2-107_JDN-9) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1";
/**
* @param args
* @throws IOException
* @throws ClientProtocolException
*/
public static byte[] getPage(String stringUrl) throws Exception {
URL url = new URL(stringUrl);
URLConnection connection = url.openConnection();
System.setProperty("http.agent", "");
connection.setRequestProperty("User-Agent", USER_AGENT);
InputStream stream = connection.getInputStream();
BufferedInputStream inputbuffer = new BufferedInputStream(stream);
ByteArrayBuffer arraybuffer = new ByteArrayBuffer(50);
int current = 0;
while ((current = inputbuffer.read()) != -1) {
arraybuffer.append((byte) current);
}
return arraybuffer.toByteArray();
}
public static String parsePage(byte[] bytes) {
// FIXME needs a tidy up
if (bytes != null) {
// convert the bytes into a string
String html = new String(bytes);
// trying parsing the page for news
final String[] parsedNews = html.split("<div class=\"story-body\">", 2);
if (parsedNews.length > 1) {
// assume there are start and stop tags
return parsedNews[1].split("<div class=\"share-this\">", 2)[0];
}
else {
// try parsing for sport
final String[] parsedSport = html.split("<article class=\"mod story\">");
if(parsedSport.length > 1) {
return parsedSport[1].split("</article>", 2)[0];
}
else {
// just return the entire page as a last resort
return html;
}
}
}
else {
return "";
}
}
}