package com.vn.newsspeak.parsers; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.vn.newsspeak.ArticleParser; public class USATodayParser extends ArticleParser { public USATodayParser() { super(); } @Override protected String getContent(String link, String type) throws ParserException { String content = ""; TagNameFilter paraFilter = new TagNameFilter("p"); StringBuilder builder = new StringBuilder(); URL url; try { url = new URL(link); BufferedReader reader = new BufferedReader(new InputStreamReader(url.openConnection().getInputStream())); String oneLine; while ((oneLine = reader.readLine()) != null) { builder.append(oneLine); } reader.close(); } catch (MalformedURLException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } parser.setInputHTML(builder.toString()); NodeList list = parser.parse(paraFilter); if (type.equalsIgnoreCase("html")) { for (int i = 0; i < list.size(); ++i) { content += list.elementAt(i).toHtml(); } } else { for (int i = 0; i < list.size(); ++i) { content += list.elementAt(i).toPlainTextString(); } } // Remove special characters, such as   Pattern regex = Pattern.compile("&.*?;"); Matcher matcher = regex.matcher(content); while (matcher.find()) content = matcher.replaceAll(""); // Remove extra whitespaces content = content.replaceAll("(\\t|\\n)", ""); return content; } }