/* * Copyright 2013 SciFY NPO <info@scify.org>. * * This product is part of the NewSum Free Software. * For more information about NewSum visit * * http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * If this code or its output is used, extended, re-engineered, integrated, * or embedded to any extent in another software or hardware, there MUST be * an explicit attribution to this work in the resulting source code, * the packaging (where such packaging exists), or user interface * (where such an interface exists). * The attribution must be of the form "Powered by NewSum, SciFY" */ package org.scify.NewSumServer.Server.Sources; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.sql.Time; import java.util.ArrayList; import java.util.Calendar; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.feedparser.*; import org.apache.commons.feedparser.network.NetworkException; import org.apache.commons.feedparser.network.ResourceRequest; import org.apache.commons.feedparser.network.ResourceRequestFactory; import org.scify.NewSumServer.Server.Comms.Communicator; import static org.scify.NewSumServer.Server.Sources.RssParser.hsSwitches; import org.scify.NewSumServer.Server.Storage.IDataStorage; import org.scify.NewSumServer.Server.Structures.Article; import org.scify.NewSumServer.Server.Structures.UnlabeledArticle; import org.scify.NewSumServer.Server.Utils.Utilities; /** * Class RssParser parses a URL * * @author ggianna * @author George K. <gkiom@scify.org> */ public class RssParser implements ISourceParser { /** * The Storage module used for various I/O operations */ private IDataStorage ids; /** * Filtered Articles counter. */ private static int f = 0; /** * Counter for Dates taken from previous runs. */ private int ifromOld = 0; /** * Counter for new Dates. */ private int inewDate = 0; /** * Counter for articles removed due to their Old Date values */ private int iOldDate = 0; /** * Regular expressions PATTERN separator. */ private static String sPatternSep = "==="; /** * The label for the articles that should be parsed by the Classifier. */ public static final String UNCLASSIFIED = "UNCLASSIFIED"; /** * Limit number in days to keep articles. Articles older that this * number of days are ignored */ private final long iArticleDays; /** * A List Containing each Item found in the feed in an {@link Article} form. */ private List<Article> lsItems; /** * All the Articles fetched by the Parser. */ private List<Article> lsFullItems = new ArrayList<Article>(); /** * The Logger class. */ protected static final Logger LOGGER = org.scify.NewSumServer.Server.Utils.Main.getLogger(); /** * Pattern to ommit text from the Articles. */ protected static final HashMap<String, String> hsSwitches = Communicator.getSwitches(); /** * The String containing various regular expression * patterns for the English Articles. */ protected static final String PATTERN_EN = readPattern("EN"); /** * The String containing various regular expression patterns * for the Greek Articles. */ protected static final String PATTERN_GR = readPattern("GR"); /** * The PATTERN to use for the text filtering. */ protected static final String PATTERN = (hsSwitches.get("PathToSources").endsWith("EN.txt")) ? PATTERN_EN : PATTERN_GR; /** * Constructor of the RssParser Class. Initializes the {@link #lsItems} list * * @param iDataS The Data Storage module to use * @param iArtDaysArg The max number of days old that articles are accepted */ public RssParser(IDataStorage iDataS, long iArtDaysArg) { this.ids = iDataS; this.iArticleDays = iArtDaysArg; lsItems = new ArrayList<Article>(); LOGGER.log(Level.INFO, "Processing pattern {0}", PATTERN.equals(PATTERN_GR) ? "GR" : "EN"); } /** * Processes the feeds from the given URL string and adds them to a List * containing an {@link Article} for each item found. * * @param urlString the URL string to parse * @param sCategory The category that the specified URL is about * @throws NetworkException * @throws IOException */ public void ProcessFeed(final String urlString, final String sCategory) throws NetworkException, IOException { //create a listener for handling our callbacks FeedParserListener listener; listener = new DefaultFeedParserListener() { @Override public void onItem(FeedParserState state, String title, String link, String description, String permalink) throws FeedParserException { // Use first 30 characters for title... if ((title == null) || (title.trim().length() == 0)) { title = description.substring(0, 30) + "..."; } // TODO for later version // check if category is "Γενικά" || "Top News" and if such, create // new UnlabeledArticle so that it gets category from the // classification Module. if (sCategory.equals(UNCLASSIFIED)) { // Initiate an Unlabeled Article (null Category) with boolean // toWrap = false, so that // it is not accessed by the classification trainer UnlabeledArticle tmpUnArt = new UnlabeledArticle(permalink, title.trim(), description, null, urlString, false); //filter Article text tmpUnArt = (UnlabeledArticle) preProcessArticle(tmpUnArt, 9); // Add the Article found to the list, avoid possible duplicates if (tmpUnArt != null) { Utilities.addItemToList(lsItems, tmpUnArt); } // Otherwise procceed normally with provided category } else { // Initiate a new article with toWrap = true, // so that it feeds the classification trainer Article tmpArt = new Article(permalink, title.trim(), description, sCategory, urlString, true); //filter article text tmpArt = preProcessArticle(tmpArt, 10); // Add the Article found to the list, avoid possible duplicates if (tmpArt != null) { Utilities.addItemToList(lsItems, tmpArt); } } } @Override public void onCreated(FeedParserState state, Date date) throws FeedParserException { if (!lsItems.isEmpty()) { //Adding date to current Article -- Some feeds don't provide date Article tmpArt = lsItems.get(lsItems.size() - 1); tmpArt.setDate(date); } } }; // debug // System.out.println("Fetching resource: " + urlString); // debug //use the FeedParser network IO package to fetch our resource URL ResourceRequest request = ResourceRequestFactory.getResourceRequest(urlString); request.setRequestHeaderField("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0"); FeedParser parser = null; try { // Grab input stream InputStream is = request.getInputStream(); parser = FeedParserFactory.newFeedParser(); parser.parse(listener, is, urlString); } catch (FeedParserException ex) { LOGGER.log(Level.WARNING, ex.getMessage(), ex); } catch (Exception ex) { LOGGER.log(Level.WARNING, ex.getMessage()); } } @Override public List<Article> getAllArticles(final HashMap<String, String> Sources) { final List<Article> AllArticles = new ArrayList<Article>(); Collection<String> sCategories = new HashSet<String>(Sources.values()); for (final String each : sCategories) { List<String> Links = new ArrayList<String>( (HashSet<String>) Utilities.getKeysByValue(Sources, each)); List<Article> lsAr = getAllNewsByCategory(Links, each); if (!lsAr.isEmpty()) { AllArticles.addAll(lsAr); } } return AllArticles; } /** * * @return the Articles of a specific parse */ public List<Article> getArticles() { return this.lsItems; } @Override public List<Article> getAllNewsByCategory(List<String> LinksToLoad, String sCategory) { LOGGER.log(Level.INFO, "Processing category {0}", sCategory); // Create and Initialize the Article List List<Article> lsResults = new ArrayList<Article>(); // Ommit bad URLs from the input list LinksToLoad = getValidLinks(LinksToLoad); // Iterate the list and getArticles for each Link for (String each : LinksToLoad) { List<Article> tmpList; try { tmpList = getNewsFromFeed(each, sCategory); if (tmpList != null || !tmpList.isEmpty()) { // Add the Articles to the list lsResults.addAll(tmpList); } } catch (NetworkException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } catch (IOException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } catch (FeedParserException ex) { LOGGER.log(Level.SEVERE, ex.getMessage(), ex); } } // System.err.println("Category " + sCategory + " : "+ lsResults.size()); this.lsFullItems.addAll(lsResults);//add all to save to file return lsResults; } @Override public List<Article> getNewsFromFeed(String sLinkToFeed, String sCategory) throws NetworkException, IOException, FeedParserException { List<Article> lsArticles = null; try { ProcessFeed(sLinkToFeed, sCategory); lsArticles = new ArrayList<Article>(getArticles()); } catch (IOException ex) { LOGGER.log(Level.WARNING, ex.getMessage(), ex); } lsItems.clear();//reset list for next feed List<Article> lsNewArticles = postProcessArticles(lsArticles); // filter //debug // System.out.println( lsNewArticles.size() + " articles from feed " + sLinkToFeed); //debug return lsNewArticles; } @Override public void saveAllArticles() { ArrayList co = new ArrayList(this.lsFullItems); //debug LOGGER.log(Level.INFO, "\nFiltered a total of {0} matches\nAdded " + "new Date to {1} articles\nAcquired date from previous run to {2} articles\n" + "Removed {3} Articles older than {4} days", new Object[]{String.valueOf(f), String.valueOf(inewDate), String.valueOf(ifromOld), String.valueOf(iOldDate), String.valueOf(iArticleDays)}); //debug LOGGER.log(Level.INFO, "Saving {0} Articles...", this.lsFullItems.size()); if (this.ids.objectExists("AllArticles", "feeds")) { this.ids.deleteObject("AllArticles", "feeds"); } this.ids.SaveObject(co, "AllArticles", "feeds"); } /** * Executes various filtering operations in the article list * * @param lsArts the Articles upon the operations will be executed * @return the list of the Articles filtered, removed unwanted articles. */ private List<Article> postProcessArticles(List<Article> lsArts) { //check if list has duplicates //Items cannot be Set, because i need index searching to getDate //so this conversion is unavoidable HashSet<Article> hsArts = new HashSet<Article>(new ArrayList(lsArts)); if (hsArts.size() != lsArts.size()) { LOGGER.log(Level.INFO, "{0} Duplicates omitted...", (lsArts.size() - hsArts.size())); } ArrayList<Article> clear = new ArrayList<Article>(hsArts); int Initial = clear.size(); // Specific filtering Iterator it = clear.iterator(); if (PATTERN.equals(PATTERN_EN)) { //english articles while (it.hasNext()) { Article each = (Article) it.next(); if (each.getFeed().contains("bbci")) { if (each.getText().matches("\\A[Pp]rovides an[d]* [oO]verview(.|\\\n)*")) { it.remove(); //BBC europe } } if (each.getFeed().contains("euronews")) { if (each.getText(). matches("\\A[Aa]t the [Ll]e[Ww]eb\\s*\\d+\\s*conference in\\s*\\w+(.|\\\n)*")) { it.remove(); //Euronews Leweb } } if (each.getFeed().contains("scientist")) { if (each.getTitle().matches("(?is)\\Aimage\\s*of\\s*[the ]*day.*")) { it.remove(); // remove articles 'Image of the Day from scientist' //FIXME maybe need to accept again when we have images } } } } else { // greek pattern while (it.hasNext()) { Article each = (Article) it.next(); if (each.getFeed().contains("epikaira")) { // remove [category] tags from titles for these feeds each.setTitle(filterTitle(each.getTitle(), "\\A\\[\\S+[-]*\\S+\\]")); } else if (each.getFeed().contains("enikos")) { // remove 'ΒΙΝΤΕΟ' from some titles each.setTitle(filterTitle(each.getTitle(), "(?iu)\\A[BΒ][ΙI][ΝND][ΤT]*[ΕE][ΟO]\\s*-\\s*")); } } } // add date to articles that do not have one, avoiding // articles older than iArticleDays from now List<Article> lsFinalArticles = addDateToArticles(clear); int Final = lsFinalArticles.size(); if (Final < Initial) { LOGGER.log(Level.INFO, "Removed {0} Articles", (Initial - Final)); } return lsFinalArticles; } /** * Filters the given list removing invalid URL links * * @param LinksToLoad The List containing all the links * @return a list with the invalid URL links omitted */ private List<String> getValidLinks(List<String> LinksToLoad) { List<String> ValidLinks = new ArrayList<String>(); ListIterator<String> iter = LinksToLoad.listIterator(); while (iter.hasNext()) { String nextLink = iter.next().trim(); if (Utilities.ValidURL(nextLink)) { ValidLinks.add(nextLink); } } return ValidLinks; } /** * filters a given text, removing all ocurrencies of unwanted text * * @param des The text to check * @param pattern The SPAM text PATTERN that is unwanted * @return A text with the unwanted part omitted */ private String filter(String des, String pattern) { String[] aPat = pattern.split(sPatternSep); for (String regex : aPat) { Pattern MyPattern = Pattern.compile(regex); Matcher MatchFound = MyPattern.matcher(des); if (MatchFound.find()) { f++; des = des.replaceAll(regex, ""); } } return des; } /** * Filters a specific text from a title. * * @param sTitle the title to process * @param PATTERN the PATTERN to apply * @return the title with the PATTERN text filtered */ private String filterTitle(String sTitle, String pattern) { String[] aPat = pattern.split(sPatternSep); for (String regex : aPat) { Pattern MyPattern = Pattern.compile(regex); Matcher MatchFound = MyPattern.matcher(sTitle); if (MatchFound.find()) { f++; sTitle = sTitle.replaceAll(regex, ""); } } return sTitle; } /** * * @param lang the language of the txt PATTERN file * @return all the Regular Expression patterns contained in the PATTERN * file, splitted by a separator */ private static String readPattern(String lang) { String sRes = null; String sToolPath = hsSwitches.get("ToolPath"); String sPath = sToolPath + "regexPat_" + lang + ".txt"; File fFile = new File(sPath); if (fFile.canRead()) { sRes = Utilities.readFromFile(sPath, sPatternSep); } else { LOGGER.log(Level.SEVERE, "Could not read file {0}", fFile.getPath()); } return sRes; } /** * Does some preprocessing operations in the article provided, such as * clearing garbage data from Article description, etc * * @param aArt the article to be processed * @param minWords The minimum number of words to accept * @return a cleaner article, or null if Article does not fulfill certain * criteria */ private Article preProcessArticle(Article aArt, int minWords) { //filter Article text if (aArt.getText() != null && !aArt.getText().trim().isEmpty()) { String sClearDescription = filter(aArt.getText(), PATTERN); // Accept article only if description length has more than 9 words if (sClearDescription.split("\\s+").length > minWords) { aArt.setText(sClearDescription); return aArt; } } return null; } /** * Adds a date to each Article in the list passed * * @param lsArts The Articles to process * @return the list of articles, with articles older than * {@link #iArticleDays} days removed. */ private List<Article> addDateToArticles(List<Article> lsArts) { // get Current date in millis Calendar now = Calendar.getInstance(); long iDays; ArrayList<Article> oldArticles; try { // get the category-daystokeep map from file to memory HashMap<String, Integer> hsCategoryDays = Utilities.readDaysPerCategoryFile(hsSwitches.get("sCatsDaysFile")); // load articles from previous run oldArticles = (ArrayList<Article>) this.ids.loadObject("AllArticles", "feeds"); // for each current article for (ListIterator<Article> arit = lsArts.listIterator(); arit.hasNext();) { Article each = arit.next(); // if it has not acquired a date yet if (each.getDate() == null) { // iterate the articles from previous run Iterator it = oldArticles.iterator(); while (it.hasNext()) { Article old = (Article) it.next(); // if the same article if (each.getText().hashCode() == old.getText().hashCode()) { // Set Current Article's Date from the previous run if (old.getDate() != null) { each.setDate(old.getDate()); ifromOld++; // update counter break; // continue to the next current article } else { // set Date as Now each.setDate(new Date()); inewDate++; break; // continue to the next article } } } // Remove all articles from date older than now - iArticleDays } else { // check if article category is in the Categories - Days Map. // and if so, assing days to keep from this file, not from global if (hsCategoryDays.containsKey(each.getCategory())) { iDays = hsCategoryDays.get(each.getCategory()); // System.err.println("Found DAYS LIMIT FOR CATEGORY " + each.getCategory() ); // System.err.println("DAYS " + iDays); } else { iDays = iArticleDays; } // if after the now - iArticleDays Date, accept, else, ignore if ((now.getTimeInMillis() - each.getDate().getTimeInMillis()) > (iDays * 1000 * 60 * 60 * 24)) { iOldDate++; arit.remove(); } } } // if old articles cannot be loaded to memory // (1st run or problem with file) } catch (Exception e) { LOGGER.log(Level.WARNING, e.getMessage()); for (Article each : lsArts) { // if current article does not have date if (each.getDate() == null) { // add date as now each.setDate(new Date()); inewDate++; } } // process possible missed articles } finally { // again for (Article each : lsArts) { if (each.getDate() == null) { // if null date and not in previous run each.setDate(new Date()); inewDate++; } } return lsArts; } } }