RssParser.java example

Explorer
NewSumServer-master
- src
  - org
    - scify
      - NewSumServer
        Server
        Comms
        Communicator.java
        MachineLearning
        INSECTDBWithDir.java
        classificationModule.java
        dataSets.java
        labelTagging.java
        util.java
        vector.java
        writeToFile.java
        OCR
        TextHandler.java
        Searching
        Indexer.java
        Searcher.java
        Sources
        BlogParser.java
        ISourceParser.java
        RSSSources.java
        RssParser.java
        Storage
        IDataStorage.java
        InsectFileIO.java
        SimpleFileIO.java
        Structures
        Article.java
        Sentence.java
        Topic.java
        UnlabeledArticle.java
        User.java
        Summarisation
        ArticleClusterer.java
        RedundancyRemover.java
        Summariser.java
        dumpClusterer.java
        Utils
        Main.java
        Utilities.java
/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 * 	http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */ 

package org.scify.NewSumServer.Server.Sources;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Time;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.feedparser.*;
import org.apache.commons.feedparser.network.NetworkException;
import org.apache.commons.feedparser.network.ResourceRequest;
import org.apache.commons.feedparser.network.ResourceRequestFactory;
import org.scify.NewSumServer.Server.Comms.Communicator;
import static org.scify.NewSumServer.Server.Sources.RssParser.hsSwitches;
import org.scify.NewSumServer.Server.Storage.IDataStorage;
import org.scify.NewSumServer.Server.Structures.Article;
import org.scify.NewSumServer.Server.Structures.UnlabeledArticle;
import org.scify.NewSumServer.Server.Utils.Utilities;

/**
 * Class RssParser parses a URL
 *
 * @author ggianna
 * @author George K. <gkiom@scify.org>
 */
public class RssParser implements ISourceParser {

    /**
     * The Storage module used for various I/O operations
     */
    private IDataStorage                ids;
    /**
     * Filtered Articles counter.
     */
    private static int                  f = 0;
    /**
     * Counter for Dates taken from previous runs.
     */
    private int                         ifromOld = 0;
    /**
     * Counter for new Dates.
     */
    private int                         inewDate = 0;
    /**
     * Counter for articles removed due to their Old Date values
     */
    private int                         iOldDate = 0;
    /**
     * Regular expressions PATTERN separator.
     */
    private static String               sPatternSep = "===";
    /**
     * The label for the articles that should be parsed by the Classifier.
     */
    public static final String          UNCLASSIFIED = "UNCLASSIFIED";
    /**
     * Limit number in days to keep articles. Articles older that this
     * number of days are ignored
     */
    private final long                  iArticleDays;
    /**
     * A List Containing each Item found in the feed in an {@link Article} form.
     */
    private List<Article>               lsItems;
    /**
     * All the Articles fetched by the Parser.
     */
    private List<Article>               lsFullItems = new ArrayList<Article>();

    /**
     * The Logger class.
     */
    protected static final Logger       LOGGER =
            org.scify.NewSumServer.Server.Utils.Main.getLogger();
    /**
     * Pattern to ommit text from the Articles.
     */
    protected static final 
            HashMap<String, String>     hsSwitches =
            Communicator.getSwitches();
    /**
     * The String containing various regular expression
     * patterns for the English Articles.
     */
    protected static final String       PATTERN_EN = readPattern("EN");
    /**
     * The String containing various regular expression patterns
     * for the Greek Articles.
     */
    protected static final String       PATTERN_GR = readPattern("GR");
    /**
     * The PATTERN to use for the text filtering.
     */
    protected static final String       PATTERN =
    (hsSwitches.get("PathToSources").endsWith("EN.txt")) ? PATTERN_EN : PATTERN_GR;

    /**
     * Constructor of the RssParser Class. Initializes the {@link #lsItems} list
     *
     * @param iDataS The Data Storage module to use
     * @param iArtDaysArg The max number of days old that articles are accepted
     */
    public RssParser(IDataStorage iDataS, long iArtDaysArg) {
        this.ids = iDataS;
        this.iArticleDays = iArtDaysArg;
        lsItems = new ArrayList<Article>();
        LOGGER.log(Level.INFO, "Processing pattern {0}",
                PATTERN.equals(PATTERN_GR) ? "GR" : "EN");
    }
    /**
     * Processes the feeds from the given URL string and adds them to a List
     * containing an {@link Article} for each item found.
     *
     * @param urlString the URL string to parse
     * @param sCategory The category that the specified URL is about
     * @throws NetworkException
     * @throws IOException
     */
    public void ProcessFeed(final String urlString, final String sCategory)
            throws NetworkException, IOException {

        //create a listener for handling our callbacks
        FeedParserListener listener;
        listener = new DefaultFeedParserListener() {
            @Override
            public void onItem(FeedParserState state,
                    String title,
                    String link,
                    String description,
                    String permalink) throws FeedParserException {
                // Use first 30 characters for title...
                if ((title == null) || (title.trim().length() == 0)) {
                    title = description.substring(0, 30) + "...";
                }
                // TODO for later version
                // check if category is "Γενικά" || "Top News" and if such, create
                // new UnlabeledArticle so that it gets category from the
                // classification Module.
                
                if (sCategory.equals(UNCLASSIFIED)) {
                    // Initiate an Unlabeled Article (null Category) with boolean
                    // toWrap = false, so that
                    // it is not accessed by the classification trainer
                    UnlabeledArticle tmpUnArt =
                            new UnlabeledArticle(permalink, title.trim(),
                            description, null, urlString, false);
                    //filter Article text
                    tmpUnArt = (UnlabeledArticle) preProcessArticle(tmpUnArt, 9);
                    // Add the Article found to the list, avoid possible duplicates
                    if (tmpUnArt != null) {
                        Utilities.addItemToList(lsItems, tmpUnArt);
                    }
                // Otherwise procceed normally with provided category
                } else {
                    // Initiate a new article with toWrap = true,
                    // so that it feeds the classification trainer
                    Article tmpArt =
                            new Article(permalink, title.trim(),
                            description, sCategory, urlString, true);
                    //filter article text
                    tmpArt = preProcessArticle(tmpArt, 10);
                    // Add the Article found to the list, avoid possible duplicates
                    if (tmpArt != null) {
                        Utilities.addItemToList(lsItems, tmpArt);
                    }
                }
            }

            @Override
            public void onCreated(FeedParserState state, Date date) throws FeedParserException {
                if (!lsItems.isEmpty()) {
                    //Adding date to current Article -- Some feeds don't provide date
                    Article tmpArt = lsItems.get(lsItems.size() - 1);
                    tmpArt.setDate(date);
                }
            }
        };
        // debug
//        System.out.println("Fetching resource: " + urlString);
        // debug
        //use the FeedParser network IO package to fetch our resource URL
        ResourceRequest request = ResourceRequestFactory.getResourceRequest(urlString);
        request.setRequestHeaderField("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0");
        FeedParser parser = null;
        try {
            // Grab input stream
            InputStream is = request.getInputStream();
            parser = FeedParserFactory.newFeedParser();
            parser.parse(listener, is, urlString);
        } catch (FeedParserException ex) {
            LOGGER.log(Level.WARNING, ex.getMessage(), ex);
        } catch (Exception ex) {
            LOGGER.log(Level.WARNING, ex.getMessage());
        }
    }
    @Override
    public List<Article> getAllArticles(final HashMap<String, String> Sources) {
        final List<Article> AllArticles = 
                new ArrayList<Article>();
        Collection<String> sCategories = new HashSet<String>(Sources.values());
        for (final String each : sCategories) {
            List<String> Links = new ArrayList<String>(
                    (HashSet<String>) Utilities.getKeysByValue(Sources, each));
            List<Article> lsAr = getAllNewsByCategory(Links, each);
            if (!lsAr.isEmpty()) {
                AllArticles.addAll(lsAr);
            }
        }
        return AllArticles;
    }

    /**
     *
     * @return the Articles of a specific parse
     */
    public List<Article> getArticles() {
        return this.lsItems;
    }

    @Override
    public List<Article> getAllNewsByCategory(List<String> LinksToLoad, String sCategory) {
        LOGGER.log(Level.INFO, "Processing category {0}", sCategory);
        // Create and Initialize the Article List
        List<Article> lsResults = new ArrayList<Article>();
        // Ommit bad URLs from the input list
        LinksToLoad = getValidLinks(LinksToLoad);
        // Iterate the list and getArticles for each Link
        for (String each : LinksToLoad) {
            List<Article> tmpList;
            try {
                tmpList = getNewsFromFeed(each, sCategory);
                if (tmpList != null || !tmpList.isEmpty()) {
                    // Add the Articles to the list
                    lsResults.addAll(tmpList);
                }
            } catch (NetworkException ex) {
                LOGGER.log(Level.SEVERE, ex.getMessage(), ex);
            } catch (IOException ex) {
                LOGGER.log(Level.SEVERE, ex.getMessage(), ex);
            } catch (FeedParserException ex) {
                LOGGER.log(Level.SEVERE, ex.getMessage(), ex);
            }
        }
//        System.err.println("Category " + sCategory + " : "+ lsResults.size());
        this.lsFullItems.addAll(lsResults);//add all to save to file

        return lsResults;
    }

    @Override
    public List<Article> getNewsFromFeed(String sLinkToFeed, String sCategory)
            throws NetworkException, IOException, FeedParserException {

        List<Article> lsArticles = null;
        try {
            ProcessFeed(sLinkToFeed, sCategory);
            lsArticles = new ArrayList<Article>(getArticles());
        } catch (IOException ex) {
            LOGGER.log(Level.WARNING, ex.getMessage(), ex);
        }
        lsItems.clear();//reset list for next feed
        List<Article> lsNewArticles = postProcessArticles(lsArticles); // filter
        //debug
//        System.out.println( lsNewArticles.size() + " articles from feed " + sLinkToFeed);
        //debug
        return lsNewArticles;
    }

    @Override
    public void saveAllArticles() {
        ArrayList co = new ArrayList(this.lsFullItems);
        //debug
        LOGGER.log(Level.INFO, "\nFiltered a total of {0} matches\nAdded "
                + "new Date to {1} articles\nAcquired date from previous run to {2} articles\n"
                + "Removed {3} Articles older than {4} days",
                new Object[]{String.valueOf(f), String.valueOf(inewDate), String.valueOf(ifromOld),
            String.valueOf(iOldDate), String.valueOf(iArticleDays)});
        //debug
        LOGGER.log(Level.INFO, "Saving {0} Articles...", this.lsFullItems.size());
        if (this.ids.objectExists("AllArticles", "feeds")) {
            this.ids.deleteObject("AllArticles", "feeds");
        }
        this.ids.SaveObject(co, "AllArticles", "feeds");
    }

    /**
     * Executes various filtering operations in the article list
     *
     * @param lsArts the Articles upon the operations will be executed
     * @return the list of the Articles filtered, removed unwanted articles.
     */
    private List<Article> postProcessArticles(List<Article> lsArts) {
        //check if list has duplicates
        //Items cannot be Set, because i need index searching to getDate
        //so this conversion is unavoidable

        HashSet<Article> hsArts = new HashSet<Article>(new ArrayList(lsArts));
        if (hsArts.size() != lsArts.size()) {
            LOGGER.log(Level.INFO,
                    "{0} Duplicates omitted...", (lsArts.size() - hsArts.size()));
        }
        ArrayList<Article> clear = new ArrayList<Article>(hsArts);
        int Initial = clear.size();
        // Specific filtering
        Iterator it = clear.iterator();
        if (PATTERN.equals(PATTERN_EN)) { //english articles
            while (it.hasNext()) {
                Article each = (Article) it.next();
                if (each.getFeed().contains("bbci")) {
                    if (each.getText().matches("\\A[Pp]rovides an[d]* [oO]verview(.|\\\n)*")) {
                        it.remove(); //BBC europe
                    }
                }
                if (each.getFeed().contains("euronews")) {
                    if (each.getText().
                            matches("\\A[Aa]t the [Ll]e[Ww]eb\\s*\\d+\\s*conference in\\s*\\w+(.|\\\n)*")) {
                        it.remove(); //Euronews Leweb
                    }
                }
                if (each.getFeed().contains("scientist")) {
                    if (each.getTitle().matches("(?is)\\Aimage\\s*of\\s*[the ]*day.*")) {
                        it.remove(); // remove articles 'Image of the Day from scientist'
                        //FIXME maybe need to accept again when we have images
                    }
                }
            }
        } else { // greek pattern
            while (it.hasNext()) {
                Article each = (Article) it.next();
                if (each.getFeed().contains("epikaira")) { // remove [category] tags from titles for these feeds
                    each.setTitle(filterTitle(each.getTitle(), "\\A\\[\\S+[-]*\\S+\\]"));
                } else if (each.getFeed().contains("enikos")) { // remove 'ΒΙΝΤΕΟ' from some titles
                    each.setTitle(filterTitle(each.getTitle(), "(?iu)\\A[BΒ][ΙI][ΝND][ΤT]*[ΕE][ΟO]\\s*-\\s*"));
                }
            }
        }
        // add date to articles that do not have one, avoiding
        // articles older than iArticleDays from now
        List<Article> lsFinalArticles = addDateToArticles(clear);

        int Final = lsFinalArticles.size();
        if (Final < Initial) {
            LOGGER.log(Level.INFO, "Removed {0} Articles", (Initial - Final));
        }
        return lsFinalArticles;
    }

    /**
     * Filters the given list removing invalid URL links
     *
     * @param LinksToLoad The List containing all the links
     * @return a list with the invalid URL links omitted
     */
    private List<String> getValidLinks(List<String> LinksToLoad) {
        List<String> ValidLinks = new ArrayList<String>();
        ListIterator<String> iter = LinksToLoad.listIterator();
        while (iter.hasNext()) {
            String nextLink = iter.next().trim();
            if (Utilities.ValidURL(nextLink)) {
                ValidLinks.add(nextLink);
            }
        }
        return ValidLinks;
    }

    /**
     * filters a given text, removing all ocurrencies of unwanted text
     *
     * @param des The text to check
     * @param pattern The SPAM text PATTERN that is unwanted
     * @return A text with the unwanted part omitted
     */
    private String filter(String des, String pattern) {
        String[] aPat = pattern.split(sPatternSep);
        for (String regex : aPat) {
            Pattern MyPattern = Pattern.compile(regex);
            Matcher MatchFound = MyPattern.matcher(des);
            if (MatchFound.find()) {
                f++;
                des = des.replaceAll(regex, "");
            }
        }
        return des;
    }

    /**
     * Filters a specific text from a title.
     *
     * @param sTitle the title to process
     * @param PATTERN the PATTERN to apply
     * @return the title with the PATTERN text filtered
     */
    private String filterTitle(String sTitle, String pattern) {
        String[] aPat = pattern.split(sPatternSep);
        for (String regex : aPat) {
            Pattern MyPattern = Pattern.compile(regex);
            Matcher MatchFound = MyPattern.matcher(sTitle);
            if (MatchFound.find()) {
                f++;
                sTitle = sTitle.replaceAll(regex, "");
            }
        }
        return sTitle;
    }

    /**
     *
     * @param lang the language of the txt PATTERN file
     * @return all the Regular Expression patterns contained in the PATTERN
     * file, splitted by a separator
     */
    private static String readPattern(String lang) {
        String sRes = null;
        String sToolPath = hsSwitches.get("ToolPath");
        String sPath = sToolPath + "regexPat_" + lang + ".txt";
        File fFile = new File(sPath);
        if (fFile.canRead()) {
            sRes = Utilities.readFromFile(sPath, sPatternSep);
        } else {
            LOGGER.log(Level.SEVERE, "Could not read file {0}", fFile.getPath());
        }
        return sRes;
    }

    /**
     * Does some preprocessing operations in the article provided, such as
     * clearing garbage data from Article description, etc
     *
     * @param aArt the article to be processed
     * @param minWords The minimum number of words to accept
     * @return a cleaner article, or null if Article does not fulfill certain
     * criteria
     */
    private Article preProcessArticle(Article aArt, int minWords) {
        //filter Article text
        if (aArt.getText() != null && !aArt.getText().trim().isEmpty()) {
            String sClearDescription = filter(aArt.getText(), PATTERN);
            // Accept article only if description length has more than 9 words
            if (sClearDescription.split("\\s+").length > minWords) {
                aArt.setText(sClearDescription);
                return aArt;
            }
        }
        return null;
    }

    /**
     * Adds a date to each Article in the list passed
     *
     * @param lsArts The Articles to process
     * @return the list of articles, with articles older than
     * {@link #iArticleDays} days removed.
     */
    private List<Article> addDateToArticles(List<Article> lsArts) {
        // get Current date in millis
        Calendar now = Calendar.getInstance();
        long iDays;
        ArrayList<Article> oldArticles;
        try {
            // get the category-daystokeep map from file to memory
            HashMap<String, Integer> hsCategoryDays =
                    Utilities.readDaysPerCategoryFile(hsSwitches.get("sCatsDaysFile"));
            // load articles from previous run
            oldArticles = (ArrayList<Article>) this.ids.loadObject("AllArticles", "feeds");
            // for each current article
            for (ListIterator<Article> arit = lsArts.listIterator(); arit.hasNext();) {
                Article each = arit.next();
                // if it has not acquired a date yet
                if (each.getDate() == null) {
                    // iterate the articles from previous run
                    Iterator it = oldArticles.iterator();
                    while (it.hasNext()) {
                        Article old = (Article) it.next();
                        // if the same article
                        if (each.getText().hashCode() == old.getText().hashCode()) {
                            // Set Current Article's Date from the previous run
                            if (old.getDate() != null) {
                                each.setDate(old.getDate());
                                ifromOld++;     // update counter
                                break; // continue to the next current article
                            } else {
                                // set Date as Now
                                each.setDate(new Date());
                                inewDate++;
                                break; // continue to the next article
                            }
                        }
                    }
                    // Remove all articles from date older than now - iArticleDays
                } else {

                    // check if article category is in the Categories - Days Map.
                    // and if so, assing days to keep from this file, not from global
                    if (hsCategoryDays.containsKey(each.getCategory())) {
                        iDays = hsCategoryDays.get(each.getCategory());
//                        System.err.println("Found DAYS LIMIT FOR CATEGORY " + each.getCategory() );
//                        System.err.println("DAYS " + iDays);
                    } else {
                        iDays = iArticleDays;
                    }
                    // if after the now - iArticleDays Date, accept, else, ignore
                    if ((now.getTimeInMillis() - each.getDate().getTimeInMillis())
                            > (iDays * 1000 * 60 * 60 * 24)) {

                        iOldDate++;
                        arit.remove();
                    }
                }
            }
            // if old articles cannot be loaded to memory
            // (1st run or problem with file)
        } catch (Exception e) {
            LOGGER.log(Level.WARNING, e.getMessage());
            for (Article each : lsArts) {
                // if current article does not have date
                if (each.getDate() == null) {
                    // add date as now
                    each.setDate(new Date());
                    inewDate++;
                }
            }
            // process possible missed articles
        } finally {
            // again
            for (Article each : lsArts) {
                if (each.getDate() == null) { // if null date and not in previous run
                    each.setDate(new Date());
                    inewDate++;
                }
            }
            return lsArts;
        }
    }

}