Summariser.java example

Explorer
NewSumServer-master
- src
  - org
    - scify
      - NewSumServer
        Server
        Comms
        Communicator.java
        MachineLearning
        INSECTDBWithDir.java
        classificationModule.java
        dataSets.java
        labelTagging.java
        util.java
        vector.java
        writeToFile.java
        OCR
        TextHandler.java
        Searching
        Indexer.java
        Searcher.java
        Sources
        BlogParser.java
        ISourceParser.java
        RSSSources.java
        RssParser.java
        Storage
        IDataStorage.java
        InsectFileIO.java
        SimpleFileIO.java
        Structures
        Article.java
        Sentence.java
        Topic.java
        UnlabeledArticle.java
        User.java
        Summarisation
        ArticleClusterer.java
        RedundancyRemover.java
        Summariser.java
        dumpClusterer.java
        Utils
        Main.java
        Utilities.java
/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 * 	http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */ 

package org.scify.NewSumServer.Server.Summarisation;

import Jama.Matrix;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedNonSymmGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramGraph;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramSymWinGraph;
import gr.demokritos.iit.jinsect.storage.INSECTDB;
import gr.demokritos.iit.jinsect.structs.GraphSimilarity;
import gr.demokritos.iit.jinsect.utils;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import org.scify.NewSumServer.Server.Structures.Article;
import org.scify.NewSumServer.Server.Structures.Sentence;
import org.scify.NewSumServer.Server.Structures.Topic;
import static org.scify.NewSumServer.Server.Summarisation.Summariser.LOGGER;
import org.scify.NewSumServer.Server.Utils.Main;

/**
 *
 * @author ggianna
 */
public class Summariser {

    /**
     * The Logger, inherited from main
     */
    protected final static Logger LOGGER = Main.getLogger();

    // Sentence model constants
    protected final String SENTENCE_MODEL_OBJNAME = "SentenceModel";
    protected final String SENTENCE_MODEL_OBJTYPE = "NLPModel";
    protected final String SUMMARY_OBJTYPE = "Summary";

    //DONE SAVE SUMMARY, for each Summary Created using InsectFileIO.saveSummary
    /**
     * The map containing the summaries
     */
    protected Map<String, List<Sentence>>   hsSentencesPerCluster;

    /**
     * The Topics
     */
    protected Set<Topic>    stTopics;

    /**
     * Storage for summaries and related index
     */
    protected INSECTDB SummaryStorage;

    /**
     * Sentence splitter model
     */
    protected SentenceModel smSplitter = null;

    /**
     * Main Constructor of the Summariser Class.
     * @param stTopics The topics to summarize from
     * @param SummaryStorage The module used for storage
     */
    public Summariser(Set<Topic> stTopics,
            INSECTDB SummaryStorage) {
        this.stTopics = stTopics;
        this.SummaryStorage = SummaryStorage;

        // Init splitter
        initSplitter();
    }

    private void initSplitter() {
        // TODO: Check whether model already exists
        SentenceModel model = null;
        boolean bModelExisted = false;

        File fTmp = new File(Main.sToolPath +  "splitModel.dat");
        // If file exists
        if (fTmp.exists()) {
            // Try to load it
            InputStream modelIn = null;
            try {
                modelIn = new FileInputStream(fTmp);
                model = new SentenceModel(modelIn);
                // On success
                if (model != null)
                    // note that it already existed
                    bModelExisted = true;
            }
            catch (IOException e) {
                LOGGER.log(Level.WARNING, "Could not load sentence splitter model.", e);
            }
            finally {
                // Finalize model file access, if possible
                if (modelIn != null) {
                    try {
                        modelIn.close();
                    }
                    catch (IOException e) {
                        model = null;
                    }
                }
            }
        }

        // If the model was not loaded normally
        if (model == null)
        {
            Charset charset = Charset.forName("UTF-8");
            ObjectStream<String> lineStream = new PlainTextByLineStream(
                    getClass().getResourceAsStream("SentenceSplitterTraining.txt"),
                charset);
            ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);


            try {
                try {
                    model = SentenceDetectorME.train("gr", sampleStream, true, null);
                } catch (IOException ex) {
                    LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
                    return;
                }
            }
            finally {
                try {
                    sampleStream.close();
                } catch (IOException ex) {
                LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
                }
            }
        }

        OutputStream modelOut = null;
        boolean bSuccess = false;
        try {
            //File fTmp = File.createTempFile("splitModel", null);
            FileOutputStream fsOut = new FileOutputStream(fTmp);
            modelOut = new BufferedOutputStream(fsOut);
            model.serialize(modelOut);
            bSuccess = true;
        } catch (IOException ex) {
            LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
        } finally {
        if (modelOut != null)
            try {
                modelOut.close();
            } catch (IOException ex) {
                bSuccess = false;
                LOGGER.log(Level.SEVERE, "Could not finalize sentence splitter model.", ex);
            }
        }

        if (bSuccess)
            this.smSplitter = model;
    }

    /**
     * Creates all Summaries
     * @return A map containing the Summary for each ID
     */
    public Map<String, List<Sentence>> getSummaries() {
        LOGGER.log(Level.INFO, "Obtaining Summaries...");
        // Init result
        Map<String, List<Sentence>> mRes = new HashMap<String, List<Sentence>>();
        // For every cluster
        for (Topic tCurTopic : stTopics) {
            // Add its summary to the result map
            mRes.put(tCurTopic.getID(), getSummary(tCurTopic));
        }
        hsSentencesPerCluster = mRes;
        LOGGER.log(Level.INFO, "Summaries obtained Succesfully");
        // Return summary map
        return mRes;
    }
    /**
     * Creates a summary of the Articles of interest
     * @param tTopic The Topic that will be processed
     * @return A List of Sentence Objects for the specified UUID.
     */
    public List<Sentence> getSummary(Topic tTopic) {
        // Check if already loaded in-memory
        if (hsSentencesPerCluster != null) {
            if (!hsSentencesPerCluster.isEmpty()) {
                if (hsSentencesPerCluster.containsKey(tTopic.getID())) {
                    return hsSentencesPerCluster.get(tTopic.getID());
                }
            }
        }

        // Init document graphs and sentences
        LinkedList<Sentence> lAllSentences = null;

        // Check if on disk
        boolean bLoadedOK = false;
        if (SummaryStorage.existsObject(tTopic.getID(), SUMMARY_OBJTYPE)) {
            lAllSentences = (LinkedList<Sentence>)SummaryStorage.loadObject(
                    tTopic.getID(), SUMMARY_OBJTYPE);
        }
        // If unsuccessfully loaded
        if (lAllSentences == null) {
            bLoadedOK = false; // Update loaded variable
            // Init to empty
            lAllSentences = new LinkedList<Sentence>();
        }
        else // else
        {
            // Return summary
            return lAllSentences;
        }



        // If only a single document
        if (tTopic.size() == 1)
        {
            Article aCur = tTopic.get(0);

            // Split into sentences
            String[] saSentences;
            // If we do not have a model
            if (smSplitter == null) {
                // Use plain splitting
                saSentences = aCur.getText().split("[.!?;:\"']");
            }
            else // else use the model
            {
                SentenceDetectorME sentenceDetector = new SentenceDetectorME(smSplitter);
                saSentences = sentenceDetector.sentDetect(aCur.getText());
            } // end If we do not have a model

            // For each sentence
            for (String sCurSentence : saSentences) {
                // If not empty
                if (sCurSentence.trim().length() > 0)
                {
                    // Create sentence object
                    Sentence sCur = new Sentence(sCurSentence.trim(), aCur.getSource(), aCur.getFeed());
                    // Add to all sentences list
                    lAllSentences.add(sCur);
                }
            }
            return lAllSentences;
        }

        // For every article in cluster
        for (Article aCur : tTopic) {
            // Split into sentences
            String[] saSentences;
            // If we do not have a model
            if (smSplitter == null) {
                // Use plain splitting
                saSentences = aCur.getText().split("[.!?;:\"']");
            }
            else // else use the model
            {
                SentenceDetectorME sentenceDetector = new SentenceDetectorME(smSplitter);
                saSentences = sentenceDetector.sentDetect(aCur.getText());
            } // end If we do not have a model

            // For each sentence
            for (String sCurSentence : saSentences) {
                // If not empty
                if (sCurSentence.trim().length() > 0)
                {
                    // Create sentence object
                    Sentence sCur = new Sentence(sCurSentence.trim(), aCur.getSource(), aCur.getFeed());
                    // Add to all sentences list
                    lAllSentences.add(sCur);
                }
            }
        }

        // Get sentence clusters
        Set<Set<Sentence>> sSentenceClusters = getClusters(lAllSentences);

        // For each cluster
        double dCnt = 0.0;
        final DocumentNGramSymWinGraph dgContentGraph = new DocumentNGramSymWinGraph();
        for (Set<Sentence> ssCurCluster : sSentenceClusters) {
            // Create common n-gram graph
            DocumentNGramGraph dgCluster = getGraphFromCluster(ssCurCluster);
            // and add to content graph
            dgContentGraph.merge(dgCluster, 1.0 / ++dCnt);
        }

        // Order sentences by Value Similarity (and not NVS) to the content graph
        Collections.sort(lAllSentences, new Comparator<Sentence>() {

            @Override
            public int compare(Sentence t, Sentence t1) {
                // Init sentence graphs
                DocumentNGramGraph dg = new DocumentNGramSymWinGraph();
                dg.setDataString(t.getSnippet());
                DocumentNGramGraph dg1 = new DocumentNGramSymWinGraph();
                dg1.setDataString(t1.getSnippet());
                // Compare to content graph
                NGramCachedGraphComparator ngc = new NGramCachedNonSymmGraphComparator();
                double dVS = ngc.getSimilarityBetween(dg, dgContentGraph).ValueSimilarity;
                double dVS1 = ngc.getSimilarityBetween(dg1, dgContentGraph).ValueSimilarity;

                // Return order based on similarity comparison
                return (int)Math.signum(dVS - dVS1);
            }
        });
        // TODO: Check sentences with most Named Entities?
        // TODO: Extract other features?

        // Save summary
        try {
            // Only if it is not already available and valid
            if (!bLoadedOK)
                if (!SummaryStorage.existsObject(tTopic.getID(), SUMMARY_OBJTYPE))
                    SummaryStorage.saveObject(lAllSentences,
                      tTopic.getID(), SUMMARY_OBJTYPE);
        } catch (Exception ex) {
            LOGGER.log(Level.WARNING, "Could Not Save Summary with Topic ID {0} ", tTopic.getID());
        }
        // Return sorted sentences
        return lAllSentences;
    }

    /**
     *
     * @param ssCluster The Sentences to process
     * @return The graph for the specified set of sentences
     */
    protected DocumentNGramGraph getGraphFromCluster(Set<Sentence> ssCluster) {
        // Init result graph
        DocumentNGramSymWinGraph dgRes = new DocumentNGramSymWinGraph();
        double dCnt = 0.0;
        // For every sentence
        for (Sentence sCur : ssCluster) {
            // If first sentence
            if (dCnt == 0.0) {
                // Initialize graph
                dgRes.setDataString(sCur.getSnippet());
                dCnt++;
            }
            else {
                // else intersect
                DocumentNGramSymWinGraph dgNew = new DocumentNGramSymWinGraph();
                dgNew.setDataString(sCur.getSnippet());
                dgRes.intersectGraph(dgNew);
            }
        }

        // Return result graph
        return dgRes;
    }

    /**
     * Clusters a set of sentences. Uses Markov Clustering (MCL).
     * @param lAllSentences The List of sentences to cluster.
     * @return A set of Set<Sentence> objects, which constitute clusters
     * of a set of given sentences.
     */
    protected NavigableSet<Set<Sentence>> getClusters(List<Sentence> lAllSentences) {
        // Create navigable set
        TreeSet<Set<Sentence>> tsRes = new TreeSet<Set<Sentence>>(new Comparator<Set<Sentence>>() {
            @Override
            public int compare(Set<Sentence> t, Set<Sentence> t1) {
                // Use string representations for comparison
                return utils.printIterable(t, "***").compareTo(utils.printIterable(t1, "***"));
            }
        });

        // Get similarities
        Matrix mSims = getSimilarityMatrix(lAllSentences);
        // Initial step
        // Normalize per column to render stochastic
        normalizeMatrixPerColumn(mSims, 1.0);
        Matrix mLastRes = null;

        // Until convergence or 100 iterations
        for (int iIter = 0; iIter < 100; iIter++) {
            // Expand by squaring the matrix
            mLastRes = mSims.times(mSims);
            // Inflate
            normalizeMatrixPerColumn(mLastRes, 2.0);
            // If convergence has been achieved
            if (mSims.minus(mLastRes).normInf() < 0.001)
                break;
            // Update sim matrix by copying last result
            mSims = mLastRes.copy();
        }

        // Final step: Interprete results
        // For each row
        for (int iRow = 0; iRow < mLastRes.getRowDimension(); iRow++) {
            Set<Sentence> sCluster = new HashSet<Sentence>();
            // For all columns
            for (int iCol = 0; iCol < mLastRes.getColumnDimension(); iCol++)
            {
                // If it contains a non-zero element (above 0.01)
                if (mLastRes.get(iRow, iCol) > 0.01)
                    // Add it to the current cluster
                    sCluster.add(lAllSentences.get(iCol));
            }
            // Add cluster to result set
            tsRes.add(sCluster);
        }

        // Return map
        return tsRes;
    }

    /**
     * Calculates a similarity matrix (including self-similarity), by using NVS
     * calculation.
     * @param lAllSentences
     * @return
     */
    protected Matrix getSimilarityMatrix(List<Sentence> lAllSentences) {
        // Init sim matrix
        final Matrix mSims = new Matrix(lAllSentences.size(), lAllSentences.size());
        // Perform parallel execution
        ExecutorService es = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
        // Init final vars
        final List<Sentence> lAllSentencesArg = lAllSentences;
        int iFirstCnt = 0;

        // For every sentence pair in cluster
        for (final Sentence sFirst : lAllSentences) {
            final int iFirstCntArg = iFirstCnt;
            es.submit(new Runnable() {

                @Override
                public void run() {
                    double dSim = 0.0;
                    int iSecondCnt = 0;
                    NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
                    // Create first graph
                    DocumentNGramGraph gFirst = new DocumentNGramSymWinGraph();
                    gFirst.setDataString(sFirst.getSnippet());

                    for (Sentence sSecond : lAllSentencesArg) {
                        if (iSecondCnt == iFirstCntArg)
                            dSim = 1.0;
                        else {
                            // Create second graph
                            // TODO: Use cache?
                            DocumentNGramGraph gSecond = new DocumentNGramSymWinGraph();
                            gSecond.setDataString(sSecond.getSnippet());
                            // Calculate Normalized Value Similarity
                            GraphSimilarity gsCur = ngc.getSimilarityBetween(gFirst, gSecond);
                            dSim = gsCur.SizeSimilarity == 0.0 ? 0.0 :
                                    gsCur.ValueSimilarity / gsCur.SizeSimilarity;
                        }
                        // Set to matrix
                        synchronized (mSims) {
                            mSims.set(iFirstCntArg, iSecondCnt, dSim);
                        }
                        iSecondCnt++;
                    }
                }
            });
            iFirstCnt++;
        }
        // Complete comparisons
        es.shutdown();
        try {
            es.awaitTermination(1, TimeUnit.DAYS);
        } catch (InterruptedException ex) {
            Logger.getLogger(Summariser.class.getName()).log(Level.SEVERE, null, ex);
            return null;
        }

        return mSims;
    }

    /**
     * Normalizes a matrix on a per column basis.
     * @param mToNormalize The matrix to normalize <b>in place</b>.
     * @param dPower The power to raise the elements to, before normalization
     * @return The normalized matrix.
     */
    protected Matrix normalizeMatrixPerColumn(Matrix mToNormalize, double dPower) {

        // For every column
        for (int iColumnCnt=0; iColumnCnt < mToNormalize.getColumnDimension(); iColumnCnt++) {
            // Determine sum
            double dColSum = 0.0;
            // For every row
            for (int iRowCnt=0; iRowCnt < mToNormalize.getRowDimension(); iRowCnt++) {
                double dPowered = Math.pow(mToNormalize.get(iRowCnt, iColumnCnt), dPower);
                // Update matrix value
                mToNormalize.set(iRowCnt, iColumnCnt, dPowered);
                // Update sum
                dColSum += dPowered;
            }

            // For every row
            for (int iRowCnt=0; iRowCnt < mToNormalize.getRowDimension(); iRowCnt++) {
                double dNormalized = mToNormalize.get(iRowCnt, iColumnCnt) / dColSum;
                // Update matrix value to normalized value
                mToNormalize.set(iRowCnt, iColumnCnt, dNormalized);
            }

        }

        return mToNormalize;
    }
}