/*
* Copyright 2013 SciFY NPO <info@scify.org>.
*
* This product is part of the NewSum Free Software.
* For more information about NewSum visit
*
* http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* If this code or its output is used, extended, re-engineered, integrated,
* or embedded to any extent in another software or hardware, there MUST be
* an explicit attribution to this work in the resulting source code,
* the packaging (where such packaging exists), or user interface
* (where such an interface exists).
* The attribution must be of the form "Powered by NewSum, SciFY"
*/
package org.scify.NewSumServer.Server.Summarisation;
import Jama.Matrix;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedNonSymmGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramGraph;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramSymWinGraph;
import gr.demokritos.iit.jinsect.storage.INSECTDB;
import gr.demokritos.iit.jinsect.structs.GraphSimilarity;
import gr.demokritos.iit.jinsect.utils;
import java.io.*;
import java.nio.charset.Charset;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.sentdetect.SentenceSampleStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import org.scify.NewSumServer.Server.Structures.Article;
import org.scify.NewSumServer.Server.Structures.Sentence;
import org.scify.NewSumServer.Server.Structures.Topic;
import static org.scify.NewSumServer.Server.Summarisation.Summariser.LOGGER;
import org.scify.NewSumServer.Server.Utils.Main;
/**
*
* @author ggianna
*/
public class Summariser {
/**
* The Logger, inherited from main
*/
protected final static Logger LOGGER = Main.getLogger();
// Sentence model constants
protected final String SENTENCE_MODEL_OBJNAME = "SentenceModel";
protected final String SENTENCE_MODEL_OBJTYPE = "NLPModel";
protected final String SUMMARY_OBJTYPE = "Summary";
//DONE SAVE SUMMARY, for each Summary Created using InsectFileIO.saveSummary
/**
* The map containing the summaries
*/
protected Map<String, List<Sentence>> hsSentencesPerCluster;
/**
* The Topics
*/
protected Set<Topic> stTopics;
/**
* Storage for summaries and related index
*/
protected INSECTDB SummaryStorage;
/**
* Sentence splitter model
*/
protected SentenceModel smSplitter = null;
/**
* Main Constructor of the Summariser Class.
* @param stTopics The topics to summarize from
* @param SummaryStorage The module used for storage
*/
public Summariser(Set<Topic> stTopics,
INSECTDB SummaryStorage) {
this.stTopics = stTopics;
this.SummaryStorage = SummaryStorage;
// Init splitter
initSplitter();
}
private void initSplitter() {
// TODO: Check whether model already exists
SentenceModel model = null;
boolean bModelExisted = false;
File fTmp = new File(Main.sToolPath + "splitModel.dat");
// If file exists
if (fTmp.exists()) {
// Try to load it
InputStream modelIn = null;
try {
modelIn = new FileInputStream(fTmp);
model = new SentenceModel(modelIn);
// On success
if (model != null)
// note that it already existed
bModelExisted = true;
}
catch (IOException e) {
LOGGER.log(Level.WARNING, "Could not load sentence splitter model.", e);
}
finally {
// Finalize model file access, if possible
if (modelIn != null) {
try {
modelIn.close();
}
catch (IOException e) {
model = null;
}
}
}
}
// If the model was not loaded normally
if (model == null)
{
Charset charset = Charset.forName("UTF-8");
ObjectStream<String> lineStream = new PlainTextByLineStream(
getClass().getResourceAsStream("SentenceSplitterTraining.txt"),
charset);
ObjectStream<SentenceSample> sampleStream = new SentenceSampleStream(lineStream);
try {
try {
model = SentenceDetectorME.train("gr", sampleStream, true, null);
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
return;
}
}
finally {
try {
sampleStream.close();
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
}
}
}
OutputStream modelOut = null;
boolean bSuccess = false;
try {
//File fTmp = File.createTempFile("splitModel", null);
FileOutputStream fsOut = new FileOutputStream(fTmp);
modelOut = new BufferedOutputStream(fsOut);
model.serialize(modelOut);
bSuccess = true;
} catch (IOException ex) {
LOGGER.log(Level.WARNING, "Could not create sentence splitter model.", ex);
} finally {
if (modelOut != null)
try {
modelOut.close();
} catch (IOException ex) {
bSuccess = false;
LOGGER.log(Level.SEVERE, "Could not finalize sentence splitter model.", ex);
}
}
if (bSuccess)
this.smSplitter = model;
}
/**
* Creates all Summaries
* @return A map containing the Summary for each ID
*/
public Map<String, List<Sentence>> getSummaries() {
LOGGER.log(Level.INFO, "Obtaining Summaries...");
// Init result
Map<String, List<Sentence>> mRes = new HashMap<String, List<Sentence>>();
// For every cluster
for (Topic tCurTopic : stTopics) {
// Add its summary to the result map
mRes.put(tCurTopic.getID(), getSummary(tCurTopic));
}
hsSentencesPerCluster = mRes;
LOGGER.log(Level.INFO, "Summaries obtained Succesfully");
// Return summary map
return mRes;
}
/**
* Creates a summary of the Articles of interest
* @param tTopic The Topic that will be processed
* @return A List of Sentence Objects for the specified UUID.
*/
public List<Sentence> getSummary(Topic tTopic) {
// Check if already loaded in-memory
if (hsSentencesPerCluster != null) {
if (!hsSentencesPerCluster.isEmpty()) {
if (hsSentencesPerCluster.containsKey(tTopic.getID())) {
return hsSentencesPerCluster.get(tTopic.getID());
}
}
}
// Init document graphs and sentences
LinkedList<Sentence> lAllSentences = null;
// Check if on disk
boolean bLoadedOK = false;
if (SummaryStorage.existsObject(tTopic.getID(), SUMMARY_OBJTYPE)) {
lAllSentences = (LinkedList<Sentence>)SummaryStorage.loadObject(
tTopic.getID(), SUMMARY_OBJTYPE);
}
// If unsuccessfully loaded
if (lAllSentences == null) {
bLoadedOK = false; // Update loaded variable
// Init to empty
lAllSentences = new LinkedList<Sentence>();
}
else // else
{
// Return summary
return lAllSentences;
}
// If only a single document
if (tTopic.size() == 1)
{
Article aCur = tTopic.get(0);
// Split into sentences
String[] saSentences;
// If we do not have a model
if (smSplitter == null) {
// Use plain splitting
saSentences = aCur.getText().split("[.!?;:\"']");
}
else // else use the model
{
SentenceDetectorME sentenceDetector = new SentenceDetectorME(smSplitter);
saSentences = sentenceDetector.sentDetect(aCur.getText());
} // end If we do not have a model
// For each sentence
for (String sCurSentence : saSentences) {
// If not empty
if (sCurSentence.trim().length() > 0)
{
// Create sentence object
Sentence sCur = new Sentence(sCurSentence.trim(), aCur.getSource(), aCur.getFeed());
// Add to all sentences list
lAllSentences.add(sCur);
}
}
return lAllSentences;
}
// For every article in cluster
for (Article aCur : tTopic) {
// Split into sentences
String[] saSentences;
// If we do not have a model
if (smSplitter == null) {
// Use plain splitting
saSentences = aCur.getText().split("[.!?;:\"']");
}
else // else use the model
{
SentenceDetectorME sentenceDetector = new SentenceDetectorME(smSplitter);
saSentences = sentenceDetector.sentDetect(aCur.getText());
} // end If we do not have a model
// For each sentence
for (String sCurSentence : saSentences) {
// If not empty
if (sCurSentence.trim().length() > 0)
{
// Create sentence object
Sentence sCur = new Sentence(sCurSentence.trim(), aCur.getSource(), aCur.getFeed());
// Add to all sentences list
lAllSentences.add(sCur);
}
}
}
// Get sentence clusters
Set<Set<Sentence>> sSentenceClusters = getClusters(lAllSentences);
// For each cluster
double dCnt = 0.0;
final DocumentNGramSymWinGraph dgContentGraph = new DocumentNGramSymWinGraph();
for (Set<Sentence> ssCurCluster : sSentenceClusters) {
// Create common n-gram graph
DocumentNGramGraph dgCluster = getGraphFromCluster(ssCurCluster);
// and add to content graph
dgContentGraph.merge(dgCluster, 1.0 / ++dCnt);
}
// Order sentences by Value Similarity (and not NVS) to the content graph
Collections.sort(lAllSentences, new Comparator<Sentence>() {
@Override
public int compare(Sentence t, Sentence t1) {
// Init sentence graphs
DocumentNGramGraph dg = new DocumentNGramSymWinGraph();
dg.setDataString(t.getSnippet());
DocumentNGramGraph dg1 = new DocumentNGramSymWinGraph();
dg1.setDataString(t1.getSnippet());
// Compare to content graph
NGramCachedGraphComparator ngc = new NGramCachedNonSymmGraphComparator();
double dVS = ngc.getSimilarityBetween(dg, dgContentGraph).ValueSimilarity;
double dVS1 = ngc.getSimilarityBetween(dg1, dgContentGraph).ValueSimilarity;
// Return order based on similarity comparison
return (int)Math.signum(dVS - dVS1);
}
});
// TODO: Check sentences with most Named Entities?
// TODO: Extract other features?
// Save summary
try {
// Only if it is not already available and valid
if (!bLoadedOK)
if (!SummaryStorage.existsObject(tTopic.getID(), SUMMARY_OBJTYPE))
SummaryStorage.saveObject(lAllSentences,
tTopic.getID(), SUMMARY_OBJTYPE);
} catch (Exception ex) {
LOGGER.log(Level.WARNING, "Could Not Save Summary with Topic ID {0} ", tTopic.getID());
}
// Return sorted sentences
return lAllSentences;
}
/**
*
* @param ssCluster The Sentences to process
* @return The graph for the specified set of sentences
*/
protected DocumentNGramGraph getGraphFromCluster(Set<Sentence> ssCluster) {
// Init result graph
DocumentNGramSymWinGraph dgRes = new DocumentNGramSymWinGraph();
double dCnt = 0.0;
// For every sentence
for (Sentence sCur : ssCluster) {
// If first sentence
if (dCnt == 0.0) {
// Initialize graph
dgRes.setDataString(sCur.getSnippet());
dCnt++;
}
else {
// else intersect
DocumentNGramSymWinGraph dgNew = new DocumentNGramSymWinGraph();
dgNew.setDataString(sCur.getSnippet());
dgRes.intersectGraph(dgNew);
}
}
// Return result graph
return dgRes;
}
/**
* Clusters a set of sentences. Uses Markov Clustering (MCL).
* @param lAllSentences The List of sentences to cluster.
* @return A set of Set<Sentence> objects, which constitute clusters
* of a set of given sentences.
*/
protected NavigableSet<Set<Sentence>> getClusters(List<Sentence> lAllSentences) {
// Create navigable set
TreeSet<Set<Sentence>> tsRes = new TreeSet<Set<Sentence>>(new Comparator<Set<Sentence>>() {
@Override
public int compare(Set<Sentence> t, Set<Sentence> t1) {
// Use string representations for comparison
return utils.printIterable(t, "***").compareTo(utils.printIterable(t1, "***"));
}
});
// Get similarities
Matrix mSims = getSimilarityMatrix(lAllSentences);
// Initial step
// Normalize per column to render stochastic
normalizeMatrixPerColumn(mSims, 1.0);
Matrix mLastRes = null;
// Until convergence or 100 iterations
for (int iIter = 0; iIter < 100; iIter++) {
// Expand by squaring the matrix
mLastRes = mSims.times(mSims);
// Inflate
normalizeMatrixPerColumn(mLastRes, 2.0);
// If convergence has been achieved
if (mSims.minus(mLastRes).normInf() < 0.001)
break;
// Update sim matrix by copying last result
mSims = mLastRes.copy();
}
// Final step: Interprete results
// For each row
for (int iRow = 0; iRow < mLastRes.getRowDimension(); iRow++) {
Set<Sentence> sCluster = new HashSet<Sentence>();
// For all columns
for (int iCol = 0; iCol < mLastRes.getColumnDimension(); iCol++)
{
// If it contains a non-zero element (above 0.01)
if (mLastRes.get(iRow, iCol) > 0.01)
// Add it to the current cluster
sCluster.add(lAllSentences.get(iCol));
}
// Add cluster to result set
tsRes.add(sCluster);
}
// Return map
return tsRes;
}
/**
* Calculates a similarity matrix (including self-similarity), by using NVS
* calculation.
* @param lAllSentences
* @return
*/
protected Matrix getSimilarityMatrix(List<Sentence> lAllSentences) {
// Init sim matrix
final Matrix mSims = new Matrix(lAllSentences.size(), lAllSentences.size());
// Perform parallel execution
ExecutorService es = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
// Init final vars
final List<Sentence> lAllSentencesArg = lAllSentences;
int iFirstCnt = 0;
// For every sentence pair in cluster
for (final Sentence sFirst : lAllSentences) {
final int iFirstCntArg = iFirstCnt;
es.submit(new Runnable() {
@Override
public void run() {
double dSim = 0.0;
int iSecondCnt = 0;
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
// Create first graph
DocumentNGramGraph gFirst = new DocumentNGramSymWinGraph();
gFirst.setDataString(sFirst.getSnippet());
for (Sentence sSecond : lAllSentencesArg) {
if (iSecondCnt == iFirstCntArg)
dSim = 1.0;
else {
// Create second graph
// TODO: Use cache?
DocumentNGramGraph gSecond = new DocumentNGramSymWinGraph();
gSecond.setDataString(sSecond.getSnippet());
// Calculate Normalized Value Similarity
GraphSimilarity gsCur = ngc.getSimilarityBetween(gFirst, gSecond);
dSim = gsCur.SizeSimilarity == 0.0 ? 0.0 :
gsCur.ValueSimilarity / gsCur.SizeSimilarity;
}
// Set to matrix
synchronized (mSims) {
mSims.set(iFirstCntArg, iSecondCnt, dSim);
}
iSecondCnt++;
}
}
});
iFirstCnt++;
}
// Complete comparisons
es.shutdown();
try {
es.awaitTermination(1, TimeUnit.DAYS);
} catch (InterruptedException ex) {
Logger.getLogger(Summariser.class.getName()).log(Level.SEVERE, null, ex);
return null;
}
return mSims;
}
/**
* Normalizes a matrix on a per column basis.
* @param mToNormalize The matrix to normalize <b>in place</b>.
* @param dPower The power to raise the elements to, before normalization
* @return The normalized matrix.
*/
protected Matrix normalizeMatrixPerColumn(Matrix mToNormalize, double dPower) {
// For every column
for (int iColumnCnt=0; iColumnCnt < mToNormalize.getColumnDimension(); iColumnCnt++) {
// Determine sum
double dColSum = 0.0;
// For every row
for (int iRowCnt=0; iRowCnt < mToNormalize.getRowDimension(); iRowCnt++) {
double dPowered = Math.pow(mToNormalize.get(iRowCnt, iColumnCnt), dPower);
// Update matrix value
mToNormalize.set(iRowCnt, iColumnCnt, dPowered);
// Update sum
dColSum += dPowered;
}
// For every row
for (int iRowCnt=0; iRowCnt < mToNormalize.getRowDimension(); iRowCnt++) {
double dNormalized = mToNormalize.get(iRowCnt, iColumnCnt) / dColSum;
// Update matrix value to normalized value
mToNormalize.set(iRowCnt, iColumnCnt, dNormalized);
}
}
return mToNormalize;
}
}