/*
* Copyright 2013 SciFY NPO <info@scify.org>.
*
* This product is part of the NewSum Free Software.
* For more information about NewSum visit
*
* http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* If this code or its output is used, extended, re-engineered, integrated,
* or embedded to any extent in another software or hardware, there MUST be
* an explicit attribution to this work in the resulting source code,
* the packaging (where such packaging exists), or user interface
* (where such an interface exists).
* The attribution must be of the form "Powered by NewSum, SciFY"
*/
package org.scify.NewSumServer.Server.Summarisation;
import gr.demokritos.iit.conceptualIndex.structs.Distribution;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramSymWinGraph;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentWordGraph;
import gr.demokritos.iit.jinsect.events.WordEvaluatorListener;
import gr.demokritos.iit.jinsect.structs.GraphSimilarity;
import gr.demokritos.iit.jinsect.structs.Pair;
import gr.demokritos.iit.jinsect.utils;
import java.io.*;
import java.nio.charset.Charset;
import java.text.Collator;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.search.ScoreDoc;
import org.scify.NewSumServer.Server.Searching.Indexer;
import org.scify.NewSumServer.Server.Searching.Searcher;
import org.scify.NewSumServer.Server.Storage.IDataStorage;
import org.scify.NewSumServer.Server.Structures.Article;
import org.scify.NewSumServer.Server.Structures.Topic;
import static org.scify.NewSumServer.Server.Summarisation.ArticleClusterer.LOGGER;
import org.scify.NewSumServer.Server.Utils.Main;
import org.scify.NewSumServer.Server.Utils.Utilities;
/**
* The Clusterer Class. Parses a given list of {@link Article}s and groups all
* articles talking about the same subject in one {@link Topic}.
* @author ggianna
*/
public class ArticleClusterer {
// Will change when clusterer is updated
private final int VERSION = 1;
/**
* The Logger Class used for logging various info and higher level messages
*/
protected final static Logger LOGGER = Main.getLogger();
/**
* The separator used for creating the Article Text files.
*/
protected final static String sSeparator = " === ";
/**
* The Set containing Topics
*/
protected HashMap<String, Topic> hsArticlesPerCluster;
/**
* The Set containing the Topics from the previous run
*/
protected HashMap<String, Topic> PreviousClusteredTopics;
/**
* An Article,UUID map
*/
protected HashMap<Article, String> hsClusterPerArticle;
/**
* The Original List of Articles to process
*/
protected List<Article> origArticles;
/** The folder where the Articles will be saved */
protected String ArticlePath;
/**
* The Data Storage Module for various I/O operations
*/
protected IDataStorage ids;
/**
* Counts the Topics that were assigned an older ID
*/
private Integer tChanged = 0;
/**
* The list containing all the pairs of articles to be fed to the
* cluster calculation engine
*/
private List<Pair> lsArticlePairs = Collections.synchronizedList(new ArrayList());
private double NVSThreshold = 0.30, SSThreshold = 0.125; // default
/**
* Main Constructor of The ArticleClusterer Class.
* After the variables are initialized, the Clusters are being calculated
* @param lsArticles The Articles that will be clustered
* @param ids The The package used for storage
* @param ArticlePath The path where the CLustered
* Articles will be stored as text files
*/
public ArticleClusterer(List<Article> lsArticles,
IDataStorage ids,
String ArticlePath) {
// should be constructed with the list of Articles
// that the method getAllNews() of the SourceParser class returns
// Keep copy of articles
origArticles = new ArrayList(lsArticles);
// Init maps
hsArticlesPerCluster = new HashMap<String, Topic>();
hsClusterPerArticle = new HashMap<Article, String>();
this.ids = ids;
this.ArticlePath = ArticlePath;
// // DEBUG LINES
// System.out.println("Input " + lsArticles.size() + " articles");
// //////////////
}
/**
*
* @param aOne The First Article
* @param aTwo The Second Article
* @return A graph similarity object between the two articles
*/
protected GraphSimilarity compareArticles(Article aOne,
Article aTwo) {
// Changed to WORD GRAPHS
DocumentWordGraph dgFirstGraph =
new DocumentWordGraph();
DocumentWordGraph dgSecondGraph =
new DocumentWordGraph();
dgFirstGraph.WordEvaluator = new WordEvaluatorListener() {
@Override
public boolean evaluateWord(String string) {
// Keep only capitalized words!
// TODO: IMPROVE!!!
// boolean bPass = (string.matches("\\p{javaUpperCase}+.*"));
// // DEBUG LINES
// if (bPass)
// System.out.println(string);
//////////////
return ((string.length() > 3) && (string.matches("\\p{javaUpperCase}+.*")))
|| (string.matches("\\d+"));
}
};
dgSecondGraph.WordEvaluator = dgFirstGraph.WordEvaluator;
dgFirstGraph.setDataString(aOne.getTitle() + " " + aOne.getText());
dgSecondGraph.setDataString(aTwo.getTitle() + " " + aTwo.getText());
// DEBUG LINES
// if ((dgFirstGraph.length() < 10) || (dgSecondGraph.length() < 10)) {
// System.out.println("1st Graph size:" + dgFirstGraph.length());
// System.out.println("2nd Graph size:" + dgSecondGraph.length());
// }
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
return ngc.getSimilarityBetween(dgFirstGraph, dgSecondGraph);
}
/**
* Clusters the Articles and updates the
* {@link #hsClusterPerArticle} and {@link #hsArticlesPerCluster} Maps.
*/
public void calculateClusters(double NVSThresholdArg, double SSThresholdArg) {
if (NVSThresholdArg != 0) {
NVSThreshold = NVSThresholdArg;
}
if (SSThresholdArg != 0) {
SSThreshold = SSThresholdArg;
}
// DEBUG LINES
LOGGER.log(Level.INFO, "Thresholds:\nNVS :{0} - SS {1} ",
new Object[] {String.valueOf(NVSThresholdArg), String.valueOf(SSThreshold)});
// LOGGER.log(Level.INFO,"JISNECT splitToWords:" +
// utils.printIterable(Arrays.asList(
// utils.splitToWords("This is a test...")), " "));
///////////////
// Get pairs of clusters, without repetitions or in-cluster pairs
List<Pair> lsPairs = getPairs(origArticles);
// Init parallel execution
ExecutorService es = Executors.newFixedThreadPool(
Runtime.getRuntime().availableProcessors());
final ConcurrentHashMap<Pair<Article,Article>,Boolean> hmResults = new
ConcurrentHashMap<Pair<Article, Article>, Boolean>();
// For every pair
LOGGER.log(Level.INFO, "Examining pairs...");
for (final Pair p : lsPairs) {
es.submit(new Runnable() {
@Override
public void run() {
// Get first article of pair
Article aA = (Article) p.getFirst();
// Get second article from pair
Article aB = (Article) p.getSecond();
// Check whether articles match
boolean bMatch = getMatch(aA, aB);
synchronized (hmResults) {
// DEBUG LINES
// if (bMatch)
// System.out.println("Match " + aA + "\n" + aB);
//
hmResults.put(p, bMatch);
}
}
});
}
// Await completion
es.shutdown();
try {
es.awaitTermination(1, TimeUnit.DAYS);
LOGGER.log(Level.INFO, "Examining pairs DONE.");
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage(), ex);
return;
}
// Assign clusters
// For every pair
for (Pair<Article,Article> p: hmResults.keySet()) {
Article aA = p.getFirst();
Article aB = p.getSecond();
boolean bMatch = hmResults.get(p);
// DEBUG LINES
// if (!tmpCateg.equals(aA.getCategory())) { //debugging only
// tmpCateg = aA.getCategory();
// System.out.println("Calculating Clusters for " + tmpCateg);
// }
//////////////
String sClusterID;
// On a match
if (bMatch) {
// If both aA and aB belong to a cluster
if (hsClusterPerArticle.containsKey(aA) &&
hsClusterPerArticle.containsKey(aB)) {
// collapse their clusters.
collapseTopics(hsClusterPerArticle.get(aA), hsClusterPerArticle.get(aB));
// Go on with next pair
continue;
}
// If a is not in a cluster
if (!hsClusterPerArticle.containsKey(aA)) {
// create a new cluster
// init cluster
Topic tNew = new Topic();
sClusterID = tNew.getID();
// add article there
tNew.add(aA);
// update mappings
hsArticlesPerCluster.put(sClusterID, tNew);
hsClusterPerArticle.put(aA, sClusterID);
}
// If aB already belongs to a cluster
if (hsClusterPerArticle.containsKey(aB)) {
// collapse the aA and aB clusters.
collapseTopics(hsClusterPerArticle.get(aA), hsClusterPerArticle.get(aB));
// continue with next test
continue;
}
else {
// create a new cluster with a RANDOM UUID
Topic tNew = new Topic();
sClusterID = tNew.getID();
// init cluster
hsArticlesPerCluster.put(sClusterID, tNew);
// add articles there
hsArticlesPerCluster.get(sClusterID).add(aB);
// update mappings
hsClusterPerArticle.put(aB, sClusterID);
}
}
else // if there is no match
{
// If a is not in a cluster
if (!hsClusterPerArticle.containsKey(aA)) {
// create a new cluster
// init cluster
Topic tNew = new Topic();
sClusterID = tNew.getID();
// add article there
tNew.add(aA);
// update mappings
hsArticlesPerCluster.put(sClusterID, tNew);
hsClusterPerArticle.put(aA, sClusterID);
}
// If aB does not belong to a cluster
if (!hsClusterPerArticle.containsKey(aB)) {
// create a new cluster with a RANDOM UUID
Topic tNew = new Topic();
sClusterID = tNew.getID();
// add articles there
tNew.add(aB);
// update mappings
hsArticlesPerCluster.put(sClusterID, tNew);
hsClusterPerArticle.put(aB, sClusterID);
}
}
}
// debugging Method
checkForInconsistencies();
for (Map.Entry mp : hsArticlesPerCluster.entrySet()) {
Topic tmpTopic = (Topic) mp.getValue();
tmpTopic.setNewestDate(true);
// Also set as the Topic Title for each Topic the Title from it's newest Article
tmpTopic.setTitleFromNewest();
}
// remove some single topics, if older than two days, and with respect
// to keeping the same size of single topics for each category
// removeSingleTopics(30, 2);
// Save all articles to file, in Article Path in order to be indexed by lucene
// Also saves the hsArticlesPerCluster Map to file, for future access
//
try {
LOGGER.log(Level.INFO, "Saving Clusters...");
saveAllClusteredArticles();
LOGGER.log(Level.INFO, "Clusters saved succesfully");
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, "Could not save CLustered Articles ", ex.getMessage());
}
// debugging Method
// checkForInconsistencies();
}
/**
* Collapses (i.e., merges) two topics (clusters) into a single one,
* updating corresponding structures as required.
* @param sTopic1ID The first topic. This topic will be updated.
* @param sTopic2ID The second topic. This topic will be deleted.
* @return True if a modification took place
*/
protected boolean collapseTopics(String sTopic1ID, String sTopic2ID) {
Topic t1 = hsArticlesPerCluster.get(sTopic1ID);
Topic t2 = hsArticlesPerCluster.get(sTopic2ID);
// If topics identical
if (t1 == t2)
// No need for collapse
return false;
// For every article in topic t2
for (Article aCur: t2) {
// Add it into topic t1
t1.add(aCur);
// Update indices
hsClusterPerArticle.put(aCur, t1.getID());
hsArticlesPerCluster.put(t1.getID(), t1);
}
// Remove t2 from structures
t2.clear();
hsArticlesPerCluster.remove(t2.getID());
return true;
}
/**
* Checks whether two articles talk about the same subject
* @param aA The First Article
* @param aB The Second Article
* @return true if two articles talk about the same subject,
* false otherwise.
*/
public boolean getMatch(Article aA, Article aB) {
//Create ifs for each category
GraphSimilarity gs = compareArticles(aA, aB);
double NVS = gs.SizeSimilarity == 0.0 ? 0.0 : gs.ValueSimilarity / gs.SizeSimilarity;
// Updated rule for matching
// boolean bMatch = (NVS > 0.20) && (gs.SizeSimilarity > 0.10);
boolean bMatch = (NVS >= NVSThreshold) && (gs.SizeSimilarity > SSThreshold);
// DEBUG LINES
// if (bMatch) {
// System.out.println("**** Match (NVS=" + NVS + ", SS=" + gs.SizeSimilarity +
// ") : \n" + aA + "\n---\n" + aB);
// System.out.println("-----------------------------------------------------");
// }
//////////////
// check titles for word similarity
boolean TitleMatch = isPossiblySameSentence(
aA.getTitle(), aB.getTitle());
// debug lines
// if (TitleMatch || bMatch) {
// Utilities.appendToFile("/home/gkioumis/Programming/Java/NewSum/NewSumServer/data/temp/TestingTitles.csv",
// bMatch + " : " + TitleMatch + " === " + aA.getTitle() + " : " + aB.getTitle());
// }
//////////////
// return bMatch || TitleMatch;
return TitleMatch || bMatch;
}
private boolean isPossiblySameSentence(String s1, String s2) {
// split to words
String[] as1 = s1.split("[ :-;!?]+");
String[] as2 = s2.split("[ :-;!?]+");
// remove words smaller than 4 letters
ArrayList<String> ls1 = new ArrayList<String>();
for (String a : as1) {
if (a.length() > 3) {
ls1.add(a);
}
}
ArrayList<String> ls2 = new ArrayList<String>();
for (String b : as2) {
if (b.length() > 3) {
ls2.add(b);
}
}
int iEqual = 0;
// for each word, compare similarity of words
for (int i=0; i < ls1.size(); i++) {
for (String bWord : ls2) {
if (isPossiblyEqualWord(ls1.get(i), bWord)) {
iEqual ++;
break; // continue from another base word
}
}
}
// measure similarity > 0.50
// = 2 * sum of words equal / (Len of Words 1 + Len of Words 2)
float fSim = (float) 2 * iEqual / (ls1.size() + ls2.size());
return fSim > 0.50;
}
/**
*
* @param aWord the first word
* @param bWord the second word
* @return true whether both words are greek, i.e. they all
* consist of Greek characters
*/
private boolean isBothGreekLocale(String aWord, String bWord) {
return Utilities.isGreekWord(aWord) && Utilities.isGreekWord(bWord);
}
/**
*
* @param aWord The first word
* @param bWord The second word
* @return True when the two words are possibly similar,
* by counting letter equality
*/
private boolean isPossiblyEqualWord(String aWord, String bWord) {
// trim words
aWord = aWord.trim(); bWord = bWord.trim();
// if words equal return
if (aWord.equalsIgnoreCase(bWord)) {
return true;
}
// set collator locale and strength
Collator col;
if (isBothGreekLocale(aWord, bWord)) {
col = Collator.getInstance(new Locale("el", "gr"));
} else {
col = Collator.getInstance(Locale.ENGLISH);
}
col.setStrength(Collator.PRIMARY);
// get the max number of characters
int iMax = Math.max(aWord.length(), bWord.length());
int iMin = Math.min(aWord.length(), bWord.length());
int iSame = 0;
// compare each character (string)
boolean bCon = true; // must be continuous match, else abort
for (int i = 0; i < iMin; i++) {
if (col.compare(aWord.substring(i, i+1), bWord.substring(i, i+1)) == 0) {
iSame ++;
} else {
bCon = false;
}
if (!bCon) {
break;
}
}
if ((iSame == iMin) || ((float) iSame / iMax) >= 0.70 ) {
return true;
}
return false;
}
/**
* Use to create Article Pairs
* @param lsArticleList the List of Articles to mess
* @return A list of article Pairs
*/
private List<Pair> getPairs(final List<Article> lsArticleList) {
// get available processors
int iThreads = Runtime.getRuntime().availableProcessors();
LOGGER.log(Level.INFO, "Creating Pairs...");
// Create executor service
ExecutorService es = Executors.newFixedThreadPool(iThreads);
// divide list into iThreads parts
int iParts = lsArticleList.size() / iThreads;
final List allLists = new ArrayList<List<Article>>();
// create sublists
for (int i = 0; i < lsArticleList.size(); i += iParts) {
allLists.add(lsArticleList.subList(i, i + Math.min(iParts, lsArticleList.size() - i)));
}
// for every sublist
for (final ListIterator<List<Article>> it = allLists.listIterator(); it.hasNext();) {
// call new thread
es.submit(new Runnable() {
@Override
public void run() {
// create a set of Pairs
HashSet<Pair<Article, Article>> tmpPairs = new HashSet<Pair<Article, Article>>();
// know index of list
int tmpIndex = it.nextIndex();
// process every sublist
List<Article> tmpList = it.next();
// create the list with the remaining items (if we are in sublist 2, then create list combined (2-3-4))
List<Article> tmpRemained = new ArrayList<Article>();
for (ListIterator<Article> remainedIter = allLists.listIterator(tmpIndex); remainedIter.hasNext();) {
List<Article> nextList = (List<Article>) remainedIter.next();
tmpRemained.addAll(nextList);
}
// for every sublist's article
for (ListIterator<Article> curListIter = tmpList.listIterator(); curListIter.hasNext();) {
// get article
Article aFirst = curListIter.next();
// compare with all remaining articles from main list (main list - sublist)
for (ListIterator<Article> others = tmpRemained.listIterator();others.hasNext();) {
// get article
Article aSecond = others.next();
// compare category and source
if (aFirst.getCategory().equals(aSecond.getCategory())
&& !aFirst.getSource().equals(aSecond.getSource())) {
// create and add pair
Pair reverse = new Pair(aSecond, aFirst);
if (!tmpPairs.contains(reverse)) {
tmpPairs.add(new Pair(aFirst, aSecond));
}
}
}
}
// when done, add to final list
synchronized (lsArticlePairs) {
lsArticlePairs.addAll(tmpPairs);
}
}
});
}
es.shutdown();
try {
es.awaitTermination(1, TimeUnit.DAYS);
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, ex.getMessage(), ex);
}
LOGGER.log(Level.INFO, "Created {0} Article Pairs", lsArticlePairs.size());
return lsArticlePairs;
}
/**
*
* @return A map containing a Unique identifier for
* each Cluster and article list that the cluster is about
*/
public HashMap<String, Topic> getArticlesPerCluster() {
if (this.hsArticlesPerCluster != null) {
if (!this.hsArticlesPerCluster.isEmpty()) {
return this.hsArticlesPerCluster;
}
}
try {
return this.ids.readClusteredTopics();
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, ex.getMessage());
return null;
}
}
/**
*
* @return A map containing an Article and the Unique Identifier
* for the exact article
*/
public HashMap<Article, String> getClusterPerArticle() {
return this.hsClusterPerArticle;
}
/**
* Initializes a new {@link org.scify.NewSumServer.Server.Searching.Searcher} object and
* searches the Index with the specified query.
* @param ind The Indexer to be used
* @param sKeyword The Search Query
* @param sUserSources The separator-delimited URL sources accepted by user
* @param iMaxHits The max number of hits to accept
* @param loc The locale of the text to process
* @return A list of Topic IDs that contain articles related to the
* search query, in descending order
* @throws FileNotFoundException
* @throws IOException
*/
public ArrayList<String> getTopicIDsByKeyword(Indexer ind, String sKeyword,
String sUserSources, int iMaxHits, Locale loc)
throws FileNotFoundException, IOException {
LOGGER.log(Level.INFO, "Searching for {0}...", sKeyword);
// Initialise a new Searcher and get the ScoreDocs found for the query
Searcher se = new Searcher();
List<ScoreDoc> lsResults;
if (Utilities.isGreekWord(sKeyword)) {
lsResults = se.searchIndex(ind.getIndexDirectory(), //lower case with greek locale
loc, sKeyword.toLowerCase(loc), iMaxHits);
} else {
lsResults = se.searchIndex(ind.getIndexDirectory(),
loc, sKeyword.toLowerCase(), iMaxHits);
}
if (lsResults == null || lsResults.isEmpty()) {
return null;
}
//get the <docId, filename> mappings
HashMap<Integer, String> docFiles = se.getDocFiles();
// debug
// for (Map.Entry each : docFiles.entrySet()) {
// Integer i = (Integer) each.getKey();
// String e = (String) each.getValue();
// System.out.println(String.valueOf(i) + ": " + e);
// }
// debug end
//Initialize the <ClusterID, List<filename>> mapping
HashMap<String, List<String>> docClusters = new HashMap<String, List<String>>();
//Create the <UUID, TotalScore> Distribution and update it according to the data
//Also update the <clusterID, list<filename>> map
Distribution<String> d = new Distribution<String>();
if ("All".equals(sUserSources) || sUserSources == null) { //Accept all user sources
for (ScoreDoc sd: lsResults) {
String ClusterID = getInfofromFile(docFiles.get(sd.doc), "ClusterID");
d.increaseValue(ClusterID, sd.score);
updateDocClusters(docClusters, ClusterID, docFiles, sd);
}
} else {
for (ScoreDoc sd: lsResults) {
String ArticleFeed = getInfofromFile(docFiles.get(sd.doc), "Feed");
if (sUserSources.contains(ArticleFeed)) { //only if feed is accepted by user
String ClusterID = getInfofromFile(docFiles.get(sd.doc), "ClusterID");
d.increaseValue(ClusterID, sd.score);
updateDocClusters(docClusters, ClusterID, docFiles, sd);
}
}
}
SortedSet<Map.Entry> sorted_d = (SortedSet) Utilities.entriesSortedByValues(d.asTreeMap());
ArrayList<String> TopicIDsHits = new ArrayList<String>();
for (Map.Entry each : sorted_d) {
TopicIDsHits.add((String) each.getKey());
}
if (!TopicIDsHits.isEmpty()) {
// debug
// for (String each : TopicIDsHits) {
// System.out.println(each);
// }
// debug end
return TopicIDsHits;
} else {
LOGGER.log(Level.INFO, " No Topics Found");
return null;
}
}
private void updateDocClusters(HashMap<String, List<String>> docClusters,
String ClusterID, HashMap<Integer, String> docFiles,
ScoreDoc sd) {
if (!docClusters.containsKey(ClusterID)) {
docClusters.put(ClusterID, new ArrayList<String>());
docClusters.get(ClusterID).add(docFiles.get(sd.doc));
} else {
docClusters.get(ClusterID).add(docFiles.get(sd.doc));
}
}
/**
* Used by the getTopicIDsByKeyword method
* to retrieve info about the Cluster ID, title, etc
* @param sFileName The filename to read
* @param Info The information we want to retrieve from the file
* @return The Information that the file possesses about the article
*/
private String getInfofromFile(String sFileName, String Info)
throws FileNotFoundException, IOException {
String sFullName = this.ArticlePath + sFileName;
File fFile = new File(sFullName);
if (fFile.canRead()) {
FileInputStream fstream = new FileInputStream(fFile);
// Get the object of DataInputStream
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String sLine;
while ((sLine = br.readLine()) != null) {
if (sLine.startsWith(Info)) {
return sLine.split(sSeparator)[1].trim();
}
}
in.close();
} else {
LOGGER.log(Level.SEVERE, "Error: Cannot read from file: {0}", fFile.toString());
return null;
} return null;
}
/**
* <p>Saves all Clustered Articles to file, one Article per file.</p>
* <p>- Stores data about the ClusterID, the feed, and the Category
* in the beginning of the file</p>
* <p>- Before saving the Clustered Topics Map, it calls
* {@link #compareTopics(java.util.HashMap, java.util.HashMap)} first</p>
* <p>- Also stores the {@link #hsArticlesPerCluster} map to file,
* using the {@link #ids} module</p>
* @throws IOException
*/
private void saveAllClusteredArticles() throws IOException {
// Save the Map that contains the list of articles per cluster
try {
// load the old topics map in memory before deleting
this.PreviousClusteredTopics = (HashMap<String, Topic>) this.ids.readClusteredTopics();
// Before saving the new map, compare the two runs in order to look for same topics,
// and if such, assign the same Topic IDs from the previous run to the new Map
boolean Changed = compareTopics(this.PreviousClusteredTopics, this.hsArticlesPerCluster); // returns true or false
if (Changed) {
LOGGER.log(Level.INFO, "Found {0} Identical Topics and switched to old IDs", String.valueOf(tChanged));
}
// debugging
// Utilities.writeTopicsToFile(hsArticlesPerCluster, "CurrentTopics");
// Utilities.writeTopicsToFile(PreviousClusteredTopics, "PreviousTopics");
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Could Not Load Clustered Topics from Previous Run: {0}", ex.getMessage());
} finally {
// delete the old map
this.ids.deleteObject("ClusteredTopics", this.ids.getGeneric());
// Save the final Map, either updated with the comparison results or not
this.ids.writeClusteredTopics(this.hsArticlesPerCluster);
}
// delete all files in Article Directory, in order to write the new
// ones afterwards
File f = new File(this.ArticlePath);
if (f.isDirectory()) {
f.setWritable(true);
for (File each : f.listFiles(new FileFilter() {
@Override
public boolean accept(File pathname) {
return pathname.getPath().endsWith(".txt");
}
})) {
if (!each.delete()) {
LOGGER.log(Level.WARNING, "File {0} could not be deleted", each.getName());
}
}
}
// Save Each Article to a single Text file, so that it is used by the indexer later
// Each Text File has the ClusterID information in it
int counter = 1; // used for distinction between articles in the same topic
Iterator it = this.hsClusterPerArticle.entrySet().iterator();
while (it.hasNext()) {
Map.Entry pair = (Map.Entry) it.next();
writeArticleToFile((Article) pair.getKey(),
this.ArticlePath, (String) pair.getValue(), counter);
counter ++;
}
}
/**
* Saves An Article to a simple file. The File contains the ClusterID
* information in it's first line
* @param aArt The Article to store
* @param sPathToFile The path where the file is saved
* @param sCluster The Cluster ID of the Article
* @throws IOException
*/
private void writeArticleToFile(Article aArt, String sPathToFile,
String sCluster, int counter) throws IOException {
try {
String sFullFileName = sPathToFile + sCluster +
"-" + String.valueOf(counter) + ".txt";
File fFile = new File(sFullFileName);
fFile.createNewFile();
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fFile), Charset.forName("UTF-8")));
bw.write("ClusterID" + sSeparator + sCluster);
bw.newLine();
bw.write("Κατηγορία" + sSeparator + aArt.getCategory());
bw.newLine();
bw.write("Feed" + sSeparator + aArt.getFeed());
bw.newLine();
bw.write("Πηγή" + sSeparator + aArt.getSource());
bw.newLine();
bw.write("Date" + sSeparator + aArt.getDatetoString());
bw.newLine();
bw.write(aArt.getTitle());
bw.newLine();
bw.write(aArt.getText());
bw.close();
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, "Could Not Write Article " + aArt.getTitle() + " to File",
ex.getMessage());
}
}
/**
* Compares two Topic maps and searches for equal topics, using the
* {@link #isTopicEqual(org.scify.NewSumServer.Server.Structures.Topic, org.scify.NewSumServer.Server.Structures.Topic) }
* method. If an equality is found, the {@link Topic} on the newer map is assigned the
* ID of the Topic from the older map.
* @param Prev The map containing the topics from the previous run
* @param Current The map containing the topics from the current run
* @return true if even one topic had it's ID changed, false otherwise
*/
private boolean compareTopics(HashMap<String, Topic> Prev, HashMap<String, Topic> Current) {
// The hsRes map will map the current ID values to the
// old ID values, if the topics are the same
HashMap<String, String> hsRes = new HashMap<String, String>();
// the hsTops map will keep the (previousID, currentTopic) mapping
// in order to update the global maps afterwards.
HashMap<String, Topic> hsTops = new HashMap<String, Topic>();
// Iterate over the Current Map
for (Map.Entry<String, Topic> cEntry : Current.entrySet()) {
String cID = cEntry.getKey();
Topic cTopic = cEntry.getValue();
// Iterate over the Previous Map (The map from the previous run)
for (Map.Entry<String, Topic> pEntry : Prev.entrySet()) {
String pID = pEntry.getKey();
Topic pTopic = pEntry.getValue();
if (isTopicEqual(cTopic, pTopic)) {
// System.out.println(cTopic.getTitle() + "==" + pTopic.getTitle());
hsRes.put(cID, pID); // keep the ID pairs
hsTops.put(pID, cTopic); // Keep the articles for this ID
break; // proceed to the next current topic
}
}
}
// while (cIt.hasNext()) {
// Map.Entry cPair = (Map.Entry) cIt.next();
// String cID = (String) cPair.getKey();
// Topic cTopic = (Topic) cPair.getValue();
// cur++;
// System.err.println("Processing Current topic " + cur + " : " + cID);
// while (pIt.hasNext()) {
// Map.Entry pPair = (Map.Entry) pIt.next();
// String pID = (String) pPair.getKey();
// Topic pTopic = (Topic) pPair.getValue();
// pre++;
// System.err.println("\tWith previous topic " + pre + " : " + pID);
// // check for Topic equality
// if (isTopicEqual(cTopic, pTopic)) {
// System.err.println("\t\tMatch found");
// pre=0;
//// System.err.println("Found an equal Topic " + cID + " ---- " + cTopic.getTitle());
// hsRes.put(cID, pID); // keep the ID pairs
//// System.err.println("Keeping Current ID " + cID + " ------ to change to ---- " + pID);
// hsTops.put(pID, cTopic); // Keep the articles for this ID
// break; // proceed to the next current topic
// }
// }
// }
if (hsRes.isEmpty()) { return false; } // no same topics, nothing to change
// iterate over the (currentID, previousID) mapping and make the required changes
Iterator nIt = hsRes.entrySet().iterator();
while (nIt.hasNext()) {
Map.Entry nPair = (Map.Entry) nIt.next();
String cID = (String) nPair.getKey(); // the current ID
String pID = (String) nPair.getValue(); // the ID from the old map, to restore
if (this.hsArticlesPerCluster.containsKey(cID)) { // should always contain that key
// System.err.println("Changed Topic " + this.hsArticlesPerCluster.get(cID).getID());
Topic tmpTopic = this.hsArticlesPerCluster.get(cID);
tmpTopic.setID(pID); // Assign the old ID to this Topic
this.hsArticlesPerCluster.remove(cID); // remove the entry from the map and add the new one
this.hsArticlesPerCluster.put(tmpTopic.getID(), hsTops.get(pID));
// System.err.println("To " + this.hsArticlesPerCluster.get(pID).getID() + " -- " + this.hsArticlesPerCluster.get(pID).getTitle());
tChanged++; // Counter of operations done
// update the reverse map for this topic
for (Article each : this.hsArticlesPerCluster.get(pID)) {
if (this.hsClusterPerArticle.containsKey(each)) { // should always be true
// update mappings with new ID
this.hsClusterPerArticle.put(each, pID);
} else {
LOGGER.log(Level.WARNING, "Unexpected behaviour: {0} -- {1}",new Object[] {each, pID});
}
}
}
}
return true; // changed IDs for same topics
}
/**
* Compares two given topics, using Ordered Text Concatenation and Topic Date.
* @param tA The first {@link Topic}
* @param tB The Second {@link Topic}
* @return true if the two topics are the same, false otherwise
*/
private boolean isTopicEqual(Topic tA, Topic tB) {
boolean match;
// they have to be in the same category to compare
if (tA.get(0).getCategory().equals(tB.get(0).getCategory())) {
// if the topic has only one article, check title, date and return
if (tA.size() == 1 && tB.size() == 1) {
if (tA.getTitle().hashCode() == tB.getTitle().hashCode()) {
match = tA.getDate().hashCode() == tB.getDate().hashCode();
} else {
match = false;
}
} else if (tA.size() == tB.size()) {
// Otherwise get all text from the topic articles, and sort (simple unicode sorting)
ArrayList<String> lsA = (ArrayList<String>) Utilities.getListOfStrings(tA);
Collections.sort(lsA, String.CASE_INSENSITIVE_ORDER);
ArrayList<String> lsB = (ArrayList<String>) Utilities.getListOfStrings(tB);
Collections.sort(lsB, String.CASE_INSENSITIVE_ORDER);
// get date for each topic
String sdA = tA.getDateToString();
String sdB = tB.getDateToString();
// for every text and date, construct a single string
StringBuilder sbA = new StringBuilder();
for (String each : lsA) {
sbA.append(each);
}
// append date at the end
sbA.append(sdA);
// same for Topic B
StringBuilder sbB = new StringBuilder();
for (String each : lsB) {
sbB.append(each);
}
sbB.append(sdB);
// Compare the two constructs and return
match = sbA.hashCode() == sbB.hashCode();
} else {
match = false;
}
} else { // not in the same category
match = false;
}
return match;
}
private void checkForInconsistencies() {
// DEBUG LINES // Checking if maps are indeed reverse
int iCnt = 0;
for (Article aCur : hsClusterPerArticle.keySet()) {
if (!hsArticlesPerCluster.get(hsClusterPerArticle.get(aCur)).contains(aCur)) {
LOGGER.log(Level.SEVERE, "Mismatch found!");
}
iCnt++;
}
LOGGER.log(Level.INFO, "Checked {0} items.", iCnt);
for (String sCurCluster : hsArticlesPerCluster.keySet()) {
for (Article aCurArticle: hsArticlesPerCluster.get(sCurCluster)) {
if (hsClusterPerArticle.get(aCurArticle).trim().compareTo(
sCurCluster.trim()) != 0) {
LOGGER.log(Level.SEVERE, "Mismatch found (reverse)!\n{0} != \n{1}\n", new
Object[] {hsClusterPerArticle.get(aCurArticle), sCurCluster});
}
}
}
LOGGER.log(Level.INFO, "Reversed Checked Mappings Done");
}
/**
* Parses the {@link #hsArticlesPerCluster} map and removes some single
* topics. The topics are removed if they are older than iDays from the
* current date and the topic limit has not been reached
* @param iMinSingleTopics the minimum number of single topics to keep
* @param iDays the distance in days from the current date
* per category
*/
private void removeSingleTopics(int iMinSingleTopics, int iDays) {
// System.out.println("initial " + this.hsArticlesPerCluster.size());
int initial = this.hsArticlesPerCluster.size();
Collection<String> sCategs = this.ids.readGenericCategories();
Distribution<String> Count = new Distribution<String>();
// get Single Topics Count per Category
for (String sCurCateg : sCategs) {
for (Map.Entry each : this.hsArticlesPerCluster.entrySet()) {
Topic tmpTopic = (Topic) each.getValue();
if (tmpTopic.getCategory().equals(sCurCateg) && tmpTopic.size() == 1) {
Count.increaseValue(sCurCateg, 1);
}
}
}
Calendar now = Calendar.getInstance();
for (String sCurCateg : sCategs) {
Iterator it = this.hsArticlesPerCluster.entrySet().iterator();
while (it.hasNext()) {
Map.Entry mp = (Map.Entry) it.next();
Topic tmpTopic = (Topic) mp.getValue();
if (tmpTopic.getCategory().equals(sCurCateg) && tmpTopic.size() == 1) {
if (Count.getValue(sCurCateg) > iMinSingleTopics) {
if (now.getTimeInMillis() - tmpTopic.getDate().getTimeInMillis() > (iDays*1000*60*60*24)) {
Count.setValue(sCurCateg, Count.getValue(sCurCateg) - 1); // decrease count by one
it.remove();
// update the reverse map
this.hsClusterPerArticle.remove(tmpTopic.get(0));
}
}
}
}
}
int iFinal = this.hsArticlesPerCluster.size();
LOGGER.log(Level.INFO, "Removed {0} single Topics", initial - iFinal);
}
// DEBUG LINES
// public static void main(String[] args) {
// String[] saWords = {"testing", "USA", "Γιώργος", " ΜΑΡΙΝΑ ΦΛΟΙΣΒΟΥ", "Έξυπνος"};
// DocumentWordGraph dgFirstGraph =
// new DocumentWordGraph();
// dgFirstGraph.WordEvaluator = new WordEvaluatorListener() {
//
// @Override
// public boolean evaluateWord(String string) {
// // Keep only capitalized words!
// // TODO: IMPROVE!!!
// boolean bPass = (string.matches("\\p{javaUpperCase}+.*"));
// // DEBUG LINES
// if (bPass)
// System.out.println(string);
// //////////////
// return bPass;
// }
// };
// dgFirstGraph.setDataString("Αυτή είναι μία χαρακτηριστική δοκιμή. Νομίζω. Γιώργος Γ.");
//
// for (String sWord : saWords)
// System.out.println(sWord + ":" +
// String.valueOf(sWord.matches("\\p{javaUpperCase}+.*")));
// }
/**
* @deprecated
* @param sSent
* @param iCharCount
* @return
*/
private String[] ommitSmallWords(String[] sSent, int iCharCount) {
List<String> lsSent = new ArrayList<String>(Arrays.asList(sSent));
lsSent.removeAll(findSmallWords(lsSent, iCharCount));
String[] aSent = lsSent.toArray(new String[0]);
return aSent;
}
/**
* @deprecated
* @param lsSen
* @param iCount
* @return
*/
private Collection<String> findSmallWords(List<String> lsSen, int iCount) {
Collection<String> lsStr = new ArrayList<String>();
for (String each: lsSen) {
if (each.length() <= iCount) {
lsStr.add(each);
}
}
return lsStr;
}
/**
* @deprecated
* @param a
* @param b
* @param c
* @return
*/
private int minimum(int a, int b, int c) {
return Math.min(Math.min(a, b),c);
}
/**
* @deprecated
* @param str1
* @param str2
* @return
*/
private int computeLevenshteinDistance(CharSequence str1,
CharSequence str2) {
int[][] distance = new int[str1.length() + 1][str2.length() + 1];
for (int i = 0; i <= str1.length(); i++) {
distance[i][0] = i;
}
for (int j = 1; j <= str2.length(); j++) {
distance[0][j] = j;
}
for (int i = 1; i <= str1.length(); i++) {
for (int j = 1; j <= str2.length(); j++) {
distance[i][j] = minimum(
distance[i-1][j] + 1,
distance[i][j-1] + 1,
distance[i-1][j-1]
+ ((str1.charAt(i-1) == str2.charAt(j-1)) ? 0 : 1));
}
}
return distance[str1.length()][str2.length()];
}
/**
* @deprecated
* @param sSenA
* @param sSenb
* @return
*/
private float compareSentences(String sSenA, String sSenb) {
String[] a = ommitSmallWords(sSenA.split(" "), 2);
String[] b = ommitSmallWords(sSenb.split(" "), 2);
String aa = getStringFromArray(a);
String bb = getStringFromArray(b);
int maxLen = aa.length() >= bb.length()
? aa.length() : bb.length();
float distance = 0;
distance += computeLevenshteinDistance(aa, bb);
float deriv = (float) distance/maxLen;
return deriv;
}
/**
* @deprecated
* @param sSenA
* @param sSenB
* @return
*/
private double nggCompare(String sSenA, String sSenB) {
DocumentNGramSymWinGraph dgA = new DocumentNGramSymWinGraph();
dgA.setDataString(sSenA);
DocumentNGramSymWinGraph dgB = new DocumentNGramSymWinGraph();
dgA.setDataString(sSenB);
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
double dRes = ngc.getSimilarityBetween(dgA, dgB).ValueSimilarity;
return dRes;
}
/**
* @deprecated
*/
private void compareAllSentences() {
for (Pair each : getAllTitlePairs()) {
compareSentences((String) each.getFirst(), (String) each.getSecond());
}
}
/**
* @deprecated
* @param sStr
* @return
*/
private String getStringFromArray(String[] sStr) {
StringBuilder sb = new StringBuilder();
for (int i=0; i< sStr.length; i++) {
sb.append(sStr[i]);
}
return sb.toString();
}
/**
* @deprecated
* @return
*/
private List<Pair> getAllTitlePairs() {
List<Article> lsArticleList = this.origArticles;
// Create a list of Pairs
List lsArticleTitlePairs = new ArrayList();
for (int i=0; i < lsArticleList.size()-1; i++) {
Article aFirst = lsArticleList.get(i); // first feed
for (int j=i+1; j < lsArticleList.size(); j++) {
Article aSecond = lsArticleList.get(j); // second feed
// create feed pair
if (aFirst.getCategory().equals(aSecond.getCategory()) &&
!aFirst.getFeed().equals(aSecond.getFeed())) {
Pair<String, String> tmpPair = new Pair(aFirst.getTitle(),
aSecond.getTitle());
lsArticleTitlePairs.add(tmpPair);
}
}
}
return lsArticleTitlePairs;
}
}