/*
* Copyright 2013 SciFY NPO <info@scify.org>.
*
* This product is part of the NewSum Free Software.
* For more information about NewSum visit
*
* http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* If this code or its output is used, extended, re-engineered, integrated,
* or embedded to any extent in another software or hardware, there MUST be
* an explicit attribution to this work in the resulting source code,
* the packaging (where such packaging exists), or user interface
* (where such an interface exists).
* The attribution must be of the form "Powered by NewSum, SciFY"
*/
package org.scify.NewSumServer.Server.Summarisation;
import gr.demokritos.iit.jinsect.documentModel.comparators.NGramCachedGraphComparator;
import gr.demokritos.iit.jinsect.documentModel.representations.DocumentNGramSymWinGraph;
import gr.demokritos.iit.jinsect.structs.GraphSimilarity;
import gr.demokritos.iit.jinsect.structs.Pair;
import java.awt.Dimension;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import javax.swing.JOptionPane;
import javax.swing.JScrollPane;
import javax.swing.JTextArea;
import org.scify.NewSumServer.Server.Storage.IDataStorage;
import org.scify.NewSumServer.Server.Structures.Article;
import org.scify.NewSumServer.Server.Structures.Topic;
import org.scify.NewSumServer.Server.Utils.Utilities;
/**
* Human comparison of articles for data acquisition
* @author George K. <gkiom@scify.org>
*/
public class dumpClusterer {
protected final static String sSeparator = " === ";
/**
* The Set containing Topics
*/
protected HashMap<String, Topic> hsArticlesPerCluster;
/**
* An Article,UUID map
*/
protected HashMap<Article, String> hsClusterPerArticle;
protected List<Article> origArticles;
/** The folder where the Articles will be saved */
protected String ArticlePath;
protected IDataStorage ids;
protected List<Pair> lsfeeds;
public dumpClusterer(List<Article> lsArticles,
IDataStorage ids,
String ArticlePath) {
origArticles = new ArrayList(lsArticles);
hsArticlesPerCluster = new HashMap<String, Topic>();
hsClusterPerArticle = new HashMap<Article, String>();
this.ids = ids;
this.ArticlePath = ArticlePath;
this.lsfeeds = getPairs(lsArticles);
}
protected GraphSimilarity compareArticles(Article aOne,
Article aTwo) {
DocumentNGramSymWinGraph dgFirstGraph =
new DocumentNGramSymWinGraph();
DocumentNGramSymWinGraph dgSecondGraph =
new DocumentNGramSymWinGraph();
dgFirstGraph.setDataString(aOne.getText());
dgSecondGraph.setDataString(aTwo.getText());
NGramCachedGraphComparator ngc = new NGramCachedGraphComparator();
return ngc.getSimilarityBetween(dgFirstGraph, dgSecondGraph);
}
private List<Pair> getPairs(List<Article> lsArticleList) {
// Create a list of Pairs
List lsArticlePairs = new LinkedList();
System.out.println("Creating Pairs...");
for (int i=0; i < lsArticleList.size()-1; i++) {
Article aFirst = lsArticleList.get(i); // first feed
for (int j=i+1; j < lsArticleList.size(); j++) {
Article aSecond = lsArticleList.get(j); // second feed
int One = aFirst.getText().length();
int Two = aSecond.getText().length();
// create feed pair
if (aFirst.getCategory().equals(aSecond.getCategory()) &&
!aFirst.getSource().equals(aSecond.getSource()) &&
!aFirst.getTitle().equals(aSecond.getTitle())) {
// Math.max(One, Two) / Math.min(One, Two) < 2) {
Pair<Article, Article> tmpPair = new Pair(aFirst, aSecond);
Pair<Article, Article> reverse = new Pair(aSecond, aFirst);
if (!lsArticlePairs.contains(tmpPair) && !lsArticlePairs.contains(reverse)) {
lsArticlePairs.add(0, tmpPair);
}
}
}
}
Collections.shuffle(lsArticlePairs);
System.out.println("Created " + lsArticlePairs.size() + " Pairs");
return lsArticlePairs;
}
public void ClusterFeeds(double iValue) {
int i=0;
for (Pair each: this.lsfeeds) {
i += 1;
Article First = (Article) each.getFirst(); // First Pair(Title, article)
Article Second= (Article) each.getSecond(); // First Pair(Title, article)
String sCat = First.getCategory();
String s1 = First.getTitle() + "\n" + First.getText();
String s2 = Second.getTitle() + "\n" + Second.getText();
GraphSimilarity gs = compareArticles(First, Second);
double NVS =
(gs.SizeSimilarity == 0.0) ? 0.0 : gs.ValueSimilarity / gs.SizeSimilarity;
//Create String with feed pairs so that user can evaluate.
if (NVS < iValue) { continue; }
else {
JTextArea text = new JTextArea(s1 + "\n\n" + s2 + "\nNVS: " + Double.toString(NVS));
text.setLineWrap(true);
JScrollPane scroll = new JScrollPane(text);
scroll.setPreferredSize(new Dimension(1000, 500));
int iTmpIndex = lsfeeds.size() - i;
String sTitle = "Only " + Integer.toString(iTmpIndex) + " Comparisons Remaining...";
boolean bMatch = JOptionPane.showConfirmDialog(null, scroll,
sTitle, 1) == JOptionPane.YES_OPTION;
String sMatches = (bMatch == true) ? "Match":"NotMatch";
// Add Data to The List
String sTmpLine = Utilities.MakeTmpHumanLine(",",
gs.ValueSimilarity, gs.ContainmentSimilarity,
gs.SizeSimilarity, NVS, sMatches);
Utilities.writeClusterCheckFile(sCat, sTmpLine);
}
}
}
}