/*
* Copyright 2004-2010 Information & Software Engineering Group (188/1)
* Institute of Software Technology and Interactive Systems
* Vienna University of Technology, Austria
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package at.tuwien.ifs.somtoolbox.summarisation.output;
import java.util.ArrayList;
import cern.colt.list.DoubleArrayList;
import at.tuwien.ifs.somtoolbox.util.ArrayUtils;
/**
* @author Julius Penaranda
* @version $Id: MultiDocumentHandler.java 3589 2010-05-21 10:42:01Z mayer $
*/
public class MultiDocumentHandler {
private static final String WORD_DELIMITER = " \n.,!?";
private ArrayList<String>[] parsedDocuments = null;
private DoubleArrayList[] allscores = null;
private ArrayList<String> resultdoc = null;
private ArrayList<String> resultfilenames = null;
private DoubleArrayList resultscores = null;
private double degree = 0;
private Object[] itemNames = null;
MultiDocumentHandler(Object[] items, ArrayList<String>[] parsedDoc) {
this.itemNames = items;
this.parsedDocuments = parsedDoc;
}
public void createAllResults(double threshold) {
this.resultdoc = new ArrayList<String>();
this.resultfilenames = new ArrayList<String>();
this.resultscores = new DoubleArrayList();
System.out.println("multidocumenthandler: threshold: " + threshold);
// for each document
for (int j = 0; j < this.parsedDocuments.length; j++) {
// for each score of sentence
for (int h = 0; h < this.allscores[j].size(); h++) {
if (this.allscores[j].get(h) >= threshold) {
resultdoc.add(this.parsedDocuments[j].get(h + 1));
resultfilenames.add((String) this.itemNames[j]);
resultscores.add(this.allscores[j].get(h));
}
}
}
removeRedundancy(resultdoc, resultscores, resultfilenames);
}
public void removeRedundancy(ArrayList<String> resultd, DoubleArrayList resultsc, ArrayList<String> resultfile) {
ArrayList<String> doc = new ArrayList<String>();
DoubleArrayList score = new DoubleArrayList();
ArrayList<String> files = new ArrayList<String>();
// for each sentence
for (int i = 0; i < resultd.size(); i++) {
String sent = resultdoc.get(i);
double simMax = 0; // max similarity value
int sentMax = 0; // sentence with max similarity
for (int j = 0; j < resultd.size(); j++) {
if (i != j) {
String sent2 = resultdoc.get(j);
double sim = computeSimilarity(sent, sent2);
if (sim >= 0.5 && simMax < sim) {
simMax = sim;
sentMax = j;
}
}
}
if (simMax == 0) { // if there's no similarity
doc.add(resultd.get(i));
score.add(resultsc.get(i));
files.add(resultfile.get(i));
} else { // else
if (sentMax > i) { // if similar sentence hasn't been examined yet
if (resultsc.get(i) >= resultsc.get(sentMax)) { // if score of 1st sent is greater than 2nd
doc.add(resultd.get(i)); // add 1st sent to result
score.add(resultsc.get(i));
files.add(resultfile.get(i));
} else { // otherwise add 2nd sent to result
doc.add(resultd.get(sentMax));
score.add(resultsc.get(sentMax));
files.add(resultfile.get(sentMax));
}
}
}
}
this.resultdoc = doc;
this.resultscores = score;
this.resultfilenames = files;
}
public ArrayList<String> getResultDocs() {
return resultdoc;
}
public DoubleArrayList getResultScores() {
return resultscores;
}
public ArrayList<String> getResultFileNames() {
return resultfilenames;
}
/**
* identifies sentence similarity across documents; numdoc sets the minimum number of documents in which similarity
* of sentence occurs; degree sets similarity degree;
*/
void find_similarities(double degr) {
this.degree = degr;
this.resultdoc = new ArrayList<String>();
this.resultscores = new DoubleArrayList();
this.resultfilenames = new ArrayList<String>();
for (int i = 0; i < this.parsedDocuments.length; i++) {
for (int h = 0; h < this.parsedDocuments.length; h++) {
if (i == 0 && i != h) {
// System.out.println("i: "+i+", h: "+h);
if (this.parsedDocuments[i].size() < this.parsedDocuments[h].size()) {
compareDocuments(h, i);
} else {
compareDocuments(i, h);
}
}
if (i != h && i != 0 && i < h) {
// System.out.println("i: "+i+", h: "+h);
if (this.parsedDocuments[i].size() < this.parsedDocuments[h].size()) {
compareDocuments(h, i);
} else {
compareDocuments(i, h);
}
}
}
}
removeRedundancy(resultdoc, resultscores, resultfilenames);
}
void compareDocuments(int d1, int d2) {
// String[] tok2= null;
ArrayList<String> doc = this.parsedDocuments[d1];
ArrayList<String> doc2 = this.parsedDocuments[d2];
// for each sentence in doc, ignore title
for (int a = 1; a < doc.size(); a++) {
String sent = doc.get(a).toLowerCase();
double simMax = 0;
int sentMax = 0;
String finalsent = "";
String finalsent2 = "";
// for each sentence in doc2, ignore title
for (int b = 1; b < doc2.size(); b++) {
String sent2 = doc2.get(b).toLowerCase();
double sim = computeSimilarity(sent, sent2);
if (sim >= this.degree) {
if (simMax <= sim) {
simMax = sim;
sentMax = b;
finalsent = sent;
finalsent2 = sent2;
}
}
}
if (simMax != 0.0) {
System.out.println(this.itemNames[d1] + ": ");
System.out.println(finalsent + ": " + this.allscores[d1].get(a - 1));
System.out.println(this.itemNames[d2] + ": ");
System.out.println(finalsent2 + ": " + this.allscores[d2].get(sentMax - 1));
System.out.println("simMax: " + simMax);
System.out.println("");
System.out.println("-------------------------------------");
if (this.allscores[d1].get(a - 1) >= this.allscores[d2].get(sentMax - 1)) {
this.resultdoc.add(this.parsedDocuments[d1].get(a));
this.resultscores.add(this.allscores[d1].get(a - 1));
this.resultfilenames.add(this.itemNames[d1] + ", " + this.itemNames[d2]);
} else {
this.resultdoc.add(this.parsedDocuments[d2].get(sentMax));
this.resultscores.add(this.allscores[d2].get(sentMax - 1));
this.resultfilenames.add(this.itemNames[d1] + ", " + this.itemNames[d2]);
}
}
} // for each sentence in doc
}
double computeSimilarity(String sent, String sent2) {
String[] tok = sent.split(WORD_DELIMITER);
String[] tok2 = sent2.split(WORD_DELIMITER);
ArrayList<String> overlaparray = new ArrayList<String>();
int overlap = 0;
// for each sentence
for (String element : tok) {
String word = element;
// check if word has been checked already
if (!overlaparray.contains(word)) {
int num1 = ArrayUtils.countOccurrences(word, tok);
int num2 = ArrayUtils.countOccurrences(word, tok2);
if (num1 != 0 && num2 != 0) {
if (num1 <= num2) {
overlap = overlap + num1;
} else {
overlap = overlap + num2;
}
overlaparray.add(word);
}
}
}
// System.out.println("overlapped words: "+overlaparray.toString());
double overl = new Integer(overlap).doubleValue();
double words_1 = new Integer(tok.length).doubleValue();
double words_2 = new Integer(tok2.length).doubleValue();
double sim = 2 * overl / (words_1 + words_2);
// double sim2 = overl / ((words_1+words_2)-overl);
return sim;
}
void storeScores(DoubleArrayList[] scores) {
this.allscores = scores;
}
}