/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.summarisation.methods; import java.util.ArrayList; import cern.colt.list.DoubleArrayList; import cern.colt.list.IntArrayList; import cern.colt.matrix.DoubleMatrix1D; import at.tuwien.ifs.somtoolbox.data.InputData; import at.tuwien.ifs.somtoolbox.data.InputDatum; import at.tuwien.ifs.somtoolbox.data.SOMLibTemplateVector; import at.tuwien.ifs.somtoolbox.summarisation.parser.Scorer; /** * @author Julius Penaranda * @version $Id: TFxIDF.java 3589 2010-05-21 10:42:01Z mayer $ */ public class TFxIDF { private InputData inputVector = null; private SOMLibTemplateVector templateVector = null; private InputDatum inputd = null; private DoubleMatrix1D doublevec = null; private ArrayList<String> doc = null; private IntArrayList intarray = new IntArrayList(); private DoubleArrayList doubarray = new DoubleArrayList(); private DoubleArrayList tfxidfarray = null; private PartOfSpeech pos = null; public TFxIDF() { // do nothing } public TFxIDF(PartOfSpeech p) { this.pos = p; } public void setVectors(InputData input, SOMLibTemplateVector template) { this.inputVector = input; this.templateVector = template; } /** Sets document */ public void setDocument(String filename, ArrayList<String> doc) { this.inputd = this.inputVector.getInputDatum(filename); this.doc = doc; } /** * computes scores of each sentence */ public DoubleArrayList computeScores(String type) { this.intarray = new IntArrayList(); this.doubarray = new DoubleArrayList(); this.doublevec = this.inputd.getVector(); this.doublevec.getNonZeros(intarray, doubarray); tfxidfarray = new DoubleArrayList(); // stores score values String sent; // for each sentence, ignore title for (int a = 1; a < this.doc.size(); a++) { sent = doc.get(a).toLowerCase(); // converts to lower case double sentvalue = 0.0; int numWords = 0; // store scores of all words if (type == Scorer.ALL) { // for each word in templatevector for (int b = 0; b < intarray.size(); b++) { String word = templateVector.getLabel(intarray.get(b)); // add score value to 'sentvalue' while (sent.indexOf(word) != -1) { sentvalue = sentvalue + doubarray.get(b); sent = sent.substring(sent.indexOf(word) + word.length()); numWords++; } } } // store scores of other words, i.e. noun, adjective,.. else { ArrayList<String> resulttokens = this.pos.getTokens(sent, type); for (int b = 0; b < intarray.size(); b++) { String word = templateVector.getLabel(intarray.get(b)); for (int i = 0; i < resulttokens.size(); i++) { if (resulttokens.get(i).indexOf(word) != -1) { // System.out.println("word gefunden: "+word+"("+doubarray.get(b)+") value: "+sentvalue); sentvalue = sentvalue + doubarray.get(b); numWords++; } } } } // normalize and add score of sentence to tfxidf-Array if (sentvalue != 0) { sentvalue = sentvalue / new Integer(numWords).doubleValue(); } tfxidfarray.add(sentvalue); } return tfxidfarray; } }