/*******************************************************************************
* Copyright 2010 Stephen O'Rourke (stephen.orourke@sydney.edu.au)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
package tml.vectorspace.operations;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import tml.corpus.Corpus;
import tml.corpus.TextDocument;
import tml.vectorspace.operations.results.RapidAutomaticKeywordExtractionResult;
/**
* This operation extracts keywords from all documents in a {@link Corpus}.
* Keyword extraction is performed using the Rapid Automatic Keyword Extraction
* (RAKE) method described in:
*
* Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). Automatic Keyword
* Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), Text
* Mining: Theory and Applications: John Wiley & Sons.
*
* @author Stephen O'Rourke
*
*/
public class RapidAutomaticKeywordExtraction extends AbstractOperation<RapidAutomaticKeywordExtractionResult> {
public RapidAutomaticKeywordExtraction() {
this.name = "Rapid Automatic Keyword Extraction";
}
public Object[][] getInnerData() {
return getResultsTable();
}
@Override
public Object[][] getResultsTable() {
Object[][] resultsTable = new Object[results.size()][2];
for (int i = 0; i < results.size(); i++) {
resultsTable[i][0] = results.get(i).getKeyword();
resultsTable[i][1] = results.get(i).getWeighting();
}
return resultsTable;
}
@Override
public Object[] getResultsTableHeader() {
Object[] data = new Object[2];
data[0] = "Keyword";
data[1] = "Weighting";
return data;
}
@Override
public void start() throws Exception {
// extract keywords
List<String> keywords = new LinkedList<String>();
Set<String> stopwords = new HashSet<String>(Arrays.asList(repository.getStopwords()));
for (String textDocumentId : corpus.getPassages()) {
TextDocument textDocument = repository.getTextDocument(textDocumentId);
String text = textDocument.getContent();
BreakIterator sentenceIterator = BreakIterator.getSentenceInstance(corpus.getRepository().getLocale());
sentenceIterator.setText(text);
int sentenceStart = sentenceIterator.first(), sentenceEnd = 0;
while ((sentenceEnd = sentenceIterator.next()) != BreakIterator.DONE) {
String sentence = text.substring(sentenceStart, sentenceEnd);
BreakIterator wordIterator = BreakIterator.getWordInstance(corpus.getRepository().getLocale());
wordIterator.setText(sentence);
int wordStart = wordIterator.first(), wordEnd = 0, keywordStart = wordStart;
while ((wordEnd = wordIterator.next()) != BreakIterator.DONE) {
String word = cleanWord(sentence.substring(wordStart, wordEnd));
if (stopwords.contains(word) || word.matches("\\W+")) {
// word is keyword break
String keyword = cleanWord(sentence.substring(keywordStart, wordStart));
if (keyword.length() > 0) {
keywords.add(keyword);
}
keywordStart = wordEnd;
} else if (wordEnd == sentence.length()) {
// word is last in sentence
String keyword = cleanWord(sentence.substring(keywordStart, wordEnd));
if (keyword.length() > 0) {
keywords.add(keyword);
}
}
wordStart = wordEnd;
}
sentenceStart = sentenceEnd;
}
}
// calculate word frequency and degree
Map<String, Integer> wordFrequency = new HashMap<String, Integer>();
Map<String, Integer> wordDegree = new HashMap<String, Integer>();
for (String keyword : keywords) {
String[] words = keyword.split("\\s");
for (String word : words) {
if (wordFrequency.containsKey(word)) {
wordFrequency.put(word, wordFrequency.get(word) + 1);
wordDegree.put(word, wordDegree.get(word) + words.length);
} else {
wordFrequency.put(word, 1);
wordDegree.put(word, words.length);
}
}
}
// calculate keyword weighting results
results = new ArrayList<RapidAutomaticKeywordExtractionResult>();
for (String keyword : new LinkedHashSet<String>(keywords)) {
double weighting = 0;
for (String word : keyword.split("\\s")) {
weighting += (double) wordDegree.get(word) / (double) wordFrequency.get(word);
}
RapidAutomaticKeywordExtractionResult result = new RapidAutomaticKeywordExtractionResult();
result.setKeyword(keyword);
result.setWeighting(weighting);
results.add(result);
}
// sort results by keyword weighting
Collections.sort(results, Collections.reverseOrder());
}
private String cleanWord(String word) {
return word.trim().toLowerCase();
}
}