//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.coreference.impl.enhancers;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.Mention;
import uk.gov.dstl.baleen.annotators.coreference.impl.data.MentionType;
import uk.gov.dstl.baleen.types.language.WordToken;
/**
* Generates acronyms for a mention.
*
*/
public class AcronymEnhancer implements MentionEnhancer {
@Override
public void enhance(Mention mention) {
// Stanford just use the uppercases but only on the NNP parts,
// We are looking at everything in the next
// If we have more than two upper cases, we do the lower case
if (mention.getType() == MentionType.PRONOUN) {
return;
}
Set<String> allAcronyms;
if (mention.isAcronym()) {
allAcronyms = Collections.singleton(mention.getText().toUpperCase());
} else {
final Collection<WordToken> words = mention.getWords();
allAcronyms = new HashSet<>();
allAcronyms.addAll(acronyms(mention));
allAcronyms.addAll(acronymsNNP(words));
}
if (allAcronyms.isEmpty()) {
allAcronyms = Collections.emptySet();
}
mention.setAcronym(allAcronyms);
}
/**
* Generate acronyms from covered text
*/
private Set<String> acronyms(Mention mention){
final String text = mention.getText();
final StringBuilder upperCase = new StringBuilder();
final StringBuilder upperAndLowerCase = new StringBuilder();
Set<String> acronyms = new HashSet<>();
boolean considerNext = true;
for (int i = 0; i < text.length(); i++) {
final char c = text.charAt(i);
if (considerNext) {
if (Character.isUpperCase(c)) {
upperCase.append(c);
upperAndLowerCase.append(c);
} else {
upperAndLowerCase.append(c);
}
considerNext = false;
}
if (Character.isWhitespace(c)) {
considerNext = true;
}
}
// We require two upper case to avoid obvious captialisation (start of sentences)
if (upperCase.length() > 2) {
acronyms.add(upperCase.toString());
} else if (upperCase.length() > 2 && upperAndLowerCase.length() != upperCase.length()) {
acronyms.add(upperAndLowerCase.toString().toUpperCase());
}
return acronyms;
}
/**
* Create acronym based on just the NNS, but unlike Stanford use lower and upper case again
*/
private Set<String> acronymsNNP(Collection<WordToken> words){
final StringBuilder upperCaseNNP = new StringBuilder();
final StringBuilder upperAndLowerCaseNNP = new StringBuilder();
Set<String> acronyms = new HashSet<>();
words.stream().filter(p -> "NNP".equalsIgnoreCase(p.getPartOfSpeech()))
.map(w -> w.getCoveredText().charAt(0))
.forEach(c -> {
if (Character.isUpperCase(c)) {
upperCaseNNP.append(c);
upperAndLowerCaseNNP.append(c);
} else {
upperAndLowerCaseNNP.append(c);
}
});
if (upperCaseNNP.length() > 2) {
acronyms.add(upperCaseNNP.toString());
} else if (upperCaseNNP.length() > 2 && upperAndLowerCaseNNP.length() != upperCaseNNP.length()) {
acronyms.add(upperAndLowerCaseNNP.toString().toUpperCase());
}
return acronyms;
}
}