package semanticMarkup.ling.learn;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import semanticMarkup.core.Treatment;
import semanticMarkup.io.input.lib.db.ParentTagProvider;
import semanticMarkup.know.IGlossary;
import semanticMarkup.know.lib.WordNetPOSKnowledgeBase;
import semanticMarkup.ling.Token;
import semanticMarkup.ling.learn.auxiliary.AjectiveReplacementForNoun;
import semanticMarkup.ling.learn.auxiliary.StringPair;
import semanticMarkup.ling.learn.dataholder.DataHolder;
import semanticMarkup.ling.learn.dataholder.SentenceStructure;
import semanticMarkup.ling.learn.knowledge.Constant;
import semanticMarkup.ling.learn.utility.LearnerUtility;
import semanticMarkup.ling.learn.utility.StringUtility;
import semanticMarkup.ling.transform.ITokenizer;
public class UnsupervisedClauseMarkup implements ITerminologyLearner {
// Date holder
public DataHolder myDataHolder;
// Configuration
private Configuration myConfiguration;
// Learner
private Learner myLearner;
// Utility
private LearnerUtility myLearnerUtility;
protected List<String> adjnouns;
protected Map<String, String> adjnounsent;
protected Set<String> bracketTags;
protected Map<String, Set<String>> categoryTerms;
protected Map<String, String> heuristicNouns;
protected Set<String> modifiers;
protected Map<String, Set<String>> roleToWords;
protected Set<String> sentences;
protected Map<Treatment, LinkedHashMap<String, String>> sentencesForOrganStateMarker;
protected Map<Treatment, LinkedHashMap<String, String>> sentenceTags;
protected Set<String> tags;
protected Map<String, Set<String>> termCategories;
protected Set<String> wordRoleTags;
protected Map<String, Set<String>> wordsToRoles;
protected Map<String, Set<String>> wordToSources;
protected Map<String, Treatment> fileTreatments = new HashMap<String, Treatment>();
private Map<String, AjectiveReplacementForNoun> adjectiveReplacementsForNouns;
private IGlossary glossary;
private String markupMode;
private ParentTagProvider parentTagProvider;
private ITokenizer sentenceDetector;
private Set<String> selectedSources;
private ITokenizer tokenizer;
/**
* Constructor of UnsupervisedClauseMarkup class. Create a new
* UnsupervisedClauseMarkup object.
*
*/
public UnsupervisedClauseMarkup(String markupMode,
IGlossary glossary,
ParentTagProvider parentTagProvider,
Set<String> selectedSources,
ITokenizer sentenceDetector,
ITokenizer tokenizer) {
//this.chrDir = desDir.replaceAll("descriptions.*", "characters/");
this.glossary = glossary;
this.markupMode = markupMode;
this.parentTagProvider = parentTagProvider;
this.sentenceDetector = sentenceDetector;
this.selectedSources = new HashSet<String>();
this.selectedSources.addAll(selectedSources);
this.tokenizer = tokenizer;
this.myConfiguration = new Configuration();
WordNetPOSKnowledgeBase wordNetPOSKnowledgeBase = null;
try {
wordNetPOSKnowledgeBase = new WordNetPOSKnowledgeBase(this.myConfiguration.getWordNetDictDir(), false);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
this.myLearnerUtility = new LearnerUtility(this.sentenceDetector, this.tokenizer, wordNetPOSKnowledgeBase);
this.myDataHolder = new DataHolder(this.myConfiguration, this.myLearnerUtility.getConstant(), this.myLearnerUtility.getWordFormUtility());
this.myLearner = new Learner(this.myConfiguration, this.tokenizer, this.myLearnerUtility);
}
// learn
public void learn(List<Treatment> treatments, IGlossary glossary) {
this.myDataHolder = this.myLearner.learn(treatments, glossary, this.markupMode);
readResults(treatments);
}
@Override
public void readResults(List<Treatment> treatments) {
// import data from data holder
this.adjnouns = this.readAdjNouns();
this.adjnounsent = this.readAdjNounSent();
this.bracketTags = this.readBracketTags();
this.heuristicNouns = this.readHeuristicNouns();
this.categoryTerms = this.readCategoryTerms();
this.modifiers = this.readModifiers();
this.roleToWords = this.readRoleToWords();
this.sentences = this.readSentences(treatments);
this.sentencesForOrganStateMarker = this.readSentencesForOrganStateMarker(treatments);
this.sentenceTags = this.readSentenceTags(treatments);
this.tags = this.readTags();
this.termCategories = this.readTermCategories();
this.wordRoleTags = this.readWordRoleTags();
this.wordsToRoles = this.readWordsToRoles();
this.wordToSources = this.readWordToSources();
this.adjectiveReplacementsForNouns = readAdjectiveReplacementsForNouns();
}
// interface methods
@Override
public List<String> getAdjNouns() {
return this.adjnouns;
}
@Override
public Map<String, String> getAdjNounSent() {
return this.adjnounsent;
}
@Override
public Set<String> getBracketTags() {
return this.bracketTags;
}
@Override
public Map<String, Set<String>> getCategoryTerms() {
return this.categoryTerms;
}
@Override
public Map<String, String> getHeuristicNouns() {
return this.heuristicNouns;
}
@Override
public Set<String> getModifiers() {
return this.modifiers;
}
@Override
public Map<String, Set<String>> getRoleToWords() {
return this.roleToWords;
}
@Override
public Set<String> getSentences() {
return this.sentences;
}
@Override
public Map<Treatment, LinkedHashMap<String, String>> getSentencesForOrganStateMarker() {
return this.sentencesForOrganStateMarker;
}
@Override
public Map<Treatment, LinkedHashMap<String, String>> getSentenceTags() {
return this.sentenceTags;
}
@Override
public Set<String> getTags() {
return this.tags;
}
@Override
public Map<String, Set<String>> getTermCategories() {
return this.termCategories;
}
@Override
public Set<String> getWordRoleTags() {
return this.wordRoleTags;
}
@Override
public Map<String, Set<String>> getWordsToRoles() {
return this.wordsToRoles;
}
@Override
public Map<String, Set<String>> getWordToSources() {
return this.wordToSources;
}
// methods dealing with ajectiveReplacementForNoun
public Map<String, AjectiveReplacementForNoun> readAdjectiveReplacementsForNouns() {
Map<String, AjectiveReplacementForNoun> result = new HashMap<String, AjectiveReplacementForNoun>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder()
.iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String modifier = sentenceObject.getModifier();
String tag = sentenceObject.getTag();
if ((!StringUtils.equals(modifier, ""))
&& (StringUtility.isEntireMatchedNullSafe(tag, "^\\[.*$"))) {
String source = sentenceObject.getSource();
modifier = modifier.replaceAll("\\[|\\]|>|<|(|)", "");
tag = tag.replaceAll("\\[|\\]|>|<|(|)", "");
result.put(source, new AjectiveReplacementForNoun(modifier,tag, source));
}
}
return result;
}
@Override
public Map<String, AjectiveReplacementForNoun> getAdjectiveReplacementsForNouns() {
return this.adjectiveReplacementsForNouns;
}
// methods importing data from data holder to class variables
public List<String> readAdjNouns() {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("unsupervisedClauseMarkup.getAdjectiveReplacementsForNouns");
if (this.myDataHolder == null) {
return null;
}
Set<String> myAdjNounSet = new HashSet<String>();
Iterator<SentenceStructure> iter = this.myDataHolder.getSentenceHolder()
.iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String modifier = sentenceObject.getModifier();
String tag = sentenceObject.getTag();
myLogger.trace("tag: "+tag);
if (tag != null) {
if (tag.matches("^\\[.*$")) {
modifier = modifier.replaceAll("\\[.*?\\]", "").trim();
myAdjNounSet.add(modifier);
}
}
}
List<String> myAdjNouns = new ArrayList<String>();
myAdjNouns.addAll(myAdjNounSet);
return myAdjNouns;
}
public Map<String, String> readAdjNounSent() {
PropertyConfigurator.configure( "conf/log4j.properties" );
Logger myLogger = Logger.getLogger("unsupervisedClauseMarkup.readAdjNounSent");
if (this.myDataHolder == null) {
return null;
}
Map<String, String> myAdjNounSent = new HashMap<String, String>();
// collect sentences that need adj-nn disambiguation
Iterator<SentenceStructure> iter = this.myDataHolder.getSentenceHolder()
.iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String modifier = sentenceObject.getModifier();
String tag = sentenceObject.getTag();
myLogger.trace("tag: "+tag);
if ((modifier != null)&&(tag != null)) {
if ((!(modifier.equals(""))) && (tag.matches("^\\[.*$"))) {
modifier = modifier.replaceAll("\\[.*?\\]", "").trim();
myAdjNounSent.put(tag, modifier);
}
}
}
return myAdjNounSent;
}
public Set<String> readBracketTags() {
if (this.myDataHolder == null) {
return null;
}
Set<String> tags = new HashSet<String>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder().iterator();
while (iter.hasNext()) {
SentenceStructure sentence = iter.next();
String thisTag = sentence.getTag();
if (thisTag != null) {
if (StringUtility.createMatcher(thisTag, "^\\[.*\\]$").find()) {
String thisModifier = sentence.getModifier();
String modifier = thisModifier
.replaceAll("\\[^\\[*\\]", "");
if (!modifier.equals("")) {
String tag;
if (modifier.lastIndexOf(" ") < 0) {
tag = modifier;
} else {
// last word from modifier
tag = modifier
.substring(modifier.lastIndexOf(" ") + 1);
}
if (tag.indexOf("[") >= 0
|| tag.matches(".*?(\\d|" + this.myLearnerUtility.getConstant().STOP
+ ").*"))
continue;
tags.add(tag);
}
}
}
}
return tags;
}
protected Map<String, Set<String>> readCategoryTerms() {
if (this.myDataHolder == null) {
return null;
}
Map<String, Set<String>> categoryNames = new HashMap<String, Set<String>>();
Iterator<StringPair> iter = this.getDataHolder().getTermCategoryHolder().iterator();
while (iter.hasNext()) {
StringPair termCategoryObject = iter.next();
String term = termCategoryObject.getHead();
String category = termCategoryObject.getTail();
if (!categoryNames.containsKey(category))
categoryNames.put(category, new HashSet<String>());
categoryNames.get(category).add(term);
}
return categoryNames;
}
public Map<String, String> readHeuristicNouns() {
if (this.myDataHolder == null) {
return null;
}
Map<String, String> myHeuristicNouns = new HashMap<String, String>();
myHeuristicNouns.putAll(this.getDataHolder().getHeuristicNounHolder());
return myHeuristicNouns;
}
public Set<String> readModifiers() {
if (this.myDataHolder == null) {
return null;
}
Set<String> modifiers = new HashSet<String>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder()
.iterator();
while (iter.hasNext()) {
String modifier = iter.next().getTag();
modifiers.add(modifier);
}
return modifiers;
}
public Map<String, Set<String>> readRoleToWords() {
if (this.myDataHolder == null) {
return null;
}
Map<String, Set<String>> roleToWords = new HashMap<String, Set<String>>();
Iterator<Entry<StringPair, String>> iter = this.getDataHolder()
.getWordRoleHolder().entrySet().iterator();
while (iter.hasNext()) {
Entry<StringPair, String> wordRoleObject = iter.next();
String word = wordRoleObject.getKey().getHead();
// perl treated hyphens as underscores
word = word.replaceAll("_", "-");
String semanticRole = wordRoleObject.getKey().getTail();
if (!roleToWords.containsKey(semanticRole))
roleToWords.put(semanticRole, new HashSet<String>());
roleToWords.get(semanticRole).add(word);
}
return roleToWords;
}
public Set<String> readSentences(List<Treatment> treatments) {
if (this.myDataHolder == null) {
return null;
}
Set<String> result = new HashSet<String>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder().iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String sentence = sentenceObject.getSentence();
result.add(sentence);
}
return sentences;
}
public HashMap<Treatment, LinkedHashMap<String, String>> readSentencesForOrganStateMarker(List<Treatment> treatments) {
if (this.myDataHolder == null) {
return null;
}
HashMap<Treatment, LinkedHashMap<String, String>> sentences = new HashMap<Treatment, LinkedHashMap<String, String>>();
List<SentenceStructure> sentenceHolder = this.getDataHolder().getSentenceHolder();
String previousTreatmentId = "-1";
for (int i = sentenceHolder.size()-1;i>=0;i--) {
SentenceStructure sentenceObject = sentenceHolder.get(i);
String source = this.getSource(sentenceObject.getSource());
String modifier = sentenceObject.getModifier();
String tag = sentenceObject.getTag();
String sentence = sentenceObject.getSentence().trim();
if(sentence.length()!=0){
String treatmentId = getTreatmentId(source);
if(selectedSources.isEmpty() || selectedSources.contains(source)) {
if(!treatmentId.equals(previousTreatmentId)) {
previousTreatmentId = treatmentId;
}
String text = sentence;
text = text.replaceAll("[ _-]+\\s*shaped", "-shaped").replaceAll("(?<=\\s)µ\\s+m\\b", "um");
text = text.replaceAll("°", "°");
text = text.replaceAll("\\bca\\s*\\.", "ca");
text = modifier+"##"+tag+"##"+text;
Treatment treatment = fileTreatments.get(treatmentId);
if(!sentences.containsKey(treatment))
sentences.put(treatment, new LinkedHashMap<String, String>());
sentences.get(treatment).put(source, text);
}
}
}
return sentences;
}
public Map<Treatment, LinkedHashMap<String, String>> readSentenceTags(List<Treatment> treatments) {
if (this.myDataHolder == null) {
return null;
}
Map<Treatment, LinkedHashMap<String, String>> tags = new HashMap<Treatment, LinkedHashMap<String, String>>();
String previousTag = null;
String previousTreatmentId = "-1";
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder().iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String source = this.getSource(sentenceObject.getSource());
String treatmentId = getTreatmentId(source);
if(selectedSources.isEmpty() || selectedSources.contains(source)) {
if(!treatmentId.equals(previousTreatmentId)) {
previousTreatmentId = treatmentId;
//listId++;
}
String tag = sentenceObject.getTag();
if(tag == null)
tag = "";
tag = tag.replaceAll("\\W", "");
Treatment treatment = fileTreatments.get(treatmentId);
if(!tags.containsKey(treatment))
tags.put(treatment, new LinkedHashMap<String, String>());
if(!tag.equals("ditto")) {
tags.get(treatment).put(source, tag);
previousTag = tag;
} else {
tags.get(treatment).put(source, previousTag);
}
}
}
return tags;
}
public Set<String> readTags() {
if (this.myDataHolder == null) {
return null;
}
Set<String> tags = new HashSet<String>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder()
.iterator();
while (iter.hasNext()) {
String tag = iter.next().getTag();
tags.add(tag);
}
return tags;
}
public Map<String, Set<String>> readTermCategories() {
if (this.myDataHolder == null) {
return null;
}
Map<String, Set<String>> termCategories = new HashMap<String, Set<String>>();
Iterator<StringPair> iter = this.getDataHolder()
.getTermCategoryHolder().iterator();
StringPair myTermCategoryPair;
while (iter.hasNext()) {
myTermCategoryPair = iter.next();
String term = myTermCategoryPair.getHead();
String category = myTermCategoryPair.getTail();
if (!termCategories.containsKey(term))
termCategories.put(term, new HashSet<String>());
termCategories.get(term).add(category);
}
return termCategories;
}
public Set<String> readWordRoleTags() {
if (this.myDataHolder == null) {
return null;
}
Set<String> tags = new HashSet<String>();
Iterator<Entry<StringPair, String>> iter = this.getDataHolder()
.getWordRoleHolder().entrySet().iterator();
while (iter.hasNext()) {
Entry<StringPair, String> wordRoleObject = iter.next();
String role = wordRoleObject.getKey().getTail();
if (StringUtils.equals(role, "op")
|| StringUtils.equals(role, "os")) {
String word = wordRoleObject.getKey().getHead();
String tag = word.replaceAll("_", "-").trim();
if (!tag.isEmpty())
tags.add(tag);
}
}
return tags;
}
public Map<String, Set<String>> readWordsToRoles() {
if (this.myDataHolder == null) {
return null;
}
Map<String, Set<String>> wordsToRoles = new HashMap<String, Set<String>>();
Iterator<Entry<StringPair, String>> iter = this.getDataHolder()
.getWordRoleHolder().entrySet().iterator();
while (iter.hasNext()) {
Entry<StringPair, String> wordRoleObject = iter.next();
String word = wordRoleObject.getKey().getHead();
// learner treated hyphens as underscores
word = word.replaceAll("_", "-");
String semanticRole = wordRoleObject.getKey().getTail();
if (!wordsToRoles.containsKey(word))
wordsToRoles.put(word, new HashSet<String>());
wordsToRoles.get(word).add(semanticRole);
}
return wordsToRoles;
}
public Map<String, Set<String>> readWordToSources() {
if (this.myDataHolder == null) {
return null;
}
Map<String, Set<String>> myWordToSources = new HashMap<String, Set<String>>();
Iterator<SentenceStructure> iter = this.myDataHolder.getSentenceHolder()
.iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String source = this.getSource(sentenceObject.getSource());
String sentence = sentenceObject.getSentence();
List<Token> tokens = this.tokenizer.tokenize(sentence);
for(Token token : tokens) {
String word = token.getContent();
if(!myWordToSources.containsKey(word))
myWordToSources.put(word, new HashSet<String>());
myWordToSources.get(word).add(source);
}
}
return myWordToSources;
}
// Miscellaneous
public void initParentTagProvider(ParentTagProvider parentTagProvider2) {
HashMap<String, String> parentTags = new HashMap<String, String>();
HashMap<String, String> grandParentTags = new HashMap<String, String>();
Iterator<SentenceStructure> iter = this.getDataHolder().getSentenceHolder()
.iterator();
while (iter.hasNext()) {
SentenceStructure sentenceObject = iter.next();
String parentTag = "";
String grandParentTag = "";
String source = getSource(sentenceObject.getSource());
String tag = sentenceObject.getTag();
parentTags.put(source, parentTag);
grandParentTags.put(source, grandParentTag);
grandParentTag = parentTag;
if (tag != null && !tag.equals("ditto"))
parentTag = tag;
else if (tag == null)
parentTag = "";
}
this.parentTagProvider.init(parentTags, grandParentTags);
}
//Utilities
public DataHolder getDataHolder() {
return this.myDataHolder;
}
protected String getTreatmentId(String sourceString) {
String[] sourceParts = sourceString.split("\\.");
return sourceParts[0];
}
protected String getSource(String sourceString) {
String[] sourceParts = sourceString.split("\\.");
return sourceParts[0] + "." + sourceParts[2];
}
}