/**
* Copyright 2014, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.component.mode.pos;
import java.io.Serializable;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import edu.emory.clir.clearnlp.collection.map.ObjectIntHashMap;
import edu.emory.clir.clearnlp.collection.ngram.Bigram;
import edu.emory.clir.clearnlp.collection.pair.ObjectDoublePair;
import edu.emory.clir.clearnlp.dependency.DEPNode;
import edu.emory.clir.clearnlp.util.DSUtils;
import edu.emory.clir.clearnlp.util.Joiner;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.constant.StringConst;
/**
* @since 3.0.0
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class POSLexicon implements Serializable
{
private static final long serialVersionUID = 8363531867786160098L;
private ObjectIntHashMap<String> document_frequencies;
private Map<String,String> ambiguity_class_features;
private Bigram<String,String> ambiguity_classes;
private String[] word_vector_paths;
private Set<String> document;
private double ambiguity_class_threshold;
private int document_frequency_cutoff;
private int document_size;
private int tree_count;
public POSLexicon(POSConfiguration configuration)
{
document_frequencies = new ObjectIntHashMap<>();
ambiguity_class_features = new HashMap<>();
ambiguity_classes = new Bigram<>();
initDocument();
setAmbiguityClassThreshold(configuration.getAmbiguityClassThreshold());
setDocumentFrequencyCutoff(configuration.getDocumentFrequencyCutoff());
setDocumentSize(configuration.getDocumentSize());
}
// ============================== Collect ==============================
public void collect(POSState state)
{
String sf;
for (DEPNode node : state.getTree())
{
sf = node.getSimplifiedWordForm();
ambiguity_classes.add(sf, node.getPOSTag());
document.add(StringUtils.toLowerCase(sf));
}
if (++tree_count == document_size)
initDocument();
}
private void initDocument()
{
if (document != null) document_frequencies.addAll(document);
document = new HashSet<>();
tree_count = 0;
}
public void finalizeCollect()
{
finalizeCollect(ambiguity_classes.getBigramSet());
}
private void finalizeCollect(Set<String> simplifiedWordForms)
{
List<ObjectDoublePair<String>> ps;
initDocument();
for (String key : simplifiedWordForms)
{
if (!includeForm(StringUtils.toLowerCase(key))) continue;
ps = ambiguity_classes.toList(key, ambiguity_class_threshold);
if (!ps.isEmpty())
{
DSUtils.sortReverseOrder(ps);
ambiguity_class_features.put(key, Joiner.joinObject(ps, StringConst.UNDERSCORE));
}
}
}
// ============================== Getters ==============================
public String getAmbiguityClassFeature(String simplifiedWordForm)
{
return ambiguity_class_features.get(simplifiedWordForm);
}
public String[] getWordVectorPaths()
{
return word_vector_paths;
}
public boolean includeForm(String lowerSimplifiedWordForm)
{
return document_frequencies.get(lowerSimplifiedWordForm) > document_frequency_cutoff;
}
// ============================== Setters ==============================
public void setAmbiguityClassThreshold(double threshold)
{
ambiguity_class_threshold = threshold;
}
public void setDocumentFrequencyCutoff(int cutoff)
{
document_frequency_cutoff = cutoff;
}
public void setDocumentSize(int size)
{
document_size = size;
}
public void setWordVectorPaths(String[] paths)
{
word_vector_paths = paths;
}
}