/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.clir.clearnlp.experiment;
import java.io.PrintStream;
import java.util.List;
import edu.emory.clir.clearnlp.collection.ngram.Unigram;
import edu.emory.clir.clearnlp.collection.pair.ObjectIntPair;
import edu.emory.clir.clearnlp.constituent.CTLib;
import edu.emory.clir.clearnlp.constituent.CTLibEn;
import edu.emory.clir.clearnlp.constituent.CTNode;
import edu.emory.clir.clearnlp.constituent.CTReader;
import edu.emory.clir.clearnlp.constituent.CTTree;
import edu.emory.clir.clearnlp.pos.POSLibEn;
import edu.emory.clir.clearnlp.util.DSUtils;
import edu.emory.clir.clearnlp.util.IOUtils;
import edu.emory.clir.clearnlp.util.StringUtils;
import edu.emory.clir.clearnlp.util.constant.StringConst;
/**
* @author Jinho D. Choi ({@code jinho.choi@emory.edu})
*/
public class MWEExtractor
{
public void extract(String filename)
{
CTReader reader = new CTReader(IOUtils.createFileInputStream(filename));
Unigram<String> qpPre = new Unigram<>();
Unigram<String> qpPost = new Unigram<>();
CTTree tree;
while ((tree = reader.nextTree()) != null)
extract(tree.getRoot(), qpPre, qpPost);
printMap(qpPre , filename+".qp_pre");
printMap(qpPost, filename+".qp_post");
}
private void printMap(Unigram<String> map, String outputFile)
{
PrintStream fout = IOUtils.createBufferedPrintStream(outputFile);
List<ObjectIntPair<String>> list = map.toList(0);
DSUtils.sortReverseOrder(list);
for (ObjectIntPair<String> p : list)
fout.println(p.o+" "+p.i);
fout.close();
}
public void extract(CTNode node, Unigram<String> qpPre, Unigram<String> qpPost)
{
if (node.isConstituentTag(CTLibEn.C_QP))
extractQP(node, qpPre, qpPost);
else
{
for (CTNode child : node.getChildrenList())
extract(child, qpPre, qpPost);
}
}
private void extractQP(CTNode node, Unigram<String> pre, Unigram<String> post)
{
List<CTNode> tokens = node.getTokenList();
int i, size = tokens.size();
CTNode token;
for (i=0; i<size; i++)
{
token = tokens.get(i);
if (token.isConstituentTagAny(POSLibEn.POS_CD, POSLibEn.POS_DOLLAR))
{
if (i > 0) pre.add(StringUtils.toLowerCase(CTLib.toForms(tokens, 0, i, StringConst.SPACE)));
break;
}
}
for (i=size-1; i>=0; i--)
{
token = tokens.get(i);
if (token.isConstituentTag(POSLibEn.POS_CD))
{
if (i+1 < size) post.add(StringUtils.toLowerCase(CTLib.toForms(tokens, i+1, size, StringConst.SPACE)));
break;
}
}
}
static public void main(String[] args)
{
MWEExtractor mwe = new MWEExtractor();
mwe.extract("/Users/jdchoi/Documents/Data/ontonotes/data/english/onto.parse");
}
}