/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.formats.ad;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
import org.cogroo.tools.featurizer.FeatureSample;
import opennlp.tools.formats.ad.ADSentenceStream;
import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
* Parser for Floresta Sita(c)tica Arvores Deitadas corpus, output to for the
* Portuguese Featurizer training.
* <p>
* Data can be found on this web site:<br>
* http://www.linguateca.pt/floresta/corpus.html
* <p>
* Information about the format:<br>
* Susana Afonso.
* "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
* .<br>
* 12 de Fevereiro de 2006.
* http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
* Detailed info about the NER tagset:
* http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names
* <p>
* <b>Note:</b> Do not use this class, internal use only!
*/
public class ADFeaturizerSampleStream implements ObjectStream<FeatureSample> {
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
private int start = -1;
private int end = -1;
private int index = 0;
private boolean expandME;
// this is used to control changing aspas representation, some sentences we keep as original, others we change to "
private int callsCount = 0;
/**
* Creates a new {@link NameSample} stream from a line stream, i.e.
* {@link ObjectStream}< {@link String}>, that could be a
* {@link PlainTextByLineStream} object.
*
* @param lineStream
* a stream of lines as {@link String}
*/
public ADFeaturizerSampleStream(ObjectStream<String> lineStream,
boolean expandME) {
this.expandME = expandME;
this.adSentenceStream = new ADSentenceStream(lineStream);
}
/**
* Creates a new {@link NameSample} stream from a {@link InputStream}
*
* @param in
* the Corpus {@link InputStream}
* @param charsetName
* the charset of the Arvores Deitadas Corpus
*/
public ADFeaturizerSampleStream(InputStreamFactory in, String charsetName,
boolean expandME) throws IOException {
try {
this.expandME = expandME;
this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
in, charsetName));
} catch (UnsupportedEncodingException e) {
// UTF-8 is available on all JVMs, will never happen
throw new IllegalStateException(e);
}
}
public FeatureSample read() throws IOException {
callsCount++;
Sentence paragraph;
while ((paragraph = this.adSentenceStream.read()) != null) {
if (end > -1 && index >= end) {
// leave
return null;
}
if (start > -1 && index < start) {
index++;
// skip this one
} else {
Node root = paragraph.getRoot();
List<String> sentence = new ArrayList<String>();
List<String> lemma = new ArrayList<String>();
List<String> tags = new ArrayList<String>();
List<String> target = new ArrayList<String>();
processRoot(root, sentence, lemma, tags, target);
if (sentence.size() > 0) {
index++;
return new FeatureSample(sentence, lemma, tags, target);
}
}
}
return null;
}
private void processRoot(Node root, List<String> sentence,List<String> lemmas, List<String> tags,
List<String> target) {
if (root != null) {
TreeElement[] elements = root.getElements();
for (int i = 0; i < elements.length; i++) {
if (elements[i].isLeaf()) {
processLeaf((Leaf) elements[i], false, "O", sentence, lemmas, tags, target);
} else {
processNode((Node) elements[i], sentence, lemmas, tags, target, null);
}
}
}
}
private void processNode(Node node, List<String> sentence, List<String> lemmas, List<String> tags,
List<String> target, String inheritedTag) {
String phraseTag = getChunkTag(node.getSyntacticTag());
boolean inherited = false;
if (phraseTag.equals("O") && inheritedTag != null) {
phraseTag = inheritedTag;
inherited = true;
}
TreeElement[] elements = node.getElements();
for (int i = 0; i < elements.length; i++) {
if (elements[i].isLeaf()) {
boolean isIntermediate = false;
if (i > 0 && elements[i - 1].isLeaf() && phraseTag != null
&& !phraseTag.equals("O")) {
isIntermediate = true;
}
if (inherited && target.size() > 0
&& target.get(target.size() - 1).endsWith(phraseTag)) {
isIntermediate = true;
}
processLeaf((Leaf) elements[i], isIntermediate, phraseTag, sentence, lemmas,
tags, target);
} else {
processNode((Node) elements[i], sentence, lemmas, tags, target, phraseTag);
}
}
}
private void processLeaf(Leaf leaf, boolean isIntermediate, String phraseTag,
List<String> sentence, List<String> lemmas, List<String> tags, List<String> target) {
String featureTag;
String lemma = leaf.getLemma();
String lexeme = leaf.getLexeme();
featureTag = leaf.getMorphologicalTag();
// this will change half of the aspas
if("«".equals(lexeme) || "»".equals(lexeme)) {
if(callsCount % 2 == 0) {
lexeme = "\"";
}
}
if (featureTag == null) {
featureTag = "-";
} else {
featureTag = featureTag.replace(" ", "=");
}
String postag;
if (leaf.getSyntacticTag() == null) {
postag = lexeme;
lemma = lexeme;
} else {
postag = ADFeaturizerSampleStream.convertFuncTag(leaf.getFunctionalTag());
}
if(postag == null) {
return;
}
if (expandME && lexeme.contains("_") && !"prop".equals(postag)) {
StringTokenizer tokenizer = new StringTokenizer(lexeme, "_");
/*
* if(postag.startsWith("prop")) { sentence.add(tokenizer.nextToken());
* target.add(featureTag); tags.add(postag); } else
*/if (tokenizer.countTokens() > 0) {
List<String> toks = new ArrayList<String>(tokenizer.countTokens());
List<String> tagsWithCont = new ArrayList<String>(
tokenizer.countTokens());
toks.add(tokenizer.nextToken());
tagsWithCont.add("B-" + postag);
target.add(featureTag);
while (tokenizer.hasMoreTokens()) {
toks.add(tokenizer.nextToken());
tagsWithCont.add("I-" + postag);
target.add(featureTag);
}
lemmas.addAll(toks);
sentence.addAll(toks);
tags.addAll(tagsWithCont);
} else {
sentence.add(lexeme);
lemmas.add(lemma);
target.add(featureTag);
tags.add(postag);
}
} else {
sentence.add(lexeme);
lemmas.add(lemma);
target.add(featureTag);
tags.add(postag);
}
}
private static String convertFuncTag(String t) {
// XXX: this should be removed when using Floresta tagger !
// if("art".equals(t) || "pron-det".equals(t) || "pron-indef".equals(t)) {
// t = "det";
// }
return t;
}
private String getChunkTag(String tag) {
String phraseTag = tag.substring(tag.lastIndexOf(":") + 1);
// maybe we should use only np, vp and pp, but will keep ap and advp.
if (phraseTag.equals("np") || phraseTag.equals("vp")
|| phraseTag.equals("pp") || phraseTag.equals("ap")
|| phraseTag.equals("advp")) {
phraseTag = phraseTag.toUpperCase();
} else {
phraseTag = "O";
}
return phraseTag;
}
public void setStart(int aStart) {
this.start = aStart;
}
public void setEnd(int aEnd) {
this.end = aEnd;
}
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
public void close() throws IOException {
adSentenceStream.close();
}
}