/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.formats.ad;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.formats.ad.ADSentenceStream;
import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement;
import opennlp.tools.formats.ad.PortugueseContractionUtility;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
/**
* Parser for Floresta Sita(c)tica Arvores Deitadas corpus, output to for the
* Portuguese NER training.
* <p>
* The data contains common multiword expressions. The categories are:<br>
* intj, spec, conj-s, num, pron-indef, n, prop, adj, prp, adv
* <p>
* Data can be found on this web site:<br>
* http://www.linguateca.pt/floresta/corpus.html
* <p>
* Information about the format:<br>
* Susana Afonso.
* "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
* .<br>
* 12 de Fevereiro de 2006.
* http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
* Detailed info about the NER tagset:
* http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names
* <p>
* <b>Note:</b> Do not use this class, internal use only!
*/
public class ADExpNameSampleStream implements ObjectStream<NameSample> {
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
/**
* To keep the last left contraction part
*/
private String leftContractionPart = null;
/**
* The tags we are looking for
*/
private Set<String> tags;
private final boolean useAdaptativeFeatures;
/**
* Creates a new {@link NameSample} stream from a line stream, i.e.
* {@link ObjectStream}< {@link String}>, that could be a
* {@link PlainTextByLineStream} object.
*
* @param lineStream
* a stream of lines as {@link String}
* @param tags
* the tags we are looking for, or null for all
*/
public ADExpNameSampleStream(ObjectStream<String> lineStream,
Set<String> tags, boolean useAdaptativeFeatures) {
this.adSentenceStream = new ADSentenceStream(lineStream);
this.tags = tags;
this.useAdaptativeFeatures = useAdaptativeFeatures;
}
/**
* Creates a new {@link NameSample} stream from a {@link InputStream}
*
* @param in
* the Corpus {@link InputStream}
* @param charsetName
* the charset of the Arvores Deitadas Corpus
* @param tags
* the tags we are looking for, or null for all
*/
public ADExpNameSampleStream(InputStreamFactory in, String charsetName,
Set<String> tags, boolean useAdaptativeFeatures) throws IOException {
this.useAdaptativeFeatures = useAdaptativeFeatures;
try {
this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
in, charsetName));
this.tags = tags;
} catch (UnsupportedEncodingException e) {
// UTF-8 is available on all JVMs, will never happen
throw new IllegalStateException(e);
}
}
int textID = -1;
public NameSample read() throws IOException {
Sentence paragraph;
while ((paragraph = this.adSentenceStream.read()) != null) {
boolean clearData = false;
if (useAdaptativeFeatures) {
int currentTextID = getTextID(paragraph);
if (currentTextID != textID) {
clearData = true;
textID = currentTextID;
}
} else {
clearData = true;
}
Node root = paragraph.getRoot();
List<String> sentence = new ArrayList<String>();
List<Span> names = new ArrayList<Span>();
process(root, sentence, names);
return new NameSample(sentence.toArray(new String[sentence.size()]),
names.toArray(new Span[names.size()]), clearData);
}
return null;
}
enum Type {
ama, cie, lit
}
private Type corpusType = null;
private Pattern metaPattern;
private int textIdMeta2 = -1;
private String textMeta2 = "";
private int getTextID(Sentence paragraph) {
String meta = paragraph.getMetadata();
if (corpusType == null) {
if (meta.startsWith("LIT")) {
corpusType = Type.lit;
metaPattern = Pattern.compile("^([a-zA-Z\\-]+)(\\d+).*?p=(\\d+).*");
} else if (meta.startsWith("CIE")) {
corpusType = Type.cie;
metaPattern = Pattern.compile("^.*?source=\"(.*?)\".*");
} else { // ama
corpusType = Type.ama;
metaPattern = Pattern.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
}
}
if (corpusType.equals(Type.lit)) {
Matcher m2 = metaPattern.matcher(meta);
if (m2.matches()) {
String textId = m2.group(1);
if (!textId.equals(textMeta2)) {
textIdMeta2++;
textMeta2 = textId;
}
return textIdMeta2;
} else {
throw new RuntimeException("Invalid metadata: " + meta);
}
} else if (corpusType.equals(Type.cie)) {
Matcher m2 = metaPattern.matcher(meta);
if (m2.matches()) {
String textId = m2.group(1);
if (!textId.equals(textMeta2)) {
textIdMeta2++;
textMeta2 = textId;
}
return textIdMeta2;
} else {
throw new RuntimeException("Invalid metadata: " + meta);
}
} else if (corpusType.equals(Type.ama)) {
Matcher m2 = metaPattern.matcher(meta);
if (m2.matches()) {
return Integer.parseInt(m2.group(1));
// currentPara = Integer.parseInt(m.group(2));
} else {
throw new RuntimeException("Invalid metadata: " + meta);
}
}
return 0;
}
/**
* Recursive method to process a node in Arvores Deitadas format.
*
* @param node
* the node to be processed
* @param sentence
* the sentence tokens we got so far
* @param names
* the names we got so far
*/
private void process(Node node, List<String> sentence, List<Span> names) {
if (node != null) {
for (TreeElement element : node.getElements()) {
if (element.isLeaf()) {
processLeaf((Leaf) element, sentence, names);
} else {
process((Node) element, sentence, names);
}
}
}
}
/**
* Process a Leaf of Arvores Detaitadas format
*
* @param leaf
* the leaf to be processed
* @param sentence
* the sentence tokens we got so far
* @param names
* the names we got so far
*/
private void processLeaf(Leaf leaf, List<String> sentence, List<Span> names) {
if (leaf != null && leftContractionPart == null) {
String namedEntityTag = null;
int startOfNamedEntity = -1;
String leafTag = leaf.getSecondaryTag();
if (leafTag != null) {
if (leafTag.contains("<sam->")) {
String[] lexemes = leaf.getLexeme().split("_");
if (lexemes.length > 1) {
for (int i = 0; i < lexemes.length - 1; i++) {
sentence.add(lexemes[i]);
}
}
leftContractionPart = lexemes[lexemes.length - 1];
return;
}
if (leaf.getLexeme().contains("_") && leaf.getLexeme().length() > 3) {
String tag = leaf.getFunctionalTag();
if (tags != null) {
if (tags.contains(tag)) {
namedEntityTag = leaf.getFunctionalTag();
}
} else {
namedEntityTag = leaf.getFunctionalTag();
}
}
}
if (namedEntityTag != null) {
startOfNamedEntity = sentence.size();
}
sentence.addAll(Arrays.asList(leaf.getLexeme().split("_")));
if (namedEntityTag != null) {
names
.add(new Span(startOfNamedEntity, sentence.size(), namedEntityTag));
}
} else {
// will handle the contraction
String tag = leaf.getSecondaryTag();
String right = leaf.getLexeme();
if (tag != null && tag.contains("<-sam>")) {
right = leaf.getLexeme();
String c = PortugueseContractionUtility.toContraction(
leftContractionPart, right);
if (c != null) {
sentence.add(c);
} else {
System.err.println("missing " + leftContractionPart + " + " + right);
sentence.add(leftContractionPart);
sentence.add(right);
}
} else {
System.err.println("unmatch" + leftContractionPart + " + " + right);
}
leftContractionPart = null;
}
}
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
public void close() throws IOException {
adSentenceStream.close();
}
}