/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.formats.ad; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.atomic.AtomicInteger; import org.cogroo.tools.featurizer.WordTag; import org.cogroo.tools.shallowparser.ShallowParserSequenceValidator; import opennlp.tools.chunker.ChunkSample; import opennlp.tools.formats.ad.ADSentenceStream.Sentence; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement; import opennlp.tools.namefind.NameSample; import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; import opennlp.tools.util.Span; /** * Parser for Floresta Sita(c)tica Arvores Deitadas corpus, output to for the * Portuguese Chunker training. * <p> * The heuristic to extract chunks where based o paper 'A Machine Learning * Approach to Portuguese Clause Identification', (Eraldo Fernandes, Cicero * Santos and Ruy Milidiú).<br> * <p> * Data can be found on this web site:<br> * http://www.linguateca.pt/floresta/corpus.html * <p> * Information about the format:<br> * Susana Afonso. * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica" * .<br> * 12 de Fevereiro de 2006. * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf * <p> * Detailed info about the NER tagset: * http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names * <p> * <b>Note:</b> Do not use this class, internal use only! */ public class ADChunkBasedShallowParserSampleStream extends ADChunk2SampleStream { private final Set<String> functTagSet; private String[] defaultFunctTags = { "SUBJ", "ACC", "DAT", "PIV", "ADVS", "ADVO", "SC", "OC", "P", "NPHR", "SA", "ADVL", "APP", // "MV","PMV", "PAUX", "AUX", }; private boolean readChunk; private ShallowParserSequenceValidator sv = new ShallowParserSequenceValidator(); private ArrayList<String> chunks; private SubjectTypes subjectTypes = new SubjectTypes(); public ADChunkBasedShallowParserSampleStream(ObjectStream<String> lineStream, String commaSeparatedFunctTags, boolean isIncludePOSTags, boolean useCGTag, boolean expandME) { super(lineStream); if (commaSeparatedFunctTags == null || commaSeparatedFunctTags.trim().isEmpty()) { Set<String> functTagsSet = new HashSet<String>(); functTagsSet.addAll(Arrays.asList(defaultFunctTags)); functTagSet = Collections.unmodifiableSet(functTagsSet); } else { String[] tags = commaSeparatedFunctTags.split(","); Set<String> functTagsSet = new HashSet<String>(); functTagsSet.addAll(Arrays.asList(tags)); functTagSet = Collections.unmodifiableSet(functTagsSet); } } /** * Creates a new {@link NameSample} stream from a {@link InputStream} * * @param in * the Corpus {@link InputStream} * @param charsetName * the charset of the Arvores Deitadas Corpus */ public ADChunkBasedShallowParserSampleStream(InputStreamFactory in, String charsetName, String commaSeparatedFunctTags, boolean isIncludePOSTags, boolean useCGTag, boolean expandME) throws IOException { super(in, charsetName); if (commaSeparatedFunctTags == null || commaSeparatedFunctTags.trim().isEmpty()) { Set<String> functTagsSet = new HashSet<String>(); functTagsSet.addAll(Arrays.asList(defaultFunctTags)); functTagSet = Collections.unmodifiableSet(functTagsSet); } else { String[] tags = commaSeparatedFunctTags.split(","); Set<String> functTagsSet = new HashSet<String>(); functTagsSet.addAll(Arrays.asList(tags)); functTagSet = Collections.unmodifiableSet(functTagsSet); } } public ChunkSample read() throws IOException { Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { this.readChunk = true; Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<String>(); List<String> tags = new ArrayList<String>(); chunks = new ArrayList<String>(); processRoot(root, sentence, tags, chunks); this.readChunk = false; sentence.clear(); tags.clear(); List<String> target = new ArrayList<String>(); processRoot(root, sentence, tags, target); for (int i = 0; i < tags.size(); i++) { tags.set(i, tags.get(i) + "|" + chunks.get(i)); } if (sentence.size() > 0) { ChunkSample cs = new ChunkSample(sentence, tags, target); // System.out.println(cs); for (int i = 0; i < sentence.size(); i++) { String[] outcomes; if(i > 0) { outcomes = target.subList(0, i).toArray(new String[i]); } else { outcomes = new String[0]; } if(!sv.validSequence(i, WordTag.create(cs), outcomes, target.get(i))) { //sv.validSequence(i, WordTag.create(cs), outcomes, target.get(i)); System.out.println("failed, invalid outcome: " + target.get(i)); } } // System.out.println(cs); // this.subjectTypes.add(cs); return cs; } } // this.subjectTypes.print(); return null; } protected String getChunkTag(Leaf leaf) { if(this.readChunk) return super.getChunkTag(leaf); String tag = leaf.getSyntacticTag(); if(functTagSet.contains(tag)) { return tag; } return null; } @Override protected String getChunkTag(Node node, String parent, int index) { if(this.readChunk) return super.getChunkTag(node, parent, index); else { String tag = node.getSyntacticTag(); String funcTag = tag.substring(0, tag.lastIndexOf(":")); if (!functTagSet.contains(funcTag)) { funcTag = "O"; } if(funcTag.equals(parent)) return "O"; if(funcTag.equals("O")) return funcTag; // check for nested... // we can check the index, and the size of this node (number of leafs) int leafs = countLeafs(node); // check if we have a complete chunk group inside String s = chunks.get(index); boolean valid = s.equals("O") || s.startsWith("B-"); if(valid) { if(chunks.size() == index + leafs) { // last chunk... return funcTag; } String end1 = chunks.get(index + leafs); valid = end1.equals("O") || end1.startsWith("B-"); } if(valid) return funcTag; return "O"; } } private int countLeafs(Node node) { int counter = 0; for (TreeElement element : node.getElements()) { if(element.isLeaf()) { counter++; } else { counter += countLeafs((Node)element); } } return counter; } protected String getPhraseTagFromPosTag(String functionalTag) { return OTHER; } @Override protected boolean isIncludePunctuations() { if(this.readChunk) return super.isIncludePunctuations(); return true; } static class SubjectTypes { private Map<String, AtomicInteger> subjects = new HashMap<String, AtomicInteger>(); private Map<String, String> examples = new HashMap<String, String>(); public void add(ChunkSample sample) { for (Span subj : sample.getPhrasesAsSpanList()) { if(subj.getType().equals("SUBJ")) { String[] chunks = extractChunk(Arrays.copyOfRange(sample.getTags(), subj.getStart(), subj.getEnd())); Span[] c = ChunkSample.phrasesAsSpanList(chunks, chunks, chunks); StringBuilder sb = new StringBuilder(); for (Span span : c) { sb.append(span.getType()).append(" "); } String value = sb.toString().trim(); if(!subjects.containsKey(value)) { subjects.put(value, new AtomicInteger(1)); examples.put(value, Arrays.toString(Arrays.copyOfRange(sample.getSentence(), subj.getStart(), subj.getEnd()))); } else { subjects.get(value).incrementAndGet(); } } } } public void print() { Set<String> chunks = new TreeSet<String>(new Comparator<String>() { @Override public int compare(String arg0, String arg1) { if(arg0.equals(arg1)) return 0; return subjects.get(arg0).intValue() - subjects.get(arg1).intValue(); } }); chunks.addAll(subjects.keySet()); for (String string : chunks) { System.out.println(string + " -> " + subjects.get(string) + "->" + examples.get(string)); } } private String[] extractChunk(String[] postags) { String[] out = new String[postags.length]; for (int i = 0; i < postags.length; i++) { out[i] = extractChunk(postags[i]); } return out; } private String extractChunk(String postag) { int i = postag.indexOf('|'); return postag.substring(i + 1); } } }