/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.formats.ad; import java.io.IOException; import java.util.ArrayList; import java.util.List; import opennlp.tools.chunker.ChunkSample; import opennlp.tools.formats.ad.ADSentenceStream.Sentence; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf; import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node; import opennlp.tools.util.InputStreamFactory; import opennlp.tools.util.ObjectStream; public class ADChunkBasedHeadFinderSampleStream extends ADChunk2SampleStream { List<String> newTags = null; List<String> headTags = null; public ChunkSample read() throws IOException { Sentence paragraph; while ((paragraph = this.adSentenceStream.read()) != null) { Node root = paragraph.getRoot(); List<String> sentence = new ArrayList<String>(); List<String> tags = new ArrayList<String>(); List<String> target = new ArrayList<String>(); newTags = new ArrayList<String>(); headTags = new ArrayList<String>(); processRoot(root, sentence, tags, target); if (sentence.size() > 0) { return new ChunkSample(sentence, newTags, headTags); } } return null; } public ADChunkBasedHeadFinderSampleStream(InputStreamFactory in, String charsetName) throws IOException { super(in, charsetName); } public ADChunkBasedHeadFinderSampleStream(ObjectStream<String> lineStream) { super(lineStream); } protected void processLeaf(Leaf leaf, boolean isIntermediate, String phraseTag, List<String> sentence, List<String> tags, List<String> target) { super.processLeaf(leaf, isIntermediate, phraseTag, sentence, tags, target); int i = target.size() - 1; // check the previous... if the current chunk is B- or O, and the previous is B-, that should be the head... if(i > 0 && ( target.get(i).startsWith("B-") || OTHER.equals(target.get(i)))) { String prev = target.get(i-1).substring(target.get(i-1).indexOf('|') + 1); if(prev.startsWith("B-") && !headTags.get(i-1).equals("B-H")) { headTags.set(i-1, "B-H"); } } // change the tags newTags.add(tags.get(i) + "|" + target.get(i)); // tags.set(i, tags.get(i) + "|" + target.get(i)); if (/*!isInherited &&*/ ("H".equals(leaf.getSyntacticTag()) || "MV".equals(leaf.getSyntacticTag())) && !OTHER.equals(phraseTag) && isFirstHead(target, headTags)) { headTags.add("B-H"); // target.set(i, "B-H"); } else { // target.set(i, OTHER); headTags.add(OTHER); } } private boolean isFirstHead(List<String> target, List<String> heads) { // look back for existing heads // trivial case: this is a boundary if(target.get(target.size() - 1).startsWith("B-")) return true; for(int i = target.size() - 2; i >= 0; i--) { if(target.get(i).startsWith("I-")) { if(heads.get(i).equals("B-H")) return false; } else if(target.get(i).startsWith("B-")) { if(heads.get(i).equals("B-H")) return false; else return true; } } return true; } }