/******************************************************************************* * Copyright 2007, 2009 Jorge Villalon (jorge.villalon@uai.cl) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package tml.annotators; import java.io.IOException; import java.io.ObjectInputStream; import org.apache.log4j.Logger; import tml.Configuration; import tml.utils.StanfordUtils; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.PennTreebankLanguagePack; import edu.stanford.nlp.trees.TreebankLanguagePack; import java.io.FileInputStream; /** * Annotator that implements the PennTree bank from the Stanford parser. It obtains the * PennTree string and stores it for further processing. * * @author Jorge Villalon */ public class PennTreeAnnotator extends AbstractAnnotator implements Annotator { public static String FIELD_NAME = "penntree"; private static String[] types = {"sentence"}; private static Logger logger = Logger.getLogger(PennTreeAnnotator.class); private static LexicalizedParser parser = null; private static TreebankLanguagePack treeBankLanguagePack = null; private static GrammaticalStructureFactory grammaticalStructureFactory = null; /** * @return the grammaticalStructureFactory * @throws IOException */ public static GrammaticalStructureFactory getGrammaticalStructureFactory() throws IOException { if(grammaticalStructureFactory == null) { logger.debug("PennTreeAnnotator was not initialized, initializing"); (new PennTreeAnnotator()).init(); } return grammaticalStructureFactory; } /** * @return the lexicalizedParser * @throws IOException */ public static LexicalizedParser getParser() throws IOException { if(parser == null) { logger.debug("PennTreeAnnotator was not initialized, initializing"); (new PennTreeAnnotator()).init(); } return parser; } /** * @return the treeBankLanguagePack * @throws IOException */ public static TreebankLanguagePack getTreeBankLanguagePack() throws IOException { if(treeBankLanguagePack == null) { logger.debug("PennTreeAnnotator was not initialized, initializing"); (new PennTreeAnnotator()).init(); } return treeBankLanguagePack; } private boolean lexicalized = false; public PennTreeAnnotator() throws IOException { super(FIELD_NAME, types); } @Override public String[] getAnnotatedText(String annotationLabel) { // TODO Redo when UIMA is added return null; } @Override public String getAnnotations(String text) { try { return StanfordUtils.getPennString(StanfordUtils.getPennTree(text)); } catch (IOException e) { logger.error(e); return null; } } @Override public Object getSchema() { // We don't have a schema, we should use UIMA return null; } @Override public void init() { try { Configuration.getTmlProperties(); } catch (IOException e1) { e1.printStackTrace(); logger.error("No properties"); return; } String PARSER_FILE = Configuration.getTmlFolder() + "/stanford/englishPCFG.ser"; String PARSER_FILE_LEXICALIZED = Configuration.getTmlFolder() + "/stanford/englishFactored.ser"; String parserFile = null; try { if(this.isLexicalized()) { parserFile = PARSER_FILE_LEXICALIZED; } else { parserFile = PARSER_FILE; } parser = new LexicalizedParser(new ObjectInputStream( new FileInputStream(parserFile))); } catch (Exception e) { logger.error("Couldn't load Stanford parser! " + e.getLocalizedMessage()); return; } parser.setOptionFlags(new String[] { "-maxLength", "800", "-retainTmpSubcategories" }); treeBankLanguagePack = new PennTreebankLanguagePack(); grammaticalStructureFactory = treeBankLanguagePack.grammaticalStructureFactory(); logger.info("PennTreeAnnotator initialized, using " + parserFile); } /** * @return the lexicalized */ public boolean isLexicalized() { return lexicalized; } /** * @param lexicalized the lexicalized to set */ public void setLexicalized(boolean lexicalized) { this.lexicalized = lexicalized; } }