/** * Copyright 2014, Emory University * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package edu.emory.clir.clearnlp.feature.common; import java.io.InputStream; import org.w3c.dom.Element; import edu.emory.clir.clearnlp.collection.pair.Pair; import edu.emory.clir.clearnlp.component.state.AbstractState; import edu.emory.clir.clearnlp.component.utils.GlobalLexica; import edu.emory.clir.clearnlp.dependency.DEPNode; import edu.emory.clir.clearnlp.feature.AbstractFeatureExtractor; import edu.emory.clir.clearnlp.feature.type.DirectionType; import edu.emory.clir.clearnlp.feature.type.FieldType; import edu.emory.clir.clearnlp.util.StringUtils; /** * @since 3.0.0 * @author Jinho D. Choi ({@code jinho.choi@emory.edu}) */ public class CommonFeatureExtractor<StateType extends AbstractState<?,?>> extends AbstractFeatureExtractor<CommonFeatureTemplate,CommonFeatureToken,StateType> { private static final long serialVersionUID = -3522042349865325347L; public CommonFeatureExtractor(InputStream in) { super(in); } @Override protected CommonFeatureTemplate createFeatureTemplate(Element eFeature) { return new CommonFeatureTemplate(eFeature); } @Override @SuppressWarnings("unchecked") protected String getFeature(CommonFeatureToken token, StateType state, DEPNode node) { switch (token.getField()) { case f : return node.getWordForm(); case f2: return node.getSimplifiedWordForm(); case f3: return node.getLowerSimplifiedWordForm(); case f4: return node.getWordShape(2); case pf: return StringUtils.getPrefix(node.getSimplifiedWordForm(), (int)token.getValue()); case sf: return StringUtils.getSuffix(node.getSimplifiedWordForm(), (int)token.getValue()); case m : return node.getLemma(); case p : return node.getPOSTag(); case n : return node.getNamedEntityTag(); case d : return node.getLabel(); case v : return node.getValency((DirectionType)token.getValue()); case ft: return node.getFeat((String)token.getValue()); case subcat: Pair<DirectionType,FieldType> p = (Pair<DirectionType,FieldType>)token.getValue(); return node.getSubcategorization(p.o1, p.o2); case b : return getBooleanFeatureValue(token, state, node); default: return null; } } @Override protected String[] getFeatures(CommonFeatureToken token, StateType state, DEPNode node) { switch (token.getField()) { case ds : return toLabelArray(node.getDependentList(), (FieldType)token.getValue()); case ds2 : return toLabelArray(node.getGrandDependentList(), (FieldType)token.getValue()); case dsw : return GlobalLexica.getDistributionalSemanticFeatures((int)token.getValue(), node.getWordForm()); case dsls: return GlobalLexica.getDistributionalSemanticFeatures((int)token.getValue(), node.getLowerSimplifiedWordForm()); case orth: return getOrthographicFeatures(state, node); default : return null; } } protected String getBooleanFeatureValue(CommonFeatureToken token, StateType state, DEPNode node) { int field = (int)token.getValue(); boolean b = false; switch (field) { case 0: b = state.isFirstNode(node); break; case 1: b = state.isLastNode(node); break; default: throw new IllegalArgumentException("Unsupported feature: b"+token.getValue()); } return b ? token.getBinaryFeatureKey() : null; } }