/**
* Copyright (c) 2010, Regents of the University of Colorado All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer. Redistributions in binary
* form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided
* with the distribution. Neither the name of the University of Colorado at
* Boulder nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
package clear.ftr.xml;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NodeList;
/**
* Abstract feature template.
*
* @author Jinho D. Choi <b>Last update:</b> 4/12/2011
*/
abstract public class AbstractFtrXml {
static protected final String TEMPLATE = "feature_template";
static protected final String CUTOFF = "cutoff";
static protected final String LABEL = "label";
/**
* N-gram feature
*/
static protected final String NGRAM = "ngram";
/**
* Extra feature
*/
static protected final String EXTRA = "extra";
/**
* Number of tokens
*/
static protected final String N = "n";
/**
* Cutoff (>= 0)
*/
static protected final String C = "c";
/**
* Type (e.g., "pp", "ump")
*/
static protected final String T = "t";
/**
* Discrete field (e.g., "f", "m", "p", "d")
*/
static protected final String F = "f";
/**
* "true" | "false"
*/
static protected final String VISIBLE = "visible";
/**
* Field delimiter (e.g., l+1.f)
*/
static protected final String DELIM_F = ":";
/**
* Relation delimiter (e.g., l_hd)
*/
static protected final String DELIM_R = "_";
/**
* N-gram feature [type][templates]
*/
public FtrTemplate[][] a_ngram_templates;
public int n_cutoff_label;
public int n_cutoff_ngram;
public int n_cutoff_extra;
public AbstractFtrXml(String featureXml) {
try {
init(new FileInputStream(featureXml));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public AbstractFtrXml(InputStream fin) {
init(fin);
}
public void init(InputStream fin) {
DocumentBuilderFactory dFactory = DocumentBuilderFactory.newInstance();
try {
DocumentBuilder builder = dFactory.newDocumentBuilder();
Document doc = builder.parse(fin);
initCutoffs(doc);
initNgrams(doc);
initFeatures(doc);
} catch (Exception e) {
e.printStackTrace();
System.exit(1);
}
}
/**
* Initializes cutoffs.
*/
protected void initCutoffs(Document doc) throws Exception {
NodeList eList = doc.getElementsByTagName(CUTOFF);
if (eList.getLength() <= 0) {
return;
}
Element eCutoff = (Element) eList.item(0);
n_cutoff_label = (eCutoff.hasAttribute(LABEL)) ? Integer.parseInt(eCutoff.getAttribute(LABEL)) : 0;
n_cutoff_ngram = (eCutoff.hasAttribute(NGRAM)) ? Integer.parseInt(eCutoff.getAttribute(NGRAM)) : 0;
n_cutoff_extra = (eCutoff.hasAttribute(EXTRA)) ? Integer.parseInt(eCutoff.getAttribute(EXTRA)) : 0;
}
protected void initNgrams(Document doc) throws Exception {
NodeList eList = doc.getElementsByTagName(NGRAM);
HashMap<String, ArrayList<FtrTemplate>> map = new HashMap<>();
int i, n = eList.getLength();
Element eFeature;
for (i = 0; i < n; i++) {
eFeature = (Element) eList.item(i);
if (eFeature.getAttribute(VISIBLE).trim().equals("false")) {
continue;
}
FtrTemplate ftr = getFtrTemplate(eFeature);
if (map.containsKey(ftr.type)) {
map.get(ftr.type).add(ftr);
} else {
ArrayList<FtrTemplate> list = new ArrayList<>();
map.put(ftr.type, list);
list.add(ftr);
}
}
n = map.size();
a_ngram_templates = new FtrTemplate[n][];
ArrayList<String> keys = new ArrayList<>(map.keySet());
Collections.sort(keys);
for (i = 0; i < n; i++) {
ArrayList<FtrTemplate> list = map.get(keys.get(i));
FtrTemplate[] arr = new FtrTemplate[list.size()];
list.toArray(arr);
a_ngram_templates[i] = arr;
}
}
/**
* Convert the element to {@link FtrTemplate}.
*/
protected FtrTemplate getFtrTemplate(Element eFeature) {
int nToken = Integer.parseInt(eFeature.getAttribute(N));
int cutoff = (eFeature.hasAttribute(C)) ? Integer.parseInt(eFeature.getAttribute(C)) : 0;
FtrTemplate ftr = new FtrTemplate(nToken, cutoff);
StringBuilder build = new StringBuilder();
int i;
String type;
for (i = 0; i < nToken; i++) {
FtrToken tok = getFtrToken(eFeature.getAttribute(F + i));
ftr.addFtrToken(i, tok);
build.append(tok.field);
}
if (eFeature.hasAttribute(T)) {
type = eFeature.getAttribute(T).trim();
} else {
type = build.toString();
}
ftr.setType(type);
return ftr;
}
/**
* @param ftr (e.g., "l.f", "l+1.m", "l-1.p", "l0_hd.d")
*/
protected FtrToken getFtrToken(String ftr) {
String[] aField = ftr.split(DELIM_F); // {"l-1_hd", "p"}
String[] aRelation = aField[0].split(DELIM_R); // {"l-1", "hd"}
char source = aRelation[0].charAt(0);
if (!validSource(source)) {
xmlError(ftr);
}
int offset = 0;
if (aRelation[0].length() >= 2) {
if (aRelation[0].charAt(1) == '+') {
offset = Integer.parseInt(aRelation[0].substring(2));
} else {
offset = Integer.parseInt(aRelation[0].substring(1));
}
}
String relation = null;
if (aRelation.length > 1) {
relation = aRelation[1];
if (!validRelation(relation)) {
xmlError(ftr);
}
}
String field = aField[1];
if (!validField(field)) {
xmlError(ftr);
}
return new FtrToken(source, offset, relation, field);
}
/**
* Prints system error and exits.
*/
protected void xmlError(String error) {
System.err.println("Invalid feature: " + error);
System.exit(1);
}
/**
* Initializes other kinds of features.
*/
abstract protected void initFeatures(Document doc) throws Exception;
abstract protected boolean validSource(char source);
abstract protected boolean validRelation(String relation);
abstract protected boolean validField(String filed);
protected void toStringAux(StringBuilder build, String type, FtrTemplate ftr) {
build.append(" <");
build.append(type);
build.append(" ");
build.append(ftr.toString());
build.append("/>\n");
}
}