package org.maltparser.core.feature.spec.reader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.regex.Pattern;
import org.maltparser.core.exception.MaltChainedException;
import org.maltparser.core.feature.FeatureException;
import org.maltparser.core.feature.spec.SpecificationModels;
/**
*
*
* @author Johan Hall
*/
public class ParReader implements FeatureSpecReader {
public enum DataStructures {
STACK, INPUT, LEFTCONTEXT, RIGHTCONTEXT
};
public enum ColumnNames {
POS, DEP, LEX, LEMMA, CPOS, FEATS
};
private EnumMap<ColumnNames, String> columnNameMap;
private EnumMap<DataStructures, String> dataStructuresMap;
private boolean useSplitFeats = true;
private boolean covington = false;
private boolean pppath;
private boolean pplifted;
private boolean ppcoveredRoot;
public ParReader() throws MaltChainedException {
initializeColumnNameMap();
initializeDataStructuresMap();
setPppath(false);
setPplifted(false);
setPpcoveredRoot(false);
}
public void load(URL specModelURL, SpecificationModels featureSpecModels) throws MaltChainedException {
BufferedReader br = null;
Pattern tabPattern = Pattern.compile("\t");
if (specModelURL == null) {
throw new FeatureException("The feature specification file cannot be found. ");
}
try {
br = new BufferedReader(new InputStreamReader(specModelURL.openStream()));
} catch (IOException e) {
throw new FeatureException("Could not read the feature specification file '" + specModelURL.toString() + "'. ", e);
}
if (br != null) {
int specModelIndex = featureSpecModels.getNextIndex();
String fileLine;
String items[];
StringBuilder featureText = new StringBuilder();
String splitfeats;
ArrayList<String> fileLines = new ArrayList<String>();
ArrayList<String> orderFileLines = new ArrayList<String>();
while (true) {
try {
fileLine = br.readLine();
} catch (IOException e) {
throw new FeatureException("Could not read the feature specification file '" + specModelURL.toString() + "'. ", e);
}
if (fileLine == null) {
break;
}
if (fileLine.length() <= 1 && fileLine.trim().substring(0, 2).trim().equals("--")) {
continue;
}
fileLines.add(fileLine);
}
try {
br.close();
} catch (IOException e) {
throw new FeatureException("Could not close the feature specification file '" + specModelURL.toString() + "'. ", e);
}
for (int j = 0; j < fileLines.size(); j++) {
orderFileLines.add(fileLines.get(j));
}
boolean deprel;
for (int j = 0; j < orderFileLines.size(); j++) {
deprel = false;
featureText.setLength(0);
splitfeats = "";
items = tabPattern.split(orderFileLines.get(j));
if (items.length < 2) {
throw new FeatureException("The feature specification file '" + specModelURL.toString() + "' must contain at least two columns.");
}
if (!(columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim())) || columnNameMap.containsValue(items[0].trim()))) {
throw new FeatureException("Column one in the feature specification file '" + specModelURL.toString() + "' contains an unknown value '" + items[0].trim() + "'. ");
}
if (items[0].trim().equalsIgnoreCase("DEP") || items[0].trim().equalsIgnoreCase("DEPREL")) {
featureText.append("OutputColumn(DEPREL, ");
deprel = true;
} else {
if (columnNameMap.containsKey(ColumnNames.valueOf(items[0].trim()))) {
featureText.append("InputColumn(").append(columnNameMap.get(ColumnNames.valueOf(items[0].trim()))).append(", ");
} else if (columnNameMap.containsValue(items[0].trim())) {
featureText.append("InputColumn(").append(items[0].trim()).append(", ");
}
if (items[0].trim().equalsIgnoreCase("FEATS") && isUseSplitFeats()) {
splitfeats = "Split(";
}
}
if (!(items[1].trim().equalsIgnoreCase("STACK") || items[1].trim().equalsIgnoreCase("INPUT") || items[1].trim().equalsIgnoreCase("CONTEXT"))) {
throw new FeatureException("Column two in the feature specification file '" + specModelURL.toString() + "' should be either 'STACK', 'INPUT' or 'CONTEXT' (Covington), not '" + items[1].trim() + "'. ");
}
int offset = 0;
if (items.length >= 3) {
try {
offset = new Integer(Integer.parseInt(items[2]));
} catch (NumberFormatException e) {
throw new FeatureException("The feature specification file '" + specModelURL.toString() + "' contains a illegal integer value. ", e);
}
}
String functionArg = "";
if (items[1].trim().equalsIgnoreCase("CONTEXT")) {
if (offset >= 0) {
functionArg = dataStructuresMap.get(DataStructures.valueOf("LEFTCONTEXT")) + "[" + offset + "]";
} else {
functionArg = dataStructuresMap.get(DataStructures.valueOf("RIGHTCONTEXT")) + "[" + Math.abs(offset + 1) + "]";
}
} else if (dataStructuresMap.containsKey(DataStructures.valueOf(items[1].trim()))) {
if (covington == true) {
if (dataStructuresMap.get(DataStructures.valueOf(items[1].trim())).equalsIgnoreCase("Stack")) {
functionArg = "Left[" + offset + "]";
} else {
functionArg = "Right[" + offset + "]";
}
} else {
functionArg = dataStructuresMap.get(DataStructures.valueOf(items[1].trim())) + "[" + offset + "]";
}
} else if (dataStructuresMap.containsValue(items[1].trim())) {
if (covington == true) {
if (items[1].trim().equalsIgnoreCase("Stack")) {
functionArg = "Left[" + offset + "]";
} else {
functionArg = "Right[" + offset + "]";
}
} else {
functionArg = items[1].trim() + "[" + offset + "]";
}
} else {
throw new FeatureException("Column two in the feature specification file '" + specModelURL.toString() + "' should not contain the value '" + items[1].trim());
}
int linearOffset = 0;
int headOffset = 0;
int depOffset = 0;
int sibOffset = 0;
int suffixLength = 0;
if (items.length >= 4) {
linearOffset = new Integer(Integer.parseInt(items[3]));
}
if (items.length >= 5) {
headOffset = new Integer(Integer.parseInt(items[4]));
}
if (items.length >= 6) {
depOffset = new Integer(Integer.parseInt(items[5]));
}
if (items.length >= 7) {
sibOffset = new Integer(Integer.parseInt(items[6]));
}
if (items.length >= 8) {
suffixLength = new Integer(Integer.parseInt(items[7]));
}
if (linearOffset < 0) {
linearOffset = Math.abs(linearOffset);
for (int i = 0; i < linearOffset; i++) {
functionArg = "pred(" + functionArg + ")";
}
} else if (linearOffset > 0) {
for (int i = 0; i < linearOffset; i++) {
functionArg = "succ(" + functionArg + ")";
}
}
if (headOffset >= 0) {
for (int i = 0; i < headOffset; i++) {
functionArg = "head(" + functionArg + ")";
}
} else {
throw new FeatureException("The feature specification file '" + specModelURL.toString() + "' should not contain a negative head function value. ");
}
if (depOffset < 0) {
depOffset = Math.abs(depOffset);
for (int i = 0; i < depOffset; i++) {
functionArg = "ldep(" + functionArg + ")";
}
} else if (depOffset > 0) {
for (int i = 0; i < depOffset; i++) {
functionArg = "rdep(" + functionArg + ")";
}
}
if (sibOffset < 0) {
sibOffset = Math.abs(sibOffset);
for (int i = 0; i < sibOffset; i++) {
functionArg = "lsib(" + functionArg + ")";
}
} else if (sibOffset > 0) {
for (int i = 0; i < sibOffset; i++) {
functionArg = "rsib(" + functionArg + ")";
}
}
if (deprel == true && (pppath == true || pplifted == true || ppcoveredRoot == true)) {
featureSpecModels.add(specModelIndex, mergePseudoProjColumns(functionArg));
} else {
if (suffixLength != 0) {
featureSpecModels.add(specModelIndex, "Suffix(" + featureText.toString() + functionArg + ")," + suffixLength + ")");
} else if (splitfeats.equals("Split(")) {
featureSpecModels.add(specModelIndex, splitfeats + featureText.toString() + functionArg + "),\\|)");
} else {
featureSpecModels.add(specModelIndex, featureText.toString() + functionArg + ")");
}
}
}
}
}
private String mergePseudoProjColumns(String functionArg) {
StringBuilder newFeatureText = new StringBuilder();
int c = 1;
if (pplifted == true) {
c++;
}
if (pppath == true) {
c++;
}
if (ppcoveredRoot == true) {
c++;
}
if (c == 1) { // no merge
newFeatureText.append("OutputColumn(DEPREL, ");
newFeatureText.append(functionArg);
newFeatureText.append(')');
return newFeatureText.toString();
}
if (c == 2) {
newFeatureText.append("Merge(");
newFeatureText.append("OutputColumn(DEPREL, ");
newFeatureText.append(functionArg);
newFeatureText.append("), ");
if (pplifted == true) {
newFeatureText.append("OutputTable(PPLIFTED, ");
newFeatureText.append(functionArg);
newFeatureText.append(")");
}
if (pppath == true) {
newFeatureText.append("OutputTable(PPPATH, ");
newFeatureText.append(functionArg);
newFeatureText.append(")");
}
if (ppcoveredRoot == true) {
newFeatureText.append("OutputTable(PPCOVERED, ");
newFeatureText.append(functionArg);
newFeatureText.append(")");
}
newFeatureText.append(")");
} else if (c == 3) { // use Merge3
int i = 0;
newFeatureText.append("Merge3(");
newFeatureText.append("OutputColumn(DEPREL, ");
newFeatureText.append(functionArg);
newFeatureText.append("), ");
i++;
if (pplifted == true) {
newFeatureText.append("OutputTable(PPLIFTED, ");
newFeatureText.append(functionArg);
i++;
if (i < 3) {
newFeatureText.append("), ");
} else {
newFeatureText.append(")");
}
}
if (pppath == true) {
newFeatureText.append("OutputTable(PPPATH, ");
newFeatureText.append(functionArg);
i++;
if (i < 3) {
newFeatureText.append("), ");
} else {
newFeatureText.append(")");
}
}
if (ppcoveredRoot == true) {
newFeatureText.append("OutputTable(PPCOVERED, ");
newFeatureText.append(functionArg);
i++;
if (i < 3) {
newFeatureText.append("), ");
} else {
newFeatureText.append(")");
}
}
newFeatureText.append(")");
} else { // c == 4
newFeatureText.append("Merge(Merge(");
newFeatureText.append("OutputColumn(DEPREL, ");
newFeatureText.append(functionArg);
newFeatureText.append("), ");
newFeatureText.append("OutputTable(PPLIFTED, ");
newFeatureText.append(functionArg);
newFeatureText.append(")), Merge(");
newFeatureText.append("OutputTable(PPPATH, ");
newFeatureText.append(functionArg);
newFeatureText.append("), ");
newFeatureText.append("OutputTable(PPCOVERED, ");
newFeatureText.append(functionArg);
newFeatureText.append(")))");
}
return newFeatureText.toString();
}
public EnumMap<ColumnNames, String> getColumnNameMap() {
return columnNameMap;
}
public void initializeColumnNameMap() {
columnNameMap = new EnumMap<ColumnNames, String>(ColumnNames.class);
columnNameMap.put(ColumnNames.POS, "POSTAG");
columnNameMap.put(ColumnNames.CPOS, "CPOSTAG");
columnNameMap.put(ColumnNames.DEP, "DEPREL");
columnNameMap.put(ColumnNames.LEX, "FORM");
columnNameMap.put(ColumnNames.LEMMA, "LEMMA");
columnNameMap.put(ColumnNames.FEATS, "FEATS");
}
public void setColumnNameMap(EnumMap<ColumnNames, String> columnNameMap) {
this.columnNameMap = columnNameMap;
}
public EnumMap<DataStructures, String> getDataStructuresMap() {
return dataStructuresMap;
}
//TODO Fix covington
public void initializeDataStructuresMap() {
dataStructuresMap = new EnumMap<DataStructures, String>(DataStructures.class);
dataStructuresMap.put(DataStructures.STACK, "Stack");
dataStructuresMap.put(DataStructures.INPUT, "Input");
}
public void setDataStructuresMap(EnumMap<DataStructures, String> dataStructuresMap) {
this.dataStructuresMap = dataStructuresMap;
}
public boolean isUseSplitFeats() {
return useSplitFeats;
}
public void setUseSplitFeats(boolean useSplitFeats) {
this.useSplitFeats = useSplitFeats;
}
public boolean isCovington() {
return covington;
}
public void setCovington(boolean covington) {
this.covington = covington;
}
public boolean isPppath() {
return pppath;
}
public void setPppath(boolean pppath) {
this.pppath = pppath;
}
public boolean isPplifted() {
return pplifted;
}
public void setPplifted(boolean pplifted) {
this.pplifted = pplifted;
}
public boolean isPpcoveredRoot() {
return ppcoveredRoot;
}
public void setPpcoveredRoot(boolean ppcoveredRoot) {
this.ppcoveredRoot = ppcoveredRoot;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Mapping of column names:\n");
for (ColumnNames columnName : ColumnNames.values()) {
sb.append(columnName.toString()).append("\t").append(columnNameMap.get(columnName)).append("\n");
}
sb.append("Mapping of data structures:\n");
for (DataStructures dataStruct : DataStructures.values()) {
sb.append(dataStruct.toString()).append("\t").append(dataStructuresMap.get(dataStruct)).append("\n");
}
sb.append("Split FEATS column: ").append(useSplitFeats).append("\n");
return sb.toString();
}
}