package org.maltparser.core.syntaxgraph.reader; import java.io.*; import java.net.URL; import java.util.Iterator; import java.util.SortedMap; import java.util.TreeMap; import java.util.regex.PatternSyntaxException; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.io.dataformat.ColumnDescription; import org.maltparser.core.io.dataformat.DataFormatException; import org.maltparser.core.io.dataformat.DataFormatInstance; import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; import org.maltparser.core.syntaxgraph.PhraseStructure; import org.maltparser.core.syntaxgraph.TokenStructure; import org.maltparser.core.syntaxgraph.edge.Edge; import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; /** * * * @author Johan Hall */ public class NegraReader implements SyntaxGraphReader { private enum NegraTables { ORIGIN, EDITOR, WORDTAG, MORPHTAG, NODETAG, EDGETAG, SECEDGETAG, SENTENCE, UNDEF }; private BufferedReader reader; private DataFormatInstance dataFormatInstance; private int sentenceCount; private String optionString; private int formatVersion; private NegraTables currentHeaderTable; private int currentTerminalSize; private int currentNonTerminalSize; private SortedMap<Integer, PhraseStructureNode> nonterminals; private StringBuilder edgelabelSymbol; private StringBuilder edgelabelTableName; private int START_ID_OF_NONTERMINALS = 500; private String fileName = null; private URL url = null; private String charsetName; private int nIterations; private int cIterations; private boolean closeStream = true; public NegraReader() { currentHeaderTable = NegraTables.UNDEF; edgelabelSymbol = new StringBuilder(); edgelabelTableName = new StringBuilder(); nonterminals = new TreeMap<Integer, PhraseStructureNode>(); nIterations = 1; cIterations = 1; } private void reopen() throws MaltChainedException { close(); if (fileName != null) { open(fileName, charsetName); } else if (url != null) { open(url, charsetName); } else { throw new DataFormatException("The input stream cannot be reopen. "); } } public void open(String fileName, String charsetName) throws MaltChainedException { setFileName(fileName); setCharsetName(charsetName); try { open(new FileInputStream(fileName), charsetName); } catch (FileNotFoundException e) { throw new DataFormatException("The input file '" + fileName + "' cannot be found. ", e); } } public void open(URL url, String charsetName) throws MaltChainedException { setUrl(url); setCharsetName(charsetName); try { open(url.openStream(), charsetName); } catch (IOException e) { throw new DataFormatException("The URL '" + url.toString() + "' cannot be opened. ", e); } } public void open(InputStream is, String charsetName) throws MaltChainedException { try { if (is == System.in) { closeStream = false; } open(new InputStreamReader(is, charsetName)); } catch (UnsupportedEncodingException e) { throw new DataFormatException("The character encoding set '" + charsetName + "' isn't supported. ", e); } } private void open(InputStreamReader isr) throws MaltChainedException { setReader(new BufferedReader(isr)); setSentenceCount(0); } public void readProlog() throws MaltChainedException { } public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { if (syntaxGraph == null || !(syntaxGraph instanceof PhraseStructure)) { return false; } syntaxGraph.clear(); final PhraseStructure phraseStructure = (PhraseStructure) syntaxGraph; PhraseStructureNode parent; PhraseStructureNode child = null; currentHeaderTable = NegraTables.UNDEF; String line; syntaxGraph.clear(); nonterminals.clear(); try { while (true) { line = reader.readLine(); if (line == null) { if (syntaxGraph.hasTokens()) { sentenceCount++; if (syntaxGraph instanceof MappablePhraseStructureGraph) { ((MappablePhraseStructureGraph) syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph) syntaxGraph), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); } } if (cIterations < nIterations) { cIterations++; reopen(); return true; } return false; } else if (line.startsWith("#EOS")) { currentTerminalSize = 0; currentNonTerminalSize = 0; currentHeaderTable = NegraTables.UNDEF; if (syntaxGraph instanceof MappablePhraseStructureGraph) { ((MappablePhraseStructureGraph) syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph) syntaxGraph), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); } return true; } else if (line.startsWith("#BOS")) { currentHeaderTable = NegraTables.SENTENCE; int s = -1, e = -1; for (int i = 5, n = line.length(); i < n; i++) { if (Character.isDigit(line.charAt(i)) && s == -1) { s = i; } if (line.charAt(i) == ' ') { e = i; break; } } if (s != e && s != -1 && e != -1) { phraseStructure.setSentenceID(Integer.parseInt(line.substring(s, e))); } sentenceCount++; } else if (currentHeaderTable == NegraTables.SENTENCE) { if (line.length() >= 2 && line.charAt(0) == '#' && Character.isDigit(line.charAt(1))) { // Non-terminal Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); ColumnDescription column = null; currentNonTerminalSize++; char[] lineChars = line.toCharArray(); int start = 0; int secedgecounter = 0; for (int i = 0, n = lineChars.length; i < n; i++) { if (lineChars[i] == '\t' && start == i) { start++; } else if (lineChars[i] == '\t' || i == n - 1) { if (columns.hasNext()) { column = columns.next(); } if (column.getPosition() == 0) { int index = Integer.parseInt((i == n - 1) ? line.substring(start + 1) : line.substring(start + 1, i)); child = nonterminals.get(index); if (child == null) { if (index != 0) { child = ((PhraseStructure) syntaxGraph).addNonTerminalNode(index - START_ID_OF_NONTERMINALS + 1); } nonterminals.put(index, child); } } else if (column.getPosition() == 2 && child != null) { syntaxGraph.addLabel(child, "CAT", (i == n - 1) ? line.substring(start) : line.substring(start, i)); } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL) { edgelabelSymbol.setLength(0); edgelabelSymbol.append((i == n - 1) ? line.substring(start) : line.substring(start, i)); edgelabelTableName.setLength(0); edgelabelTableName.append(column.getName()); } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { int index = Integer.parseInt((i == n - 1) ? line.substring(start) : line.substring(start, i)); parent = nonterminals.get(index); if (parent == null) { if (index == 0) { parent = phraseStructure.getPhraseStructureRoot(); } else { parent = phraseStructure.addNonTerminalNode(index - START_ID_OF_NONTERMINALS + 1); } nonterminals.put(index, parent); } Edge e = phraseStructure.addPhraseStructureEdge(parent, child); syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { if (secedgecounter % 2 == 0) { edgelabelSymbol.setLength(0); edgelabelSymbol.append((i == n - 1) ? line.substring(start) : line.substring(start, i)); secedgecounter++; } else { int index = Integer.parseInt((i == n - 1) ? line.substring(start) : line.substring(start, i)); if (index == 0) { parent = phraseStructure.getPhraseStructureRoot(); } else if (index < START_ID_OF_NONTERMINALS) { parent = phraseStructure.getTokenNode(index); } else { parent = nonterminals.get(index); if (parent == null) { parent = phraseStructure.addNonTerminalNode(index - START_ID_OF_NONTERMINALS + 1); nonterminals.put(index, parent); } } Edge e = phraseStructure.addSecondaryEdge(parent, child); e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); secedgecounter++; } } start = i + 1; } } } else { // Terminal Iterator<ColumnDescription> columns = dataFormatInstance.iterator(); ColumnDescription column = null; currentTerminalSize++; child = syntaxGraph.addTokenNode(currentTerminalSize); char[] lineChars = line.toCharArray(); int start = 0; int secedgecounter = 0; for (int i = 0, n = lineChars.length; i < n; i++) { if (lineChars[i] == '\t' && start == i) { start++; } else if (lineChars[i] == '\t' || i == n - 1) { if (columns.hasNext()) { column = columns.next(); } if (column.getCategory() == ColumnDescription.INPUT && child != null) { syntaxGraph.addLabel(child, column.getName(), (i == n - 1) ? line.substring(start) : line.substring(start, i)); } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_EDGE_LABEL && child != null) { // && column.getName().equals("EDGELABEL")) { edgelabelSymbol.setLength(0); edgelabelSymbol.append((i == n - 1) ? line.substring(start) : line.substring(start, i)); edgelabelTableName.setLength(0); edgelabelTableName.append(column.getName()); } else if (column.getCategory() == ColumnDescription.PHRASE_STRUCTURE_NODE_LABEL && child != null) { int index = Integer.parseInt((i == n - 1) ? line.substring(start) : line.substring(start, i)); parent = nonterminals.get(index); if (parent == null) { if (index == 0) { parent = phraseStructure.getPhraseStructureRoot(); } else { parent = phraseStructure.addNonTerminalNode(index - START_ID_OF_NONTERMINALS + 1); } nonterminals.put(index, parent); } Edge e = phraseStructure.addPhraseStructureEdge(parent, child); syntaxGraph.addLabel(e, edgelabelTableName.toString(), edgelabelSymbol.toString()); } else if (column.getCategory() == ColumnDescription.SECONDARY_EDGE_LABEL && child != null) { if (secedgecounter % 2 == 0) { edgelabelSymbol.setLength(0); edgelabelSymbol.append((i == n - 1) ? line.substring(start) : line.substring(start, i)); secedgecounter++; } else { int index = Integer.parseInt((i == n - 1) ? line.substring(start) : line.substring(start, i)); if (index == 0) { parent = phraseStructure.getPhraseStructureRoot(); } else if (index < START_ID_OF_NONTERMINALS) { parent = phraseStructure.getTokenNode(index); } else { parent = nonterminals.get(index); if (parent == null) { parent = phraseStructure.addNonTerminalNode(index - START_ID_OF_NONTERMINALS + 1); nonterminals.put(index, parent); } } Edge e = phraseStructure.addSecondaryEdge(parent, child); e.addLabel(column.getSymbolTable(), edgelabelSymbol.toString()); secedgecounter++; } } start = i + 1; } } } } else if (line.startsWith("%%")) { // comment skip } else if (line.startsWith("#FORMAT")) { // int index = line.indexOf(' '); // if (index > -1) { // try { // formatVersion = Integer.parseInt(line.substring(index+1)); // } catch (NumberFormatException e) { // // } // } } else if (line.startsWith("#BOT")) { // int index = line.indexOf(' '); // if (index > -1) { // if (line.substring(index+1).equals("ORIGIN")) { // currentHeaderTable = NegraTables.ORIGIN; // } else if (line.substring(index+1).equals("EDITOR")) { // currentHeaderTable = NegraTables.EDITOR; // } else if (line.substring(index+1).equals("WORDTAG")) { // currentHeaderTable = NegraTables.WORDTAG; // } else if (line.substring(index+1).equals("MORPHTAG")) { // currentHeaderTable = NegraTables.MORPHTAG; // } else if (line.substring(index+1).equals("NODETAG")) { // currentHeaderTable = NegraTables.NODETAG; // } else if (line.substring(index+1).equals("EDGETAG")) { // currentHeaderTable = NegraTables.EDGETAG; // } else if (line.substring(index+1).equals("SECEDGETAG")) { // currentHeaderTable = NegraTables.SECEDGETAG; // } else { // currentHeaderTable = NegraTables.UNDEF; // } // } } else if (line.startsWith("#EOT")) { currentHeaderTable = NegraTables.UNDEF; } } } catch (IOException e) { throw new DataFormatException("Error when reading from the input file. ", e); } } public void readEpilog() throws MaltChainedException { } public BufferedReader getReader() { return reader; } public void setReader(BufferedReader reader) { this.reader = reader; } public int getSentenceCount() { return sentenceCount; } public void setSentenceCount(int sentenceCount) { this.sentenceCount = sentenceCount; } public int getFormatVersion() { return formatVersion; } public void setFormatVersion(int formatVersion) { this.formatVersion = formatVersion; } public DataFormatInstance getDataFormatInstance() { return dataFormatInstance; } public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { this.dataFormatInstance = inputDataFormatInstance; } public String getOptions() { return optionString; } public void setOptions(String optionString) throws MaltChainedException { this.optionString = optionString; String[] argv; try { argv = optionString.split("[_\\p{Blank}]"); } catch (PatternSyntaxException e) { throw new DataFormatException("Could not split the penn writer option '" + optionString + "'. ", e); } for (int i = 0; i < argv.length - 1; i++) { if (argv[i].charAt(0) != '-') { throw new DataFormatException("The argument flag should start with the following character '-', not with " + argv[i].charAt(0)); } if (++i >= argv.length) { throw new DataFormatException("The last argument does not have any value. "); } switch (argv[i - 1].charAt(1)) { case 's': try { START_ID_OF_NONTERMINALS = Integer.parseInt(argv[i]); } catch (NumberFormatException e) { throw new MaltChainedException("The TigerXML Reader option -s must be an integer value. "); } break; default: throw new DataFormatException("Unknown NegraReader parameter: '" + argv[i - 1] + "' with value '" + argv[i] + "'. "); } } } public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } public URL getUrl() { return url; } public void setUrl(URL url) { this.url = url; } public String getCharsetName() { return charsetName; } public void setCharsetName(String charsetName) { this.charsetName = charsetName; } public int getNIterations() { return nIterations; } public void setNIterations(int iterations) { nIterations = iterations; } public int getIterationCounter() { return cIterations; } public void close() throws MaltChainedException { try { if (reader != null) { if (closeStream) { reader.close(); } reader = null; } } catch (IOException e) { throw new DataFormatException("Error when closing the input file.", e); } } }