package org.maltparser.core.syntaxgraph.reader; import java.io.*; import java.net.URL; import java.util.Iterator; import java.util.SortedMap; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.io.dataformat.ColumnDescription; import org.maltparser.core.io.dataformat.DataFormatException; import org.maltparser.core.io.dataformat.DataFormatInstance; import org.maltparser.core.syntaxgraph.MappablePhraseStructureGraph; import org.maltparser.core.syntaxgraph.PhraseStructure; import org.maltparser.core.syntaxgraph.TokenStructure; import org.maltparser.core.syntaxgraph.edge.Edge; import org.maltparser.core.syntaxgraph.node.PhraseStructureNode; import org.maltparser.core.syntaxgraph.node.TokenNode; /** * * * @author Johan Hall */ public class BracketReader implements SyntaxGraphReader { private BufferedReader reader; private DataFormatInstance dataFormatInstance; private int sentenceCount; private StringBuilder input; private int terminalCounter; private int nonTerminalCounter; private String optionString; private SortedMap<String, ColumnDescription> inputColumns; private SortedMap<String, ColumnDescription> edgeLabelColumns; private SortedMap<String, ColumnDescription> phraseLabelColumns; private String fileName = null; private URL url = null; private String charsetName; private int nIterations; private int cIterations; private boolean closeStream = true; private char STARTING_BRACKET = '('; private char CLOSING_BRACKET = ')'; private char INPUT_SEPARATOR = ' '; private char EDGELABEL_SEPARATOR = '-'; private char SENTENCE_SEPARATOR = '\n'; private char BLANK = ' '; private char CARRIAGE_RETURN = '\r'; private char TAB = '\t'; public BracketReader() { input = new StringBuilder(); nIterations = 1; cIterations = 1; } private void reopen() throws MaltChainedException { close(); if (fileName != null) { open(fileName, charsetName); } else if (url != null) { open(url, charsetName); } else { throw new DataFormatException("The input stream cannot be reopen. "); } } public void open(String fileName, String charsetName) throws MaltChainedException { setFileName(fileName); setCharsetName(charsetName); try { open(new FileInputStream(fileName), charsetName); } catch (FileNotFoundException e) { throw new DataFormatException("The input file '" + fileName + "' cannot be found. ", e); } } public void open(URL url, String charsetName) throws MaltChainedException { setUrl(url); setCharsetName(charsetName); try { open(url.openStream(), charsetName); } catch (IOException e) { throw new DataFormatException("The URL '" + url.toString() + "' cannot be opened. ", e); } } public void open(InputStream is, String charsetName) throws MaltChainedException { try { if (is == System.in) { closeStream = false; } open(new InputStreamReader(is, charsetName)); } catch (UnsupportedEncodingException e) { throw new DataFormatException("The character encoding set '" + charsetName + "' isn't supported. ", e); } } private void open(InputStreamReader isr) throws MaltChainedException { setReader(new BufferedReader(isr)); setSentenceCount(0); } public void readProlog() throws MaltChainedException { } public boolean readSentence(TokenStructure syntaxGraph) throws MaltChainedException { if (syntaxGraph == null || dataFormatInstance == null) { return false; } syntaxGraph.clear(); int brackets = 0; try { int l = reader.read(); char c; input.setLength(0); while (true) { if (l == -1) { input.setLength(0); return false; } c = (char) l; l = reader.read(); if (c == SENTENCE_SEPARATOR || c == CARRIAGE_RETURN || c == TAB || c == -1) { } else if (c == STARTING_BRACKET) { input.append(c); brackets++; } else if (c == CLOSING_BRACKET) { input.append(c); brackets--; } else if (c == INPUT_SEPARATOR) { if (l != STARTING_BRACKET && l != CLOSING_BRACKET && l != INPUT_SEPARATOR && l != SENTENCE_SEPARATOR && l != CARRIAGE_RETURN && l != TAB && l != -1) { input.append(c); } // Start BracketProgLangReader } else if (c == '\\') { c = (char) l; l = reader.read(); if (c != ' ' && c != '(' && c != ')' && c != '\\' && c != 'n' && c != 'r' && c != 't' && c != '\"' && c != '\'') { // System.out.println("Error"); System.exit(1); } else { input.append("\\").append(c); } // End BracketProgLangReader } else if (brackets != 0) { input.append(c); } if (brackets == 0 && input.length() != 0) { sentenceCount++; terminalCounter = 1; nonTerminalCounter = 1; if (syntaxGraph instanceof PhraseStructure) { bracketing((PhraseStructure) syntaxGraph, 0, input.length(), null); if (syntaxGraph instanceof MappablePhraseStructureGraph) { ((MappablePhraseStructureGraph) syntaxGraph).getMapping().updateDependenyGraph(((MappablePhraseStructureGraph) syntaxGraph), ((PhraseStructure) syntaxGraph).getPhraseStructureRoot()); } } return true; } if (c == -1) { if (brackets != 0) { close(); throw new MaltChainedException("Error when reading from the input file. "); } if (cIterations < nIterations) { cIterations++; reopen(); return true; } return false; } } } catch (IOException e) { close(); throw new MaltChainedException("Error when reading from the input file. ", e); } } private void bracketing(PhraseStructure phraseStructure, int start, int end, PhraseStructureNode parent) throws MaltChainedException { int bracketsdepth = 0; int startpos = start - 1; for (int i = start, n = end; i < n; i++) { if (input.charAt(i) == STARTING_BRACKET // Start BracketProgLangReader && (i == 0 || input.charAt(i - 1) != '\\') // end BracketProgLangReader ) { if (bracketsdepth == 0) { startpos = i; } bracketsdepth++; } else if (input.charAt(i) == CLOSING_BRACKET // Start BracketProgLangReader && (i == 0 || input.charAt(i - 1) != '\\') // end BracketProgLangReader ) { bracketsdepth--; if (bracketsdepth == 0) { extract(phraseStructure, startpos + 1, i, parent); } } } } private void extract(PhraseStructure phraseStructure, int begin, int end, PhraseStructureNode parent) throws MaltChainedException { int index = -1; for (int i = begin; i < end; i++) { if (input.charAt(i) == STARTING_BRACKET // Start BracketProgLangReader && (i == begin || input.charAt(i - 1) != '\\') // end BracketProgLangReader ) { index = i; break; } } if (index == -1) { TokenNode t = phraseStructure.addTokenNode(terminalCounter); if (t == null) { close(); throw new MaltChainedException("Bracket Reader error: could not create a terminal node. "); } terminalCounter++; Edge e = null; if (parent != null) { e = phraseStructure.addPhraseStructureEdge(parent, (PhraseStructureNode) t); } else { close(); throw new MaltChainedException("Bracket Reader error: could not find the parent node. "); } int start = begin; Iterator<String> inputColumnsIterator = inputColumns.keySet().iterator(); Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); boolean noneNode = false; boolean edgeLabels = false; for (int i = begin; i < end; i++) { if (input.charAt(i) == EDGELABEL_SEPARATOR || (input.charAt(i) == INPUT_SEPARATOR // Start BracketProgLangReader && (i == begin || input.charAt(i - 1) != '\\') // end BracketProgLangReader ) || i == end - 1) { if (i == begin && input.charAt(i) == EDGELABEL_SEPARATOR) { noneNode = true; } else if (start == begin) { if ((noneNode && input.charAt(i) != EDGELABEL_SEPARATOR) || !noneNode) { if (inputColumnsIterator.hasNext()) { t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), // Start BracketProgLangReader decodeString( // end BracketProgLangReader (i == end - 1) ? input.substring(start, end) : input.substring(start, i) // Start BracketProgLangReader ) // end BracketProgLangReader ); } start = i + 1; if (input.charAt(i) == EDGELABEL_SEPARATOR) { edgeLabels = true; } } } else if (edgeLabels && e != null) { if (edgeLabelsColumnsIterator.hasNext()) { e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == end - 1) ? input.substring(start, end) : input.substring(start, i)); } start = i + 1; if (input.charAt(i) == INPUT_SEPARATOR // Start BracketProgLangReader && (i == begin || input.charAt(i - 1) != '\\') // end BracketProgLangReader ) { edgeLabels = false; } } else if (input.charAt(i) == EDGELABEL_SEPARATOR && i != end - 1 && (input.charAt(i + 1) != INPUT_SEPARATOR // Start BracketProgLangReader && (i == begin || input.charAt(i - 1) != '\\') // end BracketProgLangReader )) { } else { if (inputColumnsIterator.hasNext()) { t.addLabel(inputColumns.get(inputColumnsIterator.next()).getSymbolTable(), (i == end - 1) ? input.substring(start, end) : input.substring(start, i)); } start = i + 1; } } } } else { PhraseStructureNode nt; Edge e = null; if (parent == null) { nt = phraseStructure.getPhraseStructureRoot(); } else { nt = phraseStructure.addNonTerminalNode(nonTerminalCounter); if (nt == null) { close(); throw new MaltChainedException("Bracket Reader error: could not create a nonterminal node. "); } nonTerminalCounter++; e = phraseStructure.addPhraseStructureEdge(parent, nt); } Iterator<String> phraseLabelColumnsIterator = phraseLabelColumns.keySet().iterator(); Iterator<String> edgeLabelsColumnsIterator = edgeLabelColumns.keySet().iterator(); int newbegin = begin; int start = begin; for (int i = begin; i < index; i++) { if (input.charAt(i) == EDGELABEL_SEPARATOR || i == index - 1) { if (start == newbegin) { if (phraseLabelColumnsIterator.hasNext()) { nt.addLabel(phraseLabelColumns.get(phraseLabelColumnsIterator.next()).getSymbolTable(), (i == index - 1) ? input.substring(start, index) : input.substring(start, i)); } start = i + 1; } else if (e != null) { if (edgeLabelsColumnsIterator.hasNext()) { e.addLabel(edgeLabelColumns.get(edgeLabelsColumnsIterator.next()).getSymbolTable(), (i == index - 1) ? input.substring(start, index) : input.substring(start, i)); } start = i + 1; } } else if (input.charAt(i) == BLANK) { start++; newbegin++; } } bracketing(phraseStructure, index, end, nt); } } private String decodeString(String string) { return string.replace("\\(", "(").replace("\\)", ")").replace("\\ ", " "); } public void readEpilog() throws MaltChainedException { } public BufferedReader getReader() { return reader; } public void setReader(BufferedReader reader) { this.reader = reader; } public int getSentenceCount() throws MaltChainedException { return sentenceCount; } public void setSentenceCount(int sentenceCount) { this.sentenceCount = sentenceCount; } public DataFormatInstance getDataFormatInstance() { return dataFormatInstance; } public void setDataFormatInstance(DataFormatInstance inputDataFormatInstance) { this.dataFormatInstance = inputDataFormatInstance; inputColumns = dataFormatInstance.getInputColumnDescriptions(); edgeLabelColumns = dataFormatInstance.getPhraseStructureEdgeLabelColumnDescriptions(); phraseLabelColumns = dataFormatInstance.getPhraseStructureNodeLabelColumnDescriptions(); } public String getOptions() { return optionString; } public void setOptions(String optionString) throws MaltChainedException { this.optionString = optionString; } public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } public URL getUrl() { return url; } public void setUrl(URL url) { this.url = url; } public String getCharsetName() { return charsetName; } public void setCharsetName(String charsetName) { this.charsetName = charsetName; } public int getNIterations() { return nIterations; } public void setNIterations(int iterations) { nIterations = iterations; } public int getIterationCounter() { return cIterations; } public void close() throws MaltChainedException { try { if (reader != null) { if (closeStream) { reader.close(); } reader = null; } } catch (IOException e) { throw new DataFormatException("Error when closing the input file.", e); } } }