package edu.jhu.agiga; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import com.ximpleware.AutoPilot; import com.ximpleware.NavException; import com.ximpleware.ParseException; import com.ximpleware.PilotException; import com.ximpleware.VTDException; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; import edu.jhu.agiga.AgigaConstants.DependencyForm; /** * Provides an iterator over AgigaSentence objects given an Annotated Gigaword * file. This class should usually not be used directly since VTD-XML will load * the entire XML file into memory, and requires that the file be unzipped. * Instead, StreamingSentenceReader should be used which provides a fast, * memory-efficient version of this iterator. * * This implementation using VTD-XML should handle XML files up to 2GB in size. * For larger files, we can switch to extended VTD-XML as described here: * <url>http://vtd-xml.sourceforge.net/codeSample/cs12.html</url> * * @author mgormley * */ class AgigaSentenceReader implements Iterable<AgigaSentence>, Iterator<AgigaSentence> { private static final String NULL_NER_TAG = "0"; private static Logger log = Logger.getLogger(AgigaSentenceReader.class.getName()); private int numSentences; private VTDNav vn; private AutoPilot sentAp; private AgigaPrefs prefs; private int nextIdx = -1; public AgigaSentenceReader(String inputFile, AgigaPrefs prefs) { try { this.prefs = prefs; // Read the file into a byte array log.fine("Reading file into byte array"); File f = new File(inputFile); InputStream fis = new FileInputStream(f); log.fine("File size: " + f.length()); byte[] b = new byte[(int)f.length()]; fis.read(b); fis.close(); init(b); } catch (IOException e) { throw new RuntimeException(e); } } public AgigaSentenceReader(byte[] b, AgigaPrefs prefs) { this.prefs = prefs; init(b); } public AgigaSentenceReader(VTDNav vn, AgigaPrefs prefs) { this.prefs = prefs; this.vn = vn; init(); } private void init(byte[] b) { try { // Index the xml with VTD-XML log.fine("Building VTD index"); VTDGen vg = new VTDGen(); vg.setDoc(b); vg.parse(false); vn = vg.getNav(); numSentences = 0; vn.toElement(VTDNav.ROOT); // Initialize auto pilot init(); } catch (NavException e) { throw new RuntimeException(e); } catch (ParseException e) { throw new RuntimeException(e); } } private void init() { try { sentAp = new AutoPilot(this.vn); //sentAp.selectXPath(String.format("//%s[@id]", AgigaConstants.SENTENCE)); sentAp.selectXPath(String.format("//%s/%s", AgigaConstants.SENTENCES, AgigaConstants.SENTENCE)); nextIdx = sentAp.evalXPath(); } catch (VTDException e) { throw new RuntimeException(e); } } @Override public Iterator<AgigaSentence> iterator() { return this; } @Override public boolean hasNext() { return nextIdx != -1; } @Override public AgigaSentence next() { try { int sentId = vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID)); log.finer("sentence id=" + sentId); StanfordAgigaSentence agigaSent = getSentenceInstance(prefs); // Subtract one, since the sentences are one-indexed in the XML but // zero-indexed in this API agigaSent.setSentIdx(sentId - 1); // Below we use clone nav to avoid having to find the "sent" tag // again if (prefs.readWord || prefs.readLemma || prefs.readOffsets || prefs.readPos || prefs.readNer || prefs.readNormNer) { List<AgigaToken> agigaTokens = parseTokens(vn.cloneNav()); agigaSent.setTokens(agigaTokens); } if (prefs.readParse) { String parseText = parseParse(vn.cloneNav()); agigaSent.setParseText(parseText); } if (prefs.readBasicDeps) { List<AgigaTypedDependency> basicDeps = parseDependencies(vn.cloneNav(), DependencyForm.BASIC_DEPS); agigaSent.setBasicDeps(basicDeps); } if (prefs.readColDeps) { List<AgigaTypedDependency> colDeps = parseDependencies(vn.cloneNav(), DependencyForm.COL_DEPS); agigaSent.setColDeps(colDeps); } if (prefs.readColCcprocDeps) { List<AgigaTypedDependency> colCcprocDeps = parseDependencies(vn.cloneNav(), DependencyForm.COL_CCPROC_DEPS); agigaSent.setColCcprocDeps(colCcprocDeps); } // Note that if we instead wanted to find the sent element using // XPath evaluation, we could do the following. // basicDepRelAp.selectXPath(String.format("//sent[@id=%d]", sentId)); // require(basicDepRelAp.evalXPath() != -1); // // We could alternatively move up through the tree to find it, after calling parseTokens(). // // Move back up to the <sent> tag // while (!vn.matchElement(AgigaConstants.SENTENCE)) { // vn.toElement(VTDNav.PARENT); // } numSentences++; nextIdx = sentAp.evalXPath(); return agigaSent; } catch(VTDException e) { throw new RuntimeException(e); } } protected StanfordAgigaSentence getSentenceInstance(AgigaPrefs prefs) { return new StanfordAgigaSentence(prefs); } @Override public void remove() { throw new RuntimeException("not implemented"); } public int getNumSentences() { return numSentences; } /** * Assumes the position of vn is at a AgigaConstants.SENTENCE tag * @param tree * @return */ private List<AgigaToken> parseTokens(VTDNav vn) throws PilotException, NavException { require (vn.matchElement(AgigaConstants.SENTENCE)); int tokId = -1; List<AgigaToken> agigaTokens = new ArrayList<AgigaToken>(); // Loop through each token AutoPilot tokAp = new AutoPilot(vn); tokAp.selectElement(AgigaConstants.TOKEN); while (tokAp.iterate()) { // Just double check that the tokens are in order if (tokId < 0) { tokId = vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID)); } require (vn.parseInt(vn.getAttrVal(AgigaConstants.TOKEN_ID)) == tokId); AgigaToken agigaToken = new AgigaToken(); // Subtract one, since the tokens are one-indexed in the XML but // zero-indexed in this API agigaToken.setTokIdx(tokId - 1); // Read the word, lemma, token offsets, POS tag, NER tag, and // normalized NER // We have to move to the word (first child) so that the next // sibling moves succeed. require(vn.toElement(VTDNav.FC, AgigaConstants.WORD)); if (prefs.readWord) { String word = vn.toString(vn.getText()); agigaToken.setWord(word); } if (prefs.readLemma) { require(vn.toElement(VTDNav.NS, AgigaConstants.LEMMA)); String lemma = vn.toString(vn.getText()); agigaToken.setLemma(lemma); } if (prefs.readOffsets) { require(vn.toElement(VTDNav.NS, AgigaConstants.CHARACTER_OFFSET_BEGIN)); int charOffBegin; if (prefs.strict) { charOffBegin = Integer.parseInt(vn.toString(vn.getText())); } else { // Remove unexpected whitespace surrounding the integer. charOffBegin = Integer.parseInt(vn.toString(vn.getText()).trim()); } agigaToken.setCharOffBegin(charOffBegin); require(vn.toElement(VTDNav.NS, AgigaConstants.CHARACTER_OFFSET_END)); int charOffEnd; if (prefs.strict) { charOffEnd = Integer.parseInt(vn.toString(vn.getText())); } else { // Remove unexpected whitespace surrounding the integer. charOffEnd = Integer.parseInt(vn.toString(vn.getText()).trim()); } agigaToken.setCharOffEnd(charOffEnd); } if (prefs.readPos) { require(vn.toElement(VTDNav.NS, AgigaConstants.POS)); String posTag = vn.toString(vn.getText()); agigaToken.setPosTag(posTag); } if (prefs.readNer) { if (prefs.strict) { require(vn.toElement(VTDNav.NS, AgigaConstants.NER)); String nerTag = vn.toString(vn.getText()); agigaToken.setNerTag(nerTag); } else { String nerTag = null; if (vn.toElement(VTDNav.NS, AgigaConstants.NER)) { nerTag = vn.toString(vn.getText()); } agigaToken.setNerTag(nerTag); } } if (prefs.readNormNer) { // NormNER only applies to some tokens String normNer = null; if (vn.toElement(VTDNav.NS, AgigaConstants.NORM_NER)) { normNer = vn.toString(vn.getText()); } agigaToken.setNormNer(normNer); } agigaTokens.add(agigaToken); tokId++; } return agigaTokens; } /** * Assumes the position of vn is at a AgigaConstants.SENTENCE tag * @return */ private String parseParse(VTDNav vn) throws NavException, PilotException { require (vn.matchElement(AgigaConstants.SENTENCE)); // Move to the <parse> tag require (vn.toElement(VTDNav.FC, AgigaConstants.PARSE)); String parseText = vn.toString(vn.getText()); return parseText; } /** * Assumes the position of vn is at a "sent" tag * @return */ private List<AgigaTypedDependency> parseDependencies(VTDNav vn, DependencyForm form) throws NavException, PilotException { require (vn.matchElement(AgigaConstants.SENTENCE)); // Move to the <basic-deps> tag require (vn.toElement(VTDNav.FC, form.getXmlTag())); List<AgigaTypedDependency> agigaDeps = new ArrayList<AgigaTypedDependency>(); // Loop through the dep tags AutoPilot basicDepRelAp = new AutoPilot(vn); basicDepRelAp.selectElement(AgigaConstants.DEP); while (basicDepRelAp.iterate()) { // Read the type, governor, and dependent String type = vn.toString(vn.getAttrVal(AgigaConstants.DEP_TYPE)); require (vn.toElement(VTDNav.FC, AgigaConstants.GOVERNOR)); int governorId = vn.parseInt(vn.getText()); require (vn.toElement(VTDNav.NS, AgigaConstants.DEPENDENT)); int dependentId = vn.parseInt(vn.getText()); log.finer(String.format("\tdep type=%s\t%d-->%d", type, governorId, dependentId)); // Subtract one, since the tokens are one-indexed in the XML but // zero-indexed in this API AgigaTypedDependency agigaDep = new AgigaTypedDependency(type, governorId - 1, dependentId - 1); agigaDeps.add(agigaDep); } return agigaDeps; } /** * This method will print out the XML from the current position of * <code>vn</code>. Very useful for debugging. */ public static String getElementFragmentAsString(byte[] b, VTDNav vn) throws NavException { long l = vn.getElementFragment(); int offset = (int) l; int len = (int) (l >> 32); String elementFragment = new String(Arrays.copyOfRange(b, offset, offset + len)); return elementFragment; } public static void main(String args[]) throws Exception { // Must be Level.FINER for debug logging Util.initializeLogging(Level.FINE); // Parse each file provided on the command line. for (int i = 0; i < args.length; i++) { AgigaSentenceReader reader = new AgigaSentenceReader(args[i], new AgigaPrefs()); log.fine("Parsing XML"); for (AgigaSentence agigaSent : reader) { // Do nothing } log.info("Number of sentences: " + reader.getNumSentences()); } } // TODO: any call to require should also have a message explaining what condition wasn't met public static void require(boolean truth) { if (!truth) { throw new IllegalStateException(); } } public static void require(boolean truth, String message) { if (!truth) { throw new IllegalStateException(message); } } }