/* * BioJava development code * * This code may be freely distributed and modified under the * terms of the GNU Lesser General Public Licence. This should * be distributed with the code. If you do not have a copy, * see: * * http://www.gnu.org/copyleft/lesser.html * * Copyright for this code is held jointly by the individual * authors. These should be listed in @author doc comments. * * For more information on the BioJava project and its aims, * or to join the biojava-l mailing list, visit the home page * at: * * http://www.biojava.org/ * * Created on Jan 18, 2008 * */ package org.biojava.nbio.ontology.obo; import org.biojava.nbio.ontology.Synonym; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.BufferedReader; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.*; /** A class to parse the content of an OBO file. It delegates handling of the * content to the OBOFileEventListener implementation. * * This file contains parts of the OBO-Edit file OBOParseEngine, (particularly the encoding and decoding part) * * http://geneontology.cvs.sourceforge.net/geneontology/go-dev/java/oboedit/sources/org/geneontology/oboedit/dataadapter/OBOParseEngine.java?revision=1.10&view=markup * Thanks to the OboEdit developers for giving permission to release this in BioJava. * * * @author Andreas Prlic * @author John Day Richter * @since 1.6 */ public class OboFileParser { private static final Logger logger = LoggerFactory.getLogger(OboFileParser.class); List<OboFileEventListener> listeners; protected String line; protected int linenum = 0; protected int totalSize = 0; protected int bytesRead = 0; protected StringBuffer tempBuffer = new StringBuffer(); protected SimpleDateFormat dateFormat = new SimpleDateFormat("dd:MM:yyyy HH:mm", Locale.US); protected static final Map<Character, Character> escapeChars = new HashMap<Character, Character>(); protected static final Map<Character, Character> unescapeChars = new HashMap<Character, Character>(); static { escapeChars.put(new Character('n'), new Character('\n')); escapeChars.put(new Character('W'), new Character(' ')); escapeChars.put(new Character('t'), new Character('\t')); escapeChars.put(new Character(':'), new Character(':')); escapeChars.put(new Character(','), new Character(',')); escapeChars.put(new Character('"'), new Character('"')); escapeChars.put(new Character('\''), new Character('\'')); escapeChars.put(new Character('\\'), new Character('\\')); escapeChars.put(new Character('{'), new Character('{')); escapeChars.put(new Character('}'), new Character('}')); escapeChars.put(new Character('('), new Character('(')); escapeChars.put(new Character(')'), new Character(')')); escapeChars.put(new Character('['), new Character('[')); escapeChars.put(new Character(']'), new Character(']')); escapeChars.put(new Character('!'), new Character('!')); Iterator <Character> it = escapeChars.keySet().iterator(); while (it.hasNext()) { Character key = it.next(); Character value = escapeChars.get(key); unescapeChars.put(value, key); } } public static class SOPair { public String str = null; public int index = -1; public int endIndex = -1; public SOPair(String str, int index) { this(str, index, -1); } public SOPair(String str, int index, int endIndex) { this.str = str; this.index = index; this.endIndex = endIndex; } } public OboFileParser(){ listeners = new ArrayList<OboFileEventListener>(); } public void addOboFileEventListener(OboFileEventListener listener){ listeners.add(listener); } public List<OboFileEventListener> getOboFileEventListener(){ return listeners; } /** parse an ontology file * * @param oboFile * @throws IOException * @throws IOException */ public void parseOBO(BufferedReader oboFile) throws IOException{ String line; String currentStanza; while ((line = oboFile.readLine()) != null) { if (line.length() == 0) continue; if ( line.charAt(0) == '[') { if (line.charAt(line.length() - 1) != ']') throw new IOException("Unclosed stanza: \"" + line + "\"" ); String stanzaname = line.substring(1, line.length() - 1); if (stanzaname.length() < 1) throw new IOException("Empty stanza: \"" +line+"\""); currentStanza = stanzaname; //logger.info("stanza: {}", currentStanza); triggerNewStanza(currentStanza); } else { // a content line SOPair pair; pair = unescape(line, ':', 0, true); //logger.info(pair); String name = pair.str; int lineEnd = findUnescaped(line, '!', 0, line.length(), true); if (lineEnd == -1) lineEnd = line.length(); // find nested values NestedValue nv = null; int trailingStartIndex = -1; int trailingEndIndex = -1; for (int i = lineEnd - 1; i >= 0; i--) { if (Character.isWhitespace(line.charAt(i))) { // keep going until we see non-whitespace } else if (line.charAt(i) == '}') { // if the first thing we see is a closing brace, // we have a trailing modifier if (i >= 1 && line.charAt(i - 1) == '\\') continue; trailingEndIndex = i; break; } else break; } if (trailingEndIndex != -1) { for (int i = trailingEndIndex - 1; i >= 0; i--) { if (line.charAt(i) == '{') { if (i >= 1 && line.charAt(i - 1) == '\\') continue; trailingStartIndex = i + 1; } } } int valueStopIndex; if (trailingStartIndex == -1 && trailingEndIndex != -1) throw new IOException("Unterminated trailing modifier. " + line); else if (trailingStartIndex != -1) { valueStopIndex = trailingStartIndex - 1; String trailing = line.substring(trailingStartIndex, trailingEndIndex).trim(); nv = new NestedValue(); getNestedValue(nv, trailing, 0); } else valueStopIndex = lineEnd; String value = line.substring(pair.index + 1, valueStopIndex).trim(); /* * if (nv != null) logger.warn("nv = "+nv+", value = * |"+value+"|"); */ if (value.length() == 0) throw new IOException("Tag found with no value "+ line); if ( isSynonym(name)){ Synonym synonym = parseSynonym(name,value); triggerNewSynonym(synonym); } else { //logger.info("new key:" + name + " " + value); triggerNewKey(name,value); } //logger.info("parsed key: " + name +" value: " + value + " nv: " + nv); } } } private boolean isSynonym(String key){ if ( key.equals(OboFileHandler.SYNONYM) || key.equals(OboFileHandler.EXACT_SYNONYM)) return true; return false; } /** parse the Synonym String from the Term. * value can be: * <pre>"ca_bind" RELATED [uniprot:curation]</pre> * @param value * @return the synonym text */ private Synonym parseSynonym(String key, String value) throws IOException{ //logger.info("PARSE SYNONYM " + key + " " + value); int startIndex = findUnescaped(value, '"', 0, value.length()); if (startIndex == -1) throw new IOException("Expected \"" + line + " " + linenum); SOPair p = unescape(value, '"', startIndex + 1, value.length(), true); int defIndex = findUnescaped(value, '[', p.index, value.length()); if (defIndex == -1) { throw new IOException("Badly formatted synonym. " + "No dbxref list found." + line + " " + linenum ); } String leftovers = value.substring(p.index + 1, defIndex).trim(); StringTokenizer tokenizer = new StringTokenizer(leftovers, " \t"); int scope = Synonym.RELATED_SYNONYM; if ( key.equals(OboFileHandler.EXACT_SYNONYM)) scope = Synonym.EXACT_SYNONYM; else if ( key.equals(OboFileHandler.BROAD_SYNONYM)) scope = Synonym.BROAD_SYNONYM; else if ( key.equals(OboFileHandler.NARROW_SYNONYM)) scope = Synonym.NARROW_SYNONYM; String catID = null; for (int i = 0; tokenizer.hasMoreTokens(); i++) { String token = tokenizer.nextToken(); //logger.info("TOKEN:" +token); if (i == 0) { if (token.equals("RELATED")) scope = Synonym.RELATED_SYNONYM; else if (token.equals("UNSPECIFIED")) scope = Synonym.RELATED_SYNONYM; else if (token.equals("EXACT")) scope = Synonym.EXACT_SYNONYM; else if (token.equals("BROAD")) scope = Synonym.BROAD_SYNONYM; else if (token.equals("NARROW")) scope = Synonym.NARROW_SYNONYM; else throw new IOException("Found unexpected scope " + "identifier " + token + line); } else if (i == 1) { catID = token; } else throw new IOException("Expected dbxref list," + " instead found " + token + line ); } Synonym synonym = new Synonym(); synonym.setScope(scope); synonym.setCategory(catID); synonym.setName(p.str); //logger.info("SYNONYM: " + p.str +" " + synonym.getCategory() + " " + synonym.getScope()); Map<String,Object>[] refs = getDbxrefList(value,defIndex + 1, value.length()); // set the refs in the synonym for (Map<String, Object> ref : refs){ @SuppressWarnings("unused") String xref = (String) ref.get("xref"); @SuppressWarnings("unused") String desc = (String) ref.get("desc"); //logger.info(xref + " " + desc); @SuppressWarnings("unused") NestedValue nv = (NestedValue) ref.get("nv"); //TODO: add implementation for this... } return synonym; } protected Map<String,Object>[] getDbxrefList(String line, int startoffset, int endoffset) throws IOException { Vector<Map<String,Object>> temp = new Vector<Map<String,Object>>(); boolean stop = false; while (!stop) { int braceIndex = findUnescaped(line, '{', startoffset, endoffset); int endIndex = findUnescaped(line, ',', startoffset, endoffset, true); boolean trailing = false; if (endIndex == -1) { endIndex = findUnescaped(line, ']', startoffset, endoffset, true); if (endIndex == -1) { throw new IOException("Unterminated xref list " + line); } stop = true; } if (braceIndex != -1 && braceIndex < endIndex) { endIndex = braceIndex; trailing = true; } Map<String, Object> pair = parseXref(line, startoffset, endIndex); if (pair == null) { startoffset++; continue; } NestedValue nv = null; if (trailing) { nv = new NestedValue(); endIndex = getNestedValue(nv, line, endIndex + 1); if (endIndex == -1) { throw new IOException("Badly formatted " + "trailing properties " + line); } pair.put("nv",nv); } temp.add(pair); startoffset = endIndex + 1; } Map<String,Object>[] out = new HashMap[temp.size()]; for (int i = 0; i < temp.size(); i++) { Map<String, Object> pair = temp.get(i); out[i] = pair; } return out; } protected Map<String,Object> parseXref(String line, int startoffset, int endoffset) throws IOException { String xref_str = null; String desc_str = null; SOPair xref = unescape(line, '"', startoffset, endoffset, false); xref_str = xref.str.trim(); if (xref_str.length() == 0) return null; if (xref.index != -1) { SOPair desc = unescape(line, '"', xref.index + 1, endoffset, true); desc_str = desc.str.trim(); } Map<String, Object> m = new HashMap<String, Object>(); m.put("xref",xref_str); m.put("desc",desc_str); return m; } private void triggerNewStanza(String stanza){ Iterator<OboFileEventListener> iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newStanza(stanza); } } private void triggerNewKey(String key, String value){ Iterator<OboFileEventListener> iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newKey(key, value); } } private void triggerNewSynonym(Synonym synonym){ Iterator<OboFileEventListener> iter = listeners.iterator(); while (iter.hasNext()){ OboFileEventListener li = iter.next(); li.newSynonym(synonym); } } public static String escape(String str, boolean escapespaces) { StringBuffer out = new StringBuffer(); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); Object o = unescapeChars.get(new Character(c)); if (o == null) out.append(c); else { if (escapespaces || (!escapespaces && c != ' ' && c != '\t')) { out.append("\\").append(o); } else out.append(c); } } return out.toString(); } public String unescape(String str) throws IOException { return unescape(str, '\0', 0, str.length(), false).str; } public SOPair unescape(String str, char toChar, int startindex, boolean mustFindChar) throws IOException { return unescape(str, toChar, startindex, str.length(), mustFindChar); } public SOPair unescape(String str, char toChar, int startindex, int endindex, boolean mustFindChar) throws IOException { StringBuffer out = new StringBuffer(); int endValue = -1; for (int i = startindex; i < endindex; i++) { char c = str.charAt(i); if (c == '\\') { i++; c = str.charAt(i); Character mapchar = escapeChars .get(new Character(c)); if (mapchar == null) throw new IOException("Unrecognized escape" + " character " + c + " found."); out.append(mapchar); } else if (c == toChar) { endValue = i; break; } else { out.append(c); } } if (endValue == -1 && mustFindChar) { throw new IOException("Expected " + toChar + "." + str); } return new SOPair(out.toString(), endValue); } public static int findUnescaped(String str, char toChar) { return findUnescaped(str, toChar, 0, str.length()); } public static int findUnescaped(String str, char toChar, int startIndex, int endIndex) { return findUnescaped(str, toChar, startIndex, endIndex, false); } public static int findUnescaped(String str, char toChar, int startindex, int endindex, boolean honorQuotes) { boolean inQuotes = false; char quoteChar = '\0'; for (int i = startindex; i < endindex; i++) { char c = str.charAt(i); if (c == '\\') { i++; continue; } else if (inQuotes) { if (c == quoteChar) inQuotes = false; continue; } else if (c == toChar) { return i; } else if (honorQuotes && isQuote(c)) { inQuotes = true; quoteChar = c; } } return -1; } public static boolean isEscapeStarter(char c) { return c == '\\'; } public static boolean isQuote(char c) { return c == '"'; } protected StringBuffer getTempBuffer() { tempBuffer.delete(0, tempBuffer.length()); return tempBuffer; } protected SOPair readQuotedString(String value, int startIndex, int stopIndex, char terminatingChar, boolean requireQuotes, boolean legalEndOfLine) throws IOException { char quoteChar = '\0'; StringBuffer out = getTempBuffer(); int i = startIndex; boolean useQuotes = false; for (; i < stopIndex; i++) { // burn through any leading whitespace if (Character.isWhitespace(value.charAt(i))) continue; // if the first non-whitespace character is not a quote, // proceed in non-quoted mode else if (!isQuote(value.charAt(i))) { if (requireQuotes) throw new IOException( "Expected start of quoted string. " + line + " " + value+ " at linenr " + linenum); useQuotes = false; break; } else { useQuotes = true; quoteChar = value.charAt(i); i++; break; } } // look for a closing quote or final delimiter for (; i < stopIndex; i++) { if (isEscapeStarter(value.charAt(i))) { i++; if (i >= value.length()) throw new IOException("Incomplete escape sequence. " + line); out.append(value.charAt(i)); } else if ((useQuotes && value.charAt(i) == quoteChar) || (!useQuotes && value.charAt(i) == terminatingChar)) { if (!useQuotes) return new SOPair(out.toString().trim(), startIndex, i - 1); else return new SOPair(out.toString(), startIndex, i); } else { out.append(value.charAt(i)); } } if (!useQuotes && legalEndOfLine) return new SOPair(out.toString().trim(), startIndex, i); else throw new IOException("Unterminated quoted string. " +line); } protected int getNestedValue(NestedValue nv, String str, int startIndex) throws IOException { while (startIndex < str.length()) { int equalsIndex = findUnescaped(str, '=', startIndex, str.length()); if (equalsIndex == -1) throw new IOException("Expected = in trailing modifier " +line); String name = str.substring(startIndex, equalsIndex).trim(); SOPair value = readQuotedString(str, equalsIndex + 1, str.length(), ',', false, true); Properties pv = new Properties(); pv.setProperty(unescape(name),value.str); nv.addPropertyValue(pv); startIndex = value.endIndex + 1; for (; startIndex < str.length(); startIndex++) { if (Character.isWhitespace(str.charAt(startIndex))) continue; else if (str.charAt(startIndex) == ',') { startIndex++; break; } else { logger.error("found character |{}|", str.charAt(startIndex)); throw new IOException("Expected comma in trailing modifier. " + line + " linenr: " + linenum); } } } return str.length(); } } class NestedValue { protected Properties propertyValues = new Properties(); protected String name; protected String suggestedComment; public NestedValue() { } @Override public String toString(){ String txt = "NestedValue: " ; Set<Object> keys = propertyValues.keySet(); Iterator<Object> iter = keys.iterator(); while (iter.hasNext()){ String key = iter.next().toString(); String value = propertyValues.get(key).toString(); txt += " [" + key + ":" + value + "]"; } return txt; } public String getName() { return name; } public Properties getPropertyValues() { return propertyValues; } public void addPropertyValue(Properties pv) { Set<Object> keys = pv.keySet(); Iterator<Object> iter = keys.iterator(); while (iter.hasNext()){ String key = iter.next().toString(); String value = pv.get(key).toString(); propertyValues.setProperty(key, value); } } @Override public Object clone() { try { return super.clone(); } catch (CloneNotSupportedException ex) { // this will never happen return null; } } public String getSuggestedComment() { return suggestedComment; } public void setSuggestedComment(String suggestedComment) { this.suggestedComment = suggestedComment; } }