GOParser.java example

Explorer
biojava-master
/*
 *                    BioJava development code
 *
 * This code may be freely distributed and modified under the
 * terms of the GNU Lesser General Public Licence.  This should
 * be distributed with the code.  If you do not have a copy,
 * see:
 *
 *      http://www.gnu.org/copyleft/lesser.html
 *
 * Copyright for this code is held jointly by the individual
 * authors.  These should be listed in @author doc comments.
 *
 * For more information on the BioJava project and its aims,
 * or to join the biojava-l mailing list, visit the home page
 * at:
 *
 *      http://www.biojava.org/
 *
 */

package org.biojava.nbio.ontology.io;

import org.biojava.nbio.ontology.*;

import java.io.BufferedReader;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

/**
 * Simple parser for the Gene Ontology (GO) flatfile format.
 *
 * @author Thomas Down
 * @since 1.4
 */

public class GOParser {
	public Ontology parseGO(BufferedReader goFile,
							String ontoName,
							String ontoDescription,
							OntologyFactory factory)
		throws ParseException, IOException
	{
		try {
			Ontology onto = factory.createOntology(ontoName, ontoDescription);
			Term isa = onto.importTerm(OntoTools.IS_A, null);
			Term partof = null; // fixme: onto.importTerm(OntoTools.PART_OF, null);
			List<Term> termStack = new ArrayList<Term>();
			String line;
			while ((line = goFile.readLine()) != null) {
				int leadSpaces = 0;
				while (line.charAt(leadSpaces) == ' ') {
					++leadSpaces;
				}
				line = line.trim();
				if (line.startsWith("!")) {
					continue;
				}

				StringTokenizer toke = new StringTokenizer(line, "%<$", true);
				String parentRel = toke.nextToken();
				Term term = parseTerm(onto, toke.nextToken());
				if (parentRel.equals("%")) {
					safeAddTriple(onto, term, termStack.get(leadSpaces - 1), isa);
				} else if (parentRel.equals("<")) {
					safeAddTriple(onto, term, termStack.get(leadSpaces - 1), partof);
				}
				while (toke.hasMoreTokens()) {
					String altRel = toke.nextToken();
					Term altTerm = parseTerm(onto, toke.nextToken());
					if (altRel.equals("%")) {
						safeAddTriple(onto, term, altTerm, isa);
					} else if (altRel.equals("<")) {
						safeAddTriple(onto, term, altTerm, partof);
					}
				}

				if (termStack.size() == leadSpaces) {
					termStack.add(term);
				} else {
					termStack.set(leadSpaces, term);
				}
			}
			return onto;
		} catch (AlreadyExistsException ex) {
			throw new RuntimeException( "Duplication in ontology");
		} catch (OntologyException ex) {
			throw new RuntimeException(ex);
		}
	}

	private void safeAddTriple(Ontology onto, Term s, Term o, Term p)
		throws AlreadyExistsException
	{
		if (!onto.containsTriple(s, o, p)) {
			onto.createTriple(s, o, p, null, null);
		}
	}

	private Term parseTerm(Ontology onto, String s)
		throws ParseException, AlreadyExistsException
	{
		int semi = s.indexOf(';');
		int semi2 = s.indexOf(';', semi + 1);
		if (semi < 0) {
			throw new RuntimeException("No semicolon in " + s);
		}
		String termDesc = s.substring(0, semi).trim();
		String termName;
		if (semi2 < 0) {
			termName = s.substring(semi + 1).trim();
		} else {
			termName = s.substring(semi + 1, semi2).trim();
		}
		StringTokenizer toke = new StringTokenizer(termName, ", ");
		termName = toke.nextToken();
		if (onto.containsTerm(termName)) {
			return onto.getTerm(termName);
		} else {
			Term t = onto.createTerm(termName, termDesc);
			if (toke.hasMoreTokens()) {
				List<String> secondaries = new ArrayList<String>();
				while (toke.hasMoreTokens()) {
					secondaries.add(toke.nextToken());
				}
				t.getAnnotation().setProperty("go.secondary_ids", secondaries);
			}
			return t;
		}
	}
}