/**
*
*/
package org.genedb.db.loading;
import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Represents the feature table (FT section) of an EMBL file.
*
* @author rh11
*
*/
class FeatureTable extends EmblFile.Section {
private static final Logger logger = Logger.getLogger(FeatureTable.class);
private String filePath;
FeatureTable(EmblFile emblFile) {
this.filePath = emblFile.getFilePath();
}
class Feature {
String type;
int lineNumber;
EmblLocation location;
List<Qualifier> qualifiers = new ArrayList<Qualifier>();
public String getFilePath() {
return filePath;
}
/**
* Get the values of the named qualifier. If the qualifier does not appear at all,
* an empty list is returned. If it appears multiple times, the values are in order
* of appearance.
* @param keys the name(s) of the qualifier(s)
* @return a list of values
*/
public List<String> getQualifierValues(String... keys) {
List<String> ret = new ArrayList<String>();
for (Qualifier qualifier: qualifiers) {
for (String key: keys) {
if (qualifier.name.equals(key)
&& !isQualifierIgnored(type, key)) {
qualifier.used = true;
ret.add(qualifier.value);
}
}
}
return ret;
}
/**
* Does the feature have the specified qualifier?
*
* @param key the name of the qualifier
* @return <code>true</code> if the feature has the specified qualifier,
* or <code>false</code> if not
*/
public boolean hasQualifier(String key) {
return !getQualifierValues(key).isEmpty();
}
/**
* Get the (unique) value of the specified qualifier.
*
* @param key the name of the qualifier
* @return the value of the qualifier, or <code>null</code> if the qualifier
* is not present on the feature
* @throws DataError if the qualifer appears more than once
*/
public String getQualifierValue(String key) throws DataError {
List<String> values = getQualifierValues(key);
if (values.isEmpty()) {
return null;
}
if (values.size() > 1) {
// If the qualifier is simply repeated, with the same value, that's okay.
String uniqueValue = null;
for (String value: values) {
if (uniqueValue == null) {
uniqueValue = value;
} else if (!uniqueValue.equals(value)) {
throw new DataError(String.format("%s:The qualifier '%s' appears more than once in feature '%s' at line %d (with different values)",
this.getFilePath(),key, type, lineNumber));
}
}
logger.warn(String.format("The qualifier /%s=\"%s\" is repeated in feature '%s' at line %d", key, uniqueValue, type, lineNumber));
return uniqueValue;
}
return values.get(0);
}
public Iterable<String> getUnusedQualifiers() {
Set<String> unusedQualifiers = new HashSet<String>();
for (Qualifier qualifier: qualifiers) {
if (!qualifier.used) {
unusedQualifiers.add(qualifier.toString());
}
}
return unusedQualifiers;
}
public Collection<String> getUnusedQualifierNames() {
Set<String> unusedQualifiers = new HashSet<String>();
for (Qualifier qualifier: qualifiers) {
if (!qualifier.used) {
unusedQualifiers.add(qualifier.name);
}
}
return unusedQualifiers;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
for (Qualifier qualifier: qualifiers) {
if (sb.length() > 0) {
sb.append("; ");
}
sb.append(qualifier);
}
return String.format("%s at %s: %s", type, location, sb);
}
/**
* Try to ascertain the uniqueName of the
* transcript this feature represents or corresponds to.
* This makes sense for CDS and UTR features, at least.
*
* @return the value of the <code>/systematic_id</code>
* or <code>temporary_systematic_id</code> qualifier.
* @throws DataError unless the feature has precisely one of the qualifiers
* <code>/systematic_id</code> and <code>temporary_systematic_id</code>.
*/
public String getUniqueName() throws DataError {
return getUniqueName(true);
}
/**
* Try to ascertain the uniqueName of the
* transcript this feature represents or corresponds to.
* This makes sense for CDS and UTR features, at least.
* <p>
* This uses the value of the qualifier <code>/systematic_id</code>
* or <code>/temporary_systematic_id</code> if available. If neither
* of those is found, the qualifier <code>/FEAT_NAME</code> is used;
* if that is not found either, we look for the qualifier <code>/locus_tag</code>.
*
* @param failIfNotFound whether to throw a DataError if no suitable
* name can be found
* @return the unique name of the feature.
* If <code>failIfNotFound</code> is false and no suitable
* name is found, we return <code>null</code>
* @throws DataError unless the feature has precisely one of the qualifiers
* <code>/systematic_id</code>, <code>/locus_tag</code>
* and <code>temporary_systematic_id</code>.
* Only if <code>failIfNotFound</code> is true.
*/
public String getUniqueName(boolean failIfNotFound) throws DataError {
String temporarySystematicId = this.getQualifierValue("temporary_systematic_id");
String systematicId = this.getQualifierValue("systematic_id");
String featName = this.getQualifierValue("FEAT_NAME");
String locusTag = this.getQualifierValue("locus_tag");
if (temporarySystematicId != null && systematicId != null) {
throw new DataError(
String.format("%s feature has both /systematic_id and /temporary_systematic_id", this.type));
}
else if (temporarySystematicId != null) {
temporarySystematicId = temporarySystematicId.replaceAll("\\s","").trim(); //When ids wrap lines in the EMBL file a space is introduced
return temporarySystematicId;
} else if (systematicId != null) {
systematicId = systematicId.replaceAll("\\s","").trim(); //When ids wrap lines in the EMBL file a space is introduced
return systematicId;
} else if (featName != null) {
featName = featName.replaceAll("\\s","").trim(); //When ids wrap lines in the EMBL file a space is introduced
logger.warn(
String.format("%s feature has neither /systematic_id nor /temporary_systematic_id; " +
"using /FEAT_NAME=\"%s\"", this.type, featName));
return featName;
} else if (locusTag != null) {
locusTag = locusTag.replaceAll("\\s","").trim(); //When ids wrap lines in the EMBL file a space is introduced
logger.warn(
String.format("%s feature has no /systematic_id or /FEAT_NAME; using /locus_tag=\"%s\"",
this.type, locusTag));
return locusTag;
} else {
if (failIfNotFound) {
throw new DataError(
String.format("%s feature has none of /systematic_id, /temporary_systematic_id, /FEAT_NAME or /locus_tag", this.type));
}
return null;
}
}
}
class CDSFeature extends Feature {
/**
* Try to ascertain the gene name from the qualifiers present.
*
* @return the gene name, or <code>null</code> if we couldn't find one
* @throws DataError if the qualifiers present an ambiguity
*/
public String getGeneName() throws DataError {
String primaryName = getQualifierValue("primary_name");
List<String> geneQualifiers = getQualifierValues("gene");
String featName = getQualifierValue("FEAT_NAME");
if (primaryName != null) {
return primaryName;
} else if (featName != null) {
return featName;
} else if (geneQualifiers.size() == 1) {
String geneQualifier = geneQualifiers.get(0);
// S. mansoni has some qualifiers of the form /gene="RPN1; ORFNames=CaO19.4956;"
int semicolonIndex = geneQualifier.indexOf(';');
if (semicolonIndex < 0) {
return geneQualifier;
} else {
return geneQualifier.substring(0, semicolonIndex);
}
} else {
return null;
}
}
/**
* If this CDS feature represents an alternative splice-form, return the
* uniqueName of its gene. If it represents a singly-spliced gene, return
* <code>null</code>.
*
* @return the <code>uniqueName</code> of the associated gene, or <code>null</code>
* if this is a singly-spliced gene
*/
public String getSharedId() throws DataError {
String uniqueName = getUniqueName();
String sharedId = getQualifierValue("shared_id");
if (sharedId != null) {
return sharedId;
}
if (getQualifierValues("other_transcript").isEmpty()) {
// We have neither /shared_id nor /other_transcript, so
// assume this is a singly-spliced gene.
return null;
}
// An alternately-spliced transcript does not always have a /shared_id qualifier.
// Sometimes there are just a selection of /other_transcript qualifiers. In that
// case, we try to take the stem of the transcript ID.
Matcher dotMatcher = Pattern.compile("(.*)\\.\\d+").matcher(uniqueName);
if (! dotMatcher.matches()) {
throw new DataError (String.format(
"Alternately-spliced transcript '%s' has no /shared_id qualifier, and its systematic name doesn't end with .<n>",
uniqueName));
}
sharedId = dotMatcher.group(1);
logger.info(String.format("[CDS %s] assuming /shared_id of '%s'", uniqueName, sharedId));
return sharedId;
}
/**
* Does this CDS feature represent a pseudogenic transcript?
*
* @return <code>true</code> if it represents a pseudogenic transcript,
* <code>false</code>if it doesn't.
*/
public boolean isPseudo() {
return hasQualifier("pseudo");
}
/**
* Is this feature obsolete?
*
* @return <code>true</code> if it is (i.e. note="true",
* <code>false</code>if it isn't.
*/
public boolean isObsolete() {
if (getQualifierValues("note").contains(new String("obsolete"))){
return true;
}
return false;
}
}
private class Qualifier {
String name, value;
boolean valueIsQuoted;
private boolean used = false;
public Qualifier(String name, String value, boolean valueIsQuoted) {
this.name = name;
this.value = value;
this.valueIsQuoted = valueIsQuoted;
}
public Qualifier(String name) {
this(name, null, false);
}
@Override
public String toString() {
String value = this.value;
String format;
if (valueIsQuoted) {
format = "/%s=\"%s\"";
value = value.replaceAll("\"", "\"\"");
} else if (value == null) {
format = "/%s";
} else {
format = "/%s=%s";
}
return String.format(format, name, value);
}
}
private List<Feature> features = new ArrayList<Feature>();
public Iterable<Feature> getFeatures() {
List<Feature> nonIgnoredFeatures = new ArrayList<Feature>();
for (Feature feature: features) {
if (isFeatureIgnored(feature)) {
logger.info(String.format("Ignoring '%s' feature at line %d", feature.type, feature.lineNumber));
} else {
nonIgnoredFeatures.add(feature);
}
}
return nonIgnoredFeatures;
}
private Feature currentFeature = null;
private StringBuilder currentLocation = null;
@Override
public void addData(int lineNumber, String data) throws ParsingException {
String featureType = data.substring(0, 16).trim();
String featureData = data.substring(16);
if ("".equals(featureType)) {
// continuation of current feature
if (currentLocation != null) {
parseLocationLine(featureData);
} else {
parseQualifierLine(featureData);
}
} else {
if (currentLocation != null) {
throw new SyntaxError("Feature found while location incomplete");
}
if (currentQualifier != null) {
throw new SyntaxError("Feature found while qualifier incomplete");
}
finished();
currentFeature = featureType.equals("CDS") ? new CDSFeature() : new Feature();
currentFeature.lineNumber = lineNumber;
currentFeature.type = featureType;
if (featureData.endsWith(",")) {
// Location is split over multiple lines
currentLocation = new StringBuilder(featureData);
} else {
currentFeature.location = EmblLocation.parse(featureData);
}
}
}
/*
* Add the current feature to the list of features.
* Called at the end of each feature:
* we call it from addData when another feature is encountered,
* and the EmblFile parser calls it at the end of the feature table.
*/
@Override
public void finished() {
if (currentFeature != null) {
features.add(currentFeature);
}
}
private void parseLocationLine(String line) throws ParsingException {
currentLocation.append(line);
if (! line.endsWith(",")) {
currentFeature.location = EmblLocation.parse(currentLocation.toString());
currentLocation = null;
}
}
private static final String symbolPattern = "[\\w'*-+]*[A-Za-z][\\w'*-+]*";
static final Pattern qualifierPattern = Pattern.compile("/(" + symbolPattern + ")(?:=(.*))?");
static final Pattern quotedStringPattern = Pattern.compile("\"([^\"]*)\"");
private String currentQualifier = null;
private StringBuilder currentString = null;
private void parseQualifierLine(String data) throws ParsingException {
if (currentString != null) {
// There's a quoted string on a previous line that hasn't been closed
currentString.append(' ');
if (quotesMatch(data)) {
// The string continues on the next line
currentString.append(data.replaceAll("\"\"", "\""));
}
else {
// This is the last line of the string
if (! data.endsWith("\"")) {
throw new SyntaxError("Failed to parse string data: unbalanced quotes");
}
currentString.append(data.substring(0, data.length() - 1).replaceAll("\"\"", "\""));
currentFeature.qualifiers.add(new Qualifier(currentQualifier, currentString.toString(), true));
currentQualifier = null;
currentString = null;
}
} else {
// We are not in the middle of a quoted string, so expect a qualifier
Matcher qualifierMatcher = qualifierPattern.matcher(data);
if (!qualifierMatcher.matches()) {
throw new SyntaxError(String.format("Expected a qualifier, found '%s'", data));
}
String qualifierName = qualifierMatcher.group(1);
String qualifierData = qualifierMatcher.group(2);
if (qualifierData == null) {
// e.g. /pseudo
currentFeature.qualifiers.add(new Qualifier(qualifierName));
}
else {
Matcher quotedStringMatcher = quotedStringPattern.matcher(qualifierData);
if (quotedStringMatcher.matches()) {
// Quoted string all on this line, like /foo="bar"
currentFeature.qualifiers.add(new Qualifier(qualifierName, quotedStringMatcher.group(1).replaceAll("\"\"", "\""), true));
} else if (qualifierData.startsWith("\"")) {
// Quoted string that continues on the next line, e.g. /foo="bar "" baz ...\n
if (quotesMatch(qualifierData)) {
throw new SyntaxError("Failed to parse string data: unbalanced quotes");
}
currentQualifier = qualifierName;
currentString = new StringBuilder(qualifierData.substring(1).replaceAll("\"\"", "\""));
} else {
// Not a quoted string. Treat the qualifier value as a simple identifier.
currentFeature.qualifiers.add(new Qualifier(qualifierName, qualifierData, false));
}
}
}
}
/**
* Does the string contain an even number of double-quotes?
* @param string
* @return <code>true</code> if string has an even number of double-quotes,
* or <code>false</code> if it has an odd number.
*/
static boolean quotesMatch(String string) {
boolean even = true;
for (char c: string.toCharArray()) {
if (c == '"') {
even = !even;
}
}
return even;
}
private Set<String> ignoredFeatureTypes = new HashSet<String>();
private Set<String> ignoredQualifiers = new HashSet<String>();
private Map<String,Set<String>> ignoredQualifiersByFeatureType
= new HashMap<String,Set<String>>();
private boolean isFeatureIgnored(Feature feature) {
return ignoredFeatureTypes.contains(feature.type);
}
/**
* Ignore the named qualifier, i.e. do not return any values
* for the qualifier from
* {@link Feature#getQualifierValue(String)}
* or {@link Feature#getQualifierValues(String...)}. Ignored
* qualifiers are still returned by {@link Feature#getUnusedQualifiers()}
* and {@link Feature#getUnusedQualifierNames()}.
*
* @param qualifier the name of the qualifier to ignore
*/
public void ignoreFeature(String featureType) {
logger.info(String.format("Ignoring features of type '%s'", featureType));
ignoredFeatureTypes.add(featureType);
}
private boolean isQualifierIgnored(String featureType, String qualifier) {
if (ignoredQualifiers.contains(qualifier)) {
return true;
}
synchronized(ignoredQualifiersByFeatureType) {
if (ignoredQualifiersByFeatureType.containsKey(featureType)
&& ignoredQualifiersByFeatureType.get(featureType).contains(qualifier)) {
return true;
}
}
return false;
}
/**
* Ignore the named qualifier, i.e. do not return any values
* for the qualifier from
* {@link Feature#getQualifierValue(String)}
* or {@link Feature#getQualifierValues(String...)}. Ignored
* qualifiers are still returned by {@link Feature#getUnusedQualifiers()}
* and {@link Feature#getUnusedQualifierNames()}.
*
* @param qualifier the name of the qualifier to ignore
*/
public void ignoreQualifier(String qualifier) {
logger.info(String.format("Ignoring qualifier /%s on all feature types", qualifier));
ignoredQualifiers.add(qualifier);
}
/**
* Ignore the named qualifier when it appears on a feature of the specified type,
* i.e. do not return any values for the qualifier from
* {@link Feature#getQualifierValue(String)}
* or {@link Feature#getQualifierValues(String...)}. Ignored
* qualifiers are still returned by {@link Feature#getUnusedQualifiers()}
* and {@link Feature#getUnusedQualifierNames()}.
*
* @param qualifier the name of the qualifier to ignore
* @param featureType the type of feature on which to ignore the named qualifier
*/
public void ignoreQualifier(String qualifier, String featureType) {
logger.info(String.format("Ignoring qualifier /%s on '%s' features", qualifier, featureType));
synchronized(ignoredQualifiersByFeatureType) {
if (!ignoredQualifiersByFeatureType.containsKey(featureType)) {
ignoredQualifiersByFeatureType.put(featureType, new HashSet<String>());
}
ignoredQualifiersByFeatureType.get(featureType).add(qualifier);
}
}
/**
* Reset the list of ignored qualifiers, so that no qualifier
* is ignored.
*/
public void resetIgnoredQualifiers() {
ignoredQualifiers.clear();
ignoredQualifiersByFeatureType.clear();
}
}