/*
* Copyright (c) 2003-2012 Fred Hutchinson Cancer Research Center
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fhcrc.cpl.toolbox.proteomics;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
/**
* User: peter@labkey.com
* Date: Nov 18, 2007
* Time: 11:52:56 PM
*
* this class implements a regular expression-based recognition of identifiers parsed from the fasta files.
*
*
*/
public class IdPattern
{
private String _typeName;
private Pattern _pattern;
private String _strReplace;
private String _typeAfter;
private SortedMap<Integer, Pattern> _mapReplPatterns =null;
// poplulate the map of id patterns
public static final Map<String,IdPattern> ID_PATTERN_MAP = new HashMap<String,IdPattern>();
public static final List<String> UNTYPED_ID_PATTERN_LIST = new ArrayList<String>();
public static final List<String> TYPED_ID_PATTERN_LIST = new ArrayList<String>();
public static final List<String> WHOLE_HEADER_ID_PATTERN_LIST = new ArrayList<String>();
static
{
addUntypedIdPattern("IPI", "IPI..*");
addUntypedIdPattern("COG", "COG[0-9][0-9][0-9][0-9][0-9]");
addUntypedIdPattern("SwissProt", "[A-Z,0-9]{3,6}_[A-Z,0-9]{3,5}");
addUntypedIdPattern("SwissProtAccn", "[A-Z][0-9][A-Z,0-9][A-Z,0-9][A-Z,0-9][0-9]", null, "SwissProt");
addUntypedIdPattern("UniRef100", "UniRef100_([A-Z][0-9][A-Z,0-9][A-Z,0-9][A-Z,0-9][0-9])", "$1", null);
//TypedID patterns may still need tranformation
addTypedIdPattern("SI", "([A-Z,0-9]+)_.*", "$1");
addWholeHeaderIdPattern("ENSEMBL", "^([YRQ][-A-Z,0-9]{4,9})[ ][A-Z]{1}[A-Z,0-9,-]{3,9}[ ]SGDID:[S][0-9]{9}.*","$1");
// According to Phil, SGD gene names may have a trailing letter or a trailing dash and letter
addWholeHeaderIdPattern("SGD_GN", "^[YRQ][-A-Z,0-9]{4,9}[ ]([A-Z]{3}[0-9]+-?[A-Z]?)[ ]SGDID:[S][0-9]{9}.*","$1");
addWholeHeaderIdPattern("SGDID", "^[YRQ][-A-Z,0-9]{4,9}[ ][A-Z]{3}[0-9]+-?[A-Z]?[ ]SGDID:([S][0-9]{9}).*","$1");
addWholeHeaderIdPattern("GN", ".*Gene_Symbol=([^ ]*).*","$1");
}
/**
* This object handles "unqualified" tokens in the fasta header line; ie those that aren't prefaced
* with a <identtype>| that is in the Protein.IdentTypeMap. It could also be used to verify
* qualified token values, but it would need to handle semi-colon delimited id sets and also handle
* two-part tokens differntly.
*
* @param type: the type name for the identifier, from the set of values of the Protein.IdentTypeMap
* @param match a regular expression which matches identifiers of that type
* @param replace an optional replacement string for use when the identifer needs to be extracted or built up
* from the string being tested. Can use regular expression capture groups $0 through $9, where
* $0 matches the entire identifier token
* @param following an optional type name of an identifier that normally follows this type of identifer. Used
* for example for Swissprot syntax of <accessionId>|<sprot_name>
*
* @throws PatternSyntaxException
*/
public IdPattern(String type, String match, String replace, String following) throws PatternSyntaxException
{
_typeName = type;
_pattern = Pattern.compile(match);
_strReplace = replace;
_typeAfter = following;
if (null != _strReplace)
{
_mapReplPatterns = new TreeMap<Integer,Pattern>();
int c=0;
while (c <= 9)
{
String exp = "\\$" + c;
Pattern r = Pattern.compile(exp);
if (c>0 && !r.matcher(_strReplace).find())
break;
_mapReplPatterns.put(c, r);
c++;
}
}
}
public Map<String, Set<String>> getIdFromPattern(String[] tokens, int idx)
{
String idValue=tokens[idx];
Map<String, Set<String>> idAfter=null;
Matcher matcher = _pattern.matcher(idValue);
if (!matcher.matches())
return null;
if ((null!= _typeAfter) && (tokens.length-1 > idx))
{
idAfter = ID_PATTERN_MAP.get(_typeAfter).getIdFromPattern(tokens, idx + 1);
if (null==idAfter)
return null;
}
if (null!= _mapReplPatterns)
{
idValue = _strReplace;
for (Integer ref : _mapReplPatterns.keySet())
{
Matcher matchRepl = _mapReplPatterns.get(ref).matcher(idValue);
if (ref <= matcher.groupCount())
idValue = matchRepl.replaceAll(matcher.group(ref));
}
}
return addIdMap(createIdMap(_typeName, idValue), idAfter);
}
/**
* Adds the contents of one identifer map to another
*
* @param mapExisting an identifier map to be added to
* @param mapNew an identifier map, may be null or empty
* @return always returns a map, but may be empty.
*/
public static Map<String, Set<String>> addIdMap(Map<String, Set<String>> mapExisting, Map<String, Set<String>> mapNew)
{
if (null != mapNew)
{
if (null == mapExisting || mapExisting.size()==0)
return mapNew;
for (String key : mapNew.keySet())
{
if (mapExisting.containsKey(key))
{
Set<String> vals = mapExisting.get(key);
vals.addAll(mapNew.get(key));
}
else
mapExisting.put(key, mapNew.get(key));
}
}
return mapExisting;
}
/**
* Method to create an identifier map from a key, value pair. Value can be
* semicolon dilimited.
*
* @param key type of the identifier, from the IdentMap for the type abbreviation key
* @param value the value of the identifier, can be a semi-colon divided list
* @return an idMapStructure. does not return null, returns an empty map if val is null or blank.
*/
public static Map<String, Set<String>> createIdMap(String key, String value)
{
Map<String, Set<String>> idMap = new HashMap<String, Set<String>>();
Set<String> vals = new HashSet<String>();
if (null!=value)
{
String[] valArray = value.split(";");
for (String v : valArray)
{
v = v.trim();
if (v.length() > 50)
v = v.substring(0, 50);
if (v.length()>0)
vals.add(v);
}
if (vals.size() > 0)
idMap.put(key, vals);
}
return idMap;
}
private static void addUntypedIdPattern(String typeName, String pattern)
{
addUntypedIdPattern(typeName, pattern, null, null);
}
private static void addUntypedIdPattern(String typeName, String pattern, String replace, String typeAfter)
{
addIdPattern(typeName, pattern, replace, typeAfter);
UNTYPED_ID_PATTERN_LIST.add(typeName);
}
private static void addTypedIdPattern(String typeName, String pattern, String replace)
{
addIdPattern(typeName, pattern, replace, null);
TYPED_ID_PATTERN_LIST.add(typeName);
}
private static void addWholeHeaderIdPattern(String typeName, String pattern, String replace)
{
addIdPattern(typeName, pattern, replace, null);
WHOLE_HEADER_ID_PATTERN_LIST.add(typeName);
}
private static void addIdPattern(String typeName, String match, String replace, String typeAfter)
{
try
{
IdPattern idPattern = new IdPattern(typeName, match, replace, typeAfter);
ID_PATTERN_MAP.put(typeName, idPattern);
}
catch (PatternSyntaxException e)
{
throw new RuntimeException(e);
}
}
}