package org.phenoscape.util; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.log4j.Logger; import org.obo.datamodel.OBOClass; import org.phenoscape.model.Character; import org.phenoscape.model.DataSet; import org.phenoscape.model.MultipleState; import org.phenoscape.model.Specimen; import org.phenoscape.model.State; import org.phenoscape.model.Taxon; /** * @author Jim Balhoff */ public class DataMerger { /** * Merge values from a data set into an existing data set. Merges characters, * taxa, matrix data, and document data, in that order. */ public static void mergeDataSets(DataSet newData, DataSet existingData) { mergeCharacters(newData, existingData); mergeTaxa(newData, existingData); mergeMatrix(newData, existingData); mergeDocumentData(newData, existingData); } /** * Merge characters and EQ annotations from a data set into an existing data set. The * "Character Number" and "State Number" columns are used to match a character (by index) and state (by symbol) * in the existing data set. If the index falls outside the current range of characters, * a new character is appended to the existing data set. If a state with the given symbol does * not exist, a new state is appended to the given character. Description information for the new * characters overwrites that of the existing characters. EQ annotations are replaced. */ public static void mergeCharacters(DataSet newData, DataSet existingData) { for (int i = 0; i < newData.getCharacters().size(); i++) { final Character newCharacter = newData.getCharacters().get(i); if (newCharacter == null) continue; // this handles a "sparse" collection of characters - only certain indexes represented if (i >= existingData.getCharacters().size()) { //add character existingData.addCharacter(newCharacter); } else { //merge character data final Character character = existingData.getCharacters().get(i); character.setLabel(newCharacter.getLabel()); for (State newState : newCharacter.getStates()) { final State state = findState(character.getStates(), newState.getSymbol()); if (state != null) { state.setLabel(newState.getLabel()); state.getPhenotypes().clear(); state.getPhenotypes().addAll(newState.getPhenotypes()); } else { character.addState(newState); } } } } } /** * Merge taxa from a dataset into an existing data set. Taxa are matched * via their "Publication Name" first. TTO identifiers and specimen lists are applied to the matched * taxa. Any taxa in the tab file that do not match a taxon in the existing data set are added * to the data set. If a match wasn't found by publication name, taxa are matched by TTO identifer. */ public static void mergeTaxa(DataSet newData, DataSet existingData) { final List<Taxon> importedTaxa = newData.getTaxa(); for (Taxon importedTaxon : importedTaxa) { final Taxon pubNameMatch = findTaxon(existingData.getTaxa(), importedTaxon.getPublicationName()); final Taxon existingTaxon = (pubNameMatch == null) ? (findTaxon(existingData.getTaxa(), importedTaxon.getValidName())) : pubNameMatch; if (existingTaxon != null) { existingTaxon.setValidName(importedTaxon.getValidName()); existingTaxon.setComment(importedTaxon.getComment()); for (Specimen specimen : importedTaxon.getSpecimens()) { existingTaxon.addSpecimen(specimen); } } else { existingData.addTaxon(importedTaxon); } } } /** * Merge matrix values into an existing data set. Characters are matched via their index. * Only matrix values for existing taxa and characters are applied. Values are matched by comparing * the symbol - if a state with that symbol is not available for the character in the existing data set, a * new state with that symbol is added to the character. Taxa are matched via their Publication * Name. Matrix values for unmatched taxa are unaltered. */ public static void mergeMatrix(DataSet newData, DataSet existingData) { for (int i = 0; i < existingData.getCharacters().size(); i++) { final Character currentCharacter = existingData.getCharacters().get(i); for (Taxon taxon : existingData.getTaxa()) { final String taxonName = taxon.getPublicationName(); final Taxon newTaxon = findTaxon(newData.getTaxa(), taxonName); if (newTaxon == null) { continue; } final Character newCharacter = newData.getCharacters().get(i); final State newStateValue = newData.getStateForTaxon(newTaxon, newCharacter); final State state; if (newStateValue == null) { state = null; } else if (newStateValue instanceof MultipleState) { state = mapMultipleStateIntoExistingStates(currentCharacter.getStates(), (MultipleState)newStateValue); } else { final String valueSymbol = newStateValue.getSymbol(); final State existingState = findState(currentCharacter.getStates(), valueSymbol); if (existingState == null) { state = newStateValue; currentCharacter.addState(state); } else { state = existingState; } } existingData.setStateForTaxon(taxon, currentCharacter, state); } } } /** * Replace values such as publication, publication notes, and curators in the * existing data with any non-null values in the new data. If the new data is null * for a field, the existing value is unchanged. */ public static void mergeDocumentData(DataSet newData, DataSet existingData) { if (newData.getPublication() != null) { existingData.setPublication(newData.getPublication()); } if (newData.getCurators() != null) { existingData.setCurators(newData.getCurators()); } if (newData.getPublicationNotes() != null) { existingData.setPublicationNotes(newData.getPublicationNotes()); } } private static Taxon findTaxon(List<Taxon> taxa, String pubName) { if ((pubName == null) || (pubName.equals(""))) { return null; } for (Taxon taxon : taxa) { if (pubName.equals(taxon.getPublicationName())) { return taxon; } } return null; } private static Taxon findTaxon(List<Taxon> taxa, OBOClass validName) { if (validName == null) { return null; } for (Taxon taxon : taxa) { if (validName.equals(taxon.getValidName())) { return taxon; } } return null; } private static State findState(List<State> states, String symbol) { for (State state: states) { if (symbol.equals(state.getSymbol())) { return state; } } return null; } private static MultipleState mapMultipleStateIntoExistingStates(List<State> allExistingStates, MultipleState newState) { final Set<State> existingStates = new HashSet<State>(); for (State state : newState.getStates()) { final State foundState = findState(allExistingStates, state.getSymbol()); if (foundState != null) { existingStates.add(foundState); } else { existingStates.add(state); } } return new MultipleState(existingStates, newState.getMode()); } @SuppressWarnings("unused") private static Logger log() { return Logger.getLogger(DataMerger.class); } }