/*
* This is eMonocot, a global online biodiversity information resource.
*
* Copyright © 2011–2015 The Board of Trustees of the Royal Botanic Gardens, Kew and The University of Oxford
*
* eMonocot is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* eMonocot is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
* the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* The complete text of the GNU Affero General Public License is in the source repository as the file
* ‘COPYING’. It is also available from <http://www.gnu.org/licenses/>.
*/
package org.emonocot.job.delta;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.List;
import org.emonocot.api.match.Match;
import org.emonocot.api.match.taxon.TaxonMatcher;
import org.emonocot.model.Description;
import org.emonocot.model.Taxon;
import org.gbif.ecat.parser.UnparsableException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.batch.item.ItemProcessor;
import au.org.ala.delta.DeltaContext;
import au.org.ala.delta.model.Attribute;
import au.org.ala.delta.model.Character;
import au.org.ala.delta.model.Item;
import au.org.ala.delta.model.MutableDeltaDataSet;
import au.org.ala.delta.model.format.AttributeFormatter;
import au.org.ala.delta.model.format.CharacterFormatter;
import au.org.ala.delta.model.format.ItemFormatter;
import au.org.ala.delta.translation.DataSetFilter;
import au.org.ala.delta.translation.FormatterFactory;
import au.org.ala.delta.translation.ItemListTypeSetter;
import au.org.ala.delta.translation.IterativeTranslator;
import au.org.ala.delta.translation.PrintFile;
import au.org.ala.delta.translation.TypeSetterFactory;
import au.org.ala.delta.translation.naturallanguage.NaturalLanguageTranslator;
public class DeltaNaturalLanguageProcessor implements ItemProcessor<Item,Description> {
private Logger logger = LoggerFactory.getLogger(DeltaNaturalLanguageProcessor.class);
private DeltaContext deltaContext;
private DataSetFilter filter;
private TaxonMatcher taxonMatcher;
private Integer characterForLink;
private String linkPrefix;
private String linkSuffix;
public void setCharacterForLink(Integer characterForLink) {
this.characterForLink = characterForLink;
}
public void setLinkPrefix(String linkPrefix) {
this.linkPrefix = linkPrefix;
}
public void setLinkSuffix(String linkSuffix) {
this.linkSuffix = linkSuffix;
}
public void setDeltaContextHolder(DeltaContextHolder deltaContextHolder) {
assert deltaContextHolder != null;
this.deltaContext = deltaContextHolder.getDeltaContext();
}
public void setFilter(DataSetFilter filter) {
this.filter = filter;
}
public void setTaxonMatcher(TaxonMatcher taxonMatcher) {
this.taxonMatcher = taxonMatcher;
}
private String translate(Item item) throws UnsupportedEncodingException {
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
PrintStream printStream = new PrintStream(byteArrayOutputStream);
PrintFile printFile = new PrintFile(printStream,0);
printFile.setTrimInput(false);
FormatterFactory formatterFactory = new FormatterFactory(deltaContext);
ItemListTypeSetter typeSetter = new TypeSetterFactory().createTypeSetter(deltaContext, printFile);
ItemFormatter itemFormatter = formatterFactory.createItemFormatter(typeSetter);
CharacterFormatter characterFormatter = formatterFactory.createCharacterFormatter();
AttributeFormatter attributeFormatter = formatterFactory.createAttributeFormatter();
IterativeTranslator translator = new NaturalLanguageTranslator(deltaContext, typeSetter, printFile, itemFormatter, characterFormatter, attributeFormatter);
MutableDeltaDataSet dataSet = deltaContext.getDataSet();
int numChars = dataSet.getNumberOfCharacters();
for (int i=1; i<=numChars; i++) {
au.org.ala.delta.model.Character character = dataSet.getCharacter(i);
Attribute attribute = item.getAttribute(character);
if (filter.filter(item, character)) {
logger.info(item.getItemNumber() + ", "+character.getCharacterId()+" = "+attribute.getValueAsString());
translator.beforeAttribute(attribute);
logger.info("translating " + character + " : " + attribute.getValueAsString());
translator.afterAttribute(attribute);
}
}
translator.afterItem(item);
translator.afterLastItem();
printFile.close();
printStream.close();
String output = byteArrayOutputStream.toString("UTF-8");
logger.debug(output.trim());
return output.trim();
}
private Taxon getTaxon(String scientificName) throws UnparsableException {
List<Match<Taxon>> matches = taxonMatcher.match(scientificName);
if(matches.isEmpty()) {
logger.warn("No matches for " + scientificName);
return null;
} else if(matches.size() == 1) {
Match<Taxon> match = matches.get(0);
switch(match.getStatus()) {
case EXACT:
logger.info(scientificName + " matches " + match.getInternal().getScientificName());
break;
case PARTIAL:
logger.warn("Partial match for " + scientificName + " to " + match.getInternal().getScientificName());
break;
}
return match.getInternal();
} else {
logger.warn(matches.size() + " matches for " + scientificName);
return null;
}
}
@Override
public Description process(Item item) throws Exception {
Description description = new Description();
description.setDescription(translate(item));
description.setTaxon(getTaxon(item.getDescription()));
if(characterForLink != null) {
Character character = deltaContext.getCharacter(characterForLink);
Attribute attribute = item.getAttribute(character);
String value = attribute.getValueAsString();
if(value != null) {
// Assumes value is enclosed in angle brackets i.e. is a comment
value = value.substring(1, value.length() - 1);
description.setSource(linkPrefix + value + linkSuffix);
}
}
return description;
}
}