package eu.europeana.cloud.service.dps.storm.transform.text.oai;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;
import eu.europeana.cloud.service.dps.storm.transform.text.MethodsEnumeration;
import eu.europeana.cloud.service.dps.storm.transform.text.TextExtractor;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMException;
import org.jdom2.Namespace;
import org.jdom2.input.SAXBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Simple text extractor for OAI files.
* @author Pavel Kefurt <Pavel.Kefurt@gmail.com>
*/
public class DcExtractor implements TextExtractor
{
private final Map<String, String> tags;
private static final Logger LOGGER = LoggerFactory.getLogger(DcExtractor.class);
/**
* Constructor of simple oai_dc extractor.
*/
public DcExtractor()
{
//TODO: refactor these values!
tags = new HashMap<>();
tags.put("title", "dc:title");
tags.put("description", "dc:description");
tags.put("creator", "dc:creator");
tags.put("publisher", "dc:publisher");
tags.put("date", "dc:date");
tags.put("type", "dc:type");
tags.put("format", "dc:format");
tags.put("source", "dc:cource");
tags.put("language", "dc:language");
}
/**
* Constructor of simple oai_dc extractor.
* @param tags map of tag_output_name -> tag_name_in_file (with namespace)
*/
public DcExtractor(Map<String, String> tags)
{
this.tags = tags;
}
@Override
public String extractText(InputStream is)
{
if(is == null)
{
LOGGER.warn("No data for extraction.");
return null;
}
JsonObject ret = new JsonObject();
SAXBuilder builder = new SAXBuilder();
try
{
Document document = builder.build(is);
Element metadataNode = document.getRootElement().getChild("metadata");
if(metadataNode == null)
{
LOGGER.warn("Can not extract data from oai-dc because: metadata tag is missing.");
return null;
}
Element dcNode = metadataNode.getChild("dc", Namespace.getNamespace("http://www.openarchives.org/OAI/2.0/oai_dc/"));
if(dcNode == null)
{
LOGGER.warn("Can not extract data from oai-dc because: oai_dc:dc tag is missing.");
return null;
}
for(Map.Entry<String, String> tag: tags.entrySet())
{
List<Element> list;
//use namespace?
String[] tagInfo = tag.getValue().split(":", 2);
if(tagInfo.length > 1)
{
list = dcNode.getChildren(tagInfo[1], dcNode.getNamespace(tagInfo[0]));
}
else
{
list = dcNode.getChildren(tagInfo[0]);
}
if(list.isEmpty())
{
continue;
}
//add as array or single value?
if(list.size() == 1)
{
String s = list.get(0).getTextTrim();
if(!s.isEmpty())
{
ret.addProperty(tag.getKey(), s);
}
}
else
{
JsonArray array = new JsonArray();
for (Element element: list)
{
String s = element.getTextTrim();
if(!s.isEmpty())
{
array.add(new JsonPrimitive(s));
}
}
ret.add(tag.getKey(), array);
}
}
return new Gson().toJson(ret);
}
catch (JDOMException | IOException ex)
{
LOGGER.warn("Can not extract data from oai-dc because: " + ex.getMessage());
}
return null;
}
@Override
public MethodsEnumeration getExtractionMethod()
{
return OaiExtractionMethods.DC_EXTRACTOR;
}
@Override
public Map<String, String> getExtractedMetadata()
{
return null;
}
@Override
public String getRepresentationName()
{
return "json-extracted-from-oai-dc";
}
}