package uk.bl.wa.hadoop.indexer; /** * Purloined from the TitleLevelMetadata project. * @author rcoram */ import java.util.Collection; import org.apache.solr.common.SolrInputDocument; import org.jdom.Element; import org.jdom.Namespace; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; @SuppressWarnings("unchecked") public class MetadataBuilder { public static final String[] fieldNames = { "id", "collections", "url", "domain", "title", "crawl_date" }; public static final String[] mandatoryFieldNames = { "id", "url", "domain", "crawl_date" }; public static final Namespace oa = Namespace .getNamespace("http://www.openarchives.org/OAI/2.0/"); public static final Namespace oaidc = Namespace.getNamespace("oai_dc", "http://www.openarchives.org/OAI/2.0/oai_dc/"); public static final Namespace dc = Namespace.getNamespace("dc", "http://purl.org/dc/elements/1.1/"); private static XMLOutputter output = new XMLOutputter(Format.getPrettyFormat()); public static String SolrDocumentToElement(SolrInputDocument doc) { Collection<String> fields = doc.getFieldNames(); for (String field : mandatoryFieldNames) { if (!fields.contains(field)) { System.err.println("Missing field '" + field + "' for " + doc.getFieldValues("url").toArray()[0]); return null; } } Element record = new Element("record", oa); Element header = new Element("header", oa); Element identifier = new Element("identifier", oa); identifier.setText((String) doc.getFieldValues("id").toArray()[0]); header.getChildren().add(identifier); record.getChildren().add(header); Element metdata = new Element("metadata", oa); Element oai_dc = new Element("dc", oaidc); if (doc.containsKey("collections")) { Element subject; Object[] collections = doc.getFieldValues("collections").toArray(); for (int i = 0; i < collections.length; i++) { subject = new Element("subject", dc); subject.setText((String) collections[i]); oai_dc.getChildren().add(subject); } } Element source = new Element("source", dc); source.setText((String) doc.getFieldValues("url").toArray()[0]); oai_dc.getChildren().add(source); Element publisher = new Element("publisher", dc); publisher.setText((String) doc.getFieldValues("domain").toArray()[0]); oai_dc.getChildren().add(publisher); Element title = new Element("title", dc); if (doc.containsKey("title")) { title.setText((String) doc.getFieldValues("title").toArray()[0]); } else { title.setText((String) doc.getFieldValues("domain").toArray()[0]); } oai_dc.getChildren().add(title); Element date = new Element("date", dc); String jDate = (String) doc.getFieldValues("crawl_date").toArray()[0]; date.setText(jDate); oai_dc.getChildren().add(date); metdata.getChildren().add(oai_dc); record.getChildren().add(metdata); return output.outputString(record); } }