package uk.bl.odin.orcid.htmlmeta;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import uk.bl.odin.orcid.htmlmeta.dc.DC_KEYS;
import uk.bl.odin.orcid.htmlmeta.dc.DublinCoreMeta;
import uk.bl.odin.orcid.htmlmeta.eprints.EP_KEYS;
import uk.bl.odin.orcid.htmlmeta.eprints.EPrintsMeta;
import uk.bl.odin.orcid.htmlmeta.highwire.HW_KEYS;
import uk.bl.odin.orcid.htmlmeta.highwire.HighwireMeta;
import uk.bl.odin.orcid.htmlmeta.prism.PrismMeta;
/**
* Class that delegates to various HTML meta extractors and returns a best
* matching composite view.
*
* Supports the same meta as google scholar: Google Scholar supports Highwire
* Press tags (e.g., citation_title), Eprints tags (e.g., eprints.title), BE
* Press tags (e.g., bepress_citation_title), and PRISM tags (e.g.,
* prism.title). Use Dublin Core tags (e.g., DC.title) as a last resort - they
* work poorly for journal papers because Dublin Core doesn't have unambiguous
* fields for journal title, volume, issue, and page numbers. To check that
* these tags are present, visit several abstracts and view their HTML source.
*
* THEY ALL NEED WORK - THEY'VE BEEN BUNGED TOGETHER QUICKLY
*
* @see http://scholar.google.com/intl/en/scholar/inclusion.html#indexing
*
* @author tom
*
*/
public class HTMLMetaBuilder {
private DublinCoreMeta dc;
private HighwireMeta hw;
private PrismMeta pm;
private EPrintsMeta ep;
private Document htmldoc;
public HTMLMetaBuilder(Document htmldoc) {
this.htmldoc=htmldoc;
}
public DublinCoreMeta getDublinCoreMeta(){
if (dc == null){
dc = new DublinCoreMeta();
//build dublin core
for (DC_KEYS key : DC_KEYS.values()){
Elements matching = htmldoc.select("meta[name="+DublinCoreMeta.DC_PREFIX+"."+key+"]");
for (Element e: matching){
String value = e.attr("content");
if (!value.isEmpty())
dc.put(key, value);
}
}
for (DC_KEYS key : DC_KEYS.values()){
Elements matching = htmldoc.select("meta[name="+DublinCoreMeta.DCTERMS_PREFIX+"."+key+"]");
for (Element e: matching){
String value = e.attr("content");
if (!value.isEmpty())
dc.put(key, value);
}
}
}
return dc;
}
public HighwireMeta getHighwireMeta(){
if (hw == null){
hw = new HighwireMeta();
for (HW_KEYS key : HW_KEYS.values()){
Elements matching = htmldoc.select("meta[name="+key+"]");
for (Element e: matching){
String value = e.attr("content");
if (!value.isEmpty())
hw.put(key, value);
}
}
}
return hw;
}
public PrismMeta getPrismMeta(){
if (pm == null){
pm = new PrismMeta();
}
return pm;
}
public EPrintsMeta getEPrintsMeta(){
if (ep == null){
ep = new EPrintsMeta();
for (EP_KEYS key : EP_KEYS.values()){
Elements matching = htmldoc.select("meta[name="+key+"]");
for (Element e: matching){
String value = e.attr("content");
if (!value.isEmpty())
ep.put(key, value);
}
}
}
return ep;
}
}