package uk.bl.odin.orcid.ethos;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import uk.bl.odin.orcid.domain.IsOrcidWorkProvider;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
/**
* Simple scraper that extracts meta information from HTML pages fetched from
* ethos.bl.uk.
*/
public class EthosMetaScraper implements IsOrcidWorkProvider {
public static final String JSOUP_URL = "http://ethos.bl.uk/OrderDetails.do?uin=";
// cache results for 30 minutes.
public static final Cache<String, ThesisMetadata> cache = CacheBuilder.newBuilder()
.expireAfterWrite(30, TimeUnit.MINUTES).maximumSize(100).build();
// TODO:
// DC.identifier not always present.
// sometimes it's <meta name="citation_abstract_html_url"
// content="http://hdl.handle.net/2381/8951" />
// <meta name="DC.identifier" content="http://hdl.handle.net/2381/8951" />
/**
* Scrape the DC metadata from the ETHOS HTML result.
*
* @param ethosID
* @return a populated OrcidWorkMetadata
* @throws IOException
* if JSOUP fails to retrieve document
*/
public ThesisMetadata fetch(String ethosID) throws IOException {
ThesisMetadata meta = cache.getIfPresent(ethosID);
if (meta != null)
return meta;
else
meta = new ThesisMetadata();
String url = JSOUP_URL + ethosID;
Document doc = Jsoup.connect(url).timeout(10000).get();
String creator = doc.select("meta[name=DC.creator]").first().attr("content").toString();
String publisher = doc.select("meta[name=DC.publisher]").first().attr("content").toString();
String title = doc.select("meta[name=DC.title]").first().attr("content").toString();
String year = doc.select("meta[name=DCTERMS.issued]").first().attr("content").toString();
String abstract_ = doc.select("meta[name=DCTERMS.abstract]").first().attr("content").toString();
String id = doc.select("meta[name=DC.identifier]").first().attr("content").toString();
String thesisType = doc.select("meta[name=thesis_type]").first().attr("content").toString();
meta.setAbstract(abstract_);
meta.setCreator(creator);
meta.setPublisher(publisher);
meta.setTitle(title);
meta.setUrl(url);
meta.setYear(year);
meta.setThesisType(thesisType);// could use DC, but it's often
// multi-valued.
meta.getIdentifiers().add(ethosID);
if (id != null && !id.isEmpty())
meta.getIdentifiers().add(id);
cache.put(ethosID, meta);
return meta;
}
}