package uk.bl.documents; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import org.jsoup.select.Elements; import play.Logger; import models.Document; public class MetadataExtractor { String nameSelector; String datePublishedSelector; String authorSelector; public MetadataExtractor(String nameSelector, String datePublishedSelector, String authorSelector) { this.nameSelector = nameSelector; this.datePublishedSelector = datePublishedSelector; this.authorSelector = authorSelector; } public void extract(Document document, org.jsoup.nodes.Document doc) { Elements name = doc.select(nameSelector); if (!name.isEmpty()) { document.title = name.get(0).text(); if (datePublishedSelector != null) { Elements datePublished = doc.select(datePublishedSelector); if (!datePublished.isEmpty()) { try { document.publicationDate = new SimpleDateFormat("yyyy-MM-dd").parse(datePublished.get(0).attr("content")); Calendar calendar = Calendar.getInstance(); calendar.setTime(document.publicationDate); document.publicationYear = calendar.get(Calendar.YEAR); } catch (ParseException e) {} } } if (authorSelector != null) { Elements author = doc.select(authorSelector); if (!author.isEmpty()) { String authorsString = author.get(0).text(); String[] authors = authorsString.split(",|and"); if (authors.length >= 1) { String[] a = authors[0].trim().split("\\s+", 2); document.author1Fn = a[0]; document.author1Ln = a[1]; } if (authors.length >= 2) { String[] a = authors[1].trim().split("\\s+", 2); document.author2Fn = a[0]; document.author2Ln = a[1]; } if (authors.length >= 3) { String[] a = authors[2].trim().split("\\s+", 2); document.author3Fn = a[0]; document.author3Ln = a[1]; } } } } else { Logger.error("No "+nameSelector+" found!"); } } }