package org.bbaw.wsp.cms.dochandler.parser.text.parser; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.Calendar; import java.util.GregorianCalendar; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.bbaw.wsp.cms.document.MetadataRecord; import org.bbaw.wsp.cms.dochandler.parser.text.reader.IResourceReader; import org.bbaw.wsp.cms.dochandler.parser.text.reader.ResourceReaderImpl; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; /** * This tool class provides methods to fetch DC fields into a given * {@link MetadataRecord}. Last change: - Added fields documentType, isbn, * creationDate, publishingDate - 06.09.12: throws {@link ApplicationException} * now - Added methods to check if a file is an eDoc index.html file * * @author Sascha Feldmann (wsp-shk1) * */ public class EdocIndexMetadataFetcherTool { private static IResourceReader reader = new ResourceReaderImpl(); /** * This class reads from an URL and fetches the DC tags directly (with String * operations). * * It's designed for the eDoc server. * * @param srcUrl * - the basic Url as String * @param mdRecord * - the {@link MetadataRecord} to fill * * Last change: bugfixed the publishingDate * @return the complete {@link MetadataRecord} * @throws ApplicationException */ public static MetadataRecord fetchHtmlDirectly(final String srcUrl, final MetadataRecord mdRecord) throws ApplicationException { InputStream in; try { in = reader.read(srcUrl); Scanner scanner = new Scanner(in); scanner.useDelimiter("\n"); // delimiter via line break StringBuilder builder = new StringBuilder(); while (scanner.hasNext()) { builder.append(scanner.next()); // concat to one String } String line = builder.toString(); StringBuilder creatorBuilder = new StringBuilder(); // fix: more than one // creator Pattern p = Pattern.compile("(?i)<META NAME=\"(.*?)\" CONTENT=\"(.*?)\">(?i)"); // meta // pattern for (Matcher m = p.matcher(line); m.find();) { String tag = m.group(1); String content = m.group(2); if (tag.equals("DC.Date.Creation_of_intellectual_content")) { // creation // date Calendar cal = new GregorianCalendar(); cal.set(Calendar.YEAR, Integer.parseInt(content)); cal.set(Calendar.DAY_OF_YEAR, 1); cal.set(Calendar.HOUR, 0); cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); mdRecord.setCreationDate(cal.getTime()); } else if (tag.equals("DC.Title")) { mdRecord.setTitle(content); } else if (tag.equals("DC.Creator")) { if (creatorBuilder.toString().length() == 0) { creatorBuilder.append(content); } else { creatorBuilder.append(" ; " + content); } mdRecord.setCreator(creatorBuilder.toString()); } else if (tag.equals("DC.Subject")) { mdRecord.setSwd(content); // DC.Subject follows the // Schlagwortnormdatei } else if (tag.equals("DC.Description")) { mdRecord.setDescription(content); } else if (tag.equals("DC.Identifier")) { if (content.contains("http://")) { mdRecord.setUri(content); } else if (content.contains("urn:")) { mdRecord.setUrn(content); } } } Pattern p2 = Pattern.compile("(?i)<TD class=\"frontdoor\" valign=\"top\"><B>(.*?)</B></TD>.*?<TD class=\"frontdoor\" valign=\"top\">(.*?)</TD><"); for (Matcher m = p2.matcher(line); m.find();) { String key = m.group(1); String value = m.group(2).trim(); if (key.contains("pdf-Format")) { Pattern pLink = Pattern.compile("(?i)<a href=\"(.*?)(\".*?)\">.*?</a>"); Matcher mLink = pLink.matcher(key); mLink.find(); mdRecord.setRealDocUrl(mLink.group(1)); // System.out.println(mLink.group(1)); } else if (key.contains("Freie Schlagwörter")) { mdRecord.setSubject(value); } else if (key.contains("DDC-Sachgruppe")) { mdRecord.setDdc(value); } else if (key.contains("Sprache")) { mdRecord.setLanguage(value); } else if (key.contains("Dokumentart")) { mdRecord.setDocumentType(value); } else if (key.contains("Publikationsdatum")) { final int day = Integer.parseInt(value.substring(0, value.indexOf("."))); final int month = Integer.parseInt(value.substring(value.indexOf(".") + 1, value.lastIndexOf("."))); final int year = Integer.parseInt(value.substring(value.lastIndexOf(".") + 1)); Calendar cal = new GregorianCalendar(); cal.set(year, month-1, day); // bugfixed: month is 0 based! mdRecord.setPublishingDate(cal.getTime()); } else if (key.contains("ISBN")) { mdRecord.setIsbn(value); } else if (key.contains("Institut")) { mdRecord.setPublisher(value); } else if (key.contains("Collection")) { Pattern pColl = Pattern.compile("(?i)<a.*?>(.*?)</a>"); Matcher mColl = pColl.matcher(value); mColl.find(); String collections = mColl.group(1); mdRecord.setCollectionNames(collections); } } // Bugfix: Institut Pattern p3 = Pattern.compile("(?i)<TD class=\"frontdoor\" valign=\"top\"><B>Institut:</B></TD>.*?<TD class=\"frontdoor\" valign=\"top\">(.*?)</TD><"); for (Matcher m = p3.matcher(line); m.find();) { mdRecord.setPublisher(m.group(1)); } in.close(); return mdRecord; } catch (IOException e) { throw new ApplicationException("Problem while parsing " + srcUrl + " for DC tags " + e.getMessage()); } } /** * Check if the file is an index.html file to an eDoc. * * LastChange: Performance optimation - check URI before reading from input stream * * @param uri * - the URL as string to the index.html file. * @return true if the index.html file belongs to an eDoc. * @throws ApplicationException * if the stream couldn't get opened. */ public static boolean isEDocIndex(final String uri) throws ApplicationException { if((uri.contains("edoc.bbaw.de") || uri.endsWith(".html")) && !uri.endsWith(".pdf")) { InputStream in = reader.read(uri); if (in != null) { Scanner scanner = new Scanner(in); scanner.useDelimiter("\n"); StringBuilder builder = new StringBuilder(); while (scanner.hasNext()) { builder.append(scanner.next()); } String content = builder.toString(); Pattern p = Pattern.compile("(?i)<META NAME=\"(.*?)\" CONTENT=\"(.*?)\">(?i)"); for (Matcher m = p.matcher(content); m.find();) { String tag = m.group(1); String value = m.group(2); if (tag.equals("DC.Identifier") && value.contains("edoc.bbaw.de/")) { return true; } } } } return false; } /** * Check if the resource is an eDoc. * * @param uri * - the resource's URI. * @return true if the resource is an (KOBV) eDoc. */ public static boolean isEDoc(String uri) { // test local file system File f = new File(uri); if (f.getParentFile().getName().equals("pdf") && new File(f.getParentFile().getParentFile(), "index.html").exists()) { return true; } else { // test HTTP try { URL url = new URL(uri); int pos = url.toExternalForm().lastIndexOf("/pdf"); if (pos != -1) { String newUrl = url.toExternalForm().substring(0, pos) + "/index.html"; URL indexUrl = new URL(newUrl); @SuppressWarnings("unused") URLConnection conn = indexUrl.openConnection(); return true; } return false; } catch (MalformedURLException e) { return false; } catch (IOException e) { return false; } } } /** * Fetch the eDoc's id as it's stored on the file system. * This id can be used for an OAI/ORE aggregation for example. * @param eDocUrl {@link String} the URL to the eDoc. This will be parsed for the id. * @return {@link Integer} the docID or -1 if the ID couldn'T be parsed. * @throws ApplicationException */ public static int getDocId(final String eDocUrl) throws ApplicationException { if(eDocUrl == null || eDocUrl.isEmpty()) { throw new ApplicationException("The value for the eDocUrl in getDocId mustn't be null or empty."); } Pattern p = Pattern.compile(".*/(.*?)/pdf/.*"); Matcher m = p.matcher(eDocUrl); if(m.find()) { return Integer.parseInt(m.group(1)); } return -1; } }