package org.bbaw.wsp.cms.dochandler.parser.text.parser; import java.net.MalformedURLException; import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; import de.mpg.mpiwg.berlin.mpdl.exception.ApplicationException; /** * A (static) class which offers methods to parse an special eDoc uri. It's used * by the {@link EdocUriParser}. * * @author Sascha Feldmann (wsp-shk1) * @date 15.08.2012 * */ public class EdocUriParser { /** * Return the {@link URL} to the associated index.html file. This contains the * metadata. * * @param docURI * the (input) uri (the uri to be parsed) of the eDoc. * @return the {@link URL} to the index.html file * @throws ApplicationException * if the resulting URL is invalid. */ public static URL getIndexURI(final URL docURI) throws ApplicationException { int lastSlash = docURI.toString().lastIndexOf("pdf/"); String newString = docURI.toString().substring(0, lastSlash) + "index.html"; URL indexURI; try { indexURI = new URL(newString); return indexURI; } catch (MalformedURLException e) { throw new ApplicationException(e.getMessage()); } } /** * Fetch the reference to the underlaying eDoc within an eDoc index.html file. * * @param textOrig * - the (HTML parsed) index.html content * @return the URL as String to the underlaying eDoc. May return null, if not * defined. */ public static String getDocURI(String textOrig) { Pattern p = Pattern.compile("(?i)URL: ([/.:\\p{Alnum}]+)"); for (Matcher m = p.matcher(textOrig); m.find();) { if (m.group(1).contains("http")) { return m.group(1) + "pdf"; } } return null; } }