//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.consumers.utils; import java.io.File; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang.StringUtils; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.DocumentAnnotation; import uk.gov.dstl.baleen.uima.BaleenAnnotator; import uk.gov.dstl.baleen.uima.UimaSupport; /** * Helper for dealing with sourceUrl. */ public class SourceUtils { private SourceUtils() { // Singleton } /** Convert a url into a directory structure to save a file to. * * This is a very basic implementation and will likely need to be made more robust to different * url types. * * @param basePath the parent to save into * @param url the url (to save as) * @return */ public static final File urlToFile(File basePath, String url) { String subUrl; if(url.startsWith("http") || url.startsWith("ftp")) { // Looks lie an actual URL int indexOf = url.indexOf("//"); if(indexOf == -1) { subUrl = url; } else { subUrl = url.substring(indexOf+2); } } else if(url.startsWith("\\\\")) { // Looks like a network path subUrl = url.substring(2); } else if(url.length() > 2 && url.charAt(1) == ':') { // Looks like C:\ subUrl = url.substring(2); } else { // Just use the raw subUrl = url; } subUrl = subUrl.replaceAll("\\\\+", "/") .replaceAll(File.pathSeparator, "/"); subUrl = subUrl.replace("/", File.separator); subUrl = StringUtils.strip(subUrl, File.separator); if(basePath != null) { return new File(basePath, subUrl); } else { return new File(subUrl); } } /** * Returns the base filename from DocumentAnnotation source URI in the given * JCas. * <p> * The basename is the main part of the filename, without extension or * enclosing paths, e.g. for path '/some/directory/SomeFile.txt' this method * will return 'SomeFile'. * </p> * * @param jCas * the {@link JCas} from which to get the document annotation. * @param support * an appropriately initialised {@link UimaSupport} instance * (typically obtained through * {@link BaleenAnnotator#getSupport()}). * @return the filename * @throws IllegalArgumentException * if there is an error parsing the document source URI. */ public static String getDocumentSourceBaseName(final JCas jCas, final UimaSupport support) { DocumentAnnotation documentAnnotation = support.getDocumentAnnotation(jCas); String sourceUri = documentAnnotation.getSourceUri(); return FilenameUtils.getName(sourceUri); } }