/* * Copyright 1998-2009 University Corporation for Atmospheric Research/Unidata * * Portions of this software were developed by the Unidata Program at the * University Corporation for Atmospheric Research. * * Access and use of this software shall impose the following obligations * and understandings on the user. The user is granted the right, without * any fee or cost, to use, copy, modify, alter, enhance and distribute * this software, and any derivative works thereof, and its supporting * documentation for any purpose whatsoever, provided that this entire * notice appears in all copies of the software, derivative works and * supporting documentation. Further, UCAR requests that the user credit * UCAR/Unidata in any publications that result from the use of this * software or in any product that includes this software. The names UCAR * and/or Unidata, however, may not be used in any advertising or publicity * to endorse or promote any products or commercial entity unless specific * written permission is obtained from UCAR/Unidata. The user also * understands that UCAR/Unidata is not obligated to provide the user with * any support, consulting, training or assistance of any kind with regard * to the use, operation and performance of this software nor to provide * the user with any updates, revisions, new versions or "bug fixes." * * THIS SOFTWARE IS PROVIDED BY UCAR/UNIDATA "AS IS" AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL UCAR/UNIDATA BE LIABLE FOR ANY SPECIAL, * INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION * WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE. */ package thredds.crawlabledataset; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import thredds.util.DodsURLExtractor; /** * A description * * @author Ethan Davis * @author Bas Retsios * @since Jun 8, 2005 15:34:04 -0600 */ public class CrawlableDatasetDods implements CrawlableDataset { static private org.slf4j.Logger log = org.slf4j.LoggerFactory .getLogger(CrawlableDatasetDods.class); private static DodsURLExtractor urlExtractor = null; private static Map listDatasetsMap = null; // maintain an in-memory copy for performance reasons .. TODO: add a version-check private String path; private URLConnection pathUrlConnection = null; // store this, for performance reasons private String name; private Object configObj = null; protected CrawlableDatasetDods() { } protected CrawlableDatasetDods(String path, Object configObj) { if (urlExtractor == null) urlExtractor = new DodsURLExtractor(); if (listDatasetsMap == null) // for performance listDatasetsMap = new HashMap(); if (configObj != null) { log.debug("CrawlableDatasetDods(): config object not null, it will be ignored <" + configObj.toString() + ">."); this.configObj = configObj; } if (path.startsWith("http:")) { this.path = path; try { new URI(path); // check syntax .. URISyntaxException if its not good name = getName(path); } catch (URISyntaxException e) { String tmpMsg = "Bad URI syntax for path <" + path + ">: " + e.getMessage(); log.debug( "CrawlableDatasetDods(): " + tmpMsg); throw new IllegalArgumentException( tmpMsg); } // Check if this accessPoint URL is an OPeNDAP server URL. // For now commented-out because it takes far too long when expanding a directory: // all links would be tested, because a CrawlableDataset is new-ed too fast (when its parent is expanded). /* String apVersionString = path + (path.endsWith("/") ? "version" : "/version"); String apVersionResultContent = null; try { apVersionResultContent = urlExtractor.getTextContent( apVersionString); } catch (java.io.IOException e) { String tmpMsg = "The accessPoint URL is not an OPeNDAP server URL (no version info) <" + apVersionString + ">"; log.error( "CrawlableDatasetDods(): " + tmpMsg, e); } if ( apVersionResultContent == null || (apVersionResultContent.indexOf( "DODS") == -1 && apVersionResultContent.indexOf( "OPeNDAP") == -1 && apVersionResultContent.indexOf( "DAP") == -1)) { String tmpMsg = "The accessPoint URL version info is not valid <" + apVersionResultContent + ">"; log.error( "CrawlableDatasetDods(): " + tmpMsg); } */ } else { String tmpMsg = "Invalid url <" + path + ">."; log.debug("CrawlableDatasetDods(): " + tmpMsg); throw new IllegalArgumentException(tmpMsg); } } private CrawlableDatasetDods(CrawlableDatasetDods parent, String childPath) { String normalChildPath = childPath.startsWith("/")?childPath.substring(1):childPath; this.path = parent.getPath(); this.path += this.path.endsWith("/") ? normalChildPath : "/" + normalChildPath; this.name = getName(path); this.configObj = null; } private String getName(String path) { // Attempt to return the last name in the path name sequence. if (!path.equals("/")) { String tmpName = path.endsWith("/") ? path.substring(0, path.length() - 1) : path; int index = tmpName.lastIndexOf("/"); if (index != -1) tmpName = tmpName.substring(index + 1); return tmpName; } else return path; } /** * Provide access to the java.net.URI that this CrawlableDataset represents. * * @return the java.net.URI that this CrawlableDataset represents. */ public URI getUri() { try { return new URI( this.path); } catch ( URISyntaxException e ) { return null; } } public Object getConfigObject() { return configObj; } public String getPath() { return (this.path); } public String getName() { return (this.name); } public boolean isCollection() { return isCollection(path); } public CrawlableDataset getDescendant( String relativePath ) { if ( relativePath.startsWith( "/")) throw new IllegalArgumentException( "Path must be relative <" + relativePath + ">."); return new CrawlableDatasetDods(this, relativePath); } // how do we determine if a url is a collection? // we can't count on a trailing backslash, as this was removed by CrawlableDatasetFactory // for now, assume collection unless a known file extension is encountered private static String [] knownFileExtensions = {".hdf", ".xml", ".nc", ".bz2", ".cdp", ".jpg"}; private static boolean isCollection(String path) { String testPath = path.toLowerCase(); // otherwise our matches may fail if (isDodsDataset(testPath)) return false; else { int i = 0; while ((i < (knownFileExtensions.length)) && !testPath.endsWith(knownFileExtensions[i])) ++i; return (i >= knownFileExtensions.length); // i < length means we deal with a known file ==> no collection } } private static String [] dodsExtensions = {".html", ".htm", ".das", ".dds", ".info"}; private static String getDodsExtension(String path) { String extension = ""; String testPath = path.toLowerCase(); // otherwise our matches may fail int i = 0; while ((i < (dodsExtensions.length)) && !testPath.endsWith(dodsExtensions[i])) ++i; if (i < dodsExtensions.length) extension = dodsExtensions[i]; return extension; } private static boolean isDodsDataset(String path) { return getDodsExtension(path).length() > 0; } private static String removeDodsExtension(String path) { String dodsExtension = getDodsExtension(path); if (dodsExtension.length() > 0) path = path.substring(0, path.length() - dodsExtension.length()); return path; } // This function shouldn't be here !!! // It is a workaround for many OPeNDAP servers that crop part of their urls (the /opendap-bin/nph-dods/ part) // e.g. of server with problem (2-Nov-2006): http://acdisc.sci.gsfc.nasa.gov/opendap-bin/nph-dods/OPENDAP/Giovanni/ private String forceChild(String url) { String prefix = path; if (prefix.endsWith("/")) prefix = path.substring(0, path.length() - 1); // because the url also contains a '/' that we will use int j = url.substring(0, url.length() - 1).lastIndexOf('/'); // url.length() - 1 was intentional .. if the last char is a '/', we're interested in the previous one. if (j >= 0) { String ret = prefix + url.substring(j); return ret; } else // relative paths .. leave intact return url; } public List listDatasets() throws IOException { if (!this.isCollection()) { String tmpMsg = "This dataset <" + this.getPath() + "> is not a collection dataset."; log.error("listDatasets(): " + tmpMsg); throw new IllegalStateException(tmpMsg); } if (listDatasetsMap.containsKey(path)) // shortcut .. for performance return (List)listDatasetsMap.get(path); else { List list = new ArrayList(); List pathList = new ArrayList(); // only for detecting duplicates (after removing the extension, sometimes we end up with duplicates) // Get list of possible datasets from current URL. List possibleDsList = null; try { String openPath = path; if (!openPath.endsWith("/")) // if you skip this, you will find that relative URLs don't work (fails in "extract", and in particular in URL u = new URL(baseURL, value)) openPath += "/"; possibleDsList = urlExtractor.extract(openPath); } catch (java.io.IOException e) { log.warn("listDatasets(): IOException while extracting dataset info from given OPeNDAP directory <" + path + ">, return empty list: " + e.getMessage()); return (list); } // Handle each link in the current access path. String curDsUrlString = null; for (Iterator it = possibleDsList.iterator(); it.hasNext(); ) { curDsUrlString = (String) it.next(); // Perform some tests on curDsUrlString // Skip datasets that aren't OPeNDAP datasets (".html") or // collection datasets ("/"). if ((!isDodsDataset(curDsUrlString)) && (!isCollection(curDsUrlString))) { log.warn("expandThisLevel(): Dataset isn't an OPeNDAP dataset or collection dataset, skip <" + path + ">."); continue; } curDsUrlString = removeDodsExtension(curDsUrlString); // This function goes a bit too far trying to recover from servers that drop part of URL path. // However, it also converts URLs that point to external servers to be subdirectories of this CrDS. //curDsUrlString = forceChild(curDsUrlString); // Skip any URLs that aren't children of this CrDs if ( !curDsUrlString.startsWith( path ) ) { log.debug( "listDatasets(): skipping URL <" + curDsUrlString + ">, not child of this CrDs <" + path + ">." ); continue; } if (pathList.contains(curDsUrlString)) continue; // duplicate else pathList.add(curDsUrlString); // Avoid links back down the path hierarchy (i.e., parent directory links). // Comment: this call was taken over from CrawlableDatasetFile. Since we use forceChild, this call is currently useless. if (!curDsUrlString.startsWith(path)) { log.debug("listDatasets(): current path <" + curDsUrlString + "> not child of given" + " location <" + path + ">, skip."); continue; } try { new URI(curDsUrlString); // syntax check } catch (URISyntaxException e) { log.error("listDatasets(): Skipping dataset <" + curDsUrlString + "> due to URISyntaxException: " + e.getMessage()); continue; } log.debug("listDatasets(): handle dataset (" + curDsUrlString + ")"); // So far so good .. curDsUrlString passed all tests, thus add it to the list try { list.add(CrawlableDatasetFactory.createCrawlableDataset( curDsUrlString, this.getClass().getName(), null)); } catch (ClassNotFoundException e) { log.warn("listDatasets(): Can't make CrawlableDataset for child url <" + curDsUrlString + ">: " + e.getMessage()); } catch (NoSuchMethodException e) { log.warn("listDatasets(): Can't make CrawlableDataset for child url <" + curDsUrlString + ">: " + e.getMessage()); } catch (IllegalAccessException e) { log.warn("listDatasets(): Can't make CrawlableDataset for child url <" + curDsUrlString + ">: " + e.getMessage()); } catch (InvocationTargetException e) { log.warn("listDatasets(): Can't make CrawlableDataset for child url <" + curDsUrlString + ">: " + e.getMessage()); } catch (InstantiationException e) { log.warn("listDatasets(): Can't make CrawlableDataset for child url <" + curDsUrlString + ">: " + e.getMessage()); } } listDatasetsMap.put(path, list); // remember it next time, for performance return list; } } public List listDatasets(CrawlableDatasetFilter filter) throws IOException { List list = this.listDatasets(); if (filter == null) return list; List retList = new ArrayList(); for (Iterator it = list.iterator(); it.hasNext();) { CrawlableDataset curDs = (CrawlableDataset) it.next(); if (filter.accept(curDs)) { retList.add(curDs); } } return (retList); } public CrawlableDataset getParentDataset() { if (!path.equals("/")) { String parentPath = path; int index = parentPath.lastIndexOf( "/", parentPath.endsWith( "/") ? parentPath.length() - 2 : parentPath.length() - 1 ); if ( index != -1 ) parentPath = parentPath.substring( 0, index + 1 ); return new CrawlableDatasetDods( parentPath, null); } else return null; } public boolean exists() { if (pathUrlConnection == null) try { URL u = new URL(path); pathUrlConnection = u.openConnection(); } catch (MalformedURLException e) { } catch (IOException e) { } if ( pathUrlConnection != null ) try { int responseCode = ((HttpURLConnection)pathUrlConnection).getResponseCode(); if (responseCode >= 200 && responseCode < 300) // Successful return true; } catch (IOException e) { } return false; } public long length() { if (this.isCollection()) return (0); if (pathUrlConnection == null) { try { URL u = new URL(path); pathUrlConnection = u.openConnection(); } catch (MalformedURLException e) { } catch (IOException e) { } } if (pathUrlConnection != null) return pathUrlConnection.getContentLength(); else return (-1); } public Date lastModified() { if (pathUrlConnection == null) { try { URL u = new URL(path); pathUrlConnection = u.openConnection(); } catch (MalformedURLException e) { } catch (IOException e) { } } if (pathUrlConnection != null) { long lastModified = pathUrlConnection.getLastModified(); if (lastModified != 0) { Calendar cal = Calendar.getInstance(); cal.clear(); cal.setTimeInMillis(lastModified); return (cal.getTime()); } else return null; } else return null; } public String toString() { return this.path; } }