package org.cdlib.xtf.xslt;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.lang.reflect.Method;
import java.math.BigInteger;
import java.net.URL;
import java.net.URLConnection;
import java.security.MessageDigest;
import java.text.DecimalFormat;
import java.text.FieldPosition;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.stream.StreamSource;
import org.cdlib.xtf.textIndexer.HTMLToString;
import org.cdlib.xtf.util.Path;
import org.xml.sax.InputSource;
import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.DocumentInfo;
import net.sf.saxon.trace.InstructionInfo;
import net.sf.saxon.trans.XPathException;
/*
* Copyright (c) 2006, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
/*
* This file created on Apr 21, 2005 by Martin Haye
*/
/**
* Provides file-related utilities to be called by XSLT stylesheets through
* Saxon's extension function mechanism.
*
* @author Martin Haye
*/
public class FileUtils
{
/** Used to avoid recreating SimpleDateFormat objects all the time */
private static HashMap dateFormatCache = new HashMap();
/** Used to track temp files, per thread */
private static ThreadLocal<ArrayList<File>> tempFiles =
new ThreadLocal<ArrayList<File>>();
/**
* Checks whether a file with the given path exists (that is, if it can
* be read.) If the path is relative, it is resolved relative to the
* stylesheet calling this function.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param filePath Path to the file in question
* @return true if the file exists and can be read, else false
*/
public static boolean exists(XPathContext context, String filePath) {
File file = resolveFile(context, filePath);
return file.canRead();
} // exists()
/**
* Gets the last-modified time of the file with the given path exists (that
* is, if it can be read.) If the path is relative, it is resolved relative
* to the stylesheet calling this function.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param filePath Path to the file in question
* @param formatStr A simple format string; see {@link SimpleDateFormat}.
* @return The formatted date/time if the file exists; null if
* the file doesn't exist.
*/
public static String lastModified(XPathContext context, String filePath,
String formatStr)
{
File file = resolveFile(context, filePath);
if (!file.canRead())
return null;
SimpleDateFormat fmt = getDateFormat(formatStr);
String result = fmt.format(new Date(file.lastModified()));
return result;
} // lastModified()
/**
* Gets the size in bytes of the file with the given path (that
* is, if it can be read.) If the path is relative, it is resolved relative
* to the stylesheet calling this function.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param filePath Path to the file in question
* @return The file size, or -1 if it doesn't exist.
*/
public static long length(XPathContext context, String filePath) {
File file = resolveFile(context, filePath);
if (!file.canRead())
return -1;
return file.length();
} // length()
/**
* Converts the size of a file to a human-readable string, e.g.
* "36 Kb", "1.2 Mb", etc. Contributor: Michael A. Russell
*
* @param longFileSize The size to convert
* @return Human-readable string approximating that size.
*/
public static String humanFileSize(Long longFileSize)
{
/* If the input is negative, return a zero-length string. */
if (longFileSize < 0) return("");
/* If it's up to 512, use the number itself. */
if (longFileSize < 512) return(longFileSize.toString( ));
/* Provide a place to put the result of the division. */
double doubleBytes;
/* We want at most two digits to the right of the decimal point. */
DecimalFormat outputFormat = new DecimalFormat("0.00");
/* Provide a place to put the converted value. */
StringBuffer outputStrBuf = new StringBuffer( );
/* Provide a "FieldPosition" object. It looks like it's returned
* by the "format( )" method, but I don't really care about it.
* I haven't been able to find an example of how to use it, so
* I'll just try using zero, and see what that does.
*/
FieldPosition fieldPos = new FieldPosition(0);
/* If it's up to 1024 * 512, express in terms of kilobytes. */
if (longFileSize < (1024L * 512L)) {
doubleBytes = longFileSize.doubleValue( ) / 1024.0;
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Kb");
}
/* If it's up to 1024 * 1024 * 512, express in terms of megabytes. */
if (longFileSize < (1024L * 1024L * 512L)) {
doubleBytes = longFileSize.doubleValue( ) / (1024.0 * 1024.0);
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Mb");
}
/* If it's up to 1024 * 1024 * 1024 * 512, express in terms of
* gigabytes.
*/
if (longFileSize < (1024L * 1024L * 1024L * 512L)) {
doubleBytes = longFileSize.doubleValue( ) / (1024.0 * 1024.0 *
1024.0);
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Gb");
}
/* If it's up to 1024 * 1024 * 1024 * 1024 * 512, express in terms of
* terabytes.
*/
if (longFileSize < (1024L * 1024L * 1024L * 1024L * 512L)) {
doubleBytes = longFileSize.doubleValue( ) / (1024.0 * 1024.0 *
1024.0 * 1024.0);
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Tb");
}
/* If it's up to 1024 * 1024 * 1024 * 1024 * 1024 * 512, express in
* terms of petabytes.
*/
if (longFileSize < (1024L * 1024L * 1024L * 1024L * 1024L * 512L)) {
doubleBytes = longFileSize.doubleValue( ) / (1024.0 * 1024.0 *
1024.0 * 1024.0 * 1024.0);
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Pb");
}
/* Express in exabytes. A long integer can be at most 2**63 - 1,
* and that's about 9 exabytes, so we don't need to go higher.
*/
doubleBytes = longFileSize.doubleValue( ) / (1024.0 * 1024.0 *
1024.0 * 1024.0 * 1024.0 * 1024.0);
outputFormat.format(doubleBytes, outputStrBuf, fieldPos);
return(outputStrBuf.toString( ) + " Eb");
}
/**
* Calculate the MD5 digest of a string. Contributor: Michael A. Russell
*
* @param inputString String to digest
* @return The string's MD5 hash
*/
public static String md5Hash(String inputString)
{
/* Get an md5 message digest object. */
MessageDigest msgDigest;
try {
msgDigest = MessageDigest.getInstance("MD5");
}
catch (Exception e) {
/* NoSuchAlgorithmException probably. Return a zero-length
* string.
*/
return("");
}
/* Calculate the md5 digest for the input string. */
msgDigest.update(inputString.getBytes( ), 0, inputString.length( ));
/* Get a BigInteger version of the md5 digest. */
BigInteger bigInt;
try {
bigInt = new BigInteger(1, msgDigest.digest( ));
}
catch (Exception e) {
/* NumberFormatException probably. Return a zero-length string. */
return("");
}
/* Convert the BigInteger to a hex string. */
String outputString = bigInt.toString(16);
/* If the number of characters is odd, then prefix a zero. */
if (outputString.length( ) % 2 == 1)
outputString = "0" + outputString;
/* Return the result. */
return(outputString);
}
/**
* Unfortunately the interface for getting systemId from an XPath context changed
* between Saxon 9.0 and Saxon 9.1, so we jump through hoops to be compatible
* with both.
*/
private static String getSystemId(XPathContext context)
{
try {
// Saxon 9.0 and below
return context.getOrigin().getInstructionInfo().getSystemId();
}
catch (NoSuchMethodError e)
{
// Saxon 9.1 and above
for (Method method : context.getClass().getMethods()) {
if (method.getName().equals("getOrigin"))
{
try {
return ((InstructionInfo)method.invoke(context)).getSystemId();
} catch (Exception e2) {
throw new RuntimeException(e2);
}
}
}
}
return null;
}
/**
* Resolve the location of a file given the stylesheet context.
*/
public static File resolveFile(XPathContext context, String filePath)
{
String stylesheetPath = getSystemId(context);
stylesheetPath = stylesheetPath.replaceFirst("^file:", "");
stylesheetPath = stylesheetPath.replaceAll("%20", " "); // fix spaces from Saxon on Windows
File stylesheetDir = new File(stylesheetPath).getParentFile();
filePath = filePath.replaceFirst("^file:", "");
filePath = filePath.replaceAll("%20", " "); // fix spaces from Saxon on Windows
String resolved = Path.resolveRelOrAbs(stylesheetDir, filePath);
return new File(resolved);
} // resolveFile
/**
* Resolve the location of a file given the stylesheet context. If the
* path is absolute, nothing is done. If it is relative, it is converted
* to absolute by resolving it relative to the stylesheet path.
*/
public static String resolvePath(XPathContext context, String filePath)
{
String stylesheetPath = getSystemId(context);
stylesheetPath = stylesheetPath.replaceFirst("^file:", "");
stylesheetPath = stylesheetPath.replaceAll("%20", " "); // fix spaces from Saxon on Windows
File stylesheetDir = new File(stylesheetPath).getParentFile();
filePath = filePath.replaceFirst("^file:", "");
filePath = filePath.replaceAll("%20", " "); // fix spaces from Saxon on Windows
String resolved = Path.resolveRelOrAbs(stylesheetDir, filePath);
return Path.normalize(resolved);
}
/**
* Gets the current date and time.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param formatStr A simple format string; see {@link SimpleDateFormat}.
* @return The formatted date/time.
*/
public static String curDateTime(XPathContext context, String formatStr) {
SimpleDateFormat fmt = getDateFormat(formatStr);
String result = fmt.format(new Date());
return result;
} // curDateTime()
/**
* All minutes have this many milliseconds except the last minute of the day on a day defined with
* a leap second.
*/
private static final long MILLISECS_PER_MINUTE = 60*1000;
/**
* Number of milliseconds per hour, except when a leap second is inserted.
*/
private static final long MILLISECS_PER_HOUR = 60*MILLISECS_PER_MINUTE;
/**
* Number of leap seconds per day except on
* <BR/>1. days when a leap second has been inserted, e.g. 1999 JAN 1.
* <BR/>2. Daylight-savings "spring forward" or "fall back" days.
*/
private static final long MILLISECS_PER_DAY = 24*MILLISECS_PER_HOUR;
/**
* Compute the number of days, hours, or minutes that have elapsed between
* the given time and now.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param targetDateStr The target date
* @param units Units to return: 'days', 'hours', or 'minutes'. Plural
* is optional, and single-letter abbreviations are accepted.
* @param formatStr The format of the target date; see {@link SimpleDateFormat}.
* @return number of days, hours or minutes elapsed
*/
public static long timeSince(XPathContext context,
String targetDateStr, String units, String formatStr)
{
try {
// First, parse the target time.
SimpleDateFormat fmt = getDateFormat(formatStr);
Date targetDate = fmt.parse(targetDateStr);
Calendar tmpCal = Calendar.getInstance();
tmpCal.setTime(targetDate);
long targetMillis = adjustedMillis(tmpCal);
// Now get the current time for comparison
tmpCal.setTime(new Date());
long currentMillis = adjustedMillis(tmpCal);
long diff = currentMillis - targetMillis;
// Compute the answer in the desired units.
if (units.matches("d|D|day|Day|days|Days"))
return diff / MILLISECS_PER_DAY;
else if (units.matches("h|H|hour|Hour|hours|Hours"))
return diff / MILLISECS_PER_HOUR;
else if (units.matches("m|M|min|Min|minute|Minute|minutes|Minutes"))
return diff / MILLISECS_PER_MINUTE;
else
throw new RuntimeException("timeSince units must be days, hours, or minutes (or d/h/m)");
}
catch (ParseException e) {
throw new RuntimeException("error parsing date '" + targetDateStr + "'");
}
}
/**
* Gets the time in milliseconds from a Calendar, adjusting for timezone
* so that day subtraction works properly.
*/
private static long adjustedMillis(Calendar cal) {
return cal.getTimeInMillis() + cal.getTimeZone().getOffset(cal.getTimeInMillis() );
}
/**
* Get a SimpleDateFormatter for the given format string. If one has
* already been created, use that; otherwise, make a new one.
*
* @param formatStr is the format string to use
* @return a SimpleDateFormatter for that format string.
*/
private static SimpleDateFormat getDateFormat(String formatStr) {
if (!dateFormatCache.containsKey(formatStr))
dateFormatCache.put(formatStr, new SimpleDateFormat(formatStr));
return (SimpleDateFormat)dateFormatCache.get(formatStr);
}
/**
* Generates a temporary file in the default temporary-file directory,
* using the given prefix and suffix to generate the name. Also registers
* the file for deletion at the end of the current request.
*
* @param context Context used to figure out which stylesheet is calling
* the function.
* @param prefix Prefix for the resulting file name.
* @param suffix Suffix for the resulting file name.
* @return The new temporary file name.
*/
public static String createTempFile(XPathContext context,
String prefix, String suffix)
throws IOException
{
File out = File.createTempFile(prefix, suffix);
out.delete();
ArrayList<File> files = tempFiles.get();
if (files == null) {
files = new ArrayList<File>();
tempFiles.set(files);
}
tempFiles.get().add(out);
return out.getAbsolutePath();
}
/**
* Deletes all temporary files created by the current thread using
* {@link #createTempFile}.
*/
public static void deleteTempFiles()
{
ArrayList<File> files = tempFiles.get();
if (files != null) {
for (File f : files) {
if (f.delete())
files.remove(f);
}
}
}
/**
* Reads in the first part of an XML file, stopping at the first
* close-element marker. Generally this captures enough information to
* identify which kind of XML data is inside the file.
* @throws IOException if the file can't be read
* @throws XPathException if the document cannot be parsed
*/
public static DocumentInfo readXMLStub(XPathContext context, String filePath)
throws IOException, XPathException
{
// First, locate the file
File file = resolveFile(context, filePath);
if (!file.canRead())
throw new IOException("Cannot read file '" + file.toString() + "'");
// Now read it in, up to the first close-element marker.
XMLStubReader xmlReader = new XMLStubReader();
BufferedInputStream bufStream = new BufferedInputStream(new FileInputStream(file));
InputSource inputSrc = new InputSource(bufStream);
inputSrc.setSystemId(file.toURI().toString());
Source saxSrc = new SAXSource(xmlReader, inputSrc);
DocumentInfo doc = context.getConfiguration().buildDocument(saxSrc);
return doc;
}
/**
* Reads in an HTML page (specified by URL), and uses JTidy to make it into
* XML that can be subsequently processed by a stylesheet.
*
* @throws IOException if the file can't be read
* @throws XPathException if the document cannot be parsed
*/
public static DocumentInfo readHTMLPage(XPathContext context, String urlStr)
throws IOException, XPathException
{
// Read the HTML page, and convert it to an XML string
URL url;
URLConnection connection;
InputStream inStream = null;
String pageStr;
try {
url = new URL(urlStr);
connection = url.openConnection();
inStream = connection.getInputStream();
pageStr = HTMLToString.convert(inStream);
}
finally {
if (inStream != null)
inStream.close();
}
// And convert that string to an in-memory XML document.
DocumentInfo doc = context.getConfiguration().buildDocument(
new StreamSource(new StringReader(pageStr), urlStr));
return doc;
}
} // class FileUtils