package org.cdlib.xtf.textIndexer;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.SAXParser;
import javax.xml.transform.Templates;
import org.cdlib.xtf.textEngine.IndexUtil;
import org.cdlib.xtf.util.Normalizer;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.StructuredStore;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Supplies a single file containing a single record to the
* {@link XMLTextProcessor}.
*
* @author Martin Haye
*/
public class XMLIndexSource extends IndexSource
{
/** Simple constructor */
public XMLIndexSource(InputSource inSrc, String key)
{
String sysId = inSrc.getSystemId();
this.inSrc = inSrc;
this.path = (sysId == null) ? null : new File(sysId);
this.key = key;
this.preFilters = null;
this.displayStyle = null;
this.lazyStore = null;
}
/** Constructor -- initializes all the fields */
public XMLIndexSource(InputSource inSrc, File path, String key,
Templates[] preFilters, Templates displayStyle,
StructuredStore lazyStore)
{
this.inSrc = inSrc;
this.path = path;
this.key = key;
this.preFilters = preFilters;
this.displayStyle = displayStyle;
this.lazyStore = lazyStore;
}
public void removeDoctypeDecl(boolean flag) {
this.removeDoctypeDecl = flag;
}
/** Source of XML data */
private InputSource inSrc;
/** Path to the file, or null if it's not a local file. */
private File path;
/** Key used to identify this file in the index */
private String key;
/** XSLT pre-filters used to massage the XML document (null for none) */
private Templates[] preFilters;
/** Stylesheet from which to gather XSLT key definitions to be computed
* and cached on disk. Typically, one would use the actual display
* stylesheet for this purpose, guaranteeing that all of its keys will be
* pre-cached.<br><br>
*
* Background: stylesheet processing can be optimized by using XSLT 'keys',
* which are declared with an <xsl:key> tag. The first time a key
* is used in a given source document, it must be calculated and its values
* stored on disk. The text indexer can optionally pre-compute the keys so
* they need not be calculated later during the display process.
*/
private Templates displayStyle;
/**
* Empty storage in which to build the persistent version of the
* document (aka the "lazy tree"), or null to avoid building it.
*/
private StructuredStore lazyStore;
/** Whether to remove DOCTYPE decl (this is kind of a kludge) */
private boolean removeDoctypeDecl = false;
/** Keep track of whether we've processed this file yet */
private boolean isDone = false;
/** A parser we can use to tell whether we need to apply crimson workaround */
private static SAXParser saxParser = IndexUtil.createSAXParser();
// inherit JavaDoc
public File path() {
return path;
}
// inherit JavaDoc
public String key() {
return key;
}
// inherit JavaDoc
public Templates[] preFilters() {
return preFilters;
}
// inherit JavaDoc
public Templates displayStyle() {
return displayStyle;
}
// inherit JavaDoc
public long totalSize() {
if (path == null)
return 1;
return path.length();
}
// inherit JavaDoc
public IndexRecord nextRecord()
throws SAXException, IOException
{
// Don't process the record twice.
if (isDone)
return null;
// Okay, construct a SrcRecord for the whole file.
try
{
return new IndexRecord()
{
public InputSource xmlSource()
throws IOException
{
return filterInput();
}
public int recordNum() {
return 0;
}
public int percentDone() {
return 100;
}
public StructuredStore lazyStore() {
return lazyStore;
}
};
}
finally {
isDone = true;
}
} // nextRecord()
/**
* Filter the input, if necessary, to remove DOCTYPE declarations, or
* work around a bug in the Crimson parser.
*/
protected InputSource filterInput()
throws IOException
{
// If the input source is a reader, don't filter it.
if (inSrc.getCharacterStream() != null)
return inSrc;
// If no kludgy steps to perform, skip this step.
boolean applyCrimsonWorkaround = saxParser.getClass().getName().equals(
"org.apache.crimson.jaxp.SAXParserImpl");
if (!applyCrimsonWorkaround && !removeDoctypeDecl)
return inSrc;
// Convert the input source to an input stream if it isn't one already.
InputStream inStream;
if (inSrc.getByteStream() != null)
inStream = inSrc.getByteStream();
else if (inSrc.getSystemId() != null && inSrc.getSystemId().length() > 0)
{
// Make sure we can read the file.
String path = Path.normalizeFileName(inSrc.getSystemId());
if (path.startsWith("file://"))
path = path.substring(6);
else if (path.startsWith("file:/"))
path = path.substring(5);
if (!(new File(path).canRead()))
throw new FileNotFoundException(inSrc.getSystemId());
inStream = new FileInputStream(path);
}
else
throw new IOException(
"Must pass a Reader, InputStream or system ID to index");
// Apply kludgy filters if necessary.
inStream = IndexUtil.filterXMLDocument(inStream,
applyCrimsonWorkaround,
removeDoctypeDecl);
// Finally, make a new InputSource from the filtered stream.
InputSource finalSrc = new InputSource(inStream);
if (inSrc.getSystemId() != null)
finalSrc.setSystemId(inSrc.getSystemId());
return finalSrc;
} // filterInput()
/**
* Prepare a string for inclusion in an XML document. Unicode strings are
* normalized to their canonical equivalents, a few characters are
* escaped as entities, and invalid characters are removed.
*
* @param s string to normalize
* @return possibly changed version of the string
*/
public static String normalize(String s)
{
char[] ch = s.toCharArray();
// Scan for suspicious characters that might need Unicode
// normalization.
//
boolean needNormalize = false;
int needEscape = 0;
for (int i = 0; i < ch.length; i++)
{
if ((ch[i] & ~0x7f) != 0)
needNormalize = true;
if (ch[i] == '&' || ch[i] == '<')
++needEscape;
else if (ch[i] < '\u0020' &&
(ch[i] != '\t' && ch[i] != '\n' && ch[i] != '\r'))
{
++needEscape;
}
else if (ch[i] >= '\uD800' && ch[i] <= '\uDFFF')
++needEscape;
else if (ch[i] >= '\uFFFE' && ch[i] <= '\uFFFF')
++needEscape;
}
if (needNormalize)
{
String s2 = Normalizer.normalize(s);
if (!s.equals(s2)) {
s = s2;
ch = s2.toCharArray();
}
}
if (needEscape > 0)
{
int maxSpace = ch.length + (needEscape * 5);
char[] newCh = new char[maxSpace];
int dp = 0;
for (int sp = 0; sp < ch.length; sp++)
{
if (ch[sp] == '&') {
newCh[dp++] = '&';
newCh[dp++] = 'a';
newCh[dp++] = 'm';
newCh[dp++] = 'p';
newCh[dp++] = ';';
}
else if (ch[sp] == '<') {
newCh[dp++] = '&';
newCh[dp++] = 'l';
newCh[dp++] = 't';
newCh[dp++] = ';';
}
else if (ch[sp] < '\u0020' &&
(ch[sp] != '\t' && ch[sp] != '\n' && ch[sp] != '\r'))
{
; // delete invalid character
}
else if (ch[sp] >= '\uD800' && ch[sp] <= '\uDFFF')
; // delete invalid character
else if (ch[sp] >= '\uFFFE' && ch[sp] <= '\uFFFF')
; // delete invalid character
else
newCh[dp++] = ch[sp];
}
s = new String(newCh, 0, dp);
}
return s;
} // normalize()
} // class SimpleSrcTextInfo