package org.cdlib.xtf.textIndexer;
/*
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Vector;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.sax.SAXSource;
import org.apache.lucene.util.CountedInputStream;
import org.cdlib.xtf.util.Normalizer;
import org.cdlib.xtf.util.StructuredStore;
import org.marc4j.marc.MarcConstants;
import org.marc4j.marcxml.Converter;
import org.marc4j.marcxml.DoctypeDecl;
import org.marc4j.marcxml.MarcXmlReader;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
/**
* Supplies MARC data to an XTF index, breaking it up into individual MARCXML
* records.
*
* @author Martin Haye
*/
public class MARCIndexSource extends IndexSource
{
/** Constructor -- initializes all the fields */
public MARCIndexSource(File path, String key, Templates[] preFilters,
Templates displayStyle)
{
this.path = path;
this.key = key;
this.preFilters = preFilters;
this.displayStyle = displayStyle;
// Find out the file's total size, for percent done calculations.
fileSize = path.length();
}
/** Path to the file, or null if it's not a local file. */
private File path;
/** Key used to identify this file in the index */
private String key;
/** XSLT pre-filters used to massage the XML document (null for none) */
private Templates[] preFilters;
/** Stylesheet from which to gather XSLT key definitions to be computed
* and cached on disk. Typically, one would use the actual display
* stylesheet for this purpose, guaranteeing that all of its keys will be
* pre-cached.<br><br>
*
* Background: stylesheet processing can be optimized by using XSLT 'keys',
* which are declared with an <xsl:key> tag. The first time a key
* is used in a given source document, it must be calculated and its values
* stored on disk. The text indexer can optionally pre-compute the keys so
* they need not be calculated later during the display process.
*/
private Templates displayStyle;
/** Size of the whole input file */
private long fileSize = -1;
/** Input stream for the raw data */
private CountedInputStream rawStream = null;
/** Record handling thread */
private RecordHandler recordHandler;
/** Are we there yet? */
private boolean isDone = false;
private int recordNum = 0;
// inherit JavaDoc
public File path() {
return path;
}
// inherit JavaDoc
public String key() {
return key;
}
// inherit JavaDoc
public Templates[] preFilters() {
return preFilters;
}
// inherit JavaDoc
public Templates displayStyle() {
return displayStyle;
}
// inherit JavaDoc
public long totalSize() {
return fileSize;
}
// inherit JavaDoc
public IndexRecord nextRecord()
throws SAXException, IOException
{
// If we're done, say so.
if (isDone)
return null;
// Open the MARC file if we haven't already.
openFile();
// Get the next record from the handler thread.
String parsedMarcXML = null;
synchronized (recordHandler)
{
while (true)
{
if (recordHandler.isDone) {
isDone = true;
break;
}
if (recordHandler.parsedMarcXML != null) {
parsedMarcXML = recordHandler.parsedMarcXML;
++recordNum;
recordHandler.parsedMarcXML = null;
recordHandler.notifyAll();
break;
}
try {
recordHandler.wait();
}
catch (InterruptedException e) {
assert false : "how could this thread be interrupted??";
isDone = true;
break;
}
}
} // sync
// If we ran out of records, say so.
if (isDone)
return null;
// Okay, make a record out of it.
final Reader reader = new StringReader(parsedMarcXML);
return new IndexRecord()
{
public InputSource xmlSource()
throws IOException
{
return new InputSource(reader);
}
public int recordNum() {
return recordNum;
}
public int percentDone() {
return (int)((rawStream.nRead() + 1) * 100 / fileSize);
}
public StructuredStore lazyStore() {
return null;
}
};
} // nextRecord()
private void openFile()
throws IOException
{
// Only open the file once.
if (rawStream != null)
return;
// Open the input stream and reader.
rawStream = new CountedInputStream(
new BufferedInputStream(new FileInputStream(path)));
// The output of the MARC converter will go to an XML handler of our
// own design.
//
recordHandler = new RecordHandler();
// Fire up the thread that will do the conversion.
recordHandler.start();
} // openFile()
/**
* Handles running blocks of records through the stylesheet
*/
private class RecordHandler extends Thread implements ContentHandler
{
/** A single parsed MARCXML record */
public String parsedMarcXML = null;
/** Set to true when this thread has finished its business. */
public boolean isDone = false;
/** If an exception occured, it is recorded here */
public Throwable error = null;
/** Names of XML namespace prefixes */
private Vector prefixNames = new Vector();
/** URIs of XML namespace prefixes */
private Vector prefixUris = new Vector();
/** Mapping from URI to name */
private HashMap prefixUriToName = new HashMap();
/** Accumulates the current MARCXML record */
private StringBuffer buffer = new StringBuffer();
private int recordNum = 0;
public void run()
{
try
{
while (true)
{
long startPos = rawStream.nRead();
try {
convertRecords();
long endPos = rawStream.nRead();
if (endPos == startPos)
break;
else if (skipBadRecord())
continue;
else
break;
}
catch (Throwable t) {
long endPos = rawStream.nRead();
if (endPos == startPos)
throw t;
else if (skipBadRecord())
continue;
else
break;
}
}
}
catch (Throwable t) {
error = t;
}
finally {
isDone = true;
synchronized (this) {
notifyAll();
}
}
}
private void convertRecords()
throws Exception
{
// Make byte data into characters
Reader reader = new InputStreamReader(rawStream, "ISO8859_1");
// Make a producer that knows how to parse MARC
MarcXmlReader producer = new MarcXmlReader();
try {
producer.setProperty(
"http://marc4j.org/properties/document-type-declaration",
new DoctypeDecl());
}
catch (SAXException e) { /*ignore*/
}
// Here's the input to the MARC converter
InputSource in = new InputSource(reader);
Source source = new SAXSource(producer, in);
Result marcXmlResult = new SAXResult(this);
Converter converter = new Converter();
converter.convert(source, marcXmlResult);
} // convertRecords
private boolean skipBadRecord()
throws IOException
{
int nSkipped = 0;
while (true)
{
int ch = rawStream.read();
if (ch < 0) {
System.err.flush();
System.out.println("Bad MARC data near end of file. Skipping.");
return false;
}
if (ch == MarcConstants.RT)
break;
++nSkipped;
}
if (nSkipped > 0) {
System.err.flush();
System.out.println(
"Bad MARC data near record " + recordNum + ". Attempting to resume.");
}
return true;
} // skipBadRecord()
private void beginChunk()
throws SAXException
{
buffer.setLength(0);
buffer.append("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
// Output the record wrapper, with any namespace prefix declarations.
buffer.append("<record");
for (int i = 0; i < prefixNames.size(); i++)
{
String prefixName = (String)prefixNames.get(i);
String prefixUri = (String)prefixUris.get(i);
buffer.append(" xmlns");
if (prefixName != null && prefixName.length() > 0) {
buffer.append(':');
buffer.append(prefixName);
}
buffer.append("=\"");
buffer.append(prefixUri);
buffer.append('\"');
}
buffer.append(">\n");
}
private void endChunk()
throws SAXException
{
buffer.append("</record>\n");
// Pass the newly parsed record to the main thread.
synchronized (this)
{
// Wait for the previous record to be consumed.
while (parsedMarcXML != null)
{
try {
wait();
}
catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
// Put up the new record, and notify the main thread.
parsedMarcXML = buffer.toString();
++recordNum;
notifyAll();
}
} // endChunk()
public void startDocument()
throws SAXException
{
prefixNames.clear();
prefixUris.clear();
prefixUriToName.clear();
}
public void endDocument()
throws SAXException
{
}
public void startElement(String uri, String localName, String qName,
Attributes atts)
throws SAXException
{
if (localName.equals("record")) {
beginChunk();
return;
}
// First, the "<"
buffer.append("<");
// Then the element name.
if (qName != null)
buffer.append(qName);
else if (uri != null) {
String prefix = (String)prefixUriToName.get(uri);
assert prefix != null : "invalid URI??";
buffer.append(prefix);
buffer.append(':');
buffer.append(localName);
}
else
buffer.append(localName);
// Then each attribute.
for (int i = 0; i < atts.getLength(); i++)
{
buffer.append(' ');
if (atts.getQName(i) != null)
buffer.append(atts.getQName(i));
else if (atts.getURI(i) != null) {
String prefix = (String)prefixUriToName.get(atts.getURI(i));
assert prefix != null : "invalid URI??";
buffer.append(prefix);
buffer.append(':');
buffer.append(atts.getLocalName(i));
}
else
buffer.append(atts.getLocalName(i));
buffer.append("=\"");
buffer.append(atts.getValue(i));
buffer.append("\"");
}
// Close the declaration.
buffer.append(">");
}
public void endElement(String uri, String localName, String qName)
throws SAXException
{
if (localName.equals("record")) {
endChunk();
return;
}
// First, the "</"
buffer.append("</");
// Then the element name.
if (qName != null)
buffer.append(qName);
else if (uri != null) {
String prefix = (String)prefixUriToName.get(uri);
assert prefix != null : "invalid URI??";
buffer.append(prefix);
buffer.append(':');
buffer.append(localName);
}
else
buffer.append(localName);
// Then the end.
buffer.append(">");
}
public void characters(char[] ch, int start, int length)
throws SAXException
{
String s = convertFromUTF8(ch, start, length);
if (s != null) {
ch = s.toCharArray();
start = 0;
length = ch.length;
}
// Scan for suspicious characters that might need Unicode
// normalization.
//
boolean needNormalize = false;
int needEscape = 0;
for (int i = start; i < start + length; i++)
{
if ((ch[i] & ~0x7f) != 0)
needNormalize = true;
if (ch[i] == '&' || ch[i] == '<')
++needEscape;
else if (ch[i] < '\u0020' &&
(ch[i] != '\t' && ch[i] != '\n' && ch[i] != '\r'))
{
++needEscape;
}
else if (ch[i] >= '\uD800' && ch[i] <= '\uDFFF')
++needEscape;
else if (ch[i] >= '\uFFFE' && ch[i] <= '\uFFFF')
++needEscape;
}
if (needNormalize)
{
s = new String(ch, start, length);
String s2 = Normalizer.normalize(s);
if (!s.equals(s2))
{
//System.out.println( "Translated non-normalized Unicode in record " + (numCompleted + 1) +
// ": " + s );
ch = s2.toCharArray();
start = 0;
length = ch.length;
}
}
if (needEscape > 0)
{
int maxSpace = length + (needEscape * 5);
char[] newCh = new char[maxSpace];
int dp = 0;
for (int sp = start; sp < (start + length); sp++)
{
if (ch[sp] == '&') {
newCh[dp++] = '&';
newCh[dp++] = 'a';
newCh[dp++] = 'm';
newCh[dp++] = 'p';
newCh[dp++] = ';';
}
else if (ch[sp] == '<') {
newCh[dp++] = '&';
newCh[dp++] = 'l';
newCh[dp++] = 't';
newCh[dp++] = ';';
}
else if (ch[sp] < '\u0020' &&
(ch[sp] != '\t' && ch[sp] != '\n' && ch[sp] != '\r'))
{
; // delete invalid character
}
else if (ch[sp] >= '\uD800' && ch[sp] <= '\uDFFF')
; // delete invalid character
else if (ch[sp] >= '\uFFFE' && ch[sp] <= '\uFFFF')
; // delete invalid character
else
newCh[dp++] = ch[sp];
}
ch = newCh;
start = 0;
length = dp;
}
buffer.append(ch, start, length);
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException
{
prefixNames.add(prefix);
prefixUris.add(uri);
prefixUriToName.put(uri, prefix);
}
public void endPrefixMapping(String prefix)
throws SAXException
{
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException
{
}
public void processingInstruction(String target, String data)
throws SAXException
{
}
public void setDocumentLocator(Locator locator) {
}
public void skippedEntity(String name)
throws SAXException
{
}
/**
* Look for probable UTF-8 encoding. If found, convert it to Unicode;
* if not, return null.
*
* @param chars Array of characters to convert
* @param start Where to start in the array
* @param length How many characters to examine
* @return New Unicode string, or null if no UTF-8 characters
* found.
*/
public String convertFromUTF8(char[] chars, int start, int length)
{
// Scan the string, looking for likely UTF8.
boolean foundUTF = false;
for (int i = start; i < start + length; i++)
{
char c = chars[i];
// If somehow we already have 2-byte chars, this probably isn't
// a UTF8 string.
//
if ((c & 0xFF00) != 0)
return null;
// Skip the ASCII chars
if (c <= 0x7F)
continue;
// Look for a two-byte sequence
if (c >= 0xC0 &&
c <= 0xDF &&
i + 1 < chars.length &&
chars[i + 1] >= 0x80 &&
chars[i + 1] <= 0xBF)
{
foundUTF = true;
i++;
}
// Look for a three-byte sequence
else if (c >= 0xE0 &&
c <= 0xEF &&
i + 2 < chars.length &&
chars[i + 1] >= 0x80 &&
chars[i + 1] <= 0xBF &&
chars[i + 2] >= 0x80 &&
chars[i + 2] <= 0xBF)
{
foundUTF = true;
i += 2;
}
// Look for a four-byte sequence
else if (c >= 0xF0 &&
c <= 0xF7 &&
i + 3 < chars.length &&
chars[i + 1] >= 0x80 &&
chars[i + 1] <= 0xBF &&
chars[i + 2] >= 0x80 &&
chars[i + 2] <= 0xBF &&
chars[i + 3] >= 0x80 &&
chars[i + 3] <= 0xBF)
{
foundUTF = true;
i += 3;
}
// Trailing bytes without leading bytes are illegal, and thus
// likely this string isn't UTF8 encoded.
//
else if (c >= 0x80 && c <= 0xBF)
return null;
// Certain other bytes are also illegal.
else if (c >= 0xF8 && c <= 0xFF)
return null;
}
// No UTF8 chars found? Nothing to do.
if (!foundUTF)
return null;
// Okay, convert the UTF8 value to Unicode.
try {
String value = new String(chars, start, length);
byte[] bytes = value.getBytes("ISO-8859-1");
return new String(bytes, "UTF-8");
}
catch (UnsupportedEncodingException e) {
return null;
}
} // convertUTF8inURL()
} // class RecordHandler
} // class SimpleSrcTextInfo