package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ //import java.io.IOException; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.zip.GZIPInputStream; // import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.cdlib.xtf.textEngine.IndexUtil; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.Trace; //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// /** * This class parses TextIndexer configuration XML files. <br><br> * * The TextIndexer uses a configuration file that describes one or more index * names. Each index description identifies the source text and Lucene database * directories associated with the index, and the chunk size and overlap for * the index. <br><br> * * The format of the configuration file is as follows: * * <code><blockquote dir=ltr style="MARGIN-RIGHT: 0px"> * <b><?xml version="1.0" encoding="utf-8"?></b><br> * <b><textIndexer-config></b><br><br> * * <blockquote dir=ltr style="MARGIN-RIGHT: 0px"> * <b><index name="</b><font color=#0000ff><i>IndexName</i></font><b>"> </b> * <blockquote dir=ltr style="MARGIN-RIGHT: 0px"> * <b><db path="</b><font color=#0000ff><i>LuceneIndexPath</i></font><b>"/> </b> <br> * <b><src path="</b><font color=#0000ff><i>XMLSourcePath</i></font><b>"/> </b> <br> * <b><chunk size="</b><font color=#0000ff><i>ChunkSize</i></font><b>" * overlap="</b><font color=#0000ff><i>ChunkOverlap</i></font><b>"/> </b> <br> * <b><skip files= "</b><font color=#0000ff><i>*.xxx*, *.yyy, ... </i></font><b>"/> </b><br> * <b><inputfilter path="</b><font color=#0000ff><i>XSLPreFilterFile</i></font><b>"/></b><br><br> * </blockquote> * <b></index> </b><br><br> * * </blockquote> * * <b></textIndexer-config></b><br> * </blockquote> * </blockquote> * </code> * * The arguments should appear at most once for each index specified. If * multiple instances of the arguments are specified for an index, the * last one is used. <br><br> * * A simple example of a TextIndexer config file might look as follows: * <br><br> * * <code> * <blockquote dir=ltr style="MARGIN-RIGHT: 0px"><b> * <?xml version="1.0" encoding="utf-8"?> <br> * <textIndexer-config> * * <blockquote dir=ltr style="MARGIN-RIGHT: 0px"> * <index name="AllText"> * <blockquote dir=ltr style="MARGIN-RIGHT: 0px"> * <db path="./IndexDBs"/> <br> * <src path="./SourceText"/> <br> * <chunk size="100" overlap="50"/> <br> * <skip files="*.mets*, *AuthMech*"/> <br> * <inputfilter path="./BasicFilter.xsl"/> * </blockquote> * </index> <br><br> * * </blockquote> * * </textIndexer-config> * </blockquote></b> * </code> * * @.notes * * This class is derived from the SAX {@link org.xml.sax.helpers.DefaultHandler} class so that * its {@link XMLConfigParser#startElement(String,String,String,Attributes) startElement()} * and {@link XMLConfigParser#endElement(String,String,String) endElement()} * methods can be called internally from the Java {@link javax.xml.parsers.SAXParser} * class. <br><br> * * To use this class, simply instantiate a copy, and then call its * {@link XMLConfigParser#configure(IndexerConfig) configure()} method.<br><br> */ public class XMLConfigParser extends DefaultHandler { private boolean isConfigFile = false; private boolean indexNameFound = false; private boolean inNamedIndexBlock = false; private IndexerConfig configInfo; //////////////////////////////////////////////////////////////////////////// /** * This method parses a config file and stores the resulting parameters in * a config info structure. <br><br> * * To read indexing configuration info, create an instance of this class and * call this method with the path/name of the config file to read. <br><br> * * @param cfgInfo Upon entry, a config structure with the path/name of the * config file to read in the * {@link IndexerConfig#cfgFilePath cfgFilePath} field. <br><br> * * Upon return, the same config structure with * parameter values from the config file stored in their * respective fields. <br><br> * * @throws Exception Any internal exceptions generated while parsing the * configuration file. <br><br> * * @.notes * The format of the XML file is explained in greater detail in the description * for the {@link XMLConfigParser} class. <br><br> * */ public int configure(IndexerConfig cfgInfo) throws Exception //, ParserConfigurationException, SAXException, IOException { // Start out having not confirmed that this is a config file, or // that we've found the specified index name. // isConfigFile = false; indexNameFound = false; inNamedIndexBlock = false; try { // Make sure we can access the file. if (!new File(cfgInfo.cfgFilePath).canRead()) { Trace.error( "Error: unable to read textIndexer config file \"" + cfgInfo.cfgFilePath + "\""); return -1; } // Create a reference to the passed config info class that // all the methods can access. // configInfo = cfgInfo; // Instantiate a new SAX parser instance. SAXParser xmlParser = IndexUtil.createSAXParser(); // Call the XML parser to process the config file, using this object // as the tag handler. Make sure to convert file to a proper URI, // since on Windows weird stuff happens with "C:\blah" // xmlParser.parse(new File(cfgInfo.cfgFilePath).toURI().toString(), this); } // try catch (Throwable t) { // Log what happened. Trace.error( "*** Caught an XML Parser Exception: " + t.getClass() + "\n" + " With message: " + t.getMessage()); throw (t instanceof Exception) ? (Exception)t : new RuntimeException(t); } // If we failed to read the config file if (!(isConfigFile && indexNameFound)) return -1; // Make sure all the required items were specified. if (cfgInfo.indexInfo.indexPath == null || cfgInfo.indexInfo.indexPath.equals("")) { Trace.error("Error: Index configuration file failed to specify 'db' element"); return -1; } if (cfgInfo.indexInfo.sourcePath == null || cfgInfo.indexInfo.sourcePath.equals("")) { Trace.error( "Error: Index configuration file failed to specify 'sourcePath' element"); return -1; } if (cfgInfo.indexInfo.docSelectorPath == null || cfgInfo.indexInfo.docSelectorPath.equals("")) { Trace.error( "Error: Index configuration file failed to specify 'docSelectorPath' element"); return -1; } return 0; } // public configure() //////////////////////////////////////////////////////////////////////////// /** Methed called when the start tag is encountered in the config file. <br><br> * * This class is derived from the SAX {@link org.xml.sax.helpers.DefaultHandler} * class so that the parser can call this method each time a start tag is * encountered in the XML config file.<br><br> * * @param uri The current namespace URI in use. * * @param localName The local name (i.e., without prefix) of the current * element, or the empty string if namespace processing is * disabled. * * @param qName The qualified name (i.e., with prefix) for the current * element, or the empty string if qualified names are * disabled. * * @param atts The specified or defaulted arguments for the current * element. These consist of any <code>xxx = "yyy"</code> * style arguments for the element within the < and >. * <br><br> * * @throws SAXException Any internal exceptions generated due to * syntax problems in the element. <br><br> * * @.notes * For an explanation of the config file format, see the main description * for the {@link XMLConfigParser} class. <br><br> */ public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // If we encountered a config ID tag, flag that we're actually in a // config file. // if (qName.equalsIgnoreCase("textIndexer-config")) { isConfigFile = true; return; } // If we're not in a config file, ignore any tags that might // happenstantially look like config tags. // if (!isConfigFile) return; // If we encountered an index configuration tag... if (qName.equalsIgnoreCase("index")) { // Get the index name for this configuration block. String xmlIdxName = atts.getValue("name").trim(); // If the name is missing, don't do any more work. if (xmlIdxName == null || xmlIdxName.length() == 0) return; // Get a more convenient reference to the index name that we're // looking for. // String idxName = configInfo.indexInfo.indexName; // If the block in the config file matches the specified name, // flag that we're in the right block, and should record any // config items that may follow. // if (xmlIdxName.equalsIgnoreCase(idxName)) { indexNameFound = true; inNamedIndexBlock = true; return; } // We're not in the right block, so flag and ignore it. inNamedIndexBlock = false; return; } // if( qName.equalsIgnoreCase("index") ) // For all other configuration tags, if we are not in the right // config block, ignore them. // if (!inNamedIndexBlock) return; // If the current tag is an index database Path... if (qName.equalsIgnoreCase("db")) { // Save it away as the database root directory. configInfo.indexInfo.indexPath = Path.normalizePath(atts.getValue("path")); // Validate the 'rotation' attribute if present String val = atts.getValue("rotate"); if (val != null) { if ("yes".equals(val) || "true".equals(val) || "1".equals(val)) configInfo.indexInfo.rotate = true; else if ("no".equals(val) || "false".equals(val) || "0".equals(val)) configInfo.indexInfo.rotate = false; else { Trace.error( "Unrecognized value for 'rotation' attribute of " + "config option: '" + qName + "'"); System.exit(1); } } return; } // If the current tag is an index source text Path... if (qName.equalsIgnoreCase("src")) { // Save it away as the root directory from which to get the // source XML text files. // configInfo.indexInfo.sourcePath = Path.normalizePath(atts.getValue("path")); // Validate the 'scan' attribute if present String val = atts.getValue("scan"); if (val != null) { if ("all".equals(val)) configInfo.indexInfo.scanAllDirs = true; else if ("pruned".equals(val)) configInfo.indexInfo.scanAllDirs = false; else { Trace.error( "Unrecognized value for 'scan' attribute of " + "config option: '" + qName + "'"); System.exit(1); } } // Validate the 'clone' attribute if present val = atts.getValue("clone"); if (val != null) { if ("yes".equals(val) || "1".equals(val) || "true".equals(val)) { configInfo.indexInfo.cloneData = true; String osName = System.getProperty("os.name"); if (osName.indexOf("Windows") >= 0) { Trace.warning( "Warning: data cloning probably will not work due to " + "limitations of Windows filesystem support."); } } else if ("no".equals(val) || "0".equals(val) || "false".equals(val)) configInfo.indexInfo.cloneData = false; else { Trace.error( "Unrecognized value for 'clone' attribute of " + "config option: '" + qName + "'"); System.exit(1); } } return; } // If the current tag is a docSelector stylesheet path... if (qName.equalsIgnoreCase("docSelector")) { // Save it away configInfo.indexInfo.docSelectorPath = Path.normalizePath( atts.getValue("path")); return; } // If the current tag is the chunk size info... if (qName.equalsIgnoreCase("chunk")) { // Get the size (in words) of the chunk to use. String value = atts.getValue("size"); // If the chunk size was not specified, or 'document' was // specified as the chunk size... // if (value == null || value.length() == 0) { // Set the chunk size to be the default, and the overlap too'. configInfo.indexInfo.setChunkSize(IndexInfo.defaultChunkSize); configInfo.indexInfo.setChunkOvlp(IndexInfo.defaultChunkOvlp); return; } // Otherwise, set the chunk size to be the larger of the default // chunk size and the specified chunk size. // configInfo.indexInfo.setChunkSize(Math.max(IndexInfo.minChunkSize, Integer.parseInt(value))); // Get the overlap (in words) of chunks for this index. value = atts.getValue("overlap"); // If the chunk overlap was not specified, set the overlap to // be half the selected chunk size. // if (value == null || value.length() == 0) { configInfo.indexInfo.setChunkOvlp( configInfo.indexInfo.chunkAtt[IndexInfo.chunkSize]); return; } // Otherwise set the chunk overlap the value specified. configInfo.indexInfo.setChunkOvlp(Integer.parseInt(value)); return; } // if( qName.equalsIgnoreCase( "chunk") == 0 ) // If the current tag tells us to do stop-word removal... if (qName.equalsIgnoreCase("stopwords")) { // Was the value specified in-line? String list = atts.getValue("list"); String path = atts.getValue("path"); if (list != null && list.length() > 0) configInfo.indexInfo.stopWords = atts.getValue("list"); // Was a path specified? else if (path != null && path.length() > 0) { path = Path.normalizeFileName(atts.getValue("path")); File file = new File(new File(configInfo.xtfHomePath), path); try { InputStream stream = new FileInputStream(file); if (path.endsWith(".gz")) stream = new GZIPInputStream(stream); Reader reader = new InputStreamReader(stream); char[] buf = new char[(int)file.length()]; int length = reader.read(buf); configInfo.indexInfo.stopWords = new String(buf, 0, length); } catch (IOException e) { Trace.error("Error reading stop-words file \"" + path + "\": " + e); System.exit(1); } } // If no value was specified, use the default list of stop words. else configInfo.indexInfo.stopWords = IndexInfo.defaultStopWords; return; } // If the current tag tells us to map plural words... if (qName.equalsIgnoreCase("pluralmap")) { // Save the path. configInfo.indexInfo.pluralMapPath = Path.normalizePath( atts.getValue("path")); return; } // If the current tag tells us to map accented chars... if (qName.equalsIgnoreCase("accentmap")) { // Save the path. configInfo.indexInfo.accentMapPath = Path.normalizePath( atts.getValue("path")); return; } // If the current tag tells us to create a spellcheck dictionary... if (qName.equalsIgnoreCase("spellcheck")) { // Validate the attribute. String val = atts.getValue("createDict"); if (val == null) val = atts.getValue("createdict"); if ("yes".equals(val) || "true".equals(val)) configInfo.indexInfo.createSpellcheckDict = true; else if ("no".equals(val) || "false".equals(val)) configInfo.indexInfo.createSpellcheckDict = false; else { Trace.error( "Unrecognized value for 'createDict' attribute of " + "config option: '" + qName + "'"); System.exit(1); } return; } // If the current tag tells us to strip whitespace... if (qName.equalsIgnoreCase("whitespace")) { // Validate the attribute. String val = atts.getValue("strip"); if ("yes".equals(val) || "true".equals(val)) configInfo.indexInfo.stripWhitespace = true; else if ("no".equals(val) || "false".equals(val)) configInfo.indexInfo.stripWhitespace = false; else { Trace.error( "Unrecognized value for 'strip' attribute of " + "config option: '" + qName + "'"); System.exit(1); } return; } // If the current tag points to validation specs... if (qName.equalsIgnoreCase("validation")) { // Save it away configInfo.indexInfo.validationPath = Path.normalizePath( atts.getValue("path")); return; } Trace.error("Unknown config option: '" + qName + "'"); System.exit(1); } // public startElement() //////////////////////////////////////////////////////////////////////////// /** Methed called when the end tag is encountered in the config file. <br><br> * * This class is derived from the SAX {@link org.xml.sax.helpers.DefaultHandler} * class so that the parser can call this method each time an end tag * is encountered in the XML config file.<br><br> * * @param uri The current namespace URI in use. * * @param localName The local name (i.e., without prefix) of the current * element, or the empty string if namespace processing is * disabled. * * @param qName The qualified name (i.e., with prefix) for the current * element, or the empty string if qualified names are * disabled. * * @throws SAXException If any internal exceptions generated due to * syntax problems in the element. <br><br> * * @.notes * For an explanation of the config file format, see the main description * for the {@link XMLConfigParser} class. <br><br> */ public void endElement(String uri, String localName, String qName) throws SAXException// called at element end { // If we got the "end of index" tag, flag that the specified index // name is no longer found. This will effectively stop the processing // of config file elements. // // (Note: If another start tag for the specified index name is found, // the indexNameFound tag will get set back to true, and reading // of config info will resume.) // if (qName.equalsIgnoreCase("index")) inNamedIndexBlock = false; } // public endElement() }