NinesStatementHandler.java example

/** 
 *  Copyright 2007 Applied Research in Patacriticism and the University of Virginia
 * 
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 **/
package org.nines;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.nines.RDFIndexerConfig.Mode;
import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.impl.BNodeImpl;
import org.openrdf.model.impl.LiteralImpl;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.rio.RDFHandler;
import org.openrdf.rio.RDFHandlerException;

final class NinesStatementHandler implements RDFHandler {
    private final static Logger log = Logger.getLogger(NinesStatementHandler.class.getName());

    private HashMap<String, HashMap<String, ArrayList<String>>> documents;
    private String dateBNodeId;
    private HashMap<String, ArrayList<String>> doc;
    private Boolean title_sort_added = false;
    private File file;
    private RDFIndexerConfig config;
    private ErrorReport errorReport;
    private String documentURI;
    private long largestTextField = -1;
    private LinkCollector linkCollector;
    private boolean hasCorrectedText = false;

    private static String uncertain = "Uncertain";

    public NinesStatementHandler(ErrorReport errorReport, LinkCollector linkCollector, RDFIndexerConfig config) {
        this.errorReport = errorReport;
        this.config = config;
        doc = new HashMap<String, ArrayList<String>>();
        documentURI = "";
        documents = new HashMap<String, HashMap<String, ArrayList<String>>>();
        this.linkCollector = linkCollector;
        System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.SimpleLog");
        System.setProperty("org.apache.commons.logging.simplelog.showdatetime", "true");
        System.setProperty("org.apache.commons.logging.simplelog.log.org.apache.commons.httpclient", "error");
    }

    public void handleStatement(Statement statement) throws RDFHandlerException {

        String subject = statement.getSubject().stringValue();
        String predicate = statement.getPredicate().stringValue();
        String object = statement.getObject().stringValue();
        
        // if the object of the triple is blank, skip it, it is nothing worth indexing
        // EXCEPT for text in page-level RDF. There are valid cases where the collex:text
        // for a page is blank. To avoid streaming out validation errors on these
        // cases, let blanks through. There is matching code in handleText.
        if ( object == null || object.length() == 0 ) {
            if ( !(this.config.isPagesArchive() && "http://www.collex.org/schema#text".equals(predicate)) ) {
                return;
            }
        }

        // start of a new document
        if ("http://www.w3.org/1999/02/22-rdf-syntax-ns#type".equals(predicate)
            && statement.getSubject() instanceof URIImpl) {
            if (documents.get(subject) != null) {
                errorReport.addError(new IndexerError(this.file.toString(), subject, "Duplicate URI"));
                log.info("*** Duplicate: " + subject);
            }
            doc = new HashMap<String, ArrayList<String>>();
            addField(doc, "uri", subject);
            documents.put(subject, doc);
            title_sort_added = false;
            documentURI = subject;
            this.hasCorrectedText = ( this.config.correctedTextMap.containsKey(this.documentURI));
            log.info("Parsing RDF for document: " + subject);
            errorReport.flush();
        }
        
        // Check for any unsupported nines:* attributes and issue error if any exist
        if (predicate.startsWith("http://www.nines.org/schema#")) {
            addError( "NINES is no longer a valid attribute: "+ predicate);
            return;
        }

        if (predicate.startsWith("http://www.collex.org/schema#")) {
            String attribute = predicate.substring("http://www.collex.org/schema#".length());
            if (!(attribute.equals("archive") || attribute.equals("freeculture") || attribute.equals("source_xml")
                || attribute.equals("source_html") || attribute.equals("source_sgml") || attribute.equals("federation")
                || attribute.equals("ocr") || attribute.equals("genre") || attribute.equals("thumbnail")
                || attribute.equals("text") || attribute.equals("fulltext") || attribute.equals("image")
                || attribute.equals("pages") || attribute.equals("pagenum") || attribute.equals("pageof") 
                || attribute.equals("discipline") || attribute.equals("typewright"))) {

                addError("Collex does not support this property: " + predicate );
                return;
            }
        }

        // parse RDF statements into fields, return when the statement has been handled
        if (handleFederation(predicate, object))
            return;
        if (handleOcr(predicate, object))
            return;
        if (handlePages(predicate, object))
            return;
        if (handlePageNum(predicate, object))
            return;
        if (handlePageOf(predicate, object))
            return;
        if (handleTypewright(predicate, object))
            return;
        if (handleFullText(predicate, object))
            return;
        if (handleCollexSourceXml(predicate, object))
            return;
        if (handleCollexSourceHtml(predicate, object))
            return;
        if (handleCollexSourceSgml(predicate, object))
            return;
        if (handleArchive(predicate, object))
            return;
        if (handleFreeCulture(predicate, object))
            return;
        if (handleTitle(predicate, object))
            return;
        if (handleAlternative(predicate, object))
            return;
        if (handleGenre(predicate, object))
            return;
        if (handleDate(subject, predicate, statement.getObject()))
            return;
        if (handleDateLabel(subject, predicate, object))
            return;
        if (handleSource(predicate, object))
            return;
        if (handleThumbnail(predicate, object))
            return;
        if (handleImage(predicate, object))
            return;
        if (handleURL(predicate, object))
            return;
        if (handleText(predicate, object))
            return;
        if (handleRole(predicate, object))
            return;
        if (handlePerson(predicate, object))
            return;
        if (handleFormat(predicate, object))
            return;
        if (handleLanguage(predicate, object))
            return;
        if (handleGeospacial(predicate, object))
            return;
        if (handleProvenance(predicate, object))
            return;
        if (handleDiscipline(predicate, object))
            return;
        if (handleSubject(predicate, object))
            return;
        if (handleType(predicate, object))
            return;
        if (handleHasPart(predicate, object))
            return;
        if (handleIsPartOf(predicate, object))
            return;
    }

    private boolean handleFederation(String predicate, String object) {
        if ("http://www.collex.org/schema#federation".equals(predicate)) {
            if (object.equals("NINES") || object.equals("18thConnect") || object.equals("MESA") || 
                object.equals("ModNets") || object.equals("SiRO") || object.equals("estc") || object.equals("GLA") ) {
                addField(doc, "federation", object);
            } else {
                addError("Unknown federation: " + object);
            }
            return true;
        }
        return false;
    }

    private boolean handleOcr(String predicate, String object) {
        if ("http://www.collex.org/schema#ocr".equals(predicate)) {
            if ("true".equalsIgnoreCase(object)) {
                // only add a ocr field if it's true. No field set implies "F"alse
                addField(doc, "is_ocr", "T");
                return true;
            }
        }
        return false;
    }

    private boolean handleTypewright(String predicate, String object) {
        if ("http://www.collex.org/schema#typewright".equals(predicate)) {
            if ("true".equalsIgnoreCase(object)) {
                // only add a typewright field if it's true. No field set implies "F"alse
                addField(doc, "typewright", "T");
                return true;
            }
        }
        return false;
    }

    private boolean handlePerson(String predicate, String object) {
        if ("http://www.collex.org/schema#person".equals(predicate)) {
            addField(doc, "person", object);
            return true;
        }
        return false;
    }

    private boolean handleFormat(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/format".equals(predicate)) {
            addField(doc, "format", object);
            return true;
        }
        return false;
    }

    private boolean handleLanguage(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/language".equals(predicate)) {
            addField(doc, "language", object);
            return true;
        }
        return false;
    }

    private boolean handleGeospacial(String predicate, String object) {
        if ("http://www.collex.org/schema#geospacial".equals(predicate)) {
            addField(doc, "geospacial", object);
            return true;
        }
        return false;
    }

    private boolean handleCollexSourceXml(String predicate, String object) {
        if ("http://www.collex.org/schema#source_xml".equals(predicate)) {
            addField(doc, "source_xml", object);
            return true;
        }
        return false;
    }

    private boolean handleCollexSourceHtml(String predicate, String object) {
        if ("http://www.collex.org/schema#source_html".equals(predicate)) {
            addField(doc, "source_html", object);
            return true;
        }
        return false;
    }

    private boolean handleCollexSourceSgml(String predicate, String object) {
        if ("http://www.collex.org/schema#source_sgml".equals(predicate)) {
            addField(doc, "source_sgml", object);
            return true;
        }
        return false;
    }

    private boolean handleArchive(String predicate, String object) {
        if ("http://www.collex.org/schema#archive".equals(predicate)) {
            addField(doc, "archive", object);
            return true;
        }
        return false;
    }

    private boolean handleFreeCulture(String predicate, String object) {
        if ("http://www.collex.org/schema#freeculture".equals(predicate)) {
            if ("false".equalsIgnoreCase(object)) {
                addFieldEntry(doc, "freeculture", "F", true); // "F"alse
            } else if ("true".equalsIgnoreCase(object)) {
                addFieldEntry(doc, "freeculture", "T", true); // "T"rue
            }
            return true;
        }
        return false;
    }
    
    private boolean handlePages(String predicate, String object) {
        if ("http://www.collex.org/schema#pages".equals(predicate)) {
            if ("false".equalsIgnoreCase(object)) {
                addFieldEntry(doc, "has_pages", "F", true); // "F"alse
            } else if ("true".equalsIgnoreCase(object)) {
                addFieldEntry(doc, "has_pages", "T", true); // "T"rue
            }
            return true;
        }
        return false;
    }
    
    private boolean handlePageOf(String predicate, String object) {
        if ("http://www.collex.org/schema#pageof".equals(predicate)) {
            addField(doc, "page_of", object);
            return true;
        }
        return false;
    }
    
    private boolean handlePageNum(String predicate, String object) {
        if ("http://www.collex.org/schema#pagenum".equals(predicate)) {
            addField(doc, "page_num", object);
            return true;
        }
        return false;
    }

    private boolean handleFullText(String predicate, String object) {
        if ("http://www.collex.org/schema#fulltext".equals(predicate)) {
            if ( this.hasCorrectedText ) {
                addField(doc, "has_full_text", "T"); 
            } else {
                if ("false".equalsIgnoreCase(object)) {
                    // only add a fulltext field if its false. No field set implies "T"rue
                    addField(doc, "has_full_text", "F"); // "F"alse
                }
            }
            return true;
        }
        return false;
    }

    private boolean handleTitle(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/title".equals(predicate)) {
            addField(doc, "title", object);
            if (!title_sort_added) {
                addField(doc, "title_sort", object);
                title_sort_added = true;
            }
            return true;
        }
        return false;
    }

    private boolean handleProvenance(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/provenance".equals(predicate)) {
            addField(doc, "provenance", object);
            return true;
        }
        return false;
    }

    private boolean handleType(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/type".equals(predicate)) {
            addField(doc, "doc_type", object);
            return true;
        }
        return false;
    }

    private boolean handleDiscipline(String predicate, String object) {
            if ("http://www.collex.org/schema#discipline".equals(predicate)) {
                addField(doc, "discipline", object);
                return true;
            }
            return false;
        }

    private boolean handleSubject(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/subject".equals(predicate)) {
            addField(doc, "subject", object);
            return true;
        }
        return false;
    }

    private boolean handleAlternative(String predicate, String object) {
        if ("http://purl.org/dc/terms/alternative".equals(predicate)) {
            addField(doc, "alternative", object);
            return true;
        }
        return false;
    }

    private boolean handleGenre(String predicate, String object) {
        if ("http://www.collex.org/schema#genre".equals(predicate)) {
            // ignore deprecated genres for backward compatibility
            if (!"Primary".equals(object) && !"Secondary".equals(object)) {
                addField(doc, "genre", object);
            }
            return true;
        }
        return false;
    }

    private boolean handleDate(String subject, String predicate, Value value) {
        if ("http://purl.org/dc/elements/1.1/date".equals(predicate)) {
            String object = value.stringValue().trim();
            if (value instanceof LiteralImpl) {

                // add label
                addField(doc, "date_label", object);

                //System.out.println( "handleDate: " + object );

                ArrayList<String> years = parseYears(object);

                if( years.isEmpty() == true ) {
                    addError("Invalid date format: " + object);
                    return false;
                }

                // add the years
                for (String year : years) {
                    addFieldIfUnique(doc, "year", year);
                }

                // and any fields that are derived from the years
                addDerivedDateFields( years );
            } else {
                BNodeImpl bnode = (BNodeImpl) value;
                dateBNodeId = bnode.getID();
            }

            return true;
        }

        return false;
    }

    private boolean handleDateLabel(String subject, String predicate, String object) {
        if (subject.equals(dateBNodeId)) {
            // if dateBNodeId matches, we assume we're under a <collex:date> and simply
            // look for <rdfs:label> and <rdf:value>

            if ("http://www.w3.org/2000/01/rdf-schema#label".equals(predicate)) {
                addField(doc, "date_label", object);
                return true;
            }

            if ("http://www.w3.org/1999/02/22-rdf-syntax-ns#value".equals(predicate)) {
                //System.out.println( "handleDateLabel: " + object );
                ArrayList<String> years = parseYears(object);

                if( years.isEmpty() == true ) {
                    addError("Invalid date format: " + object);
                    return false;
                }

                // add the years
                for (String year : years) {
                    addFieldIfUnique(doc, "year", year);
                }

                // and any fields that are derived from the years
                addDerivedDateFields( years );

               return true;
            }
        }
        return false;
    }

    private boolean handleSource(String predicate, String object) {
        if ("http://purl.org/dc/elements/1.1/source".equals(predicate)) {
            addField(doc, "source", object);
            return true;
        }
        return false;
    }

    private boolean handleThumbnail(String predicate, String object) {
        if ("http://www.collex.org/schema#thumbnail".equals(predicate)) {
            addField(doc, "thumbnail", object);
            return true;
        }
        return false;
    }

    private boolean handleImage(String predicate, String object) {
        if ("http://www.collex.org/schema#image".equals(predicate)) {
            addField(doc, "image", object);
            return true;
        }
        return false;
    }

    private boolean handleURL(String predicate, String object) {
        if ("http://www.w3.org/2000/01/rdf-schema#seeAlso".equals(predicate)) {
            addField(doc, "url", object);
            return true;
        }
        return false;
    }

    private boolean handleHasPart(String predicate, String object) {
        if ("http://purl.org/dc/terms/hasPart".equals(predicate)) {
            addField(doc, "hasPart", object);
            return true;
        }
        return false;
    }

    private boolean handleIsPartOf(String predicate, String object) {
        if ("http://purl.org/dc/terms/isPartOf".equals(predicate)) {
            addField(doc, "isPartOf", object);
            return true;
        }
        return false;
    }

    private boolean handleText(String predicate, String object) {

        // first, check if this object is TEXT. If it is the predicate
        // will have the #text url below....
        if ("http://www.collex.org/schema#text".equals(predicate)) {

            String text = object;
            boolean externalText = false;
            if ( this.hasCorrectedText ) {
                // only in index mode do we attempt to grab 
                // corrected text from the full text folder
                if (config.mode == Mode.INDEX) {
                    externalText = true;
                    text = getCorrectedText();
                } else {
                    text = "";
                }
                
            } else {
                // Objects with external content will have some form of
                // http url as the content.
                if (object.trim().startsWith("http://") && object.trim().indexOf(" ") == -1) {
                    addFieldEntry(doc, "text_url", text, false);
    
                    // only in index mode do we attempt to grab 
                    // full text from the full text folder
                    if (config.mode == Mode.INDEX) {
                        externalText = true;
                        text = getFullText( text );
                    } else {
                        text = "";
                    }
                }
            }

            if ( text.length() > 0 || this.config.isPagesArchive() ) {
                this.largestTextField = Math.max(this.largestTextField, text.length());
                // NOTE: the !externalText signals to the add method that it
                // should NOT perform any cleanup. Text goes in untouched.
                addFieldEntry(doc, "text", text, false, !externalText);
            }

            return true;
        }
        return false;
    }
    
    /**
     * find the full path to the full text root baseed on 
     * the path to the original rdf sources
     * @return
     */
    private String findFullTextRoot() {
        String path = this.config.sourceDir.toString();
        int pos = path.indexOf("/rdf/");
        path = path.substring(0, pos) + "/fulltext/";
        path += RDFIndexerConfig.safeArchive(this.config.archiveName) + "/";
        return path;
    }
    
    /**
     * Get the corrected text for the current document
     * @return
     */
    private String getCorrectedText() {
        String fName = this.config.correctedTextMap.get(this.documentURI);
        File corrTxtFile = new File( this.config.correctedTextDir, fName);
        if (corrTxtFile.exists() == false) {
            this.errorReport.addError(new IndexerError("", this.documentURI, "Missing corrected text file " + corrTxtFile.toString()));
            return "";
        }
        FileInputStream is = null;
        try {
            is = new FileInputStream(corrTxtFile);
            return IOUtils.toString( is, "UTF-8");
        } catch (IOException e) {
            errorReport.addError(new IndexerError(corrTxtFile.toString(), this.documentURI, "Unable to read corrected text" + ": "
                + e.toString()));
            return "";
        } finally {
            IOUtils.closeQuietly(is);
        }
    }
    
    /**
     * Read the full text for <code>uri</code> from the fulltext area of the solr sources.
     * If any errors are encountered, log them and return an empty string
     * 
     * @param uri
     * @return A string containing the full text - or an empty string if errors occur.
     */
    private String getFullText(String uri) {

        String fullTextRoot = findFullTextRoot() ;
        File root = new File( fullTextRoot );
        if (root.exists() == false) {
            this.errorReport
                .addError(new IndexerError("", uri, "Missing full text source directory " + root.toString()));
            return "";
        }

        // convert URL into filename for text
        String name = uri.replaceAll("/", "SL");
        name = name.replace(":", "CL");
        name = name.replace("?", "QU");
        name = name.replace("=", "EQ");
        name = name.replace("&", "AMP");
        File textFile = new File(fullTextRoot + name + ".txt");
        if (textFile.exists() == false) {
            this.errorReport.addError(new IndexerError("", uri, "Missing full text file " + textFile.toString()));
            return "";
        }

        // read it!
        FileInputStream is = null;
        try {
            is = new FileInputStream(textFile);
            return IOUtils.toString( is, "UTF-8");
        } catch (IOException e) {
            errorReport.addError(new IndexerError(textFile.toString(), uri, "Unable to read full text" + ": "
                + e.toString()));
            return "";
        } finally {
            IOUtils.closeQuietly(is);
        }
    }

    private boolean handleRole(String predicate, String object) {
        if (predicate.startsWith("http://www.loc.gov/loc.terms/relators/")) {
            String role = predicate.substring("http://www.loc.gov/loc.terms/relators/".length());
            addField(doc, "role_" + role, object);
            addField(doc, "role", "role_" + role);
            return true;
        }
        return false;
    }

    public static ArrayList<String> parseYears(String value) {
        ArrayList<String> years = new ArrayList<String>();

        if ("unknown".equalsIgnoreCase(value.trim()) || uncertain.equalsIgnoreCase(value.trim())) {
            return( years );
        }

        // deal with embedded whitespace in ranges
        value = value.replace( ", ", "," ).replace( " ,", "," );

        StringTokenizer tokenizer = new StringTokenizer(value);
        while (tokenizer.hasMoreTokens()) {
            String token = tokenizer.nextToken();
            int range = token.indexOf(',');
            int wild = token.indexOf('u');

            // if we have a leading alpha (e.g "Aug") it is ignored
            if( Character.isLetter( token.charAt( 0 ) ) == true ) {
                years.clear( );
                return( years );
            }

            // ranges containing wildcards are forbidden
            if( range != -1 && wild != -1 ) {
                years.clear( );
                return( years );
            }

            if( range != -1 ) {
                parseYearRange( years, token );
            } else if( wild != -1 ) {
                parseYearWild( years, token );
            } else {
                if( token.length() >= 4 ) {
                    years.add( token.substring( 0, 4 ) );
                } else {
                    // invalid date, less than 4 characters
                    years.clear( );
                    return( years );
                }
            }
        }
        return( years );
    }

    private void addDerivedDateFields( final ArrayList<String> years ) {

        // only process years that are in the correct format...
        Pattern p = Pattern.compile( "\\d{4}" );
        for( String year : years ) {
            Matcher m = p.matcher( year );
            if( m.matches( ) == true ) {
                //System.out.println( "YEAR [" + year + "] quarter [" + makeQuarterCentury( year ) + "] half [" + makeHalfCentury( year ) + "] full [" + makeCentury( year ) + "]" );
                addFieldIfUnique( doc, "decade", makeDecade( year ) );
                addFieldIfUnique( doc, "quarter_century", makeQuarterCentury( year ) );
                addFieldIfUnique( doc, "half_century", makeHalfCentury( year ) );
                addFieldIfUnique( doc, "century", makeCentury( year ) );
            }
        }
    }

    public static String makeDecade( final String year ) {
        return( year.substring( 0, 3 ) + "0" );
    }

    public static String makeQuarterCentury( final String year ) {
        Integer sub = Integer.parseInt( year.substring( 2, 4 ) );
        String quarter = "00";
        if( sub >= 75 ) quarter = "75";
        else if( sub >= 50 ) quarter = "50";
        else if( sub >= 25 ) quarter = "25";
        return( year.substring( 0, 2 ) + quarter );
    }

    public static String makeHalfCentury( final String year ) {
        Integer sub = Integer.parseInt( year.substring( 2, 4 ) );
        String half = ( sub >= 50 ) ? "50" : "00";
        return( year.substring( 0, 2 ) + half );
    }

    public static String makeCentury( final String year ) {
        return( year.substring( 0, 2 ) + "00" );
    }

    public void addField(HashMap<String, ArrayList<String>> map, String name, String value) {

        // skip null fields
        if (value == null || name == null)
            return;

        // if the field is a url, check to see if it is reachable
        if (config.collectLinks && value.trim().startsWith("http://") && value.trim().indexOf(" ") == -1
            && !"uri".equals(name)) {
            linkCollector.addLink(documentURI, this.file.toString(), value);
        }

        addFieldEntry(map, name, value, false);
    }

    public void addFieldIfUnique(HashMap<String, ArrayList<String>> map, String name, String value) {

        // skip null fields
        if (value == null || name == null)
            return;

        ArrayList<String> objectArray = map.get( name );
        if( objectArray == null || objectArray.contains( value ) == false ) {
            addFieldEntry(map, name, value, false);
        }
    }

    /**
     * Add a CLEANED field entry. The entry will be normalize, escape sequences stripped
     * and invalid utf-8 chars stripped
     * @param map
     * @param name
     * @param value
     * @param replace
     */
    private void addFieldEntry(HashMap<String, ArrayList<String>> map, String name, String value, Boolean replace) {
        addFieldEntry(map, name, value, replace, true);
    }

    /**
     * Add a new field entry and optionally clean the data
     * @param map
     * @param name
     * @param value
     * @param replace
     * @param clean
     */
    private void addFieldEntry(HashMap<String, ArrayList<String>> map, String name, String value, boolean replace, boolean clean) {

        // clean everything going in? 
        String data = value;
        if ( clean ) {
            data = TextUtils.stripEscapeSequences(data, this.errorReport, this.file, this.documentURI);
            data = TextUtils.normalizeWhitespace(data);
            data = TextUtils.stripUnknownUTF8(data, this.errorReport, this.file, this.documentURI);
        }
       
        // make sure we add to array for already existing fields
        if (map.containsKey(name) && replace == false) {
            ArrayList<String> pastValues = map.get(name);
            pastValues.add(data);
            map.put(name, pastValues);
        } else {
            ArrayList<String> values = new ArrayList<String>();
            values.add(data);
            map.put(name, values);
        }
    }


    private String getFirstField(HashMap<String, ArrayList<String>> object, String field) {
        ArrayList<String> objectArray = object.get(field);
        if (objectArray != null && objectArray.isEmpty( ) == false ) {
            return objectArray.get(0);
        }
        return "";
    }

    private String getLastField(HashMap<String, ArrayList<String>> object, String field) {
        ArrayList<String> objectArray = object.get(field);
        if (objectArray != null && objectArray.isEmpty( ) == false ) {
            return objectArray.get( objectArray.size( ) - 1 );
        }
        return "";
    }

    public HashMap<String, HashMap<String, ArrayList<String>>> getDocuments( boolean isPageData ) {
        if ( isPageData ) {
            return documents;
        }
        
        // add author_sort: we do that here because we have a few different fields we look at and the order they appear
        // shouldn't matter, so we wait to the end to find them.
        Set<String> keys = documents.keySet();
        for (String uri : keys) {
            HashMap<String, ArrayList<String>> object = documents.get(uri);
            String author = getFirstField(object, "role_AUT");
            String artist = getFirstField(object, "role_ART");
            String editor = getFirstField(object, "role_EDT");
            String publisher = getFirstField(object, "role_PUB");
            String translator = getFirstField(object, "role_TRN");
            String printer = getFirstField(object, "role_CRE");
            String etcher = getFirstField(object, "role_ETR");
            String engraver = getFirstField(object, "role_EGR");
            if (author.length() > 0)
                addField(object, "author_sort", author);
            else if (artist.length() > 0)
                addField(object, "author_sort", artist);
            else if (editor.length() > 0)
                addField(object, "author_sort", editor);
            else if (publisher.length() > 0)
                addField(object, "author_sort", publisher);
            else if (translator.length() > 0)
                addField(object, "author_sort", translator);
            else if (printer.length() > 0)
                addField(object, "author_sort", printer);
            else if (etcher.length() > 0)
                addField(object, "author_sort", etcher);
            else if (engraver.length() > 0)
                addField(object, "author_sort", engraver);

            // add year_sort fields
            String year_sort_min = getFirstField(object, "year");
            if (year_sort_min.isEmpty() == false ) {
                String year_sort_max = getLastField(object, "year");

                addField(object, "year_sort", year_sort_min);
                addField(object, "year_sort_asc", year_sort_min);
                addField(object, "year_sort_desc", year_sort_max);
            } else {
                addField( object, "year", uncertain );
                addField( object, "year_sort", uncertain );
                addField( object, "year_sort_asc", uncertain );
                addField( object, "year_sort_desc", uncertain );
            }

            // add fulltext and ocr indicators
            ArrayList<String> objectArray = object.get("text");
            if (objectArray != null) { // If we have a text field
                if (object.get("has_full_text") == null)
                    addField(object, "has_full_text", "T");
            } else {
                if (object.get("has_full_text") == null)
                    addField(object, "has_full_text", "F");
            }
            objectArray = object.get("is_ocr");
            if (objectArray == null) // If we weren't told differently, then it is not an ocr object
                addField(object, "is_ocr", "F");
            objectArray = object.get("freeculture");
            if (objectArray == null) // If we weren't told differently, then it is freeculture
                addField(object, "freeculture", "T");
        }
        return documents;
    }

    private static void parseYearWild( List<String> years, final String date ) {

        // expand 184u to 1840-1849
        char[] yearChars = date.toCharArray();
        int numLength = date.length();
        int i, factor = 1, startPos = 0;

        if (numLength > 4) numLength = 4;

        // increase factor according to size of number
        for (i = 0; i < numLength; i++) factor *= 10;

        // start looking for 'u', decreasing factor as we go
        for (i = startPos; i < numLength; i++) {
            if (yearChars[i] == 'u') {
                int padSize = numLength - i;
                String formatStr = "%0" + padSize + "d";
                // iterate over each year
                for (int j = 0; j < factor; j++) {
                    years.add(date.substring(0, i) + String.format(formatStr, j));
                }
                // once one 'u' char is found, we are done
                break;
            }
            factor = factor / 10;
        }
    }

    private static void parseYearRange( List<String> years, final String range ) {

        String [] tokens = range.split( "," );
        if( tokens.length != 2 ) {
            // more than 1 range delimiter
            years.clear( );
            return;
        }

        String start = tokens[ 0 ];
        String finish = tokens[ 1 ];
        if (start.length() >= 4 && finish.length() >= 4) {
            years.addAll( enumerateYears(start.substring(0, 4), finish.substring(0, 4)));
        } else {
            years.clear( );
            return;
        }
    }

    private static ArrayList<String> enumerateYears(String startYear, String endYear) {
        int y1 = Integer.parseInt(startYear);
        int y2 = Integer.parseInt(endYear);

        ArrayList<String> years = new ArrayList<String>();
        years.add(startYear);
        if (y2 <= y1)
            return years;

        for (int i = y1 + 1; i <= y2; i++) {
            years.add("" + i);
        }

        return years;
    }
    
    private void addError( final String message ) {
        this.errorReport.addError(new IndexerError(this.file.toString(), this.documentURI, message));
    }

    public void setFile(final File file) {
        this.file = file;
    }
    
    public long getLargestTextSize() {
        return this.largestTextField;
    }

    public void endRDF() throws RDFHandlerException {
        // no-op
    }

    public void handleComment(String arg0) throws RDFHandlerException {
        // no-op
    }

    public void handleNamespace(String arg0, String arg1) throws RDFHandlerException {
        // no-op
    }

    public void startRDF() throws RDFHandlerException {
        // no-op
    }
}