/** * Copyright 2011 Applied Research in Patacriticism and the University of Virginia * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package org.nines; import org.apache.commons.lang.StringUtils; import java.io.File; import java.util.*; /** * Configuration for the RDFFileIndexer * * @author nicklaiacona */ public class RDFIndexerConfig { // mode of operation public enum Mode { NONE, // No mode.. do nothing. TEST, // Testing mode: ignore all ext text, dont post to solr SPIDER, // retrieve full text from external source - no post to solr CLEAN_RAW, // cleanup the raw sipdered text and move to fulltext CLEAN_FULL, // cleanup the fulltext INDEX, // populate solr with rdf data Text will be pulled from the RDF or fulltext RESOLVE, // examine existing archive and resolve any references (isPartOf, hasPart) COMPARE // compare the new arcive with the main index }; // general properties public String logRoot = "."; public File sourceDir; public String archiveName; public String solrBaseURL = "http://localhost:8983/solr"; public Mode mode = Mode.NONE; public String defaultEncoding = "UTF-8"; public String customCleanClass = ""; // corrected text map: URI -> filename public Map<String,String> correctedTextMap = new HashMap<String,String>(); public File correctedTextDir = null; // indexing properties public boolean collectLinks = true; public boolean deleteAll = false; public long maxUploadSize = 10000000; // 10m of characters // comparison properties public String ignoreFields = ""; public String includeFields = "*"; public int pageSize = 500; public boolean isPagesArchive() { return (this.archiveName.indexOf("pages_") == 0); } public final boolean isTestMode() { return this.mode.equals(Mode.TEST); } // all of the solr instance fields. Text is the last field private static final ArrayList<String> ALL_FIELDS = new ArrayList<String>( Arrays.asList( "uri", "archive", "date_label", "genre", "source", "image", "thumbnail", "title", "alternative", "url", "role_ART", "role_AUT", "role_BRD", "role_CNG", "role_CND", "role_DRT", "role_IVR", "role_IVE", "role_OWN", "role_FMO", "role_PRF", "role_PRO", "role_PRN", "role_EDT", "role_PBL", "role_TRL", "role_EGR", "role_ETR", "role_CRE", "freeculture", "is_ocr", "federation", "has_full_text", "source_xml", "typewright", "publisher", "agent", "agent_facet", "author", "editor", "text_url", "year", "type", "date_created", "date_updated", "title_sort", "author_sort", "year_sort", "source_html", "hasPart", "isPartOf", "source_sgml", "person", "format", "language", "geospacial", "text" )); private static final ArrayList<String> ALL_PAGE_FIELDS = new ArrayList<String>( Arrays.asList( "uri", "archive", "date_created", "date_updated", "page_num", "page_of", "text" )); /** * Gets the path and partial name of the logfile. The partial name * just includes the cleaned arhive name. To this must be appended * the log type and extension (ex: _progress.log) * * @return Full path and base name of logfile */ public final String getLogfileBaseName(String subFolder) { String name = this.archiveName.replaceAll("/", "_").replaceAll(":", "_").replaceAll(" ", "_"); String logFileRelativePath = this.logRoot + "/"; if (!subFolder.equals("")) logFileRelativePath = logFileRelativePath + subFolder + "/"; return logFileRelativePath + name; } /** * Look at the compare config and generate a field list * suitable for submission to Solr: * @return List in the form: field1+field2+... */ public final String getFieldList() { ArrayList<String> fields = ALL_FIELDS; if ( isPagesArchive() ) { fields = ALL_PAGE_FIELDS; } // if the ignored list has anything assume all fields and skip requested if (ignoreFields.trim().length() > 0) { List<String> ignored = new ArrayList<String>(Arrays.asList(ignoreFields.split(","))); List<String> fl = new ArrayList<String>( fields ); for (String ignore : ignored) { fl.remove(ignore); } return StringUtils.join( fl.iterator(), "+" ); } // all fields? if (includeFields.equals("*")) { return "*"; } // just some List<String> included = new ArrayList<String>(Arrays.asList(includeFields.split(","))); if (included.contains("uri") == false) { included.add("uri"); } return StringUtils.join(included.iterator(), "+"); } /** * Generate a clean core name from an archive */ public final String coreName( ) { return( coreName( archiveName ) ); } public final String coreName( final String archive ) { if (archive.indexOf("pages_") == 0) { return safeArchive( archive ); } return "archive_" + safeArchive( archive ); } public static final String safeArchive( String archive ) { archive = archive.replaceAll(":", "_"); archive = archive.replaceAll(" ", "_"); archive = archive.replaceAll(",", "_"); return( archive ); } }