/**
* Copyright 2011 Applied Research in Patacriticism and the University of Virginia
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
**/
package org.nines;
import org.apache.commons.lang.StringUtils;
import java.io.File;
import java.util.*;
/**
* Configuration for the RDFFileIndexer
*
* @author nicklaiacona
*/
public class RDFIndexerConfig {
// mode of operation
public enum Mode {
NONE, // No mode.. do nothing.
TEST, // Testing mode: ignore all ext text, dont post to solr
SPIDER, // retrieve full text from external source - no post to solr
CLEAN_RAW, // cleanup the raw sipdered text and move to fulltext
CLEAN_FULL, // cleanup the fulltext
INDEX, // populate solr with rdf data Text will be pulled from the RDF or fulltext
RESOLVE, // examine existing archive and resolve any references (isPartOf, hasPart)
COMPARE // compare the new arcive with the main index
};
// general properties
public String logRoot = ".";
public File sourceDir;
public String archiveName;
public String solrBaseURL = "http://localhost:8983/solr";
public Mode mode = Mode.NONE;
public String defaultEncoding = "UTF-8";
public String customCleanClass = "";
// corrected text map: URI -> filename
public Map<String,String> correctedTextMap = new HashMap<String,String>();
public File correctedTextDir = null;
// indexing properties
public boolean collectLinks = true;
public boolean deleteAll = false;
public long maxUploadSize = 10000000; // 10m of characters
// comparison properties
public String ignoreFields = "";
public String includeFields = "*";
public int pageSize = 500;
public boolean isPagesArchive() {
return (this.archiveName.indexOf("pages_") == 0);
}
public final boolean isTestMode() {
return this.mode.equals(Mode.TEST);
}
// all of the solr instance fields. Text is the last field
private static final ArrayList<String> ALL_FIELDS = new ArrayList<String>( Arrays.asList( "uri", "archive",
"date_label", "genre", "source", "image", "thumbnail", "title", "alternative", "url", "role_ART", "role_AUT",
"role_BRD", "role_CNG", "role_CND", "role_DRT", "role_IVR", "role_IVE", "role_OWN", "role_FMO", "role_PRF", "role_PRO", "role_PRN",
"role_EDT", "role_PBL", "role_TRL", "role_EGR", "role_ETR", "role_CRE", "freeculture", "is_ocr", "federation",
"has_full_text", "source_xml", "typewright", "publisher", "agent", "agent_facet", "author", "editor",
"text_url", "year", "type", "date_created", "date_updated", "title_sort", "author_sort",
"year_sort", "source_html",
"hasPart", "isPartOf",
"source_sgml", "person", "format", "language", "geospacial", "text" ));
private static final ArrayList<String> ALL_PAGE_FIELDS = new ArrayList<String>( Arrays.asList( "uri", "archive",
"date_created", "date_updated", "page_num", "page_of", "text" ));
/**
* Gets the path and partial name of the logfile. The partial name
* just includes the cleaned arhive name. To this must be appended
* the log type and extension (ex: _progress.log)
*
* @return Full path and base name of logfile
*/
public final String getLogfileBaseName(String subFolder) {
String name = this.archiveName.replaceAll("/", "_").replaceAll(":", "_").replaceAll(" ", "_");
String logFileRelativePath = this.logRoot + "/";
if (!subFolder.equals(""))
logFileRelativePath = logFileRelativePath + subFolder + "/";
return logFileRelativePath + name;
}
/**
* Look at the compare config and generate a field list
* suitable for submission to Solr:
* @return List in the form: field1+field2+...
*/
public final String getFieldList() {
ArrayList<String> fields = ALL_FIELDS;
if ( isPagesArchive() ) {
fields = ALL_PAGE_FIELDS;
}
// if the ignored list has anything assume all fields and skip requested
if (ignoreFields.trim().length() > 0) {
List<String> ignored = new ArrayList<String>(Arrays.asList(ignoreFields.split(",")));
List<String> fl = new ArrayList<String>( fields );
for (String ignore : ignored) {
fl.remove(ignore);
}
return StringUtils.join( fl.iterator(), "+" );
}
// all fields?
if (includeFields.equals("*")) {
return "*";
}
// just some
List<String> included = new ArrayList<String>(Arrays.asList(includeFields.split(",")));
if (included.contains("uri") == false) {
included.add("uri");
}
return StringUtils.join(included.iterator(), "+");
}
/**
* Generate a clean core name from an archive
*/
public final String coreName( ) {
return( coreName( archiveName ) );
}
public final String coreName( final String archive ) {
if (archive.indexOf("pages_") == 0) {
return safeArchive( archive );
}
return "archive_" + safeArchive( archive );
}
public static final String safeArchive( String archive ) {
archive = archive.replaceAll(":", "_");
archive = archive.replaceAll(" ", "_");
archive = archive.replaceAll(",", "_");
return( archive );
}
}