/*
This software was produced for the U. S. Government
under Contract No. W15P7T-11-C-F600, and is
subject to the Rights in Noncommercial Computer Software
and Noncommercial Computer Software Documentation
Clause 252.227-7014 (JUN 1995)
Copyright 2013 The MITRE Corporation. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package org.opensextant.matching;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;
import org.apache.solr.client.solrj.response.SolrResponseBase;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DataLoader {
private static ModifiableSolrParams loadParams = new ModifiableSolrParams();
private static String requestHandler = "/update";
/** Log object. */
private static final Logger LOGGER = LoggerFactory.getLogger(DataLoader.class);
/** Some common params. */
static {
loadParams.set("update.contentType", "text/csv");
loadParams.set("skipLines", "1");
loadParams.set("optimize", "true");
loadParams.set("separator", "\t");
loadParams.set("header", "false");
loadParams.set("trim", "on");
loadParams.set("overwrite", "false");
loadParams.set("debug", "true");
}
private DataLoader() {
}
public static void main(String[] args) throws Exception {
if (args.length < 3 || args.length > 4) {
usage();
}
String scheme = args[0];
String inputForm = args[1];
String csvFilePath = args[2];
String solrhome = "";
if (args.length == 4) {
solrhome = args[3];
}
// get a SolrServer with the proper core
SolrClient solrServer = getSolrServer(scheme, solrhome);
// convert indexed content to flat list
// currently creates a temp file, could stream?
if ("index".equalsIgnoreCase(inputForm)) {
csvFilePath = flatten(csvFilePath);
}
try {
// set the fieldnames param for the selected schema
final ModifiableSolrParams params = new ModifiableSolrParams(loadParams);
if ("gazetteer".equalsIgnoreCase(scheme)) {
params.set("fieldnames", MatcherFactory.getGazetteerFieldNamesLoader());
} else {
params.set("fieldnames", MatcherFactory.getVocabFieldNames());
}
// build the update request
final ContentStreamUpdateRequest updateRequest = new ContentStreamUpdateRequest(requestHandler);
updateRequest.setParams(params);
ContentStream inStream = new ContentStreamBase.FileStream(new File(csvFilePath));
// add the input file as a stream to the request
updateRequest.addContentStream(inStream);
// make the call
SolrResponseBase response = null;
try {
response = updateRequest.process(solrServer);
// see what happened
printResponse(response);
} catch (Exception e) {
LOGGER.error("Exception in submitting Solr request " + e);
}
} finally {
// cleanup
solrServer.close();
}
}
private static void usage() {
String tmp = "DataLoader <scheme> <inputformat> <inputfilepath> <solrhome> where\n";
tmp = tmp + " <scheme> = gazetteer | vocabulary\n";
tmp = tmp + " <inputformat> = csv | index\n";
tmp = tmp + " <inputfilepath> = file to be loaded\n";
tmp = tmp + " <solrhome> = path to solr home (optional)\n";
LOGGER.info(tmp);
}
private static String flatten(String currentPath) {
File topDir = new File(currentPath).getParentFile();
File input = new File(currentPath);
Map<File, String> index = new HashMap<File, String>();
// read the index file into the index Map
// loop over the lines of the index file
LineIterator indexIter = null;
try {
indexIter = FileUtils.lineIterator(input, "UTF-8");
} catch (IOException e) {
LOGGER.error("Couldnt read from " + input.getName() + ":", e);
return null;
}
if (indexIter != null) {
while (indexIter.hasNext()) {
// get next line
String line = indexIter.next();
String[] pieces = line.split(":");
File subFile = new File(topDir, pieces[0]);
String tmpVal = pieces[1];
if (pieces.length >= 3) {
tmpVal = tmpVal + ":" + pieces[2];
}
index.put(subFile, tmpVal);
}
}
File tmp = null;
try {
tmp = File.createTempFile("vocab", "txt");
} catch (IOException e) {
LOGGER.error("Could not create temp file when flattening vocab:", e);
return null;
}
// loop over the files mentioned in the index and write to temp file
int indexID = 0;
for (File in : index.keySet()) {
String[] catAndTax = index.get(in).split(":");
String cat = catAndTax[0];
String tax = "";
if (catAndTax.length > 1) {
tax = catAndTax[1];
} else {
tax = "NONE";
}
// loop over the lines of the subfiles file
// write the new flat contents to the temp file
LineIterator contentIter = null;
try {
contentIter = FileUtils.lineIterator(in, "UTF-8");
} catch (IOException e) {
LOGGER.error("Couldnt read from " + in.getName(), e);
return null;
}
if (contentIter != null) {
while (contentIter.hasNext()) {
// get next line
String line = contentIter.next();
// concat the pieces
String out = indexID + "\t" + line + "\t" + cat + "\t" + tax + "\n";
// write all pieces to temp
try {
FileUtils.writeStringToFile(tmp, out, "UTF-8", true);
} catch (IOException e) {
LOGGER.error("Could not write to temp file when flattening vocab:", e);
}
indexID++;
}
}
}
LOGGER.info("Flattened " + indexID + " vocabulary entries to temp file");
// return temp file path
return tmp.getAbsolutePath();
}
private static void printResponse(SolrResponseBase response) {
LOGGER.info(response.toString());
}
private static SolrClient getSolrServer(String scheme, String solrhome) {
MatcherFactory.config(solrhome);
MatcherFactory.start();
SolrClient svr = null;
if ("gazetteer".equalsIgnoreCase(scheme)) {
svr = MatcherFactory.getSolrServerGeo();
} else {
svr = MatcherFactory.getSolrServerVocab();
}
return svr;
}
}