/*
* This software was produced for the U. S. Government
* under Basic Contract No. W15P7T-13-C-A802, and is
* subject to the Rights in Noncommercial Computer Software
* and Noncommercial Computer Software Documentation
* Clause 252.227-7014 (FEB 2012)
*
* Copyright (C) 2016 The MITRE Corporation.
* Copyright (C) 2016 OpenSextant.org
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.opensextant.mapreduce;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.opensextant.data.TextInput;
import org.opensextant.extraction.TextMatch;
import org.opensextant.util.TextUtils;
import net.sf.json.JSONObject;
import java.io.IOException;
/**
* Common configuration for the mappers.
*/
public abstract class AbstractMapper extends Mapper<BytesWritable, Text, Text, Text> {
protected long counter = 0;
/**
* Configures logging.
*/
@Override
public void setup(Context c) throws IOException {
LoggingUtilities.configureLogging(c.getConfiguration());
}
/**
* A common method for converting a Text object into an Xponents TextInput tuple.
* The assumptions for this demonstration method are:
* <ul>
* <li>input is JSON data and can be parsed as such</li>
* <li>JSON data contains a top level "text" field, which will be used for extraction.</li>
* <li>record ID is the result of key.toString(), or if key is null, then use JSON get('id') </li>
* </ul>
* Caller can optionally set Language ID of text.
* @param key record ID, optionally null.
* @param textRecord a JSON formatted object.
* @return TextInput pair.
*/
protected static TextInput prepareInput(final Object key, final Text textRecord) {
JSONObject obj = JSONObject.fromObject(textRecord.toString());
if (!obj.containsKey("text")) {
return null;
}
String text_id = null;
if (key != null) {
text_id = key.toString();
} else {
text_id = obj.getString("id");
}
String text = obj.getString("text");
return new TextInput(text_id, text);
}
/**
* Given an Xponents match, produce a common JSON output.
* @param tm
* @return
*/
protected static JSONObject prepareOutput(final TextMatch tm) {
JSONObject j = new JSONObject();
j.put("type", tm.getType());
j.put("value", TextUtils.squeeze_whitespace(tm.getText()));
j.put("offset", tm.start);
return j;
}
}