/** * Copyright 2012-2015 The MITRE Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. * ** ************************************************** * NOTICE * * This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012-2013 The MITRE Corporation. All Rights Reserved. * ************************************************* */ package org.opensextant.output; import java.net.URI; import java.net.URISyntaxException; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.commons.io.FilenameUtils; import org.opensextant.ConfigException; import org.opensextant.giscore.events.Feature; import org.opensextant.giscore.events.Schema; import org.opensextant.giscore.events.SimpleField; import org.opensextant.giscore.geometry.Point; import org.opensextant.processing.ResultsUtility; import org.opensextant.extraction.TextMatch; import org.opensextant.extraction.ExtractionResult; import org.opensextant.data.Geocoding; import org.slf4j.Logger; import org.slf4j.LoggerFactory; // TODO: Auto-generated Javadoc public class GISDataModel { protected final Logger log = LoggerFactory.getLogger(getClass()); protected boolean includeOffsets; protected boolean includeCoordinate; protected Schema schema = null; protected List<String> field_order = new ArrayList<String>(); public Set<String> field_set = new HashSet<String>(); /** * Instantiates a new GIS data model. * * @param jobName * the job name * @param includeOffsets * the include offsets * @param includeCoordinate * the include coordinate */ public GISDataModel(String jobName, boolean includeOffsets, boolean includeCoordinate) { this(jobName, includeOffsets, includeCoordinate, true); } /** * Instantiates a new GIS data model. * * @param jobName * the job name * @param includeOffsets * the include offsets * @param includeCoordinate * the include coordinate * @param buildSchema * the build schema */ public GISDataModel(String jobName, boolean includeOffsets, boolean includeCoordinate, boolean buildSchema) { super(); this.includeOffsets = includeOffsets; this.includeCoordinate = includeCoordinate; if (buildSchema) { defaultFields(); try { this.schema = buildSchema(jobName); } catch (ConfigException e) { // could not successfully construct the schema... fail hard. throw new RuntimeException(e); } } } /** * Adds the place data. * * @param row * row of data * @param g * geocoding */ protected void addPlaceData(Feature row, Geocoding g) { addColumn(row, OpenSextantSchema.ISO_COUNTRY, g.getCountryCode()); addColumn(row, OpenSextantSchema.PROVINCE, g.getAdmin1()); addColumn(row, OpenSextantSchema.FEATURE_CLASS, g.getFeatureClass()); addColumn(row, OpenSextantSchema.FEATURE_CODE, g.getFeatureCode()); addColumn(row, OpenSextantSchema.PLACE_NAME, g.getPlaceName()); if (includeCoordinate) { if (g.hasCoordinate()) { // Set the geometry to be a point, and add the feature to the list row.setGeometry(new Point(g.getLatitude(), g.getLongitude())); addLatLon(row, g); } } } /** * Adds the precision. * * @param row * row of data * @param g * geocoding */ protected void addPrecision(Feature row, Geocoding g) { addColumn(row, OpenSextantSchema.PRECISION, g.getPrecision()); } /** * Adds the confidence. * * @param row * row of data * @param conf * confidence */ protected void addConfidence(Feature row, double conf) { addColumn(row, OpenSextantSchema.CONFIDENCE, formatConfidence(conf)); } /** * Adds the offsets. * * @param row * data * @param m * match metadata */ protected void addOffsets(Feature row, TextMatch m) { addColumn(row, OpenSextantSchema.START_OFFSET, m.start); addColumn(row, OpenSextantSchema.END_OFFSET, m.end); } /** * Adds the lat lon. to the given data row. * * @param row * data * @param g * geocoding */ protected void addLatLon(Feature row, Geocoding g) { addColumn(row, OpenSextantSchema.LAT, g.getLatitude()); addColumn(row, OpenSextantSchema.LON, g.getLongitude()); } /** * If the caller has additional data to attach to records, allow them to add * fields to schema at runtime and map their data to keys on GeocodingResult * * Similarly, you could have Geocoding row-level attributes unique to the * geocoding whereas attrs on GeocodingResult are global for all geocodings * in that result set. * * @param row * the row * @param rowAttributes * the row attributes * @throws ConfigException * the config exception */ protected void addAdditionalAttributes(Feature row, Map<String, Object> rowAttributes) throws ConfigException { if (rowAttributes != null) { try { for (String field : rowAttributes.keySet()) { if (log.isDebugEnabled()) { log.debug("FIELD=" + field + " = " + rowAttributes.get(field)); } addColumn(row, OpenSextantSchema.getField(field), rowAttributes.get(field)); } } catch (ConfigException fieldErr) { throw fieldErr; } } } /** * Adds the file paths. * * @param row * data * @param recordFile * original file * @param recordTextFile * text version of original */ protected void addFilePaths(Feature row, String recordFile, String recordTextFile) { // TOOD: HPATH goes here. if (recordFile != null) { addColumn(row, OpenSextantSchema.FILENAME, FilenameUtils.getBaseName(recordFile)); addColumn(row, OpenSextantSchema.FILEPATH, recordFile); // Only add text path: // if original is not plaintext or // if original has not been converted // if (recordTextFile != null && !recordFile.equals(recordTextFile)) { addColumn(row, OpenSextantSchema.TEXTPATH, recordTextFile); } } else { log.error("No File path given"); } } /** * Adds the context. * * @param row * the row * @param g * the g */ protected void addContext(Feature row, TextMatch g) { addColumn(row, OpenSextantSchema.CONTEXT, g.getContext()); } /** * Adds the match text. * * @param row * the row * @param g * the g */ protected void addMatchText(Feature row, TextMatch g) { addColumn(row, OpenSextantSchema.MATCH_TEXT, g.getText()); } /** * Allows caller to add a method or pattern id of sorts to denote how match * was derived. * * @param row * the row * @param method * the method */ protected void addMatchMethod(Feature row, String method) { addColumn(row, OpenSextantSchema.MATCH_METHOD, method); } /** * Adds the match method. * * @param row * the row * @param match * the match */ protected void addMatchMethod(Feature row, TextMatch match) { String method = match.getType(); addColumn(row, OpenSextantSchema.MATCH_METHOD, method); } /** * Builds a GISCore feature array (rows) from a given array of TextMatches; Enrich * the features with record-level attributes (columns). If provided result has .input set, * then conext and other metadata for this match will be pulled from it. Context is not * pulled at match time, as it is not used by most processing -- it tends to be more * of an output/formatting issue. And only matches that pass any filters are enriched with * context and other metadaa. * * * @param id * the id * @param g * the g * @param m * the m * @param rowAttributes * the row attributes * @param res * the res * @return the list * @throws ConfigException * schema configuration error */ public List<Feature> buildRows(int id, Geocoding g, TextMatch m, Map<String, Object> rowAttributes, ExtractionResult res) throws ConfigException { Feature row = new Feature(); // Administrative settings: row.setName(g.getPlaceName()); row.setSchema(schema.getId()); row.putData(OpenSextantSchema.SCHEMA_OID, id); // if (includeOffsets) { addOffsets(row, m); } addPlaceData(row, g); addPrecision(row, g); //addConfidence(row, g.getConfidence()); if (m.getContext() == null && res.input!=null){ int len = res.input.buffer.length(); ResultsUtility.setContextFor(res.input.buffer, m, len); } addContext(row, m); addMatchText(row, m); addMatchMethod(row, g.getMethod()); addAdditionalAttributes(row, rowAttributes); if (res.recordFile != null) { addFilePaths(row, res.recordFile, res.recordTextFile); } // this is a list for M x N times List<Feature> features = new ArrayList<Feature>(); features.add(row); return features; } private static final DecimalFormat confFmt = new DecimalFormat("0.000"); /** * Convenience method for managing how confidence number is reported in * output. * * @param conf * the conf * @return the string */ protected String formatConfidence(double conf) { return confFmt.format(conf); } /** * Gets the schema. * * @return the schema */ public Schema getSchema() { return this.schema; } /** * Create a schema instance with the fields properly typed and ordered. * * @param jobName * the job name * @return the schema * @throws ConfigException * schema configuration error */ protected Schema buildSchema(String jobName) throws ConfigException { if (this.schema != null) { return this.schema; } URI uri = null; try { uri = new URI("urn:OpenSextant"); } catch (URISyntaxException e) { // e.printStackTrace(); } this.schema = new Schema(uri); // Add ID field to the schema this.schema.put(OpenSextantSchema.SCHEMA_OID); this.schema.setName(jobName); for (String field : field_order) { if (!this.includeOffsets && (field.equals("start") || field.equals("end"))) { continue; } if (!this.includeCoordinate && (field.equals("lat") || field.equals("lon"))) { continue; } SimpleField F = getField(field); this.schema.put(F); } this.field_set.addAll(field_order); return this.schema; } /** * Gets the field. * * @param field * the field * @return the field * @throws ConfigException * the config exception */ protected SimpleField getField(String field) throws ConfigException { return OpenSextantSchema.getField(field); } /** * Can add. * * @param f * the f * @return true, if successful */ protected boolean canAdd(SimpleField f) { if (f == null) { return false; } return field_set.contains(f.getName()) && (schema.get(f.getName()) != null); } /** * Add a column of data to output; Field is validated ; value is not added * if null. * * @param row * the row * @param f * the f * @param d * the d */ protected void addColumn(Feature row, SimpleField f, Object d) { if (d == null) { return; } if (canAdd(f)) { row.putData(f, d); } } /** * Add a column of data to output; Field is validated. * * @param row * the row * @param f * the f * @param d * the d */ protected void addColumn(Feature row, SimpleField f, int d) { if (canAdd(f)) { row.putData(f, d); } } /** * Add a column of data to output; Field is validated. * * @param row * the row * @param f * the f * @param d * the d */ protected void addColumn(Feature row, SimpleField f, double d) { if (canAdd(f)) { row.putData(f, d); } } /** * Add a field key to the field order; Caller must also be responsible for * ensuring field is valid and exists in Schema. * * @param fld * field name * @throws ConfigException * the config exception */ public void addField(String fld) throws ConfigException { if (getField(fld) == null) { throw new ConfigException("Field is not defined in Schema"); } field_order.add(fld); } /** * Removes the field. * * @param fld * field name * @throws ConfigException * the config exception */ public void removeField(String fld) throws ConfigException { if (getField(fld) == null) { throw new ConfigException("Field is not defined in Schema; Cannot remove non-existing field"); } field_order.remove(fld); } /** * Default fields. */ protected final void defaultFields() { // ID occurs in all output. // id. // Matching data field_order.add("placename"); // Geographic field_order.add("province"); field_order.add("iso_cc"); field_order.add("lat"); field_order.add("lon"); // Textual context. field_order.add("matchtext"); field_order.add("context"); field_order.add("filename"); field_order.add("filepath"); field_order.add("textpath"); // File mechanics field_order.add("method"); field_order.add("feat_class"); field_order.add("feat_code"); field_order.add("confidence"); field_order.add("precision"); field_order.add("start"); field_order.add("end"); } }