package com.bericotech.clavin.index; import static com.bericotech.clavin.index.IndexField.*; import static java.util.concurrent.TimeUnit.MILLISECONDS; import com.bericotech.clavin.gazetteer.BasicGeoName; import com.bericotech.clavin.gazetteer.CountryCode; import com.bericotech.clavin.gazetteer.FeatureClass; import com.bericotech.clavin.gazetteer.FeatureCode; import com.bericotech.clavin.gazetteer.GeoName; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntField; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /*##################################################################### * * CLAVIN (Cartographic Location And Vicinity INdexer) * --------------------------------------------------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * ==================================================================== * * IndexDirectoryBuilder.java * *###################################################################*/ /** * Builds a Lucene index of geographic entries based on * the GeoNames gazetteer. * * This program is run one-time before CLAVIN can be used. * */ public class IndexDirectoryBuilder { private final static Logger LOG = LoggerFactory.getLogger(IndexDirectoryBuilder.class); private static final String HELP_OPTION = "help"; private static final String FULL_ANCESTRY_OPTION = "with-full-ancestry"; private static final String GAZETTEER_FILES_OPTION = "gazetteer-files"; private static final String INDEX_PATH_OPTION = "index-path"; private static final String REPLACE_INDEX_OPTION = "replace-index"; private static final String ALTERNATE_NAMES_OPTION = "alt-names-file"; private static final String[] DEFAULT_GAZETTEER_FILES = new String[] { "./allCountries.txt", "./src/main/resources/SupplementaryGazetteer.txt" }; private static final String DEFAULT_INDEX_DIRECTORY = "./IndexDirectory"; private final Map<String, GeoName> adminMap; private final Map<String, Set<GeoName>> unresolvedMap; private final Map<Integer, AlternateName> alternateNameMap; private final boolean fullAncestry; private IndexWriter indexWriter; private int indexCount; private IndexDirectoryBuilder(final boolean fullAncestryIn) { adminMap = new TreeMap<String, GeoName>(); unresolvedMap = new TreeMap<String, Set<GeoName>>(); alternateNameMap = new HashMap<Integer, AlternateName>(); this.fullAncestry = fullAncestryIn; } public void buildIndex(final File indexDir, final List<File> gazetteerFiles, final File altNamesFile) throws IOException { LOG.info("Indexing... please wait."); indexCount = 0; // Create a new index file on disk, allowing Lucene to choose // the best FSDirectory implementation given the environment. FSDirectory index = FSDirectory.open(indexDir); // indexing by lower-casing & tokenizing on whitespace Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer(); // create the object that will actually build the Lucene index indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_4_9, indexAnalyzer)); // let's see how long this takes... Date start = new Date(); // if we were given an alternate names file, process it if (altNamesFile != null) { loadAlternateNames(altNamesFile); } // load GeoNames gazetteer into Lucene index String line; int count = 0; for (File gazetteer : gazetteerFiles) { LOG.info("Processing Gazetteer: {}", gazetteer.getAbsolutePath()); BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(gazetteer), "UTF-8")); while ((line = reader.readLine()) != null) { try { count += 1; // print progress update to console if (count % 100000 == 0 ) { LOG.info("rowcount: " + count); } GeoName geoName = BasicGeoName.parseFromGeoNamesRecord(line); resolveAncestry(geoName); } catch (IOException e) { LOG.info("Skipping... Error on line: {}", line); } catch (RuntimeException re) { LOG.info("Skipping... Error on line: {}", line); } } reader.close(); } // that wasn't so long, was it? Date stop = new Date(); LOG.info("Unresolved GeoNames (Pre-resolution)"); logUnresolved(); resolveUnresolved(); LOG.info("Unresolved GeoNames (Post-resolution)"); logUnresolved(); LOG.info("Indexing unresolved GeoNames."); for (Set<GeoName> geos : unresolvedMap.values()) { for (GeoName nm : geos) { indexGeoName(nm); } } LOG.info("[DONE]"); LOG.info("{} geonames added to index. ({} records)", indexWriter.maxDoc(), indexCount); LOG.info("Merging indices... please wait."); indexWriter.close(); index.close(); LOG.info("[DONE]"); DateFormat df = new SimpleDateFormat("HH:mm:ss"); long elapsed_MILLIS = stop.getTime() - start.getTime(); LOG.info("Process started: " + df.format(start) + ", ended: " + df.format(stop) + "; elapsed time: " + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds."); } private static final int ALT_NAMES_ID_FIELD = 1; private static final int ALT_NAMES_LANG_FIELD = 2; private static final int ALT_NAMES_NAME_FIELD = 3; private static final int ALT_NAMES_PREFERRED_FIELD = 4; private static final int ALT_NAMES_SHORT_FIELD = 5; private static final String ALT_NAMES_TRUE = "1"; private static final String ISO2_ENGLISH = "en"; private static final String ISO3_ENGLISH = "eng"; private void loadAlternateNames(final File altNamesFile) throws IOException { LOG.info("Reading alternate names file: {}", altNamesFile.getAbsolutePath()); // parse all lines of the alternate names database and store only the 'en' names // marked as preferred or short names for each location // // Column format (see http://download.geonames.org/export/dump/) // ------------------------------------------------------ // alternateNameId : the id of this alternate name, int // geonameid : geonameId referring to id in table 'geoname', int // isolanguage : iso 639 language code 2- or 3-characters; 4-characters 'post' for postal // codes and 'iata','icao' and faac for airport codes, fr_1793 for French // Revolution names, abbr for abbreviation, link for a website, varchar(7) // alternate name : alternate name or name variant, varchar(200) // isPreferredName : '1', if this alternate name is an official/preferred name // isShortName : '1', if this is a short name like 'California' for 'State of California' // isColloquial : '1', if this alternate name is a colloquial or slang term // isHistoric : '1', if this alternate name is historic and was used in the past BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(altNamesFile), "UTF-8")); String line; int lineNum = 0; while ((line = reader.readLine()) != null) { lineNum++; AlternateName name = new AlternateName(line); if (name.isEnglish() && name.isPrefOrShort()) { alternateNameMap.put(name.geonameId, name.bestName(alternateNameMap.get(name.geonameId))); } } reader.close(); LOG.info("Processed {} alternate names. Found {} names.", lineNum, alternateNameMap.size()); } private void resolveAncestry(final GeoName geoname) throws IOException { // set this GeoName's parent if it is known String parentKey = geoname.getParentAncestryKey(); if (parentKey != null) { // if we cannot successfully set the parent, add to the unresolved map, // waiting for a parent to be set if (!geoname.setParent(adminMap.get(parentKey)) || !geoname.isAncestryResolved()) { Set<GeoName> unresolved = unresolvedMap.get(parentKey); if (unresolved == null) { unresolved = new HashSet<GeoName>(); unresolvedMap.put(parentKey, unresolved); } unresolved.add(geoname); } } // if this geoname is fully resolved, add it to the index if (geoname.isAncestryResolved()) { indexGeoName(geoname); } // if this is an administrative division, configure the parent of any waiting // GeoNames and notify all 2nd level and further descendants their tree has been // updated String myKey = geoname.getAncestryKey(); if (myKey != null) { GeoName conflict = adminMap.get(myKey); if (conflict != null) { LOG.error(String.format("Resolved duplicate admin key [%s] for GeoNames (%d %s:%s %s) and (%d %s:%s %s)", myKey, conflict.getGeonameID(), conflict.getFeatureClass(), conflict.getFeatureCode(), conflict.getName(), geoname.getGeonameID(), geoname.getFeatureClass(), geoname.getFeatureCode(), geoname.getName())); } adminMap.put(myKey, geoname); checkDescendantsResolved(geoname, true); } } private void checkDescendantsResolved(final GeoName geoname, final boolean setParent) throws IOException { String key = geoname.getAncestryKey(); if (key != null) { Set<GeoName> descendants = unresolvedMap.get(key); if (descendants != null) { // use an iterator so we can remove elements Iterator<GeoName> iter = descendants.iterator(); while (iter.hasNext()) { GeoName desc = iter.next(); if (setParent) { if (!desc.setParent(geoname)) { LOG.error("Error setting parent [{}] of GeoName [{}].", geoname, desc); } } if (desc.isAncestryResolved()) { checkDescendantsResolved(desc, false); indexGeoName(desc); iter.remove(); } } if (descendants.isEmpty()) { unresolvedMap.remove(key); } } } } private void resolveUnresolved() throws IOException { // sort keys in ascending order by level of specificity and name Set<String> keys = new TreeSet<String>(new Comparator<String>() { @Override public int compare(final String strA, final String strB) { int specA = strA.split("\\.").length; int specB = strB.split("\\.").length; return specA != specB ? specA - specB : strA.compareTo(strB); } }); keys.addAll(unresolvedMap.keySet()); // iterate over keys, attempting to resolve less specific keys first; if // they are resolved, this may result in more specific keys being resolved // as well for (String key : keys) { String subKey = key; GeoName parent = null; int lastDot; while (parent == null && (lastDot = subKey.lastIndexOf(".")) > 0) { subKey = key.substring(0, lastDot); parent = adminMap.get(subKey); } if (parent != null) { Set<GeoName> unresolved = unresolvedMap.get(key); if (unresolved == null) { // resolving a higher-level key also resolved this key; do nothing break; } Iterator<GeoName> iter = unresolved.iterator(); // use iterator so we can remove while (iter.hasNext()) { GeoName geoName = iter.next(); // first check to see if a previous loop resolved all parents if (geoName.isAncestryResolved()) { indexGeoName(geoName); iter.remove(); } else if (geoName.setParent(parent)) { if (geoName.isAncestryResolved()) { // ancestry has been resolved, remove from the unresolved collection indexGeoName(geoName); iter.remove(); } else { LOG.error("GeoName [{}] should be fully resolved. (parent: {})", geoName, parent); } } else { LOG.error("Unable to set parent of {} to {}", geoName, parent); } } if (unresolved.isEmpty()) { unresolvedMap.remove(key); } } else { LOG.error("Unable to resolve parent for GeoName key: {}", key); } } } /** * Builds a set of Lucene documents for the provided GeoName, indexing * each using all available names and storing the entire ancestry path * for each GeoName in the index. See {@link IndexField} for descriptions * of the fields indexed for each document. * * @param geoName the GeoName to index * @throws IOException if an error occurs while indexing */ private void indexGeoName(final GeoName geoName) throws IOException { indexCount++; // find all unique names for this GeoName String nm = geoName.getName(); String asciiNm = geoName.getAsciiName(); Set<String> names = new HashSet<String>(); names.add(nm); names.add(asciiNm); names.addAll(geoName.getAlternateNames()); // if this is a top-level administrative division, add its primary and alternate country codes // if they are not already found in the name or alternate names if (geoName.isTopLevelAdminDivision()) { if (geoName.getPrimaryCountryCode() != null) { names.add(geoName.getPrimaryCountryCode().name()); } for (CountryCode cc : geoName.getAlternateCountryCodes()) { names.add(cc.name()); } } AlternateName preferredName = alternateNameMap.get(geoName.getGeonameID()); // ensure preferred name is found in alternate names if (preferredName != null) { names.add(preferredName.name); } names.remove(null); names.remove(""); // reuse a single Document and field instances Document doc = new Document(); doc.add(new StoredField(GEONAME.key(), fullAncestry ? geoName.getGazetteerRecordWithAncestry() : geoName.getGazetteerRecord())); doc.add(new IntField(GEONAME_ID.key(), geoName.getGeonameID(), Field.Store.YES)); // if the alternate names file was loaded and we found a preferred name for this GeoName, store it if (preferredName != null) { doc.add(new StoredField(PREFERRED_NAME.key(), preferredName.name)); } // index the direct parent ID in the PARENT_ID field GeoName parent = geoName.getParent(); if (parent != null) { doc.add(new IntField(PARENT_ID.key(), parent.getGeonameID(), Field.Store.YES)); } // index all ancestor IDs in the ANCESTOR_IDS field; this is a secondary field // so it can be used to restrict searches and PARENT_ID can be used for ancestor // resolution while (parent != null) { doc.add(new IntField(ANCESTOR_IDS.key(), parent.getGeonameID(), Field.Store.YES)); parent = parent.getParent(); } doc.add(new LongField(POPULATION.key(), geoName.getPopulation(), Field.Store.YES)); // set up sort field based on population and geographic feature type if (geoName.getFeatureClass().equals(FeatureClass.P) || geoName.getFeatureCode().name().startsWith("PCL")) { if (geoName.getGeonameID() != 2643741) // todo: temporary hack until GeoNames.org fixes the population for City of London // boost cities and countries when sorting results by population doc.add(new LongField(SORT_POP.key(), geoName.getPopulation() * 11, Field.Store.YES)); } else { // don't boost anything else, because people rarely talk about other stuff // (e.g., Washington State's population is more than 10x that of Washington, DC // but Washington, DC is mentioned far more frequently than Washington State) doc.add(new LongField(SORT_POP.key(), geoName.getPopulation(), Field.Store.YES)); } doc.add(new IntField(HISTORICAL.key(), IndexField.getBooleanIndexValue(geoName.getFeatureCode().isHistorical()), Field.Store.NO)); doc.add(new StringField(FEATURE_CODE.key(), geoName.getFeatureCode().name(), Field.Store.NO)); // create a unique Document for each name of this GeoName TextField nameField = new TextField(INDEX_NAME.key(), "", Field.Store.YES); doc.add(nameField); for (String name : names) { nameField.setStringValue(name); indexWriter.addDocument(doc); } } private void logUnresolved() { int unresolvedGeoCount = 0; Map<String, Integer> unresolvedCodeMap = new TreeMap<String, Integer>(); Map<String, Integer> missingCodeMap = new TreeMap<String, Integer>(); for (Map.Entry<String, Set<GeoName>> entry : unresolvedMap.entrySet()) { LOG.trace("{}: {} unresolved GeoNames", entry.getKey(), entry.getValue().size()); unresolvedGeoCount += entry.getValue().size(); FeatureCode code; switch (entry.getKey().split("\\.").length) { case 1: code = FeatureCode.PCL; break; case 2: code = FeatureCode.ADM1; break; case 3: code = FeatureCode.ADM2; break; case 4: code = FeatureCode.ADM3; break; case 5: code = FeatureCode.ADM4; break; default: LOG.error("Unexpected ancestry key: {}", entry.getKey()); code = FeatureCode.NULL; break; } if (missingCodeMap.containsKey(code.name())) { missingCodeMap.put(code.name(), missingCodeMap.get(code.name())+1); } else { missingCodeMap.put(code.name(), 1); } for (GeoName geo : entry.getValue()) { String featKey = String.format("%s:%s", geo.getFeatureClass(), geo.getFeatureCode()); if (unresolvedCodeMap.containsKey(featKey)) { unresolvedCodeMap.put(featKey, unresolvedCodeMap.get(featKey)+1); } else { unresolvedCodeMap.put(featKey, 1); } } } LOG.info("Found {} administrative divisions.", adminMap.size()); LOG.info("Found {} missing administrative keys.", unresolvedMap.size()); for (String code : missingCodeMap.keySet()) { LOG.info("{}: {}", code, missingCodeMap.get(code)); } LOG.info("{} total unresolved GeoNames", unresolvedGeoCount); for (String key : unresolvedCodeMap.keySet()) { LOG.trace("{}: {}", key, unresolvedCodeMap.get(key)); } } /** * Turns a GeoNames gazetteer file into a Lucene index, and adds * some supplementary gazetteer records at the end. * * @param args not used * @throws IOException */ public static void main(String[] args) throws IOException { Options options = getOptions(); CommandLine cmd = null; CommandLineParser parser = new GnuParser(); try { cmd = parser.parse(options, args); } catch (ParseException pe) { LOG.error(pe.getMessage()); printHelp(options); System.exit(-1); } if (cmd.hasOption(HELP_OPTION)) { printHelp(options); System.exit(0); } String indexPath = cmd.getOptionValue(INDEX_PATH_OPTION, DEFAULT_INDEX_DIRECTORY); String[] gazetteerPaths = cmd.getOptionValues(GAZETTEER_FILES_OPTION); if (gazetteerPaths == null || gazetteerPaths.length == 0) { gazetteerPaths = DEFAULT_GAZETTEER_FILES; } boolean replaceIndex = cmd.hasOption(REPLACE_INDEX_OPTION); boolean fullAncestry = cmd.hasOption(FULL_ANCESTRY_OPTION); File idir = new File(indexPath); // if the index directory exists, delete it if we are replacing, otherwise // exit gracefully if (idir.exists() ) { if (replaceIndex) { LOG.info("Replacing index: {}", idir.getAbsolutePath()); FileUtils.deleteDirectory(idir); } else { LOG.info("{} exists. Remove the directory and try again.", idir.getAbsolutePath()); System.exit(-1); } } List<File> gazetteerFiles = new ArrayList<File>(); for (String gp : gazetteerPaths) { File gf = new File(gp); if (gf.isFile() && gf.canRead()) { gazetteerFiles.add(gf); } else { LOG.info("Unable to read Gazetteer file: {}", gf.getAbsolutePath()); } } if (gazetteerFiles.isEmpty()) { LOG.error("No Gazetteer files found."); System.exit(-1); } String altNamesPath = cmd.getOptionValue(ALTERNATE_NAMES_OPTION); File altNamesFile = altNamesPath != null ? new File(altNamesPath) : null; if (altNamesFile != null && !(altNamesFile.isFile() && altNamesFile.canRead())) { LOG.error("Unable to read alternate names file: {}", altNamesPath); System.exit(-1); } new IndexDirectoryBuilder(fullAncestry).buildIndex(idir, gazetteerFiles, altNamesFile); } private static Options getOptions() { Options options = new Options(); options.addOption(OptionBuilder .withLongOpt(HELP_OPTION) .withDescription("Print help") .create('?')); options.addOption(OptionBuilder .withLongOpt(FULL_ANCESTRY_OPTION) .withDescription("Store the gazetteer records for the full ancestry tree of each element." + " This will increase performance at the expense of a larger index.") .create()); options.addOption(OptionBuilder .withLongOpt(GAZETTEER_FILES_OPTION) .withDescription(String.format("The ':'-separated list of input Gazetteer files to parse. Default: %s", StringUtils.join(DEFAULT_GAZETTEER_FILES, ':'))) .hasArgs() .withValueSeparator(':') .create('i')); options.addOption(OptionBuilder .withLongOpt(ALTERNATE_NAMES_OPTION) .withDescription("When provided, the path to the GeoNames.org alternate names file for resolution of common and " + "short names for each location. If not provided, the default name for each location will be used.") .hasArg() .create()); options.addOption(OptionBuilder .withLongOpt(INDEX_PATH_OPTION) .withDescription(String.format("The path to the output index directory. Default: %s", DEFAULT_INDEX_DIRECTORY)) .hasArg() .create('o')); options.addOption(OptionBuilder .withLongOpt(REPLACE_INDEX_OPTION) .withDescription("Replace an existing index if it exists. If this option is not specified," + "index processing will fail if an index already exists at the specified location.") .create('r')); return options; } private static void printHelp(Options options) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("run", options, true); } private static class AlternateName implements Comparable<AlternateName> { private final int geonameId; private final String name; private final String lang; private final boolean preferredName; private final boolean shortName; public AlternateName(final String line) { String[] fields = line.split("\t"); geonameId = Integer.parseInt(fields[ALT_NAMES_ID_FIELD]); lang = fields[ALT_NAMES_LANG_FIELD]; name = fields[ALT_NAMES_NAME_FIELD]; preferredName = fields.length > ALT_NAMES_PREFERRED_FIELD && ALT_NAMES_TRUE.equals(fields[ALT_NAMES_PREFERRED_FIELD].trim()); shortName = fields.length > ALT_NAMES_SHORT_FIELD && ALT_NAMES_TRUE.equals(fields[ALT_NAMES_SHORT_FIELD].trim()); } public boolean isEnglish() { return ISO2_ENGLISH.equalsIgnoreCase(lang) || ISO3_ENGLISH.equalsIgnoreCase(lang); } public boolean isPrefOrShort() { return preferredName || shortName; } @Override public int compareTo(final AlternateName other) { int comp = geonameId - other.geonameId; comp = comp == 0 ? Boolean.compare(preferredName, other.preferredName) : comp; comp = comp == 0 ? Boolean.compare(shortName, other.shortName) : comp; comp = comp == 0 ? name.compareTo(other.name) : comp; return comp; } /** * Get the "best" alternate name for the target GeoName. The best name * is selected in the following order: * * 1. non-null * 2. preferred AND short * 3. preferred only * 4. short only * 5. this * * Note that if the preferred and short name flags are identical, this method * returns the object on which it was called. * * @param other the object to compare to * @return the "best" AlternateName determined by the criteria listed above */ public AlternateName bestName(final AlternateName other) { if (other == null) { return this; } // if one name is preferred and the other is not, use the preferred name int comp = Boolean.compare(preferredName, other.preferredName); // if preferred is the same, use a short name over a non-short name comp = comp != 0 ? Boolean.compare(shortName, other.shortName) : comp; // if all things are still equal, use this return comp >= 0 ? this : other; } } }