/******************************************************************************* * Copyright (c) 2014 Open Door Logistics (www.opendoorlogistics.com) * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser Public License v2.1 * which accompanies this distribution, and is available at * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html ******************************************************************************/ package com.opendoorlogistics.components.geocode.postcodes.builder; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Scanner; import java.util.TreeSet; import org.mapdb.DB; import org.mapdb.DBMaker; import com.opendoorlogistics.components.geocode.postcodes.impl.CountryConfigs; import com.opendoorlogistics.components.geocode.postcodes.impl.PCConstants; import com.opendoorlogistics.components.geocode.postcodes.impl.PCRecord; import com.opendoorlogistics.components.geocode.postcodes.impl.PCRecord.StrField; import com.opendoorlogistics.components.geocode.postcodes.impl.PCSerialiser; import com.opendoorlogistics.core.utils.KeyboardInput; import com.opendoorlogistics.core.utils.strings.Strings; final public class GDFFileBuilder { private boolean log=true; private static class CountryRec{ private CountryRec(String countryCode) { this.countryProcessor = CountryConfigs.getProcessor(countryCode); this.countryCode = countryCode; } final CountryConfigs.CountryProcessor countryProcessor; final String countryCode; DB db = null; String indexFilename; String tmpIndexFilename; HashMap<String, Integer> strToInt= new HashMap<>(); Map<Integer, String> intToStr; List<Map<String, byte[]>> pcMaps = new ArrayList<>(); void addStrings(PCRecord rec){ for (PCRecord.StrField fld : PCRecord.StrField.values()) { String s = rec.getField(fld); if(s!=null){ addString(s); } } } private void addString(String s) { Integer id = strToInt.get(s); if (id == null) { id = strToInt.size(); strToInt.put(s, id); intToStr.put(id, s); } } } private void finishProcessing(CountryRec rec, boolean compact) { // create final file CountryRec finalRec = new CountryRec(rec.countryCode); createEmptyDb(new File(rec.indexFilename),finalRec); // copy strings into new file for(Map.Entry<Integer, String> entry : rec.intToStr.entrySet()){ finalRec.intToStr.put(entry.getKey(), entry.getValue()); } // copy and merge into new file assert rec.pcMaps.size() == finalRec.pcMaps.size(); assert rec.pcMaps.size() == finalRec.countryProcessor.nbLevels(); for(int i =0 ; i < rec.pcMaps.size() ;i++){ for(Map.Entry<String,byte[]> entry : rec.pcMaps.get(i).entrySet()){ // merge all entries List<PCRecord> list = PCSerialiser.multiDeserialise(entry.getValue(),rec.intToStr); PCRecord merged = PCRecord.merge(list); merged.setField(StrField.POSTAL_CODE, entry.getKey().toUpperCase()); // convert merged to bytes ByteArrayOutputStream bytes = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(bytes); PCSerialiser.serialize(merged, rec.strToInt, dos); byte[] byteArray = bytes.toByteArray(); // save bytes finalRec.pcMaps.get(i).put(entry.getKey(), byteArray); } } // close and delete temporary if(rec.db!=null){ rec.db.close(); } new File(rec.tmpIndexFilename).delete(); new File(rec.tmpIndexFilename + ".p").delete(); finalRec.db.commit(); finalRec.db.close(); // compact and close the final file if(compact){ // try sleeping for a bit as we're getting file lock issues try { Thread.sleep(250); } catch (InterruptedException e) { e.printStackTrace(); } log("Compacting " + rec.indexFilename); finalRec.db = DBMaker.newFileDB(new File(rec.indexFilename)).make(); finalRec.db.compact(); finalRec.db.commit(); finalRec.db.close(); } } /** * Build the binary lookup file(s) from the geonames postcode files * * @param geonamesFile * @param outputdirectory */ public void buildFromGeonamesFile(String geonamesFile,String[] ignoreCountries,boolean doCompact, String outputdirectory) { // get ignore set TreeSet<String> ignoreSet = new TreeSet<>(); if(ignoreCountries!=null){ for(String country : ignoreCountries){ country = Strings.std(country); ignoreSet.add(country); } } // assume file sorted by country HashSet<String> processedCountries = new HashSet<>(); String currentCountry = null; Scanner scanner = null; CountryRec countryRec = null; ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { int lineNb = 0; int skipped = 0; scanner = new Scanner(new File(geonamesFile), "UTF-8"); while (scanner.hasNextLine()) { String line = scanner.nextLine(); // split, including trailing and empty tabs.... String[] split = line.split("\t", -1); if (split.length < 11) { throw new RuntimeException("Invalid geonames record on line " + lineNb + " : " + line); } // get the country and check for new country String cleanedCountry = Strings.std(split[0]); if (cleanedCountry.equals(currentCountry) == false) { // close old file if (countryRec != null) { finishProcessing(countryRec,doCompact); } // check if we should skip this country, if not then init the record countryRec=null; if(ignoreSet.contains(cleanedCountry)==false){ countryRec = initCountry(cleanedCountry, processedCountries, outputdirectory); } currentCountry = cleanedCountry; } // skip current country if flagged if(countryRec==null){ lineNb++; continue; } // create the pc object PCRecord pcRecord = createPCRecObject(split, lineNb); if(pcRecord!=null){ pcRecord = countryRec.countryProcessor.processRecord(pcRecord); if(pcRecord==null){ log("Country specific processing rejected line " + lineNb + " : " + Strings.toCommas(split)); } } if(pcRecord==null){ skipped++; } // save the strings if (pcRecord != null) { countryRec.addStrings(pcRecord); } // save to the map file for the different levels if (pcRecord != null) { List<String> levels = countryRec.countryProcessor.splitByLevels(pcRecord.getField(StrField.POSTAL_CODE),false); if(levels==null){ log("Found problem row at line " + lineNb + " : " + Strings.toCommas(split)); skipped++; }else{ for(int i =0 ; i < levels.size() ; i++){ // save to the map and also save the string as an int saveToPCMap(levels.get(i), pcRecord, countryRec.strToInt,baos, countryRec.pcMaps.get(i)); countryRec.addString(levels.get(i)); } } } // go to next line lineNb++; if(lineNb%25000==0){ log("Processed " + lineNb + " lines in file " + geonamesFile); } } log(lineNb, skipped); } catch (Throwable e) { throw new RuntimeException(e); } finally { if (countryRec!=null && countryRec.db != null) { finishProcessing(countryRec,doCompact); } if (scanner != null) { scanner.close(); } } log("Finished building GDF files"); } private void saveToPCMap(String pc, PCRecord pcRecord, Map<String, Integer> strToInt, ByteArrayOutputStream bytes ,Map<String, byte[]>outMap) throws IOException { // always standardise the key pc = Strings.std(pc); // get any pre-existing record and save it to the output stream bytes.reset(); byte[] current = outMap.get(pc); if (current != null) { bytes.write(current); } // serialise the current postcode and add to the bytes stream DataOutputStream dos = new DataOutputStream(bytes); PCSerialiser.serialize(pcRecord, strToInt, dos); // save the final bytes stream byte[] byteArray = bytes.toByteArray(); outMap.put(pc, byteArray); } private CountryRec initCountry(String newCountry, HashSet<String> processedCountries, String outputdirectory) { // update the current name and that we've started processing this country if (processedCountries.contains(newCountry)) { throw new RuntimeException("Input file is not ordered by country"); } processedCountries.add(newCountry); // create new db and other objects CountryRec ret = new CountryRec(newCountry); ret.indexFilename = outputdirectory + File.separator + newCountry + "." + PCConstants.DBFILE_EXTENSION; ret.tmpIndexFilename = ret.indexFilename + PCConstants.TEMP_FILE_EXTENSION; createEmptyDb(new File(ret.tmpIndexFilename), ret); log("Started processing " + newCountry + ", country has " + ret.pcMaps.size() + " postcode levels defined."); return ret; } private PCRecord createPCRecObject(String[] split, int lineNb) { PCRecord pc = new PCRecord(); for (PCRecord.StrField fld : PCRecord.StrField.values()) { pc.setField(fld, split[fld.ordinal()]); } int indx = PCRecord.StrField.values().length; if (split[indx].length() > 0 && split[indx + 1].length() > 0) { pc.setLatitude(new BigDecimal(split[indx++])); pc.setLongitude(new BigDecimal(split[indx++])); } else { // skip this record as has no geocode log("Found row with no geocodes, line " + lineNb + " : " + Strings.toCommas(split)); return null; } pc.setAccuracy((short) -1); if (indx < split.length) { String accuracy = split[indx]; if (accuracy.length() > 0) { pc.setAccuracy(Short.parseShort(accuracy)); } } return pc; } private void createEmptyDb(File outFile,CountryRec rec) { rec.db= DBMaker.newFileDB(outFile).closeOnJvmShutdown().transactionDisable().cacheSize(1000000).make(); // save the version rec.db.createAtomicString(PCConstants.DBNAME_VERSION, PCConstants.pcgeocode_file_version.toString()); // save the country code rec.db.createAtomicString(PCConstants.DBNAME_COUNTRYCODE, rec.countryCode); // create the string map, ensuring we always have empty string as this is used when merging rec.intToStr = rec.db.getHashMap(PCConstants.DBNAME_INT2ST); rec.addString(""); // create a pc map for each level for(int i = 0 ; i < rec.countryProcessor.nbLevels() ; i++){ Map<String,byte[]> map = rec.db.getHashMap(PCConstants.DBNAME_PCS+ Integer.toString(i)); rec.pcMaps.add(map); } } private void log(int lineNb, int skipped) { log("Processed " + lineNb + " rows" + ", skipped " + skipped + " rows"); } private void log(String s) { if(log){ System.out.println(s); } } // private void wipeDirectory(String directory) { // if (KeyboardInput.yesNoPrompt("Delete all files from directory \"" + directory + "\" (recommended)?", true)) { // File file = new File(directory); // if (!file.isDirectory()) { // throw new RuntimeException("Not a valid directory:" + directory); // } // for (File child : file.listFiles()) { // if (child.isDirectory() == false) { // child.delete(); // } // } // } // } public boolean isLog() { return log; } public void setLog(boolean log) { this.log = log; } }