/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.entityhub.indexing.geonames; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory; import org.apache.stanbol.entityhub.indexing.core.EntityProcessor; import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig; import org.apache.stanbol.entityhub.servicesapi.model.Reference; import org.apache.stanbol.entityhub.servicesapi.model.Representation; import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class HierarchyProcessor implements EntityProcessor { public static final String PARAM_ADMIN1 = "admin1"; public static final String PARAM_ADMIN2 = "admin2"; public static final String PARAM_HIERARCHY = "hierarchy"; public static final String PARAM_COUNTRY_INFO = "country-info"; public static final String DEFAULT_ADMIN1_FILE = "admin1CodesASCII.txt"; public static final String DEFAULT_ADMIN2_FILE = "admin2Codes.txt"; public static final String DEFAULT_HIERARCHY_FILE = "hierarchy.zip"; public static final String DEFAULT_COUNTRY_INFO_FILE = "countryInfo.txt"; private final Logger log = LoggerFactory.getLogger(HierarchyProcessor.class); private File countryInfoFile; private List<File> adminCodesFiles; private File hierarchyFile; private final Map<String, Integer> adminCode2featureId = new HashMap<String, Integer>(); private final Map<Integer,Collection<Integer>> parentFeature = new HashMap<Integer, Collection<Integer>>(); private final Map<Integer,Collection<Integer>> adminParentFeature = new HashMap<Integer, Collection<Integer>>(); private final Map<String, Integer> countryCode2featureId = new HashMap<String, Integer>(); private IndexingConfig indexingConfig; private final ValueFactory vf = InMemoryValueFactory.getInstance(); public static final int COUNTRY_ID_INDEX = 17; @Override public void setConfiguration(Map<String,Object> config) { indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG); adminCodesFiles = Arrays.asList( getConfiguredFile(config, PARAM_ADMIN1, DEFAULT_ADMIN1_FILE), getConfiguredFile(config, PARAM_ADMIN2, DEFAULT_ADMIN2_FILE)); hierarchyFile = getConfiguredFile(config, PARAM_HIERARCHY, DEFAULT_HIERARCHY_FILE); countryInfoFile = getConfiguredFile(config, PARAM_COUNTRY_INFO, DEFAULT_COUNTRY_INFO_FILE); } /** * @param value */ private File getConfiguredFile(Map<String,Object> config, String param, String defaultValue) { Object value = config.get(param); if(value == null){ //if not set use the default value = GeonamesConstants.DEFAULT_SOURCE_FOLDER_NAME + defaultValue; log.info("No Geonames.org Admin1 code file configured use the default: {}",value); } File file = indexingConfig.getSourceFile(value.toString()); return file; } @Override public boolean needsInitialisation() { return true; } @Override public void initialise() { for(File af : adminCodesFiles){ if(!af.isFile()){ throw new IllegalArgumentException("The configured AdminCodes file "+ af +"does not exist. Change the configureation " + "or copy the file to this location!"); } } if(!hierarchyFile.isFile()){ throw new IllegalArgumentException("The configured hierarchy data file "+ hierarchyFile +"does not exist. Change the configureation " + "or copy the file to this location!"); } if(!countryInfoFile.isFile()){ throw new IllegalArgumentException("The configured hierarchy data file "+ countryInfoFile +"does not exist. Change the configureation " + "or copy the file to this location!"); } try { readAdminCodes(); } catch (IOException e) { throw new IllegalStateException("Unable to read geonames.org administration codes",e); } try { readHierarchy(); } catch (IOException e) { throw new IllegalStateException("Unable to read geonames.org hierarchy codes",e); } } /** * There are two sources of hierarchy in the geonames.org dumps. <p> * First the Admin Region Codes stored in the main table in combination with * the CountryInfo and the AdminRegion infos for the first two levels. This * uses the ISO country code and the additional number for linking the * Regions. Second the Hierarchy table providing parentID, childId, [type] * information. This uses featureIDs for linking. <p> * This Method reads the first data source into memory. For the country * related information it calls {@link #readCountryInfos()}. * @throws IOException */ private void readAdminCodes() throws IOException{ long start = System.currentTimeMillis(); //first read adminCodes based on the countryInfos int lineCount = readCountryInfos(); for(File adminCodeFile : adminCodesFiles){ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(adminCodeFile), Charset.forName("utf-8"))); String line; while((line = reader.readLine()) != null){ if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment lineCount++; //no tokenizer this time ... need only first and last column! String code = line.substring(0, line.indexOf('\t')); Integer geonamesId = Integer.valueOf(line.substring(line.lastIndexOf('\t')+1)); adminCode2featureId.put(code, geonamesId); } } reader.close(); reader = null; } log.info("read "+lineCount+" AdminCodes in "+(System.currentTimeMillis()-start)+"ms"); } private int readCountryInfos() throws IOException{ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(countryInfoFile), Charset.forName("utf-8"))); String line; int lineCount = 0; while((line = reader.readLine()) != null){ if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment LineTokenizer t = new LineTokenizer(line); String code = null; Integer geonamesId = null; int i=1; for(;t.hasNext();i++){ String actToken = t.next(); if(i==1){ code = actToken; } if(i == HierarchyProcessor.COUNTRY_ID_INDEX && actToken != null){ geonamesId = Integer.valueOf(actToken); break; } } if(i == HierarchyProcessor.COUNTRY_ID_INDEX && code != null && geonamesId != null){ adminCode2featureId.put(code,geonamesId); countryCode2featureId.put(code,geonamesId); lineCount++; } else { log.warn("Unable to parse countryInfo from Line "+line); } } } reader.close(); reader = null; return lineCount; } /** * There are two sources of hierarchy in the geonames.org dumps. <p> * First the Admin Region Codes stored in the main table in combination with * the CountryInfo and the AdminRegion infos for the first two levels. This * uses the ISO country code and the additional number for linking the * Regions. Second the Hierarchy table providing parentID, childId, [type] * information. This uses featureIDs for linking. <p> * This Method processes the second datasource and stores the child -> * parents mappings in memory. Administrative hierarchies are stored in a * different map. Note also that also for Administrative regions there are * some cases where a child has more than one parent. * @throws IOException */ private void readHierarchy() throws IOException{ BufferedReader reader; if(hierarchyFile.getName().endsWith(".zip")){ ZipFile hierarchyArchive; try { hierarchyArchive = new ZipFile(hierarchyFile); } catch (IOException e) { //in the init we check if this is a file, exists and we can read ... // .. so throw a runtime exception here! throw new IllegalArgumentException("Unable to access geonames.org DB Dump hirarchy File",e); } Enumeration<? extends ZipEntry> e = hierarchyArchive.entries(); ZipEntry entry = null; while(e.hasMoreElements()){ ZipEntry cur = e.nextElement(); if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("hierarchy.txt")){ entry = cur; break; } } if(entry ==null){ throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!"); } else { log.info("read hierarchy data fromArchive Entry "+entry.getName()); reader = new BufferedReader(new InputStreamReader(hierarchyArchive.getInputStream(entry), Charset.forName("utf-8"))); } } else { reader = new BufferedReader(new InputStreamReader(new FileInputStream(hierarchyFile), Charset.forName("utf-8"))); } String line; int lineCount=0; long start = System.currentTimeMillis(); while((line = reader.readLine()) != null){ lineCount++; LineTokenizer t = new LineTokenizer(line); Integer parent = Integer.valueOf(t.next()); Integer child = Integer.valueOf(t.next()); String type; if(t.hasNext()){ type = t.next(); } else { type = null; } if("ADM".equals(type)){ Collection<Integer> parents = adminParentFeature.get(child); if(parents == null){ parents = new ArrayList<Integer>(1); //there are only some exceptions with multiple parents adminParentFeature.put(child, parents); } parents.add(parent); } else { Collection<Integer> parents = parentFeature.get(child); if(parents == null){ parents = new ArrayList<Integer>(3); parentFeature.put(child, parents); } parents.add(parent); } } log.info(String.format("read %d hierarchy relations in %dms",lineCount,System.currentTimeMillis()-start)); } @Override public void close() { // TODO Auto-generated method stub } @Override public Representation process(Representation source) { Integer id = source.getFirst(GeonamesPropertyEnum.idx_id.toString(), Integer.class); if(id == null){ log.warn("The <{}> field MUST contain the integer ID!",GeonamesPropertyEnum.idx_id); return source; } //now add the parents based on the codes parsed from the main data addParents(source, id, new String[]{ source.getFirst(GeonamesPropertyEnum.idx_CC.toString(),String.class), source.getFirst(GeonamesPropertyEnum.idx_ADM1.toString(),String.class), source.getFirst(GeonamesPropertyEnum.idx_ADM2.toString(),String.class), source.getFirst(GeonamesPropertyEnum.idx_ADM3.toString(),String.class), source.getFirst(GeonamesPropertyEnum.idx_ADM4.toString(),String.class) }); return source; } /** * Recursive method the finds all parents and adds the childs of the current * node (not all childs, but only those of the current tree) * @param id the id of the lower level * @param parents the set used to add all the parents/child mappings */ private void getParents(Integer id, Map<Integer,Collection<Integer>> parents){ Collection<Integer> current = parentFeature.get(id); if(current != null){ for(Integer parent : current){ Collection<Integer> childs = parents.get(parent); if(childs == null){ childs = new HashSet<Integer>(); parents.put(parent, childs); } if(childs.add(id)){ getParents(parent, parents); } } } current = adminParentFeature.get(id); if(current != null){ for(Integer parent : current){ Collection<Integer> childs = parents.get(parent); if(childs == null){ childs = new HashSet<Integer>(); parents.put(parent, childs); } if(childs.add(id)){ getParents(parent, parents); } } } } private void addParents(Representation doc,Integer id,String[] adminCodes){ Integer[] adminIds = new Integer[5]; //now process the admin Codes (including the country at index 0) for(int i=0;i<adminCodes.length;i++){ if(adminCodes[i] != null && !adminCodes[i].equals("00")){ //00 is used to indicate not known adminIds[i] =adminCode2featureId.get(adminCodes[i]); //might also add null! } } //now get the direct parents Map<Integer,Collection<Integer>> parents = new HashMap<Integer, Collection<Integer>>(); getParents(id,parents); //add all parents (NOW done by the field mappings configuration) //doc.add(GeonamesPropertyEnum.gn_parentFeature.toString(), getFeatureReferences(parents.keySet())); //get admin hierarchy Set<Integer> parentLevel; //add country if(adminIds[0] != null){ doc.add(GeonamesPropertyEnum.gn_parentCountry.toString(), vf.createReference( new StringBuilder(GeonamesConstants.GEONAMES_RESOURCE_NS).append(adminIds[0]).append('/').toString())); parentLevel = Collections.singleton(adminIds[0]); } else { parentLevel = Collections.emptySet(); } //add the admin codes for the 4 levels parentLevel = addAdminLevel(doc, GeonamesPropertyEnum.gn_parentADM1, parents, parentLevel, adminIds[1]); parentLevel = addAdminLevel(doc, GeonamesPropertyEnum.gn_parentADM2, parents, parentLevel, adminIds[2]); parentLevel = addAdminLevel(doc, GeonamesPropertyEnum.gn_parentADM3, parents, parentLevel, adminIds[3]); parentLevel = addAdminLevel(doc, GeonamesPropertyEnum.gn_parentADM4, parents, parentLevel, adminIds[4]); } /** * This Method combines the information of <ul> * <li> the adminIds originating form the information in the main feature table of geonames * <li> hierarchy information originating from the hierarchy table. * </ul> * and combines them to the full admin regions hierarchy.<br> * This code would be much simpler if one would trust one of the two data source. * However first tests have shown, that both structures contain some errors! * @param doc The doc to add the data * @param property the property used for the level * @param parents the parent->child mappings for the current geonames feature * @param parentLevel the regions of the parent level (should be only one, but sometimes there are more). * This data are based on the hierarchy table. * @param adminId the region as stored in the geonames main table (only available for level 1 and 2) * @return the regions of this level (should be only one, but sometimes there are more) */ private Set<Integer> addAdminLevel(Representation doc,GeonamesPropertyEnum property, Map<Integer,Collection<Integer>> parents,Set<Integer> parentLevel, Integer adminId){ Set<Integer> currentLevel = new HashSet<Integer>(); //first add the admin1 originating from the admin info file if(adminId!=null){ currentLevel.add(adminId); } for(Integer parent : parentLevel){ //second add the admin1 via the childs of the country Collection<Integer> tmp = parents.get(parent); if(tmp != null){ currentLevel.addAll(tmp); } } if(!currentLevel.isEmpty()){ //now add all the adm1 we found doc.add(property.toString(), getFeatureReferences(currentLevel)); if(currentLevel.size()>1){ //write warning if there are multiple ids log.warn("Multiple {} for ID {} (ids: {})",new Object[]{ property.name(),doc.getId(),currentLevel.toString()}); } } return currentLevel; } private Collection<Reference> getFeatureReferences(Collection<Integer> ids){ List<Reference> refs = new ArrayList<Reference>(ids.size()); for(Integer id : ids){ if(id != null){ refs.add(vf.createReference( new StringBuilder(GeonamesConstants.GEONAMES_RESOURCE_NS) .append(id).append('/').toString())); } } return refs; } }