/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.entityhub.indexing.geonames;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import org.apache.stanbol.entityhub.core.model.InMemoryValueFactory;
import org.apache.stanbol.entityhub.indexing.core.EntityProcessor;
import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
import org.apache.stanbol.entityhub.indexing.geonames.AlternateLabelProcessor.FeatureName.NameType;
import org.apache.stanbol.entityhub.servicesapi.model.Representation;
import org.apache.stanbol.entityhub.servicesapi.model.Text;
import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class AlternateLabelProcessor implements EntityProcessor {
private static final String PARAM_ALTERNATE_LABELS = "alt-labels";
private static final String ALTERNATE_LABELS_FILE = "alternateNames.zip";
private final Logger log = LoggerFactory.getLogger(AlternateLabelProcessor.class);
private final ValueFactory vf = InMemoryValueFactory.getInstance();
/**
* Names for features. This includes also postal codes, abbreviations,
* airport codes and so on.
* @author westei
*
*/
public static final class FeatureName {
enum NameType {
naturalLanguage,
postal,
link,
abbreviation,
airportCode,
unknown
}
private final NameType type;
private final int labelID;
private final Integer geonameID;
private final String name;
private final String lang;
private final boolean preferred;
private final boolean shortName;
private boolean colloquial;
private boolean historic;
private static final String TRUE = "1";
protected FeatureName(String line){
LineTokenizer t = new LineTokenizer(line);
labelID = Integer.parseInt(t.next()); //first Elem the labelID
geonameID = Integer.parseInt(t.next());
String language = t.next();
if(language != null && (language.length() == 2 || language.length() == 3)){
this.lang = language;
} else {
this.lang = null; //no valied lang Code
}
if(language == null || language.length()<=3){
type = NameType.naturalLanguage;
} else if("post".equals(language)){
type = NameType.postal;
} else if("link".equals(language)) {
type = NameType.link;
} else if("abbr".equals(language)) {
type = NameType.abbreviation;
} else if("iata".equals(language) || "icao".equals(language) || "faac".equals(language)){
type = NameType.airportCode;
} else {
type = NameType.unknown; // e.g. fr_1793 for French Revolution names
}
name = t.next();
if(name == null){
throw new IllegalStateException(" Unable to parse name from line:" + line);
}
String act = t.next();
this.preferred = act != null && act.equals(TRUE);
act = t.next();
this.shortName = act != null && act.equals(TRUE);
act = t.next();
this.colloquial = act != null && act.equals(TRUE);
act = t.next();
this.historic = act != null && act.equals(TRUE);
}
public final Integer getGeonameID() {
return geonameID;
}
public final String getName() {
return name;
}
public final String getLang() {
return lang;
}
public final boolean isPreferred() {
return preferred;
}
public final boolean isShortName() {
return shortName;
}
public final boolean isColloquial() {
return colloquial;
}
public final boolean isHistoric() {
return historic;
}
public final boolean isNaturalLanguageLabel(){
return type == NameType.naturalLanguage;
}
public final NameType getLabelType(){
return type;
}
@Override
public final boolean equals(Object obj) {
return obj instanceof FeatureName && ((FeatureName)obj).labelID == labelID;
}
@Override
public final int hashCode() {
return labelID;
}
public final String toString(){
return name+(lang!=null?('@'+lang):"");
}
}
private final Map<Integer,List<FeatureName>> featureNames = new HashMap<Integer,List<FeatureName>>();
private File alternateNamesFile;
private IndexingConfig indexingConfig;
@Override
public void setConfiguration(Map<String,Object> config) {
indexingConfig = (IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
Object value = config.get(PARAM_ALTERNATE_LABELS);
if(value == null){ //if not set use the default
value = GeonamesConstants.DEFAULT_SOURCE_FOLDER_NAME + ALTERNATE_LABELS_FILE;
log.info("No Geonames.org alternate label source set use the default: {}",value);
}
alternateNamesFile = indexingConfig.getSourceFile(value.toString());
}
@Override
public boolean needsInitialisation() {
return true;
}
@Override
public void initialise() {
if(!alternateNamesFile.isFile()){
throw new IllegalArgumentException("The configured geonames.org alternate label file "
+alternateNamesFile+" does not exist. Plase change the configuration or copy the "
+ "tile to that location.");
}
BufferedReader reader;
if(alternateNamesFile.getName().endsWith(".zip")){
ZipFile alternateNamesArchive;
try {
alternateNamesArchive = new ZipFile(alternateNamesFile);
} catch (IOException e) {
//in the init we check if this is a file, exists and we can read ...
// .. so throw a runtime exception here!
throw new IllegalArgumentException("Unable to access geonames.org DB Dump file",e);
}
Enumeration<? extends ZipEntry> e = alternateNamesArchive.entries();
ZipEntry entry = null;
while(e.hasMoreElements()){
ZipEntry cur = e.nextElement();
if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("alternatenames.txt")){
entry = cur;
break;
}
}
if(entry == null){
throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!");
} else {
log.info("read alternate names from Archive Entry "+entry.getName());
try {
reader = new BufferedReader(new InputStreamReader(alternateNamesArchive.getInputStream(entry), Charset.forName("utf-8")));
} catch (IOException ex) {
throw new IllegalArgumentException("Unable to read Entry '" + entry.getName()
+ "' from alternate names file "+alternateNamesFile,ex);
}
}
} else {
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(alternateNamesFile), Charset.forName("utf-8")));
} catch (FileNotFoundException e) {
throw new IllegalArgumentException("Unable to read Alternate names "
+ "' from alternate names file "+alternateNamesFile,e);
}
}
FeatureName name;
int lineCount = 0;
EnumMap<NameType, int[]> labelTypeCounts = new EnumMap<NameType, int[]>(NameType.class);
for(NameType entry :NameType.values()){
labelTypeCounts.put(entry, new int[]{0});
}
String line;
long start = System.currentTimeMillis();
try {
while((line = reader.readLine()) != null){
try {
name = new FeatureName(line);
} catch (RuntimeException e) {
log.warn("Unable to parse Featurname for line: "+line,e);
continue;
}
List<FeatureName> names = featureNames.get(name.geonameID);
if(names == null){
names = new ArrayList<FeatureName>();
featureNames.put(name.geonameID, names);
}
if(name.isPreferred()){
names.add(0, name);
} else {
names.add(name);
}
lineCount++;
labelTypeCounts.get(name.getLabelType())[0]++; //increase the count for this type!
if(log.isDebugEnabled() && lineCount%10000==0){
log.debug("processed "+lineCount+" labels");
}
}
} catch (IOException e) {
throw new IllegalStateException("Unable to read data from alternate label file "
+alternateNamesFile,e);
}
log.info("read "+lineCount+" alternate Names for "+featureNames.size()+" Features in "+(System.currentTimeMillis()-start)+"ms");
for(Entry<NameType, int[]> count : labelTypeCounts.entrySet()){
log.info(" "+count.getKey().toString()+": "+count.getValue()[0]);
}
}
@Override
public void close() {
featureNames.clear();
alternateNamesFile = null;
}
@Override
public Representation process(Representation source) {
Integer id = source.getFirst(GeonamesPropertyEnum.idx_id.toString(), Integer.class);
if(id == null){
log.warn("The <{}> field MUST contain the integer ID!",GeonamesPropertyEnum.idx_id);
return source;
}
List<FeatureName> alternateNames = featureNames.remove(id); //use remove, because we need not need it a 2nd time!
if(alternateNames != null){
List<Text> altList = new ArrayList<Text>(alternateNames.size());
List<Text> officialList = new ArrayList<Text>(alternateNames.size());
List<String> postalCodes = new ArrayList<String>();
List<URL> wikipediaLinks = new ArrayList<URL>();
List<Text> shortNames = new ArrayList<Text>();
List<Text> colloquialNames = new ArrayList<Text>();
for(FeatureName name : alternateNames){
if(name.isNaturalLanguageLabel()){
Text act = vf.createText(name.getName(),name.getLang());
if(name.isPreferred()){
officialList.add(act);
} else {
altList.add(act);
}
if(name.isShortName()){
shortNames.add(act);
}
if(name.isColloquial()){
colloquialNames.add(act);
}
} else if(name.getLabelType() == NameType.postal){
postalCodes.add(name.getName());
} else if(name.getLabelType() == NameType.link){
if(name.getName().contains("wikipedia.org")){
try {
wikipediaLinks.add(new URL(name.getName()));
} catch (MalformedURLException e) {
log.warn("Unable to parse URL for link label "+name.getName());
//ignore
}
}
}
}
if(!altList.isEmpty()){
source.add(GeonamesPropertyEnum.gn_alternateName.toString(),altList);
}
if(!officialList.isEmpty()){
source.add(GeonamesPropertyEnum.gn_officialName.toString(),officialList);
}
if(!postalCodes.isEmpty()){
source.add(GeonamesPropertyEnum.gn_postalCode.toString(), postalCodes);
}
if(!wikipediaLinks.isEmpty()){
source.add(GeonamesPropertyEnum.gn_wikipediaArticle.toString(), wikipediaLinks);
}
if(!shortNames.isEmpty()){
source.add(GeonamesPropertyEnum.gn_shortName.toString(), shortNames);
}
if(!colloquialNames.isEmpty()){
source.add(GeonamesPropertyEnum.gn_colloquialName.toString(), colloquialNames);
}
}
return source;
}
}