/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform.wikipedia;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Map;
import java.util.TreeMap;
/**
* Implementation of the {@link ILanguage} interface. This class should
* not be instanciated by yourself. Use the static methods to find registered
* instances - either by language code or name.
* @author Christian M. Meyer
*
*/
public class WikipediaLanguageMapper implements ILanguage {
protected String code;
protected String name;
protected String iso639_3;
protected String iso639_2b;
protected String iso639_2t;
protected String iso639_1;
protected WikipediaLanguageMapper(final String code, final String name,
final String iso639_3, final String iso639_2b,
final String iso639_2t, final String iso639_1) {
this.code = code;
this.name = name;
this.iso639_3 = iso639_3;
this.iso639_2b = iso639_2b;
this.iso639_2t = iso639_2t;
this.iso639_1 = iso639_1;
}
public String getCode() {
return code;
}
public String getName() {
return name;
}
public String getISO639_3() {
return iso639_3;
}
public String getISO639_2B() {
return iso639_2b;
}
public String getISO639_2T() {
return iso639_2t;
}
public String getISO639_1() {
return iso639_1;
}
public int compareTo(final ILanguage other) {
return (equals(other) ? 0 : code.compareTo(other.getCode()));
}
@Override
public boolean equals(final Object other) {
if (other == null || !(other instanceof ILanguage)) {
return false;
}
else {
return code.equals(((ILanguage) other).getCode());
}
}
@Override
public int hashCode() {
return code.hashCode();
}
@Override
public String toString() {
return name;
}
// -- Static interface --
/** The English language. */
public static final ILanguage ENGLISH = get("eng");
/** The German language. */
public static final ILanguage GERMAN = get("deu");
/** The Russian language. */
public static final ILanguage RUSSIAN = get("rus");
private static boolean initialized;
private static Map<String, ILanguage> languageIndex;
private static Map<String, String> additionalCodeIndex;
private static Map<String, String> additionalNameIndex;
// Avoid two threads interleaving!
private static synchronized void initialize() {
if (initialized) {
return;
}
languageIndex = new TreeMap<String, ILanguage>();
additionalCodeIndex = new TreeMap<String, String>();
additionalNameIndex = new TreeMap<String, String>();
try {
InputStream is = WikipediaLanguageMapper.class.getClassLoader().getResource("language_codes.txt").openStream();
//InputStream is = new FileInputStream(new File("language_codes.txt"));
InputStreamReader isr =new InputStreamReader(is,"UTF-8");
BufferedReader reader = new BufferedReader(isr);
try {
String line;
while ((line = reader.readLine()) != null) {
// Extract the fields.
String[] fields = new String[8];
int i;
int idx = 0;
do {
i = line.indexOf('\t');
if (i >= 0) {
fields[idx++] = line.substring(0, i);
line = line.substring(i + 1);
}
} while (i >= 0);
if (idx < 0) {
continue;
}
fields[idx] = line;
// Save the main language entry.
ILanguage language = new WikipediaLanguageMapper(fields[0], fields[1],
fields[2], fields[3], fields[4], fields[5]);
languageIndex.put(language.getCode(), language);
additionalCodeIndex.put(language.getCode(), language.getCode());
additionalNameIndex.put(language.getName().toLowerCase(), language.getCode());
// Save additional language codes.
String additionalCodes = fields[6];
if (!additionalCodes.isEmpty()) {
do {
i = additionalCodes.indexOf(';');
if (i >= 0) {
String addCode = additionalCodes.substring(0, i);
additionalCodeIndex.put(addCode, language.getCode());
additionalCodes = additionalCodes.substring(i + 1);
}
} while (i >= 0);
additionalCodeIndex.put(additionalCodes, language.getCode());
}
// Save additional language names.
String additionalNames = fields[7];
if (!additionalNames.isEmpty()) {
do {
i = additionalNames.indexOf(';');
if (i >= 0) {
String addName = additionalNames.substring(0, i);
additionalNameIndex.put(addName.toLowerCase(), language.getCode());
additionalNames = additionalNames.substring(i + 1);
}
} while (i >= 0);
additionalNameIndex.put(additionalNames.toLowerCase(), language.getCode());
}
}
} finally {
reader.close();
}
initialized = true;
} catch (IOException e) {
throw new RuntimeException("Unable to load language code index", e);
}
}
/** Returns the language with the given internal code. Note that the
* internal codes roughly correspond to ISO 639-3 code, but also model
* some extensions to this. Use {@link #findByCode(String)} if you are
* unsure about your code. If no language could be found,
* <code>null</code> is returned. */
public static ILanguage get(final String code) {
initialize();
return (code == null ? null : languageIndex.get(code));
}
/** Find the language with the given code. The method checks both for the
* internal language codes and for any ISO 639 code. If no language
* could be found, <code>null</code> is returned. */
public static ILanguage findByCode(final String code) {
initialize();
return get(additionalCodeIndex.get(code));
}
/** Find the language with the given name. The method checks both for the
* canonical English name as well as alternative names in other languages
* or spelling errors found in Wikipedia. If no language could be found,
* <code>null</code> is returned. */
public static ILanguage findByName(final String name) {
if (name == null) {
return null;
}
initialize();
return get(additionalNameIndex.get(name.trim().toLowerCase()));
}
/** Tests if the specified languages are equal. The method returns
* <code>true</code> if both languages are <code>null</code>, but
* <code>false</code> if only one of them is <code>null</code>. */
public static boolean equals(final ILanguage language1,
final ILanguage language2) {
if (language1 == language2) {
return true;
}
else
if (language1 == null || language2 == null) {
return false;
}
else {
return language1.equals(language2);
}
}
}