/* LanguageTool, a natural language style checker * Copyright (C) 2005 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool; import org.languagetool.databroker.ResourceDataBroker; import org.languagetool.language.Contributor; import org.languagetool.language.Demo; import org.languagetool.rules.Rule; import org.languagetool.rules.patterns.Unifier; import org.languagetool.synthesis.Synthesizer; import org.languagetool.tagging.Tagger; import org.languagetool.tagging.disambiguation.Disambiguator; import org.languagetool.tagging.disambiguation.xx.DemoDisambiguator; import org.languagetool.tagging.xx.DemoTagger; import org.languagetool.tokenizers.SentenceTokenizer; import org.languagetool.tokenizers.Tokenizer; import org.languagetool.tokenizers.WordTokenizer; import org.languagetool.tools.MultiKeyProperties; import org.languagetool.tools.StringTools; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Constructor; import java.net.URL; import java.util.*; /** * Base class for any supported language (English, German, etc). Language classes * are detected at runtime by searching the classpath for files named * {@code META-INF/org/languagetool/language-module.properties}. Those file(s) * need to contain a key {@code languageClasses} which specifies the fully qualified * class name(s), e.g. {@code org.languagetool.language.English}. Use commas to specify * more than one class. */ public abstract class Language { public static final Language DEMO = new Demo(); private static final String PROPERTIES_PATH = "META-INF/org/languagetool/language-module.properties"; private static final String PROPERTIES_KEY = "languageClasses"; private static List<Language> externalLanguages = new ArrayList<Language>(); /** * All languages supported by LanguageTool. This includes at least a "demo" language * for testing. */ public static Language[] LANGUAGES = getLanguages(); private static Language[] getLanguages() { final List<Language> languages = new ArrayList<Language>(); final Set<String> languageClassNames = new HashSet<String>(); try { final Enumeration<URL> propertyFiles = Language.class.getClassLoader().getResources(PROPERTIES_PATH); while (propertyFiles.hasMoreElements()) { final URL url = propertyFiles.nextElement(); final InputStream inputStream = url.openStream(); try { // We want to be able to read properties file with duplicate key, as produced by // Maven when merging files: final MultiKeyProperties props = new MultiKeyProperties(inputStream); final List<String> classNamesStr = props.getProperty(PROPERTIES_KEY); if (classNamesStr == null) { throw new RuntimeException("Key '" + PROPERTIES_KEY + "' not found in " + url); } for (String classNames : classNamesStr) { final String[] classNamesSplit = classNames.split("\\s*,\\s*"); for (String className : classNamesSplit) { if (languageClassNames.contains(className)) { // avoid duplicates - this way we are robust against problems with the maven assembly // plugin which aggregates files more than once (in case the deployment descriptor // contains both <format>zip</format> and <format>dir</format>): continue; } languages.add(createLanguageObjects(url, className)); languageClassNames.add(className); } } } finally { inputStream.close(); } } } catch (IOException e) { throw new RuntimeException(e); } languages.add(DEMO); return languages.toArray(new Language[languages.size()]); } private static Language createLanguageObjects(URL url, String className) { try { final Class<?> aClass = Class.forName(className); final Constructor<?> constructor = aClass.getConstructor(); return (Language) constructor.newInstance(); } catch (ClassNotFoundException e) { throw new RuntimeException("Class '" + className + "' specified in " + url + " could not be found in classpath", e); } catch (Exception e) { throw new RuntimeException("Object for class '" + className + "' specified in " + url + " could not created", e); } } /** * All languages supported by LanguageTool, but without the demo language. */ public static final Language[] REAL_LANGUAGES = new Language[LANGUAGES.length-1]; static { int i = 0; for (final Language lang : LANGUAGES) { if (!lang.getShortName().equals(Demo.SHORT_NAME)) { REAL_LANGUAGES[i] = lang; i++; } } } private static final Language[] BUILTIN_LANGUAGES = LANGUAGES; private static final Disambiguator DEMO_DISAMBIGUATOR = new DemoDisambiguator(); private static final Tagger DEMO_TAGGER = new DemoTagger(); private static final SentenceTokenizer SENTENCE_TOKENIZER = new SentenceTokenizer(); private static final WordTokenizer WORD_TOKENIZER = new WordTokenizer(); private static final Unifier MATCH_UNIFIER = new Unifier(); // ------------------------------------------------------------------------- /** * Get this language's two character code, e.g. <code>en</code> for English. * The country variant (e.g. "US"), if any, is not returned. * @return language code */ public abstract String getShortName(); /** * Get this language's name in English, e.g. <code>English</code> or * <code>German (Germany)</code>. * @return language name */ public abstract String getName(); /** * Get this language's country variants, e.g. <code>US</code> (as in <code>en-US</code>) or * <code>PL</code> (as in <code>pl-PL</code>). * @return String[] - array of country variants for the language. */ public abstract String[] getCountryVariants(); /** * Get the name(s) of the maintainer(s) for this language or <code>null</code>. */ public abstract Contributor[] getMaintainers(); /** * Get the rules classes that should run for texts in this language. * @since 1.4 */ public abstract List<Class<? extends Rule>> getRelevantRules(); // ------------------------------------------------------------------------- /** * Get this language's Java locale, not considering the country code. */ public Locale getLocale() { return new Locale(getShortName()); } /** * Get this language's Java locale, considering language code and country code (if any). * @since 2.1 */ public Locale getLocaleWithCountry() { if (getCountryVariants().length > 0) { return new Locale(getShortName(), getCountryVariants()[0]); } else { return getLocale(); } } /** * Get the location of the rule file(s). */ public List<String> getRuleFileName() { final List<String> ruleFiles = new ArrayList<String>(); final ResourceDataBroker dataBroker = JLanguageTool.getDataBroker(); ruleFiles.add(dataBroker.getRulesDir() + "/" + getShortName() + "/" + JLanguageTool.PATTERN_FILE); if (getShortNameWithVariant().length() > 2) { final String fileName = getShortName() + "/" + getShortNameWithVariant() + "/" + JLanguageTool.PATTERN_FILE; if (dataBroker.ruleFileExists(fileName)) { ruleFiles.add(dataBroker.getRulesDir() + "/" + fileName); } } return ruleFiles; } /** * Languages that have country variants need to overwrite this to select their most common variant. * @return default country variant or <code>null</code> * @since 1.8 */ public Language getDefaultVariant() { return null; } /** * Get this language's part-of-speech disambiguator implementation. */ public Disambiguator getDisambiguator() { return DEMO_DISAMBIGUATOR; } /** * Get this language's part-of-speech tagger implementation. */ public Tagger getTagger() { return DEMO_TAGGER; } /** * Get this language's sentence tokenizer implementation. */ public SentenceTokenizer getSentenceTokenizer() { return SENTENCE_TOKENIZER; } /** * Get this language's word tokenizer implementation. */ public Tokenizer getWordTokenizer() { return WORD_TOKENIZER; } /** * Get this language's part-of-speech synthesizer implementation or <code>null</code>. */ public Synthesizer getSynthesizer() { return null; } /** * Get this language's feature unifier. * @return Feature unifier for analyzed tokens. */ public Unifier getUnifier() { return MATCH_UNIFIER; } /** * Get this language's feature unifier used for disambiguation. * Note: it might be different from the normal rule unifier. * @return Feature unifier for analyzed tokens. */ public Unifier getDisambiguationUnifier() { return MATCH_UNIFIER; } /** * Get the name of the language translated to the current locale, * if available. Otherwise, get the untranslated name. */ public final String getTranslatedName(final ResourceBundle messages) { try { return messages.getString(getShortNameWithVariant()); } catch (final MissingResourceException e) { try { return messages.getString(getShortName()); } catch (final MissingResourceException e1) { return getName(); } } } /** * Get the short name of the language with a country variant, if it is * a single-variant language. For generic language classes, get only a two- or * three-character code. * @since 1.8 */ public final String getShortNameWithVariant() { String name = getShortName(); if (getCountryVariants().length == 1 && !name.contains("-x-")) { // e.g. "de-DE-x-simple-language" name += "-" + getCountryVariants()[0]; } return name; } /** * Start symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}. * Note that the array must be of equal length as {@link #getUnpairedRuleEndSymbols()} and the sequence of * starting symbols must match exactly the sequence of ending symbols. */ public String[] getUnpairedRuleStartSymbols() { return new String[]{ "[", "(", "{", "\"", "'" }; } /** * End symbols used by {@link org.languagetool.rules.GenericUnpairedBracketsRule}. * @see #getUnpairedRuleStartSymbols() */ public String[] getUnpairedRuleEndSymbols() { return new String[]{ "]", ")", "}", "\"", "'" }; } // ------------------------------------------------------------------------- /** * Re-inits the built-in languages and adds the specified ones. */ public static void reInit(final List<Language> languages) { LANGUAGES = new Language[BUILTIN_LANGUAGES.length + languages.size()]; int i = BUILTIN_LANGUAGES.length; System.arraycopy(BUILTIN_LANGUAGES, 0, LANGUAGES, 0, BUILTIN_LANGUAGES.length); for (final Language lang : languages) { LANGUAGES[i++] = lang; } externalLanguages = languages; } /** * Return languages that are not built-in but have been added manually. */ public static List<Language> getExternalLanguages() { return externalLanguages; } /** * Return all languages supported by LanguageTool. * @return A list of all languages, including external ones and country variants (e.g. en-US) */ public static List<Language> getAllLanguages() { final List<Language> langList = new ArrayList<Language>(); Collections.addAll(langList, LANGUAGES); langList.addAll(externalLanguages); return langList; } /** * Get the Language object for the given language name. * * @param languageName e.g. <code>English</code> or <code>German</code> (case is significant) * @return a Language object or <code>null</code> */ public static Language getLanguageForName(final String languageName) { for (Language element : Language.LANGUAGES) { if (languageName.equals(element.getName())) { return element; } } return null; } /** * Get the Language object for the given short language name. * * @param langCode e.g. <code>en</code> or <code>es-US</code> * @return a Language object * @throws IllegalArgumentException if the language is not supported or if the language code is invalid */ public static Language getLanguageForShortName(final String langCode) { final Language language = getLanguageForShortNameOrNull(langCode); if (language == null) { throw new IllegalArgumentException("'" + langCode + "' is not a language code known to LanguageTool. Supported languages: " + Arrays.toString(REAL_LANGUAGES)); } return language; } /** * Return whether a language with the given language code is supported. Which languages * are supported depends on the classpath when the {@code Language} object is initialized. * * @param langCode e.g. <code>en</code> or <code>es-US</code> * @return true if the language is supported * @throws IllegalArgumentException if the language is not supported or if the language code is invalid * @since 2.1 */ public static boolean isLanguageSupported(final String langCode) { return getLanguageForShortNameOrNull(langCode) != null; } private static Language getLanguageForShortNameOrNull(final String langCode) { StringTools.assureSet(langCode, "langCode"); Language result = null; if (langCode.contains("-x-")) { for (Language element : Language.LANGUAGES) { if (element.getShortName().equals(langCode)) { // e.g. "de-DE-x-simple-language" return element; } } } else if (langCode.contains("-")) { final String[] parts = langCode.split("-"); if (parts.length != 2) { throw new IllegalArgumentException("'" + langCode + "' isn't a valid language code"); } for (Language element : Language.LANGUAGES) { if (parts[0].equals(element.getShortName()) && element.getCountryVariants().length == 1 && parts[1].equals(element.getCountryVariants()[0])) { result = element; break; } } } else { for (Language element : Language.LANGUAGES) { if (langCode.equals(element.getShortName())) { result = element; break; } } } return result; } /** * Get the best match for a locale, using American English as the final fallback if nothing * else fits. The returned language will be a country variant language (e.g. British English, not just English) * if available. * @since 1.8 * @throws RuntimeException if no language was found and American English as a fallback is not available */ public static Language getLanguageForLocale(final Locale locale) { final Language language = getLanguageForLanguageNameAndCountry(locale); if (language != null) { return language; } else { final Language firstFallbackLanguage = getLanguageForLanguageNameOnly(locale); if (firstFallbackLanguage != null) { return firstFallbackLanguage; } } for (Language aLanguage : REAL_LANGUAGES) { if (aLanguage.getShortNameWithVariant().equals("en-US")) { return aLanguage; } } throw new RuntimeException("No appropriate language found, not even en-US. Supported languages: " + Arrays.toString(REAL_LANGUAGES)); } private static Language getLanguageForLanguageNameAndCountry(Locale locale) { for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage())) { final List<String> countryVariants = Arrays.asList(language.getCountryVariants()); if (countryVariants.contains(locale.getCountry())) { return language; } } } return null; } private static Language getLanguageForLanguageNameOnly(Locale locale) { // use default variant if available: for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage()) && language.hasVariant()) { final Language defaultVariant = language.getDefaultVariant(); if (defaultVariant != null) { return defaultVariant; } } } // use the first match otherwise (which should be the only match): for (Language language : Language.REAL_LANGUAGES) { if (language.getShortName().equals(locale.getLanguage()) && !language.hasVariant()) { return language; } } return null; } @Override public final String toString() { return getName(); } /** * Get sorted info about all maintainers (without country variants) to be used in the About dialog. * @since 0.9.9 * @param messages {{@link ResourceBundle} language bundle to translate the info * @return A list of maintainers, sorted by name of language. */ public static String getAllMaintainers(final ResourceBundle messages) { final StringBuilder maintainersInfo = new StringBuilder(); final List<String> toSort = new ArrayList<String>(); for (final Language lang : Language.REAL_LANGUAGES) { if (!lang.isVariant()) { if (lang.getMaintainers() != null) { final List<String> names = new ArrayList<String>(); for (Contributor contributor : lang.getMaintainers()) { names.add(contributor.getName()); } toSort.add(messages.getString(lang.getShortName()) + ": " + listToStringWithLineBreaks(names)); } } } Collections.sort(toSort); for (final String lElem : toSort) { maintainersInfo.append(lElem); maintainersInfo.append('\n'); } return maintainersInfo.toString(); } /** * Whether this is a country variant of another language, i.e. whether it doesn't * directly extend {@link Language}, but a subclass of {@link Language}. * @since 1.8 */ public final boolean isVariant() { for (Language language : LANGUAGES) { final boolean skip = language.getShortNameWithVariant().equals(getShortNameWithVariant()); if (!skip && language.getClass().isAssignableFrom(getClass())) { return true; } } return false; } /** * Whether this class has at least one subclass that implements variants of this language. * @since 1.8 */ public final boolean hasVariant() { for (Language language : LANGUAGES) { final boolean skip = language.getShortNameWithVariant().equals(getShortNameWithVariant()); if (!skip && getClass().isAssignableFrom(language.getClass())) { return true; } } return false; } public boolean isExternal() { return false; } /** * Return true if this is the same language as the given one, considering country * variants only if set for both languages. For example: en = en, en = en-GB, en-GB = en-GB, * but en-US != en-GB * @since 1.8 */ public boolean equalsConsiderVariantsIfSpecified(Language otherLanguage) { if (getShortName().equals(otherLanguage.getShortName())) { final boolean thisHasVariant = hasCountryVariant(); final boolean otherHasVariant = otherLanguage.hasCountryVariant(); if (thisHasVariant && otherHasVariant) { return getShortNameWithVariant().equals(otherLanguage.getShortNameWithVariant()); } return true; } else { return false; } } private boolean hasCountryVariant() { return getCountryVariants().length == 1 && !(getCountryVariants().length == 1 && getCountryVariants()[0].equals("ANY")); } private static String listToStringWithLineBreaks(final Collection<String> l) { final StringBuilder sb = new StringBuilder(); int i = 0; for (final Iterator<String> iter = l.iterator(); iter.hasNext();) { final String str = iter.next(); sb.append(str); if (iter.hasNext()) { if (i > 0 && i % 3 == 0) { sb.append(",\n "); } else { sb.append(", "); } } i++; } return sb.toString(); } }