/*
* CATMA Computer Aided Text Markup and Analysis
*
* Copyright (C) 2009-2013 University Of Hamburg
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.catma.document.source;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
/**
* Metadata concerning indexing of a {@link SourceDocument}.
*
* @author marco.petris@web.de
*
*/
public class IndexInfoSet {
private List<String> unseparableCharacterSequences;
private List<Character> userDefinedSeparatingCharacters;
private Locale locale;
/**
* @param unseparableCharacterSequences a list of character sequences that should be treated as single tokens
* although a tokenizer would probably detect them as individual tokens.
* @param userDefinedSeparatingCharacters a list of characters that seperate tokens (in addition to the characters
* that usually separate tokens like whitespace characters)
* @param locale the main locale of the text to be indexed
*/
public IndexInfoSet(List<String> unseparableCharacterSequences,
List<Character> userDefinedSeparatingCharacters,
Locale locale) {
super();
this.unseparableCharacterSequences = unseparableCharacterSequences;
this.userDefinedSeparatingCharacters = userDefinedSeparatingCharacters;
this.locale = locale;
}
/**
* Constructor with default locale, no special unseparable character sequences
* and no user defined separating characters.
*/
public IndexInfoSet() {
this.unseparableCharacterSequences = new ArrayList<String>();
this.userDefinedSeparatingCharacters = new ArrayList<Character>();
}
/**
* @return the locale specified or {@link Locale#getDefault()}.
*/
public Locale getLocale() {
return (locale==null) ? Locale.getDefault() : locale;
}
/**
* @param locale the main locale of the text to be indexed
*/
public void setLocale(Locale locale) {
this.locale = locale;
}
public LanguageItem getLanguage() {
return new LanguageItem(locale);
}
public void setLanguage(LanguageItem language) {
this.locale = language.getLocale();
}
/**
* @return a (possibly empty) list of unseparable character sequences,
* does not return <oode>null</oode>
*/
public List<String> getUnseparableCharacterSequences() {
return (unseparableCharacterSequences==null) ?
Collections.<String>emptyList() : unseparableCharacterSequences;
}
/**
* @return a (possibly empty) list of user defined speparating character sequences,
* does not return <oode>null</oode>
*/
public List<Character> getUserDefinedSeparatingCharacters() {
return (userDefinedSeparatingCharacters ==null) ?
Collections.<Character>emptyList() : userDefinedSeparatingCharacters;
}
/**
* @param character a user defined separating character (null is not allowed)
*/
public void addUserDefinedSeparatingCharacter(Character character) {
if (userDefinedSeparatingCharacters == null) {
userDefinedSeparatingCharacters = new ArrayList<Character>();
}
userDefinedSeparatingCharacters.add(character);
}
/**
* @param ucs null is not allowed
*/
public void addUnseparableCharacterSequence(String ucs) {
if (unseparableCharacterSequences == null) {
unseparableCharacterSequences = new ArrayList<String>();
}
unseparableCharacterSequences.add(ucs);
}
/**
* @param character null is not allowed
*/
public void removeUserDefinedSeparatingCharacter(Character character) {
if (userDefinedSeparatingCharacters != null) {
userDefinedSeparatingCharacters.remove(character);
}
}
/**
* @param ucs null is not allowed
*/
public void removeUnseparableCharacterSequence(String ucs) {
if (unseparableCharacterSequences != null) {
unseparableCharacterSequences.remove(ucs);
}
}
//TODO: we should use Character.getDirectonality to support mixed content
public boolean isRightToLeftWriting() {
String lang = getLocale().getLanguage().toLowerCase();
if (lang.equals(new Locale("iw").getLanguage().toLowerCase())) {
return true;
}
else if (lang.equals(new Locale("he").getLanguage().toLowerCase())) {
return true;
}
else if (lang.equals(new Locale("ar").getLanguage().toLowerCase())) {
return true;
}
else if (lang.equals(new Locale("ara").getLanguage().toLowerCase())) {
return true;
}
return false;
}
}