/*
Copyright (C) 2003-2004 Pierrick Brihaye
pierrick.brihaye@wanadoo.fr
Original Perl code :
Portions (c) 2002 QAMUS LLC (www.qamus.org),
(c) 2002 Trustees of the University of Pennsylvania
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
package marmot.thirdparty.aramorph;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
/**
* A java port of Buckwalter Arabic Morphological Analyzer Version 1.0. Original
* Perl distribution avalaible from : <a href=
* "http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002L49">LDC
* Catalog</a>
*
* @author Pierrick Brihaye, 2003
*/
public class AraMorph {
/**
* The dictionary handler. TODO : use more generic interface.
*/
private static DictionaryHandler dict = null;
/**
* The solutions handler. TODO : use more generic interface.
*/
/** Constructs an Arabic morphological analyzer that will output nothing. */
public AraMorph() {
dict = new DictionaryHandler();
}
public char romanizeChar(char c) {
switch (c) {
case '\u0621':
return '\''; // \u0621 : ARABIC LETTER HAMZA
case '\u0622':
return '|'; // \u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE
case '\u0623':
return '>'; // \u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE
case '\u0624':
return '&'; // \u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE
case '\u0625':
return '<'; // \u0625 : ARABIC LETTER
case '\u0626':
return '}'; // \u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE
case '\u0627':
return 'A'; // \u0627 : ARABIC LETTER ALEF
case '\u0628':
return 'b'; // \u0628 : ARABIC LETTER BEH
case '\u0629':
return 'p'; // \u0629 : ARABIC LETTER TEH MARBUTA
case '\u062A':
return 't'; // \u062A : ARABIC LETTER TEH
case '\u062B':
return 'v'; // \u062B : ARABIC LETTER THEH
case '\u062C':
return 'j'; // \u062C : ARABIC LETTER JEEM
case '\u062D':
return 'H'; // \u062D : ARABIC LETTER HAH
case '\u062E':
return 'x'; // \u062E : ARABIC LETTER KHAH
case '\u062F':
return 'd'; // \u062F : ARABIC LETTER DAL
case '\u0630':
return '*'; // \u0630 : ARABIC LETTER THAL
case '\u0631':
return 'r'; // \u0631 : ARABIC LETTER REH
case '\u0632':
return 'z'; // \u0632 : ARABIC LETTER ZAIN
case '\u0633':
return 's'; // \u0633 : ARABIC LETTER SEEN
case '\u0634':
return '$'; // \u0634 : ARABIC LETTER SHEEN
case '\u0635':
return 'S'; // \u0635 : ARABIC LETTER SAD
case '\u0636':
return 'D'; // \u0636 : ARABIC LETTER DAD
case '\u0637':
return 'T'; // \u0637 : ARABIC LETTER TAH
case '\u0638':
return 'Z'; // \u0638 : ARABIC LETTER ZAH
case '\u0639':
return 'E'; // \u0639 : ARABIC LETTER AIN
case '\u063A':
return 'g'; // \u063A : ARABIC LETTER GHAIN
case '\u0640':
return '_'; // \u0640 : ARABIC TATWEEL
case '\u0641':
return 'f'; // \u0641 : ARABIC LETTER FEH
case '\u0642':
return 'q'; // \u0642 : ARABIC LETTER QAF
case '\u0643':
return 'k'; // \u0643 : ARABIC LETTER KAF
case '\u0644':
return 'l'; // \u0644 : ARABIC LETTER LAM
case '\u0645':
return 'm'; // \u0645 : ARABIC LETTER MEEM
case '\u0646':
return 'n'; // \u0646 : ARABIC LETTER NOON
case '\u0647':
return 'h'; // \u0647 : ARABIC LETTER HEH
case '\u0648':
return 'w'; // \u0648 : ARABIC LETTER WAW
case '\u0649':
return 'Y'; // \u0649 : ARABIC LETTER ALEF MAKSURA
case '\u064A':
return 'y'; // \u064A : ARABIC LETTER YEH
case '\u064B':
return 'F'; // \u064B : ARABIC FATHATAN
case '\u064C':
return 'N'; // \u064C : ARABIC DAMMATAN
case '\u064D':
return 'K'; // \u064D : ARABIC KASRATAN
case '\u064E':
return 'a'; // \u064E : ARABIC FATHA
case '\u064F':
return 'u'; // \u064F : ARABIC DAMMA
case '\u0650':
return 'i'; // \u0650 : ARABIC KASRA
case '\u0651':
return '~'; // \u0651 : ARABIC SHADDA
case '\u0652':
return 'o'; // \u0652 : ARABIC SUKUN
case '\u0670':
return '`'; // \u0670 : ARABIC LETTER SUPERSCRIPT ALEF
case '\u0671':
return '{'; // \u0671 : ARABIC LETTER ALEF WASLA
case '\u067E':
return 'P'; // \u067E : ARABIC LETTER PEH
case '\u0686':
return 'J'; // \u0686 : ARABIC LETTER TCHEH
case '\u06A4':
return 'V'; // \u06A4 : ARABIC LETTER VEH
case '\u06AF':
return 'G'; // \u06AF : ARABIC LETTER GAF
case '\u0698':
return 'R'; // \u0698 : ARABIC LETTER JEH (no more in Buckwalter system)
// Not in Buckwalter system \u0679 : ARABIC LETTER TTEH
// Not in Buckwalter system \u0688 : ARABIC LETTER DDAL
// Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH
// Not in Buckwalter system \u0691 : ARABIC LETTER RREH
// Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA
// Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE
// Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL
// Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE
case '\u060C':
return ','; // \u060C : ARABIC COMMA
case '\u061B':
return ';'; // \u061B : ARABIC SEMICOLON
case '\u061F':
return '?'; // \u061F : ARABIC QUESTION MARK
}
return c;
}
/**
* Returns a word in the Buckwalter transliteration system from a word in
* arabic. Vowels and diacritics are <strong>discarded</strong>.
*
* @param word
* The word in arabic
* @return The romanized word
*/
public String romanizeWord(String word) {
StringBuilder sb = new StringBuilder(word.length());
for (int index = 0; index < word.length(); index++) {
char c = word.charAt(index);
char new_c = romanizeChar(c);
if (c == new_c) {
// Delete
System.err.println(word);
continue;
}
switch (new_c) {
// Not significant for morphological analysis (ARABIC TATWEEL)
case '_':
// Not suitable for morphological analysis : remove all
// vowels/diacritics, i.e. undo the job !
case 'F':
case 'N':
case 'K':
case 'a':
case 'u':
case 'i':
case '~':
case 'o':
// TODO : how to handle ARABIC LETTER SUPERSCRIPT ALEF and ARABIC LETTER
// ALEF WASLA ?
// Strip them for now.
case '`':
case '\\':
case '{':
// Delete Character
continue;
}
sb.append(new_c);
}
return sb.toString();
}
static private final Pattern arabic_word_pattern_ = Pattern.compile("([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+");
/**
* Analyzes a token. For performance issues, the analyzer keeps track of the
* results.
*
* @return Whether or not the word has a solution in arabic
* @param outputBuckwalter
* Whether or not the Buckwalter transliteration system should be
* used. If not, outputs will be in arabic wherever possible
* @param token
* The token to be analyzed
*/
public Set<Solution> analyzeToken(String token) {
if (!arabic_word_pattern_.matcher(token).matches()) {
return null;
}
String translitered = romanizeWord(token);
Set<Solution> solutions = null;
solutions = feedWordSolutions(translitered);
if (solutions != null) {
return solutions;
}
// Set<String> alternative_spellings = feedAlternativeSpellings(translitered);
// solutions = new HashSet<Solution>();
// if (alternative_spellings != null) {
//
// for (String alternative : alternative_spellings) {
// // feed solutions with alternative spellings' ones
//
// Set<Solution> alternative_solutions = feedWordSolutions(alternative);
//
// if (alternative_solutions != null)
// solutions.addAll(alternative_solutions);
// }
//
// }
// if (solutions.isEmpty()) {
// return null;
// }
return solutions;
}
/**
* Splits a word in prefix + stem + suffix combinations.
*
* @return The list of combinations
* @param translitered
* The word. It is assumed that {@link #romanizeWord(String word)
* romanizeWord} has been called before
*/
private Set<SegmentedWord> segmentWord(String translitered) {
Set<SegmentedWord> segmented = new HashSet<SegmentedWord>();
int prefix_len = 0;
int suffix_len = 0;
// TODO : why 4 ? The info could certainly be grabbed from
// dictionnaries...
while ((prefix_len) <= 4 && (prefix_len <= translitered.length())) {
String prefix = translitered.substring(0, prefix_len);
int stem_len = (translitered.length() - prefix_len);
suffix_len = 0;
// TODO : why 6 ? The info could certainly be grabbed from
// dictionnaries...
while ((stem_len >= 1) && (suffix_len <= 6)) {
String stem = translitered.substring(prefix_len, prefix_len
+ stem_len);
String suffix = translitered.substring(prefix_len + stem_len,
prefix_len + stem_len + suffix_len);
segmented.add(new SegmentedWord(prefix, stem, suffix));
stem_len--;
suffix_len++;
}
prefix_len++;
}
return segmented;
}
/**
* Feed an internal list of solutions for the given word
*
* @param translitered
* The word. It is assumed that {@link #romanizeWord(String word)
* romanizeWord} has been called before
* @return Whether or not there are solutions for this word
*/
private Set<Solution> feedWordSolutions(String translitered) {
Set<Solution> wordSolutions = new HashSet<Solution>();
// get a list of valid segmentations
Set<SegmentedWord> segments = segmentWord(translitered);
// Brute force algorithm
for (SegmentedWord segmentedWord : segments) {
Collection<DictionaryEntry> prefixes = dict
.getPrefixIterator(segmentedWord.getPrefix());
if (prefixes == null) {
continue;
}
Collection<DictionaryEntry> stems = dict
.getStemIterator(segmentedWord.getStem());
if (stems == null) {
continue;
}
Collection<DictionaryEntry> suffixes = dict
.getSuffixIterator(segmentedWord.getSuffix());
if (suffixes == null) {
continue;
}
for (DictionaryEntry prefix : prefixes) {
for (DictionaryEntry stem : stems) {
// Prefix/Stem compatiblity
if (dict.hasAB(prefix.getMorphology(), stem.getMorphology())) {
for (DictionaryEntry suffix : suffixes) {
// Prefix/Suffix compatiblity
if (dict.hasAC(prefix.getMorphology(),
suffix.getMorphology())) {
// Stem/Suffix compatibility
if (dict.hasBC(stem.getMorphology(),
suffix.getMorphology())) {
// All tests passed : it is a solution
wordSolutions.add(new Solution(prefix, stem, suffix));
}
}
}
}
}
}
}
return wordSolutions;
}
/**
* Feed an internal list of alternative spellings for the given word
*
* @param translitered
* The word. It is assumed that {@link #romanizeWord(String word)
* romanizeWord} has been called before
* @return Whether or not there are alternative spellings for this word
*/
// private Set<String> feedAlternativeSpellings(String translitered) {
// // No need to reprocess
// HashSet<String> wordAlternativeSpellings = new HashSet<String>();
// String temp = translitered;
// String temp2;
// // final 'alif maqSuura + hamza-on-the-line
// if (temp.matches(".*" + "Y'$")) { // Y_w'_Y'
// // -> yaa' + hamza-on-the-line
// temp = temp.replaceAll("Y", "y"); // y_w'_y'
//
// wordAlternativeSpellings.add(temp); // y_w'_y' -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__y'
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__y'
// wordAlternativeSpellings.add(temp); // y_&__y' -- pushed
// }
// temp = translitered; // Y_w'_Y'
// // -> yaa' + hamza-on-the-line
// temp = temp.replaceAll("Y", "y"); // y_w'_y'
// // final yaa' + hamza-on-the-line -> hamza-on-yaa'
// temp = temp.replaceFirst("y'$", "}"); // y_w'_}
//
// wordAlternativeSpellings.add(temp); // y_w'_} -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__}
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__}
// wordAlternativeSpellings.add(temp); // y_&__} -- pushed
// }
// }
// // final yaa' + hamza-on-the-line
// else if (temp.matches(".*" + "y'$")) { // Y_w'_y'
// // 'alif maqSuura -> yaa'
// temp2 = temp.replaceAll("Y", "y"); // y_w'_y'
// if (!temp.equals(temp2)) {
// temp = temp2; // y_w'_y'
// wordAlternativeSpellings.add(temp); // y_w'_y' -- pushed
// }
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__y'
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__y'
// wordAlternativeSpellings.add(temp); // y_&__y' -- pushed
// }
// temp = translitered; // Y_w'_y'
// // 'alif maqSuura -> yaa'
// temp = temp.replaceAll("Y", "y"); // y_w'_y'
// // final yaa' + hamza-on-the-line -> 'alif maqSuura
// temp = temp.replaceFirst("y'$", "}"); // y_w'_}
// wordAlternativeSpellings.add(temp); // y_w'_} -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__}
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__}
// wordAlternativeSpellings.add(temp); // y_&__} -- pushed
// }
// }
// // final yaa'
// else if (temp.matches(".*" + "y$")) { // Y_w'_y
// // 'alif maqSuura -> yaa'
// temp = temp.replaceAll("Y", "y"); // y_w'_y
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__y
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__y
// wordAlternativeSpellings.add(temp); // y_&__y -- pushed
// }
// temp = translitered; // Y_w'_y
// // 'alif maqSuura -> yaa'
// temp = temp.replaceAll("Y", "y"); // y_w'_y
// // final yaa' -> 'alif maqSuura
// temp = temp.replaceAll("y$", "Y"); // y_w'_Y
// wordAlternativeSpellings.add(temp); // y_w'_Y -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__Y
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__Y
// wordAlternativeSpellings.add(temp); // y_&__Y -- pushed
// }
// }
// // final haa'
// else if (temp.matches(".*" + "h$")) { // Y_w'_h
// // 'alif maqSuura -> yaa'
// temp2 = temp.replaceAll("Y", "y"); // y_w'_h
// if (!temp.equals(temp2)) {
// temp = temp2; // y_w'_h
// wordAlternativeSpellings.add(temp); // y_w'_h -- pushed
// }
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__h
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__h
// wordAlternativeSpellings.add(temp); // y_&__h -- pushed
// }
// // final haa' -> taa' marbuuTa
// temp = temp.replaceFirst("h$", "p"); // y_w'_p
// wordAlternativeSpellings.add(temp); // y_w'_p -- pushed
// }
// // final taa' marbuuTa
// else if (temp.matches(".*" + "p$")) { // Y_w'_p
// // 'alif maqSuura -> yaa'
// temp2 = temp.replaceAll("Y", "y"); // y_w'_p
// if (!temp.equals(temp2)) {
// temp = temp2; // y_w'_p
// wordAlternativeSpellings.add(temp); // y_w'_p -- pushed
// }
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__p
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__p
// wordAlternativeSpellings.add(temp); // y_&__p -- pushed
// }
// // final taa' marbuuTa -> haa'
// temp = temp.replaceFirst("p$", "h"); // y_w'_h
// wordAlternativeSpellings.add(temp); // //y_w'_h -- pushed
// }
// // Substitutions before matching
// else {
// // final 'alif maqSuura -> yaa'
// temp2 = temp.replaceFirst("Y$", "y"); // Y_w'_y
// if (!temp.equals(temp2)) {
// temp = temp2; // Y_w'_y
// // 'alif maqSuura -> yaa'
// temp = temp.replaceAll("Y", "y"); // y_w'_y
// wordAlternativeSpellings.add(temp); // y_w'_y -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&__y
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&__y
// wordAlternativeSpellings.add(temp); // y_&__y -- pushed
// }
// } else {
// // 'alif maqSuura -> yaa'
// temp2 = temp.replaceAll("Y", "y"); // y_w'__
// if (!temp.equals(temp2)) {
// temp = temp2; // y_w'__
// wordAlternativeSpellings.add(temp); // y_w'__ -- pushed
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&___
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&___
// wordAlternativeSpellings.add(temp); // y_&___ -- pushed
// }
// } else {
// // medial waaw + hamza-on-the-line -> hamza-on-waaw
// temp2 = temp.replaceFirst("w'", "&"); // y_&___
// if (!temp.equals(temp2)) {
// temp = temp2; // y_&___
// wordAlternativeSpellings.add(temp); // y_&___ -- pushed
// } else {
// } // nothing
// }
// }
// }
//
// if (wordAlternativeSpellings.isEmpty()) {
// return null;
// }
//
// return wordAlternativeSpellings;
// }
// Inner class
private class SegmentedWord {
private String prefix;
private String stem;
private String suffix;
protected SegmentedWord(String prefix, String stem, String suffix) {
this.prefix = prefix;
this.stem = stem;
this.suffix = suffix;
}
protected String getPrefix() {
return this.prefix;
}
protected String getStem() {
return this.stem;
}
protected String getSuffix() {
return this.suffix;
}
}
}