AraMorph.java example

Explorer
cistern-master
- hmmla
  - src
    - hmmla
- marmot
  - src
/*
Copyright (C) 2003-2004 Pierrick Brihaye
pierrick.brihaye@wanadoo.fr
 
Original Perl code :
Portions (c) 2002 QAMUS LLC (www.qamus.org), 
(c) 2002 Trustees of the University of Pennsylvania 
 
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
 */

package marmot.thirdparty.aramorph;

import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * A java port of Buckwalter Arabic Morphological Analyzer Version 1.0. Original
 * Perl distribution avalaible from : <a href=
 * "http://www.ldc.upenn.edu/Catalog/CatalogEntry.jsp?catalogId=LDC2002L49">LDC
 * Catalog</a>
 * 
 * @author Pierrick Brihaye, 2003
 */
public class AraMorph {

	/**
	 * The dictionary handler. TODO : use more generic interface.
	 */
	private static DictionaryHandler dict = null;

	/**
	 * The solutions handler. TODO : use more generic interface.
	 */

	/** Constructs an Arabic morphological analyzer that will output nothing. */
	public AraMorph() {
		dict = new DictionaryHandler();
	}

	public char romanizeChar(char c) {
		
		switch (c) {
		
		case '\u0621':
			return '\''; // \u0621 : ARABIC LETTER HAMZA
		case '\u0622':
			return '|'; // \u0622 : ARABIC LETTER ALEF WITH MADDA ABOVE
		case '\u0623':
			return '>'; // \u0623 : ARABIC LETTER ALEF WITH HAMZA ABOVE
		case '\u0624':
			return '&'; // \u0624 : ARABIC LETTER WAW WITH HAMZA ABOVE
		case '\u0625':
			return '<'; // \u0625 : ARABIC LETTER 
		case '\u0626':
			return '}'; // \u0626 : ARABIC LETTER YEH WITH HAMZA ABOVE
		case '\u0627':
			return 'A'; // \u0627 : ARABIC LETTER ALEF
		case '\u0628':
			return 'b'; // \u0628 : ARABIC LETTER BEH
		case '\u0629':
			return 'p'; // \u0629 : ARABIC LETTER TEH MARBUTA
		case '\u062A':
			return 't'; // \u062A : ARABIC LETTER TEH
		case '\u062B':
			return 'v'; // \u062B : ARABIC LETTER THEH
		case '\u062C':
			return 'j'; // \u062C : ARABIC LETTER JEEM
		case '\u062D':
			return 'H'; // \u062D : ARABIC LETTER HAH
		case '\u062E':
			return 'x'; // \u062E : ARABIC LETTER KHAH
		case '\u062F':
			return 'd'; // \u062F : ARABIC LETTER DAL
		case '\u0630':
			return '*'; // \u0630 : ARABIC LETTER THAL
		case '\u0631':
			return 'r'; // \u0631 : ARABIC LETTER REH
		case '\u0632':
			return 'z'; // \u0632 : ARABIC LETTER ZAIN
		case '\u0633':
			return 's'; // \u0633 : ARABIC LETTER SEEN
		case '\u0634':
			return '$'; // \u0634 : ARABIC LETTER SHEEN
		case '\u0635':
			return 'S'; // \u0635 : ARABIC LETTER SAD
		case '\u0636':
			return 'D'; // \u0636 : ARABIC LETTER DAD
		case '\u0637':
			return 'T'; // \u0637 : ARABIC LETTER TAH
		case '\u0638':
			return 'Z'; // \u0638 : ARABIC LETTER ZAH
		case '\u0639':
			return 'E'; // \u0639 : ARABIC LETTER AIN
		case '\u063A':
			return 'g'; // \u063A : ARABIC LETTER GHAIN
		case '\u0640':
			return '_'; // \u0640 : ARABIC TATWEEL
		case '\u0641':
			return 'f'; // \u0641 : ARABIC LETTER FEH
		case '\u0642':
			return 'q'; // \u0642 : ARABIC LETTER QAF
		case '\u0643':
			return 'k'; // \u0643 : ARABIC LETTER KAF
		case '\u0644':
			return 'l'; // \u0644 : ARABIC LETTER LAM
		case '\u0645':
			return 'm'; // \u0645 : ARABIC LETTER MEEM
		case '\u0646':
			return 'n'; // \u0646 : ARABIC LETTER NOON
		case '\u0647':
			return 'h'; // \u0647 : ARABIC LETTER HEH
		case '\u0648':
			return 'w'; // \u0648 : ARABIC LETTER WAW
		case '\u0649':
			return 'Y'; // \u0649 : ARABIC LETTER ALEF MAKSURA
		case '\u064A':
			return 'y'; // \u064A : ARABIC LETTER YEH
		case '\u064B':
			return 'F'; // \u064B : ARABIC FATHATAN 
		case '\u064C':
			return 'N'; // \u064C : ARABIC DAMMATAN
		case '\u064D':
			return 'K'; // \u064D : ARABIC KASRATAN
		case '\u064E':
			return 'a'; // \u064E : ARABIC FATHA
		case '\u064F':
			return 'u'; // \u064F : ARABIC DAMMA
		case '\u0650':
			return 'i'; // \u0650 : ARABIC KASRA
		case '\u0651':
			return '~'; // \u0651 : ARABIC SHADDA
		case '\u0652':
			return 'o'; // \u0652 : ARABIC SUKUN
		case '\u0670':
			return '`'; // \u0670 : ARABIC LETTER SUPERSCRIPT ALEF
		case '\u0671':
			return '{'; // \u0671 : ARABIC LETTER ALEF WASLA
		case '\u067E':
			return 'P'; // \u067E : ARABIC LETTER PEH
		case '\u0686':
			return 'J'; // \u0686 : ARABIC LETTER TCHEH
		case '\u06A4':
			return 'V'; // \u06A4 : ARABIC LETTER VEH
		case '\u06AF':
			return 'G'; // \u06AF : ARABIC LETTER GAF
		case '\u0698':
			return 'R'; // \u0698 : ARABIC LETTER JEH (no more in Buckwalter system)
		// Not in Buckwalter system \u0679 : ARABIC LETTER TTEH
		// Not in Buckwalter system \u0688 : ARABIC LETTER DDAL
		// Not in Buckwalter system \u06A9 : ARABIC LETTER KEHEH
		// Not in Buckwalter system \u0691 : ARABIC LETTER RREH
		// Not in Buckwalter system \u06BA : ARABIC LETTER NOON GHUNNA
		// Not in Buckwalter system \u06BE : ARABIC LETTER HEH DOACHASHMEE
		// Not in Buckwalter system \u06C1 : ARABIC LETTER HEH GOAL
		// Not in Buckwalter system \u06D2 : ARABIC LETTER YEH BARREE
		case '\u060C':
			return ','; // \u060C : ARABIC COMMA
		case '\u061B':
			return ';'; // \u061B : ARABIC SEMICOLON
		case '\u061F':
			return '?'; // \u061F : ARABIC QUESTION MARK
		}	
				
		return c;
	}

	/**
	 * Returns a word in the Buckwalter transliteration system from a word in
	 * arabic. Vowels and diacritics are <strong>discarded</strong>.
	 * 
	 * @param word
	 *            The word in arabic
	 * @return The romanized word
	 */
	public String romanizeWord(String word) {
		StringBuilder sb = new StringBuilder(word.length());
		
		for (int index = 0; index < word.length(); index++) {
			
			char c = word.charAt(index);
			
			char new_c = romanizeChar(c);
			
			if (c == new_c) {
				// Delete
				
				System.err.println(word);
				
				continue;
			}
			
			switch (new_c) {
					// Not significant for morphological analysis (ARABIC TATWEEL)
				case '_':					
					// Not suitable for morphological analysis : remove all
					// vowels/diacritics, i.e. undo the job !
				case 'F':
				case 'N':
				case 'K':
				case 'a':
				case 'u':
				case 'i':
				case '~':
				case 'o':
					// TODO : how to handle ARABIC LETTER SUPERSCRIPT ALEF and ARABIC LETTER
					// ALEF WASLA ? 
					// Strip them for now.
				case '`':
				case '\\':
				case '{':
					// Delete Character
					continue;
			}
			
			sb.append(new_c);
			
		}
		return sb.toString();
	}

	static private final Pattern arabic_word_pattern_ = Pattern.compile("([\u067E\u0686\u0698\u06AF\u0621-\u063A\u0641-\u0652])+");
	
	
	/**
	 * Analyzes a token. For performance issues, the analyzer keeps track of the
	 * results.
	 * 
	 * @return Whether or not the word has a solution in arabic
	 * @param outputBuckwalter
	 *            Whether or not the Buckwalter transliteration system should be
	 *            used. If not, outputs will be in arabic wherever possible
	 * @param token
	 *            The token to be analyzed
	 */
	public Set<Solution> analyzeToken(String token) {
		if (!arabic_word_pattern_.matcher(token).matches()) {
			return null;
		}

		String translitered = romanizeWord(token);
		Set<Solution> solutions = null;

		solutions = feedWordSolutions(translitered);
		if (solutions != null) {
			return solutions;
		}

//		Set<String> alternative_spellings = feedAlternativeSpellings(translitered);
//		solutions = new HashSet<Solution>();
//		if (alternative_spellings != null) {
//
//			for (String alternative : alternative_spellings) {
//				// feed solutions with alternative spellings' ones
//
//				Set<Solution> alternative_solutions = feedWordSolutions(alternative);
//
//				if (alternative_solutions != null)
//					solutions.addAll(alternative_solutions);
//			}
//
//		}
//		if (solutions.isEmpty()) {
//			return null;
//		}

		return solutions;
	}

	/**
	 * Splits a word in prefix + stem + suffix combinations.
	 * 
	 * @return The list of combinations
	 * @param translitered
	 *            The word. It is assumed that {@link #romanizeWord(String word)
	 *            romanizeWord} has been called before
	 */
	private Set<SegmentedWord> segmentWord(String translitered) {
		Set<SegmentedWord> segmented = new HashSet<SegmentedWord>();
		int prefix_len = 0;
		int suffix_len = 0;
		// TODO : why 4 ? The info could certainly be grabbed from
		// dictionnaries...
		while ((prefix_len) <= 4 && (prefix_len <= translitered.length())) {
			String prefix = translitered.substring(0, prefix_len);
			int stem_len = (translitered.length() - prefix_len);
			suffix_len = 0;
			// TODO : why 6 ? The info could certainly be grabbed from
			// dictionnaries...
			while ((stem_len >= 1) && (suffix_len <= 6)) {
				String stem = translitered.substring(prefix_len, prefix_len
						+ stem_len);
				String suffix = translitered.substring(prefix_len + stem_len,
						prefix_len + stem_len + suffix_len);
				segmented.add(new SegmentedWord(prefix, stem, suffix));
				stem_len--;
				suffix_len++;
			}
			prefix_len++;
		}
		return segmented;
	}

	/**
	 * Feed an internal list of solutions for the given word
	 * 
	 * @param translitered
	 *            The word. It is assumed that {@link #romanizeWord(String word)
	 *            romanizeWord} has been called before
	 * @return Whether or not there are solutions for this word
	 */
	private Set<Solution> feedWordSolutions(String translitered) {
		Set<Solution> wordSolutions = new HashSet<Solution>();
		
		// get a list of valid segmentations
		Set<SegmentedWord> segments = segmentWord(translitered);
		// Brute force algorithm

		for (SegmentedWord segmentedWord : segments) {

			Collection<DictionaryEntry> prefixes = dict
					.getPrefixIterator(segmentedWord.getPrefix());

			if (prefixes == null) {
				continue;
			}

			Collection<DictionaryEntry> stems = dict
					.getStemIterator(segmentedWord.getStem());

			if (stems == null) {
				continue;
			}

			Collection<DictionaryEntry> suffixes = dict
					.getSuffixIterator(segmentedWord.getSuffix());

			if (suffixes == null) {
				continue;
			}

			for (DictionaryEntry prefix : prefixes) {

				for (DictionaryEntry stem : stems) {

					// Prefix/Stem compatiblity
					if (dict.hasAB(prefix.getMorphology(), stem.getMorphology())) {

						for (DictionaryEntry suffix : suffixes) {

							// Prefix/Suffix compatiblity
							if (dict.hasAC(prefix.getMorphology(),
									suffix.getMorphology())) {
								// Stem/Suffix compatibility
								if (dict.hasBC(stem.getMorphology(),
										suffix.getMorphology())) {
									// All tests passed : it is a solution
									wordSolutions.add(new Solution(prefix, stem, suffix));
								}
							}
						}
					}
				}
			}
		}

		return wordSolutions;
	}

	/**
	 * Feed an internal list of alternative spellings for the given word
	 * 
	 * @param translitered
	 *            The word. It is assumed that {@link #romanizeWord(String word)
	 *            romanizeWord} has been called before
	 * @return Whether or not there are alternative spellings for this word
	 */
//	private Set<String> feedAlternativeSpellings(String translitered) {
//		// No need to reprocess
//		HashSet<String> wordAlternativeSpellings = new HashSet<String>();
//		String temp = translitered;
//		String temp2;
//		// final 'alif maqSuura + hamza-on-the-line
//		if (temp.matches(".*" + "Y'$")) { // Y_w'_Y'
//			// -> yaa' + hamza-on-the-line
//			temp = temp.replaceAll("Y", "y"); // y_w'_y'
//
//			wordAlternativeSpellings.add(temp); // y_w'_y' -- pushed
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__y'
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__y'
//				wordAlternativeSpellings.add(temp); // y_&__y' -- pushed
//			}
//			temp = translitered; // Y_w'_Y'
//			// -> yaa' + hamza-on-the-line
//			temp = temp.replaceAll("Y", "y"); // y_w'_y'
//			// final yaa' + hamza-on-the-line -> hamza-on-yaa'
//			temp = temp.replaceFirst("y'$", "}"); // y_w'_}
//
//			wordAlternativeSpellings.add(temp); // y_w'_} -- pushed
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__}
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__}
//				wordAlternativeSpellings.add(temp); // y_&__} -- pushed
//			}
//		}
//		// final yaa' + hamza-on-the-line
//		else if (temp.matches(".*" + "y'$")) { // Y_w'_y'
//			// 'alif maqSuura -> yaa'
//			temp2 = temp.replaceAll("Y", "y"); // y_w'_y'
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_w'_y'
//				wordAlternativeSpellings.add(temp); // y_w'_y' -- pushed
//			}
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__y'
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__y'
//				wordAlternativeSpellings.add(temp); // y_&__y' -- pushed
//			}
//			temp = translitered; // Y_w'_y'
//			// 'alif maqSuura -> yaa'
//			temp = temp.replaceAll("Y", "y"); // y_w'_y'
//			// final yaa' + hamza-on-the-line -> 'alif maqSuura
//			temp = temp.replaceFirst("y'$", "}"); // y_w'_}
//			wordAlternativeSpellings.add(temp); // y_w'_} -- pushed
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__}
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__}
//				wordAlternativeSpellings.add(temp); // y_&__} -- pushed
//			}
//		}
//		// final yaa'
//		else if (temp.matches(".*" + "y$")) { // Y_w'_y
//			// 'alif maqSuura -> yaa'
//			temp = temp.replaceAll("Y", "y"); // y_w'_y
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__y
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__y
//				wordAlternativeSpellings.add(temp); // y_&__y -- pushed
//			}
//			temp = translitered; // Y_w'_y
//			// 'alif maqSuura -> yaa'
//			temp = temp.replaceAll("Y", "y"); // y_w'_y
//			// final yaa' -> 'alif maqSuura
//			temp = temp.replaceAll("y$", "Y"); // y_w'_Y
//			wordAlternativeSpellings.add(temp); // y_w'_Y -- pushed
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__Y
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__Y
//				wordAlternativeSpellings.add(temp); // y_&__Y -- pushed
//			}
//		}
//		// final haa'
//		else if (temp.matches(".*" + "h$")) { // Y_w'_h
//			// 'alif maqSuura -> yaa'
//			temp2 = temp.replaceAll("Y", "y"); // y_w'_h
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_w'_h
//				wordAlternativeSpellings.add(temp); // y_w'_h -- pushed
//			}
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__h
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__h
//				wordAlternativeSpellings.add(temp); // y_&__h -- pushed
//			}
//			// final haa' -> taa' marbuuTa
//			temp = temp.replaceFirst("h$", "p"); // y_w'_p
//			wordAlternativeSpellings.add(temp); // y_w'_p -- pushed
//		}
//		// final taa' marbuuTa
//		else if (temp.matches(".*" + "p$")) { // Y_w'_p
//			// 'alif maqSuura -> yaa'
//			temp2 = temp.replaceAll("Y", "y"); // y_w'_p
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_w'_p
//				wordAlternativeSpellings.add(temp); // y_w'_p -- pushed
//			}
//			// medial waaw + hamza-on-the-line -> hamza-on-waaw
//			temp2 = temp.replaceFirst("w'", "&"); // y_&__p
//			if (!temp.equals(temp2)) {
//				temp = temp2; // y_&__p
//				wordAlternativeSpellings.add(temp); // y_&__p -- pushed
//			}
//			// final taa' marbuuTa -> haa'
//			temp = temp.replaceFirst("p$", "h"); // y_w'_h
//			wordAlternativeSpellings.add(temp); // //y_w'_h -- pushed
//		}
//		// Substitutions before matching
//		else {
//			// final 'alif maqSuura -> yaa'
//			temp2 = temp.replaceFirst("Y$", "y"); // Y_w'_y
//			if (!temp.equals(temp2)) {
//				temp = temp2; // Y_w'_y
//				// 'alif maqSuura -> yaa'
//				temp = temp.replaceAll("Y", "y"); // y_w'_y
//				wordAlternativeSpellings.add(temp); // y_w'_y -- pushed
//				// medial waaw + hamza-on-the-line -> hamza-on-waaw
//				temp2 = temp.replaceFirst("w'", "&"); // y_&__y
//				if (!temp.equals(temp2)) {
//					temp = temp2; // y_&__y
//					wordAlternativeSpellings.add(temp); // y_&__y -- pushed
//				}
//			} else {
//				// 'alif maqSuura -> yaa'
//				temp2 = temp.replaceAll("Y", "y"); // y_w'__
//				if (!temp.equals(temp2)) {
//					temp = temp2; // y_w'__
//					wordAlternativeSpellings.add(temp); // y_w'__ -- pushed
//					// medial waaw + hamza-on-the-line -> hamza-on-waaw
//					temp2 = temp.replaceFirst("w'", "&"); // y_&___
//					if (!temp.equals(temp2)) {
//						temp = temp2; // y_&___
//						wordAlternativeSpellings.add(temp); // y_&___ -- pushed
//					}
//				} else {
//					// medial waaw + hamza-on-the-line -> hamza-on-waaw
//					temp2 = temp.replaceFirst("w'", "&"); // y_&___
//					if (!temp.equals(temp2)) {
//						temp = temp2; // y_&___
//						wordAlternativeSpellings.add(temp); // y_&___ -- pushed
//					} else {
//					} // nothing
//				}
//			}
//		}
//
//		if (wordAlternativeSpellings.isEmpty()) {
//			return null;
//		}
//
//		return wordAlternativeSpellings;
//	}

	// Inner class
	private class SegmentedWord {

		private String prefix;
		private String stem;
		private String suffix;

		protected SegmentedWord(String prefix, String stem, String suffix) {
			this.prefix = prefix;
			this.stem = stem;
			this.suffix = suffix;
		}

		protected String getPrefix() {
			return this.prefix;
		}

		protected String getStem() {
			return this.stem;
		}

		protected String getSuffix() {
			return this.suffix;
		}
	}
	
}