package jhazm; import jhazm.tokenizer.WordTokenizer; import java.io.IOException; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; /** * * @author Mojtaba Khallash */ public class Lemmatizer { // // Fields // public static Lemmatizer instance; private HashMap verbs; private HashSet<String> words; // // Constructors // public Lemmatizer() throws IOException { this("resources/data/words.dat", "resources/data/verbs.dat", true); } public Lemmatizer(boolean joinedVerbParts) throws IOException { this("resources/data/words.dat", "resources/data/verbs.dat", joinedVerbParts); } public Lemmatizer(String wordsFile, String verbsFile) throws IOException { this(wordsFile, verbsFile, true); } public Lemmatizer(String wordsFile, String verbsFile, boolean joinedVerbParts) throws IOException { this.words = new HashSet<>(); for (String line : Files.readAllLines(Paths.get(wordsFile), Charset.forName("UTF8"))) this.words.add(line.trim()); WordTokenizer tokenizer = new WordTokenizer(verbsFile); List<String> pureVerbs = Files.readAllLines(Paths.get(verbsFile), Charset.forName("UTF8")); this.verbs = new HashMap(); this.verbs.put("است", "#است"); for (String verb : pureVerbs) { for (String tense : conjugations(verb)) { if (!this.verbs.containsKey(tense)) this.verbs.put(tense, verb); } } if (joinedVerbParts) { for (String verb : pureVerbs) { String bon = verb.split("#")[0]; for (String afterVerb : tokenizer.getAfterVerbs()) { this.verbs.put(bon + "ه " + afterVerb, verb); this.verbs.put("ن" + bon + "ه " + afterVerb, verb); } for (String beforeVerb : tokenizer.getBeforeVerbs()) { this.verbs.put(beforeVerb + " " + bon, verb); } } } } // // API // public static Lemmatizer i() throws IOException { if (instance != null) return instance; instance = new Lemmatizer(); return instance; } public String lemmatize(String word) { return lemmatize(word, ""); } public String lemmatize(String word, String pos) { if (pos.length() == 0 && this.words.contains(word)) return word; if ((pos.length() == 0 || pos.equals("V")) && this.verbs.containsKey(word)) return this.verbs.get(word).toString(); if (pos.startsWith("AJ") && word.charAt(word.length() - 1) == 'ی') return word; if (pos.equals("PRO")) return word; if (this.words.contains(word)) return word; String stem = new Stemmer().stem(word); if (this.words.contains(stem)) return stem; return word; } public List<String> conjugations(String verb) { String[] endsList = new String[] { "م", "ی", "", "یم", "ید", "ند" }; List<String> ends = new ArrayList<>(Arrays.asList(endsList)); if (verb.equals("#هست")) { List<String> conjugate1 = new ArrayList<>(); List<String> conjugate2 = new ArrayList<>(); for (String end : ends) { conjugate1.add("هست" + end); conjugate2.add("نیست" + end); } conjugate1.addAll(conjugate2); return conjugate1; } HashSet<String> conjugates = new HashSet<>(); String[] parts = verb.split("#"); String past = parts[0]; String present = parts[1]; for (String end : ends) { String conj = past + end; String nconj; // pastSimples conj = getRefinement(conj); conjugates.add(conj); nconj = getRefinement(getNot(conj)); conjugates.add(nconj); conj = "می‌" + conj; // pastImperfects conj = getRefinement(conj); conjugates.add(conj); nconj = getRefinement(getNot(conj)); conjugates.add(nconj); } endsList = new String[] { "ه‌ام", "ه‌ای", "ه", "ه‌ایم", "ه‌اید", "ه‌اند" }; ends = new ArrayList<>(Arrays.asList(endsList)); // pastNarratives for (String end : ends) { String conj = past + end; conjugates.add(getRefinement(conj)); conjugates.add(getRefinement(getNot(conj))); } conjugates.add(getRefinement("ب" + present)); conjugates.add(getRefinement("ن" + present)); if (present.endsWith("ا") || Arrays.asList(new String[] { "آ", "گو" }).contains(present)) present = present + "ی"; endsList = new String[] { "م", "ی", "د", "یم", "ید", "ند" }; ends = new ArrayList<>(Arrays.asList(endsList)); List<String> presentSimples = new ArrayList<>(); for (String end : ends) { String conj = present + end; presentSimples.add(conj); conjugates.add(getRefinement(conj)); conjugates.add(getRefinement(getNot(conj))); } for (String item : presentSimples) { String conj; // presentImperfects conj = "می‌" + item; conjugates.add(getRefinement(conj)); conjugates.add(getRefinement(getNot(conj))); // presentSubjunctives conj = "ب" + item; conjugates.add(getRefinement(conj)); // presentNotSubjunctives conj = "ن" + item; conjugates.add(getRefinement(conj)); } return new ArrayList(conjugates); } // // Helper Methods // private String getRefinement(String text) { return text.replace("بآ", "بیا").replace("نآ", "نیا"); } private String getNot(String text) { return "ن" + text; } }