Lemmatizer.java example

Explorer

JHazm-master
- JHazm
  - src
    - main
      - java
        jhazm
        DependencyParser.java
        Lemmatizer.java
        Normalizer.java
        POSTagger.java
        Stemmer.java
        model
        Doc.java
        Document.java
        Verb.java
        reader
        BijankhanReader.java
        HamshahriReader.java
        PersicaReader.java
        PeykareReader.java
        VerbValencyReader.java
        terminal
        Action.java
        Runner.java
        tokenizer
        SentenceTokenizer.java
        WordTokenizer.java
        utility
        MakeTrans.java
        RegexPattern.java
    - test
      - java
        jhazm
        test
        DependencyParserTest.java
        LemmatizerTest.java
        NormalizerTests.java
        POSTaggerTest.java
        StemmerTests.java
        reader
        BijankhanReaderTest.java
        HamshahriReaderTest.java
        PersicaReaderTest.java
        PeykareReaderTest.java
        VerbValencyReaderTest.java
        tokenizer
        SentenceTokenizerTests.java
        WordTokenizerTest.java

package jhazm;

import jhazm.tokenizer.WordTokenizer;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;

/**
 *
 * @author Mojtaba Khallash
 */
public class Lemmatizer {
    //
    // Fields
    //


    public static Lemmatizer instance;
    private HashMap verbs;
    private HashSet<String> words;





    //
    // Constructors
    //

    public Lemmatizer() throws IOException {
        this("resources/data/words.dat", "resources/data/verbs.dat", true);
    }

    public Lemmatizer(boolean joinedVerbParts) throws IOException {
        this("resources/data/words.dat", "resources/data/verbs.dat", joinedVerbParts);
    }
    public Lemmatizer(String wordsFile, String verbsFile) throws IOException {
        this(wordsFile, verbsFile, true);
    }
    public Lemmatizer(String wordsFile, String verbsFile, boolean joinedVerbParts)
            throws IOException {
        this.words = new HashSet<>();
        for (String line : Files.readAllLines(Paths.get(wordsFile), Charset.forName("UTF8")))
            this.words.add(line.trim());

        WordTokenizer tokenizer = new WordTokenizer(verbsFile);

        List<String> pureVerbs = Files.readAllLines(Paths.get(verbsFile), Charset.forName("UTF8"));

        this.verbs = new HashMap();
        this.verbs.put("است", "#است");
        for (String verb : pureVerbs) {
            for (String tense : conjugations(verb)) {
                if (!this.verbs.containsKey(tense))
                    this.verbs.put(tense, verb);
            }
        }

        if (joinedVerbParts) {
            for (String verb : pureVerbs) {
                String bon = verb.split("#")[0];
                for (String afterVerb : tokenizer.getAfterVerbs()) {
                    this.verbs.put(bon + "ه " + afterVerb, verb);
                    this.verbs.put("ن" + bon + "ه " + afterVerb, verb);
                }
                for (String beforeVerb : tokenizer.getBeforeVerbs()) {
                    this.verbs.put(beforeVerb + " " + bon, verb);
                }
            }
        }
    }




    //
    // API
    //

    public static Lemmatizer i() throws IOException {
        if (instance != null) return instance;
        instance = new Lemmatizer();
        return instance;
    }

    public String lemmatize(String word) {
        return lemmatize(word, "");
    }
    
    public String lemmatize(String word, String pos) {
        if (pos.length() == 0 && this.words.contains(word))
            return word;

        if ((pos.length() == 0 || pos.equals("V")) && this.verbs.containsKey(word))
            return this.verbs.get(word).toString();

        if (pos.startsWith("AJ") && word.charAt(word.length() - 1) == 'ی')
            return word;

        if (pos.equals("PRO"))
            return word;

        if (this.words.contains(word))
            return word;

        String stem = new Stemmer().stem(word);
        if (this.words.contains(stem))
            return stem;

        return word;
    }

    public List<String> conjugations(String verb) {
        String[] endsList = new String[] { "م", "ی", "", "یم", "ید", "ند" };
        List<String> ends = new ArrayList<>(Arrays.asList(endsList));

        if (verb.equals("#هست")) {
            List<String> conjugate1 = new ArrayList<>();
            List<String> conjugate2 = new ArrayList<>();
            for (String end : ends) {
                conjugate1.add("هست" + end);
                conjugate2.add("نیست" + end);
            }
            conjugate1.addAll(conjugate2);
            return conjugate1;
        }

        HashSet<String> conjugates = new HashSet<>();
        String[] parts = verb.split("#");
        String past = parts[0];
        String present = parts[1];

        for (String end : ends) {
            String conj = past + end;
            String nconj;

            // pastSimples
            conj = getRefinement(conj);
            conjugates.add(conj);
            nconj = getRefinement(getNot(conj));
            conjugates.add(nconj);


            conj = "می‌" + conj;

            // pastImperfects
            conj = getRefinement(conj);
            conjugates.add(conj);
            nconj = getRefinement(getNot(conj));
            conjugates.add(nconj);
        }

        endsList = new String[] { "ه‌ام", "ه‌ای", "ه", "ه‌ایم", "ه‌اید", "ه‌اند" };
        ends = new ArrayList<>(Arrays.asList(endsList));

        // pastNarratives
        for (String end : ends) {
            String conj = past + end;
            conjugates.add(getRefinement(conj));
            conjugates.add(getRefinement(getNot(conj)));
        }

        conjugates.add(getRefinement("ب" + present));
        conjugates.add(getRefinement("ن" + present));

        if (present.endsWith("ا") || Arrays.asList(new String[] { "آ", "گو" }).contains(present))
            present = present + "ی";

        endsList = new String[] { "م", "ی", "د", "یم", "ید", "ند" };
        ends = new ArrayList<>(Arrays.asList(endsList));

        List<String> presentSimples = new ArrayList<>();
        for (String end : ends) {
            String conj = present + end;
            presentSimples.add(conj);

            conjugates.add(getRefinement(conj));
            conjugates.add(getRefinement(getNot(conj)));
        }

        for (String item : presentSimples) {
            String conj;

            // presentImperfects
            conj = "می‌" + item;
            conjugates.add(getRefinement(conj));
            conjugates.add(getRefinement(getNot(conj)));

            // presentSubjunctives
            conj = "ب" + item;
            conjugates.add(getRefinement(conj));

            // presentNotSubjunctives
            conj = "ن" + item;
            conjugates.add(getRefinement(conj));
        }

        return new ArrayList(conjugates);
    }




    //
    // Helper Methods
    //

    private String getRefinement(String text) {
        return text.replace("بآ", "بیا").replace("نآ", "نیا");
    }

    private String getNot(String text) {
        return "ن" + text;
    }
}