/*
* Copyright (c) 2015 University of Illinois Board of Trustees, All rights reserved.
* Developed at GSLIS/ the iSchool, by Dr. Jana Diesner, Amirhossein Aleyasen,
* Chieh-Li Chin, Shubhanshu Mishra, Kiumars Soltani, and Liang Tao.
*
* This program is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free Software
* Foundation; either version 2 of the License, or any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses>.
*
*/
package context.core.task.stemming;
import context.app.AppConfig;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Sentence;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordLemmaTag;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
/**
*
* @author Aale
*/
public class LemmaTagger {
private static StanfordCoreNLP pipeline;
private static Map<String, String> tagger_models_str = new HashMap<String, String>();
private static Map<String, MaxentTagger> tagger_models = new HashMap<String, MaxentTagger>();
static {
Properties props = new Properties();
props.put("annotators", "tokenize");
pipeline = new StanfordCoreNLP(props);
tagger_models_str.put("en", AppConfig.getUserDirLoc() + "/data/pos/models/wsj-0-18-left3words-distsim.tagger");
tagger_models_str.put("ar", AppConfig.getUserDirLoc() + "/data/pos/models/arabic-fast.tagger");
tagger_models_str.put("ch", AppConfig.getUserDirLoc() + "/data/pos/models/chinese-distsim.tagger");
tagger_models_str.put("fr", AppConfig.getUserDirLoc() + "/data/pos/models/french.tagger");
tagger_models_str.put("de", AppConfig.getUserDirLoc() + "/data/pos/models/german-fast.tagger");
}
/**
*
* @param args
* @throws ClassNotFoundException
* @throws IOException
*/
public static void main(String[] args) throws ClassNotFoundException, IOException {
// Initialize the tagger
MaxentTagger tagger = getTagger("en");
// The sample string
// String sample = "بعد إعلان الدستور العثماني سنة";
// String sample = "This question appears to be off-topic. The users who voted to close gave this specific reason.";
// The tagged string
// String tagged = tagger.tagString(sample);
// Output the result
// System.out.println(tagged);
List<CoreLabel> sent = Sentence.toCoreLabelList("These", "are", "some", "questions");
final List<TaggedWord> lemmatize = lemmatize(sent, "en");
System.out.println("Lemmatize::");
System.out.println(lemmatize);
for (TaggedWord c : lemmatize) {
System.out.println(c.word() + "\t" + c.tag());
}
}
static Morphology morphology = new Morphology();
/**
*
* @param sent
* @param language
* @return
*/
public static List<TaggedWord> lemmatize(List<CoreLabel> sent, String language) {
MaxentTagger tagger = getTagger(language);
// List<HasWord> sent = Sentence.toWordList("This is a sample text");
List<TaggedWord> taggedSent = tagger.tagSentence(sent);
for (TaggedWord token : taggedSent) {
String word = token.word();
String pos = token.tag();
String lemma = morphology.lemmatize(new WordTag(word, pos)).lemma();
token.setTag(lemma);
}
// final List<WordLemmaTag> tagged = (List<WordLemmaTag>) tagger.tagCoreLabelsOrHasWords(sent, morphology, true);
// for (TaggedWord tw : taggedSent) {
// System.out.println(tw.word() + "\t" + tw.tag());
// }
return taggedSent;
}
private static MaxentTagger getTagger(String language) {
MaxentTagger tagger = tagger_models.get(language);
if (tagger == null) {
// try {
tagger_models.put(language, new MaxentTagger(tagger_models_str.get(language)));
return tagger_models.get(language);
// } catch (IOException | ClassNotFoundException ex) {
// Exceptions.printStackTrace(ex);
// }
}
return tagger;
}
}