package jhazm.reader;
import edu.stanford.nlp.ling.TaggedWord;
import jhazm.tokenizer.WordTokenizer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Created by Mojtaba on 30/10/2015.
*/
public class PeykareReader {
private static List<String> cpos;
private static WordTokenizer tokenizer = null;
static {
try {
cpos = Arrays.asList(new String[] {
"N", // Noun
"V", // Verb
"AJ", // Adjective
"ADV", // Adverb
"PRO", // Pronoun
"DET", // Determiner
"P", // Preposition
"POSTP", // Postposition
"NUM", // Number
"CONJ", // Conjunction
"PUNC", // Punctuation
"RES", // Residual
"CL", // Classifier
"INT" // Interjection
});
tokenizer = new WordTokenizer();
} catch (IOException e) {
}
}
/**
* Coarse POS tags of Peykare corpus:
*/
public static String coarsePOS(List<String> tags) {
try {
String result = "N";
for (String tag : tags) {
if (cpos.contains(tag)) {
result = tag;
break;
}
}
if (tags.contains("EZ"))
result += "e";
return result;
}
catch(Exception ex) {
return "N";
}
}
/**
* Join verb parts like Dadedgan corpus.
* Input:
* دیده/ADJ_INO
* شد/V_PA
* Iutput:
* دیده شد/V_PA
*/
public static List<TaggedWord> joinVerbParts(List<TaggedWord> sentence) {
Collections.reverse(sentence);
List<TaggedWord> result = new ArrayList<>();
TaggedWord beforeTaggedWord = new TaggedWord("", "");
for (TaggedWord taggedWord : sentence) {
if (PeykareReader.tokenizer.getBeforeVerbs().contains(taggedWord.word()) ||
(PeykareReader.tokenizer.getAfterVerbs().contains(beforeTaggedWord.word()) &&
PeykareReader.tokenizer.getVerbs().contains(taggedWord.word()))) {
beforeTaggedWord.setWord(taggedWord.word() + " " + beforeTaggedWord.word());
if (result.isEmpty())
result.add(beforeTaggedWord);
}
else {
result.add(taggedWord);
beforeTaggedWord = taggedWord;
}
}
Collections.reverse(result);
return result;
}
}