/**
* KOSHIK is an NLP framework for large scale processing using Hadoop.
* Copyright © 2014 Peter Exner
*
* This file is part of KOSHIK.
*
* KOSHIK is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* KOSHIK is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with KOSHIK. If not, see <http://www.gnu.org/licenses/>.
*/
package se.lth.cs.koshik.analysis.stagger;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import se.lth.cs.koshik.analysis.ContentProcessor;
import se.lth.cs.koshik.input.conll.CoNLLFeature;
import se.lth.cs.koshik.model.Document;
import se.lth.cs.koshik.model.text.Sentence;
import se.su.ling.stagger.TaggedToken;
import se.su.ling.stagger.Tagger;
import se.su.ling.stagger.Token;
import se.su.ling.stagger.Tokenizer;
import se.su.ling.stagger.SwedishTokenizer;
public class StaggerProcessor implements ContentProcessor {
Tagger tagger;
public StaggerProcessor() {
try {
ObjectInputStream modelReader = new ObjectInputStream(new FileInputStream("./model.zip/model/sv/swedish.bin"));
tagger = (Tagger) modelReader.readObject();
modelReader.close();
tagger.setExtendLexicon(true);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void process(Document document) throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(document.getContent().getBytes("UTF-8"))));
Tokenizer tokenizer = new SwedishTokenizer(reader);
ArrayList<Token> sentence;
int sentIdx = 0;
while((sentence=tokenizer.readSentence())!=null) {
TaggedToken[] sent = new TaggedToken[sentence.size()];
for(int j=0; j<sentence.size(); j++) {
Token tok = sentence.get(j);
String id;
id = sentIdx + ":" + tok.offset;
sent[j] = new TaggedToken(tok, id);
}
TaggedToken[] taggedSent = tagger.tagSentence(sent, true, false);
if(taggedSent.length > 0) {
int lastIndex = taggedSent.length - 1;
int begin = taggedSent[0].token.offset;
int end = taggedSent[lastIndex].token.offset + taggedSent[lastIndex].token.value.length();
Sentence koshikSentence = new Sentence(document);
koshikSentence.setBegin(begin);
koshikSentence.setEnd(end);
}
int tokenIdx = 1;
for(TaggedToken token:taggedSent) {
String lemma = token.lf;
String form = token.token.value;
String pos = tagger.getTaggedData().getPosTagSet().getTagName(token.posTag);
int begin = token.token.offset;
int end = token.token.offset + token.token.value.length();
se.lth.cs.koshik.model.text.Token koshikToken = new se.lth.cs.koshik.model.text.Token(document);
koshikToken.setFeature(CoNLLFeature.ID, String.valueOf(tokenIdx));
koshikToken.setFeature(CoNLLFeature.FORM, token.token.value);
koshikToken.setFeature(CoNLLFeature.LEMMA, token.lf);
String posTag = tagger.getTaggedData().getPosTagSet().getTagName(token.posTag);
String[] posTagParts = posTag.split("\\|");
String predictedPosTag;
String predictedFeats;
if(posTagParts.length > 1) {
predictedPosTag = posTagParts[0];
predictedFeats = posTag.substring(predictedPosTag.length() + 1) ;
} else {
predictedPosTag = posTagParts[0];
predictedFeats = posTagParts[0];
}
koshikToken.setFeature(CoNLLFeature.PPOS, predictedPosTag);
koshikToken.setFeature(CoNLLFeature.PFEAT, predictedFeats);
koshikToken.setBegin(begin);
koshikToken.setEnd(end);
tokenIdx++;
}
sentIdx++;
}
tokenizer.yyclose();
}
}