/**
* Copyright (C) 2012 cogroo <cogroo@cogroo.org>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.cogroo.tools.checker;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import opennlp.tools.util.Span;
import org.apache.log4j.Logger;
import org.cogroo.entities.Chunk;
import org.cogroo.entities.SyntacticChunk;
import org.cogroo.entities.impl.ChunkCogroo;
import org.cogroo.entities.impl.ChunkTag;
import org.cogroo.entities.impl.MorphologicalTag;
import org.cogroo.entities.impl.SyntacticTag;
import org.cogroo.entities.impl.TokenCogroo;
import org.cogroo.interpreters.FlorestaTagInterpreter;
import org.cogroo.interpreters.TagInterpreter;
import org.cogroo.text.Sentence;
import org.cogroo.text.Token;
import org.cogroo.tools.checker.rules.dictionary.CogrooTagDictionary;
import org.cogroo.tools.checker.rules.dictionary.TagDictionary;
import org.cogroo.tools.checker.rules.model.TagMask.SyntacticFunction;
public class SentenceAdapter {
private TagDictionary td;
private TagInterpreter ti = new FlorestaTagInterpreter();
private static final Logger LOGGER = Logger.getLogger(SentenceAdapter.class);
private ChunkerConverter chunkerConverter;
private SyntacticChunkConverter syntacticChunkerConverter;
public SentenceAdapter(TagDictionary td) {
this.td = td;
this.chunkerConverter = new ChunkerConverter(ti);
this.syntacticChunkerConverter = new SyntacticChunkConverter(ti);
}
public org.cogroo.entities.Sentence asTypedSentence(Sentence sentence, String document) {
org.cogroo.entities.Sentence typedSentence = new org.cogroo.entities.Sentence();
typedSentence.setTextSentence(sentence);
typedSentence.setDocumentText(document);
typedSentence.setOffset(sentence.getStart());
typedSentence.setSpan(new Span(sentence.getStart(), sentence.getEnd()));
List<org.cogroo.entities.Token> typedTokenList = new ArrayList<org.cogroo.entities.Token>();
for (Token token : sentence.getTokens()) {
org.cogroo.entities.Token typedToken = new TokenCogroo(new Span(
token.getStart(), token.getEnd()));
typedToken.setLexeme(token.getLexeme());
typedToken.setMorphologicalTag(createMorphologicalTag(token));
setPrimitiveAndGeneralize(typedToken, td);
typedTokenList.add(typedToken);
}
typedSentence.setTokens(Collections.unmodifiableList(typedTokenList));
chunkerConverter.convertChunks(sentence, typedSentence);
syntacticChunkerConverter.convertChunks(sentence, typedSentence);
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Typed sentence: ");
if (LOGGER.isDebugEnabled()) {
StringBuilder trace = new StringBuilder();
trace.append("Show tree [" + typedSentence.getSentence()
+ "]: \n");
List<org.cogroo.entities.Token> tokens = typedSentence.getTokens();
for (int i = 0; i < tokens.size(); i++) {
trace.append("\t["
+ tokens.get(i).getSyntacticTag() + "]["
+ tokens.get(i).getChunkTag() + "] (ck: "
+ tokens.get(i).getChunk().getMorphologicalTag() + ") "
+ tokens.get(i) + " --> {"
+ tokens.get(i).getPrimitive() + "}_"
+ tokens.get(i).getMorphologicalTag()
+ "\n");
}
trace.append("Syntactic Elements:\n");
for (SyntacticChunk schunks : typedSentence.getSyntacticChunks()) {
trace.append("\t").append(schunks).append("\n");
}
trace.append("Chunks:\n");
for (Chunk schunks : typedSentence.getChunks()) {
trace.append("\t").append(schunks).append("\n");
}
trace.append("\n\nAs syntactic tree: " + typedSentence.getSyntaxTree() + "\n");
LOGGER.debug(trace.toString());
}
}
return typedSentence;
}
private MorphologicalTag createMorphologicalTag(Token token) {
String tag;
if ("-".equals(token.getFeatures()))
tag = token.getPOSTag();
else
tag = token.getPOSTag() + "=" + token.getFeatures();
return ti.parseMorphologicalTag(tag);
}
public static void setPrimitiveAndGeneralize(
org.cogroo.entities.Token tok, CogrooTagDictionary dict) {
Merger.generalizePOSTags(tok.getMorphologicalTag(),
dict.getTags(tok.getLexeme(), false));
// tokens.get(i).setMorphologicalTag(mt);
// Gets the primitive of the token.
String[] primitives = dict.getPrimitive(tok.getLexeme(),
tok.getMorphologicalTag(), true);
if (primitives == null) {
primitives = dict.getPrimitive(tok.getLexeme().toLowerCase(),
tok.getMorphologicalTag(), true);
}
if (primitives == null) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Missing lemma for: " + tok);
}
String[] primitive = {tok.getLexeme()};
tok.setPrimitive(primitive);
} else {
tok.setPrimitive(primitives);
}
}
private static class SyntacticChunkConverter {
private final TagInterpreter corpusTagInterpreter;
public SyntacticChunkConverter(TagInterpreter corpusTagInterpreter) {
this.corpusTagInterpreter = corpusTagInterpreter;
}
public void convertChunks(Sentence sentence,
org.cogroo.entities.Sentence typedSentence) {
if(sentence.getSyntacticChunks() == null) {
createFakeChunks(typedSentence);
return;
}
int lastToken = 0;
List<SyntacticChunk> typedSyntacticChunks = new ArrayList<SyntacticChunk>();
List<org.cogroo.entities.Token> typedTokens = typedSentence.getTokens();
for (org.cogroo.text.SyntacticChunk syntacticChunk : sentence
.getSyntacticChunks()) {
int start = syntacticChunk.getStart();
int end = syntacticChunk.getEnd();
for (int i = lastToken; i < start; i++) {
typedSyntacticChunks
.add(createNoneSyntacticChunk(typedTokens.get(i)));
}
lastToken = end;
List<Chunk> typedChunks = new ArrayList<Chunk>();
// search for the chunk...
for (int i = start; i < end; i++) {
Chunk tc = typedTokens.get(i).getChunk();
if (typedChunks.size() == 0
|| !typedChunks.get(typedChunks.size() - 1).equals(tc))
typedChunks.add(tc);
}
SyntacticChunk typedSyntacticChunk = new SyntacticChunk(typedChunks);
typedSyntacticChunk.setSyntacticTag(corpusTagInterpreter
.parseSyntacticTag(syntacticChunk.getTag()));
for (int i = start; i < end; i++) {
typedTokens.get(i).setSyntacticChunk(typedSyntacticChunk);
}
typedSyntacticChunks.add(typedSyntacticChunk);
}
// leftovers
for (int i = lastToken; i < typedTokens.size(); i++) {
typedSyntacticChunks.add(createNoneSyntacticChunk(typedTokens.get(i)));
}
if(LOGGER.isDebugEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append("Typed syntatic chunks:\n");
for (SyntacticChunk chunk : typedSyntacticChunks) {
sb.append(" ");
for (org.cogroo.entities.Token token : chunk.getTokens()) {
sb.append(token.getLexeme()).append(" ");
}
sb.append("\n MT: ").append(chunk.getMorphologicalTag()).append("\n");
}
LOGGER.debug(sb.toString());
}
typedSentence.setSyntacticChunks(Collections
.unmodifiableList(typedSyntacticChunks));
}
private void createFakeChunks(org.cogroo.entities.Sentence typedSentence) {
List<SyntacticChunk> sc = new ArrayList<SyntacticChunk>();
for (org.cogroo.entities.Token token : typedSentence.getTokens()) {
SyntacticChunk chunk = new SyntacticChunk(Collections.singletonList(token.getChunk()));
chunk.setSyntacticTag(corpusTagInterpreter.parseSyntacticTag("O"));
token.setSyntacticChunk(chunk);
sc.add(chunk);
}
typedSentence.setSyntacticChunks(sc);
}
private SyntacticChunk createNoneSyntacticChunk(
org.cogroo.entities.Token token) {
SyntacticChunk noneTypedSyntacticChunk = new SyntacticChunk(
Collections.singletonList(token.getChunk()));
SyntacticTag st = new SyntacticTag();
st.setSyntacticFunction(SyntacticFunction.NONE);
noneTypedSyntacticChunk.setSyntacticTag(st);
token.setSyntacticChunk(noneTypedSyntacticChunk);
return noneTypedSyntacticChunk;
}
}
private static class ChunkerConverter {
private final TagInterpreter corpusTagInterpreter;
public ChunkerConverter(TagInterpreter corpusTagInterpreter) {
this.corpusTagInterpreter = corpusTagInterpreter;
}
public void convertChunks(Sentence sentence, org.cogroo.entities.Sentence typedSentence) {
if(sentence.getChunks() == null) {
createFakeChunks(typedSentence);
return;
}
List<org.cogroo.entities.Token> typedTokens = typedSentence.getTokens();
for (int i = 0; i < sentence.getTokens().size(); i++) {
Token textToken = sentence.getTokens().get(i);
org.cogroo.entities.Token typedToken = typedTokens.get(i);
ChunkTag tag = corpusTagInterpreter.parseChunkTag(textToken.getChunkTag());
typedToken.setChunkTag(tag);
}
List<Chunk> chunks = new ArrayList<Chunk>(sentence.getChunks().size());
int head;
for (org.cogroo.text.Chunk textChunk : sentence.getChunks()) {
if(textChunk.getHeadIndex() != -1) {
head = textChunk.getHeadIndex();
} else {
head = textChunk.getStart();
}
// try changing the chunkTag
ChunkTag ctag = corpusTagInterpreter.parseChunkTag(sentence.getTokens().get(head).getChunkTag() + "*");
if(ctag != null) {
typedTokens.get(head).setChunkTag(ctag);
}
MorphologicalTag tag = typedTokens.get(head).getMorphologicalTag().clone();
List<org.cogroo.entities.Token> tokens = new ArrayList<org.cogroo.entities.Token>();
for (int i = textChunk.getStart(); i < textChunk.getEnd(); i++) {
tokens.add(typedTokens.get(i));
}
Chunk typedChunk = new ChunkCogroo(tokens, textChunk.getStart());
typedChunk.setType(textChunk.getTag());
for (org.cogroo.entities.Token token : tokens) {
token.setChunk(typedChunk);
}
typedChunk.setMorphologicalTag(tag);
chunks.add(typedChunk);
}
for (org.cogroo.entities.Token token : typedTokens) {
if(token.getChunk() == null) {
Chunk c = new ChunkCogroo(Collections.singletonList(token), 0);
c.setMorphologicalTag(token.getMorphologicalTag().clone());
token.setChunk(c);
}
}
if(LOGGER.isDebugEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append("Typed chunks:\n");
for (Chunk chunk : chunks) {
sb.append(" ");
for (org.cogroo.entities.Token t : chunk.getTokens()) {
sb.append(t.getLexeme()).append(" ");
}
sb.append("\n -- MT: " + chunk.getMorphologicalTag() + "\n");
}
LOGGER.debug(sb.toString());
}
typedSentence.setChunks(chunks);
}
private void createFakeChunks(org.cogroo.entities.Sentence typedSentence) {
int index = 0;
List<Chunk> cl = new ArrayList<Chunk>();
for (org.cogroo.entities.Token token : typedSentence.getTokens()) {
token.setChunkTag(corpusTagInterpreter.parseChunkTag("O"));
ChunkCogroo chunk = new ChunkCogroo(Collections.singletonList(token), index++);
chunk.setMorphologicalTag(token.getMorphologicalTag().clone());
token.setChunk(chunk);
cl.add(chunk);
}
typedSentence.setChunks(cl);
}
}
}