package edu.tufts.component.tokenizer;
import org.meandre.annotations.Component;
import org.meandre.annotations.ComponentInput;
import org.meandre.annotations.ComponentOutput;
import org.meandre.annotations.Component.FiringPolicy;
import org.meandre.annotations.Component.Licenses;
import org.meandre.annotations.Component.Mode;
import org.meandre.components.abstracts.AbstractExecutableComponent;
import org.meandre.core.ComponentContext;
import org.meandre.core.ComponentContextProperties;
import org.seasr.datatypes.BasicDataTypesTools;
import org.seasr.meandre.components.tools.Names;
import org.seasr.meandre.support.parsers.DataTypeParser;
import opennlp.tools.lang.english.Tokenizer;
/*
*
* Connector.java
*
* Created on Aug 3, 2009
*
* Copyright 2003-2009 Tufts University Licensed under the
* Educational Community License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.osedu.org/licenses/ECL-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS IS"
* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
@Component(
name = "VUETokenizer",
creator = "Anoop Kumar",
baseURL = "meandre://vue.tufts.edu/tokenizer",
firingPolicy = FiringPolicy.all,
mode = Mode.compute,
rights = Licenses.UofINCSA,
tags = "semantic, tools, text, opennlp, tokenizer",
description = "This component breaks a document into tokens v1.0.1 "
)
public class VUETokenizer extends AbstractExecutableComponent {
@ComponentInput(
name = Names.PORT_TEXT,
description = "The text to be tokenized"
)
protected static final String IN_TEXT = Names.PORT_TEXT;
//------------------------------ OUTPUTS -----------------------------------------------------
@ComponentOutput(
name = Names.PORT_TOKENS,
description = "The sequence of tokens"
)
protected static final String OUT_TOKENS = Names.PORT_TOKENS;
@Override
public void disposeCallBack(ComponentContextProperties arg0)
throws Exception {
// TODO Auto-generated method stub
}
@Override
public void executeCallBack(ComponentContext cc) throws Exception {
String[] inputs = DataTypeParser.parseAsString(cc.getDataComponentFromInput(IN_TEXT));
StringBuilder sb = new StringBuilder();
for (String text : inputs)
sb.append(text).append(" ");
String[] ta = sb.toString().split("\\W");
cc.getOutputConsole().println("[INFO]Tokenized: "+ta.length);
cc.pushDataComponentToOutput(OUT_TOKENS, BasicDataTypesTools.stringToStrings(ta));
}
@Override
public void initializeCallBack(ComponentContextProperties cc)
throws Exception {
}
}