/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package di.uniba.it.tri.tokenizer; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; /** * * @author pierpaolo */ public class TriTwitterTokenizer implements TriTokenizer { @Override public List<String> getTokens(Reader reader) throws IOException { StringBuilder sb = new StringBuilder(); BufferedReader br = new BufferedReader(reader); while (br.ready()) { sb.append(br.readLine()); sb.append("\n"); } br.close(); return getTokens(sb.toString()); } @Override public List<String> getTokens(String text) throws IOException { List<String> tokens=Twokenize.tokenize(text); List<String> newTokens=new ArrayList<>(tokens.size()); for(String s:tokens) { if (s.startsWith("http://") || s.startsWith("https://")) { newTokens.add("_URL_"); } else if (s.startsWith("@")) { newTokens.add("_USR_"); } else { newTokens.add(s.toLowerCase()); } } return newTokens; } }