package org.societies.orchestration.cpa.test.util; import com.google.gson.Gson; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.societies.activity.model.Activity; import org.societies.api.activity.IActivity; import org.societies.orchestration.cpa.impl.comparison.util.LanguageUtil; import java.io.*; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; /** * Created with IntelliJ IDEA. * User: epic * Date: 08/07/13 * Time: 14:40 */ public class Tweet2011Extractor { protected static Logger LOG = LoggerFactory.getLogger(Tweet2011Extractor.class); public static ArrayList<Status> readJsonGzFile(String file){ LOG.info("reading tweets from file: \""+file+"\""); ArrayList<Status> ret = new ArrayList<Status>(); GZIPInputStream gzipInputStream = null; try { gzipInputStream = new GZIPInputStream(new FileInputStream(new File(file))); } catch (IOException e) { LOG.error("could not open file: ",e); } if(gzipInputStream!=null){ BufferedReader reader = new BufferedReader(new InputStreamReader(gzipInputStream)); String strLine; int count = 0; try { while ((strLine = reader.readLine()) != null) { ret.add(splitLine(strLine)); count++; } } catch (IOException e) { LOG.error("Error reading gzip file: ",e); } LOG.info("read "+count+" lines form gzip file"); } return ret; } public static List<IActivity> convertToActivities(ArrayList<Status> statuses){ ArrayList<IActivity> ret = new ArrayList<IActivity>(); Activity a = null; ArrayList<String> targets = null; for(Status s : statuses){ targets = findTargets(s); for(String target : targets){ a = new Activity(); a.setActor(s.getScreenName()); a.setObject(s.getText()); a.setTarget(target); //a.setPublished(); TODO: convert json date format to epoch long string. ret.add(a); } } return ret; } public static ArrayList<String> findTargets(Status s){ String workString = s.getText(); ArrayList<String> ret = new ArrayList<String>(); String[] words = s.getText().split(" "); for(String word : words){ if(word.charAt(0) == '@'){ ret.add(word.substring(word.indexOf("@")+1)); } } /* while(workString.contains("@")){ ret.add(workString.substring(workString.indexOf("@")+1).split(" ")[0]); workString = workString.substring(workString.indexOf("@")+1,workString.indexOf(" ")); workString = workString.trim(); }*/ return ret; } public static List<IActivity> actsFromGzJson(String filename){ return convertToActivities(readJsonGzFile(filename)); } /** * Should return {id,create_at,screen_name,text} * @param json * @return */ private static Status splitLine(String json){ Gson gson = new Gson(); Status ret = gson.fromJson(json,Status.class); return ret; } public static void main(String args[]){ //Status status= splitLine("{\"id\":33246488821903360,\"screenName\":\"TaylorBormann\",\"createdAt\":\"11:32 AM - 3 Feb 11\",\"text\":\"@ltrain_ oh there is I promise haha\"}"); Status status = readJsonGzFile(args[0]).get(0); LOG.info("status "+status.getCreatedAt()+" - "+status.getScreenName()+ " : "+status.getText()); } public static boolean isEnglish(String text){ return LanguageUtil.isEnglish(text); } }