/** * */ package com.maalaang.omtwitter.tools; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.Properties; import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.maalaang.omtwitter.io.LogSystemStream; /** * @author Sangwon Park * */ public class ExtractSampleCorpus { private Properties prop = null; private Logger logger = null; /** * @param args */ public static void main(String[] args) { try { Properties prop = new Properties(); prop.load(new InputStreamReader(new FileInputStream(args[0]), "UTF-8")); LogSystemStream.redirectErrToLog(Level.ERROR); ExtractSampleCorpus con = new ExtractSampleCorpus(prop); con.run(Integer.parseInt(prop.getProperty("sample.window")), Integer.parseInt(prop.getProperty("sample.number"))); } catch (Exception e) { e.printStackTrace(); } } public ExtractSampleCorpus(Properties prop) { this.prop = prop; this.logger = Logger.getLogger(getClass()); } public void run(int sampleWindow, int sampleNumber) throws Exception { String writeFile = prop.getProperty("corpus.file.out") + "." + sampleNumber + "." + sampleWindow; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(prop.getProperty("corpus.file.in")), "UTF-8")); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(writeFile), "UTF-8")); String line = null; int i = 0; long cnt = 0; logger.info("extract " + sampleNumber + " tweets in every " + sampleWindow + " tweets - " +prop.getProperty("corpus.file.in")); while ((line = br.readLine()) != null) { if (i == sampleWindow) { i = 0; } if (i++ < sampleNumber) { bw.write(line); bw.write('\n'); cnt++; } } logger.info(cnt + " tweets were extracted - " + writeFile); bw.close(); br.close(); } }