package edu.harvard.wcfia.yoshikoder; import java.awt.Desktop; import java.awt.event.ActionEvent; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.logging.Logger; import javax.swing.JFileChooser; import javax.swing.JOptionPane; import javax.swing.filechooser.FileFilter; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import edu.harvard.wcfia.yoshikoder.dictionary.Node; import edu.harvard.wcfia.yoshikoder.dictionary.YKDictionary; import edu.harvard.wcfia.yoshikoder.document.DocumentList; import edu.harvard.wcfia.yoshikoder.document.DocumentListImpl; import edu.harvard.wcfia.yoshikoder.document.YKDocument; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationCache; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationException; import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationService; import edu.harvard.wcfia.yoshikoder.reporting.EntryFrequencyMap; import edu.harvard.wcfia.yoshikoder.reporting.UnifiedDocumentFrequencyReport; import edu.harvard.wcfia.yoshikoder.reporting.WordFrequencyMap; import edu.harvard.wcfia.yoshikoder.reporting.YKReport; import edu.harvard.wcfia.yoshikoder.ui.dialog.YKReportDialog; import edu.harvard.wcfia.yoshikoder.util.DialogUtil; import edu.harvard.wcfia.yoshikoder.util.DialogWorker; import edu.harvard.wcfia.yoshikoder.util.FileUtil; import edu.harvard.wcfia.yoshikoder.util.TaskWorker; public class UnifiedWordFrequencyReportAction extends YoshikoderAction { private static final Logger log = Logger.getLogger("edu.harvard.wcfia.yoshikoder.UnifiedFrequencyReportAction"); protected JFileChooser chooser = new JFileChooser(); FileFilter csvutf8 = new FileFilter() { @Override public String getDescription() { return "CSV (UTF-8 encoded)"; } @Override public boolean accept(File f) { return f.isDirectory(); } }; FileFilter excel = new FileFilter() { @Override public String getDescription() { return "MS Excel"; } @Override public boolean accept(File f) { return f.isDirectory(); } }; public UnifiedWordFrequencyReportAction(Yoshikoder yk) { super(yk, UnifiedWordFrequencyReportAction.class.getName()); chooser = new JFileChooser(); chooser.removeChoosableFileFilter(chooser.getAcceptAllFileFilter()); chooser.addChoosableFileFilter(csvutf8); // chooser.addChoosableFileFilter(excel); // NOTE: Excel is never used! chooser.setFileFilter(csvutf8); } // first pass to get vocab protected List<String> getVocab(List<YKDocument> docs) throws IOException, TokenizationException { Set<String> vocab = new HashSet<String>(); TokenizationCache tcache = yoshikoder.getTokenizationCache(); for (YKDocument doc : docs) { TokenList tl = tcache.getTokenList(doc); if (tl == null){ tl = TokenizationService.getTokenizationService().tokenize(doc); tcache.putTokenList(doc, tl); } WordFrequencyMap map = new WordFrequencyMap(tl); vocab.addAll(map.getVocabularyList()); } List<String> list = new ArrayList<String>(); list.addAll(vocab); return list; } // second pass to push out counts protected void pushOutCountsCSVUtf8(List<YKDocument> docs, Writer writer) throws IOException, TokenizationException { List<String> vocab = getVocab(docs); Collections.sort(vocab); // alphabetical // write header for (String word : vocab) writer.write("," + FileUtil.escapeForCsv(word)); writer.write(",Total\n"); TokenizationCache tcache = yoshikoder.getTokenizationCache(); for (YKDocument doc : docs) { TokenList tl = tcache.getTokenList(doc); if (tl == null){ tl = TokenizationService.getTokenizationService().tokenize(doc); tcache.putTokenList(doc, tl); } WordFrequencyMap map = new WordFrequencyMap(tl); writer.write( FileUtil.escapeForCsv(doc.getTitle()) ); for (String vocabWord: vocab) { Integer count = map.getWordCount(vocabWord); if (count == null) writer.write(",0"); else writer.write("," + count.toString()); } writer.write("," + map.getTotal() + "\n"); } // something else should close the file } protected void writeCsvUTF8(List<YKDocument> documents, File file) throws Exception { final List<YKDocument> docs = documents; final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), Charset.forName("UTF8"))); tworker = new TaskWorker(yoshikoder){ protected void doWork() throws Exception { pushOutCountsCSVUtf8(docs, writer); } protected void onError() { try { writer.close(); } catch (Exception ex){ log.info("could not close the CSV file"); } if (e instanceof TokenizationException){ DialogUtil.yelp(yoshikoder, "Tokenization Error", e); } else if (e instanceof IOException){ DialogUtil.yelp(yoshikoder, "Input/Ouput Error", e); } else { DialogUtil.yelp(yoshikoder, "Error", e); } } @Override protected void onSuccess() { try { writer.close(); } catch (Exception ex){ log.info("could not close the CSV file"); } // dont ask because it won't work cross platform /* int resp = JOptionPane.showConfirmDialog(yoshikoder, "Open report file?", "Open report", JOptionPane.YES_NO_OPTION); if (resp == JOptionPane.YES_OPTION){ try { Desktop.getDesktop().open(outputFile); } catch (Exception ex){ ex.printStackTrace(); } } */ } }; tworker.start(); } // second pass to push out counts protected void pushOutCountsExcel(List<YKDocument> docs, FileOutputStream stream) throws IOException, TokenizationException { List<String> vocab = getVocab(docs); Collections.sort(vocab); // alphabetical HSSFWorkbook wb = new HSSFWorkbook(); HSSFRow row; HSSFCell cell; HSSFSheet sheet = wb.createSheet("Word frequencies"); // header row = sheet.createRow((short)0); for (int ii = 0; ii < vocab.size(); ii++) { cell = row.createCell((short)(ii+1)); cell.setEncoding(HSSFCell.ENCODING_UTF_16); cell.setCellValue(FileUtil.escapeForCsv(vocab.get(ii))); } cell = row.createCell((short)(vocab.size()+1)); cell.setEncoding(HSSFCell.ENCODING_UTF_16); cell.setCellValue("Total"); int rowNumber = 1; TokenizationCache tcache = yoshikoder.getTokenizationCache(); for (YKDocument doc : docs) { TokenList tl = tcache.getTokenList(doc); if (tl == null){ tl = TokenizationService.getTokenizationService().tokenize(doc); tcache.putTokenList(doc, tl); } WordFrequencyMap map = new WordFrequencyMap(tl); row = sheet.createRow((short)rowNumber); cell = row.createCell((short)0); cell.setEncoding(HSSFCell.ENCODING_UTF_16); cell.setCellValue(doc.getTitle()); for (int ii = 0; ii < vocab.size(); ii++) { cell = row.createCell((short)(ii+1)); Integer count = map.getWordCount(vocab.get(ii)); if (count == null) cell.setCellValue((double)0); else cell.setCellValue((double)count.doubleValue()); } cell = row.createCell((short)(vocab.size()+1)); cell.setCellValue((double)map.getTotal()); rowNumber++; } wb.write(stream); // something else should close the file } protected void writeExcel(List<YKDocument> documents, File file) throws Exception { final List<YKDocument> docs = documents; final File outputFile = file; final FileOutputStream stream = new FileOutputStream(outputFile); tworker = new TaskWorker(yoshikoder){ protected void doWork() throws Exception { pushOutCountsExcel(docs, stream); } protected void onError() { try { stream.close(); } catch (Exception ex){ log.info("could not close the file stream"); ex.printStackTrace(); } if (e instanceof TokenizationException){ DialogUtil.yelp(yoshikoder, "Tokenization Error", e); } else if (e instanceof IOException){ DialogUtil.yelp(yoshikoder, "Input/Ouput Error", e); } else { DialogUtil.yelp(yoshikoder, "Error", e); } } @Override protected void onSuccess() { try { stream.close(); } catch (Exception ex){ ex.printStackTrace(); log.info("could not close the file stream"); } int resp = JOptionPane.showConfirmDialog(yoshikoder, "Open report file?", "Open report", JOptionPane.YES_NO_OPTION); if (resp == JOptionPane.YES_OPTION){ try { Desktop.getDesktop().open(outputFile); } catch (Exception ex){ ex.printStackTrace(); } } } }; tworker.start(); } public void actionPerformed(ActionEvent e) { if (yoshikoder.getProject().getDocumentList().size() > 1){ File file; try { int resp = chooser.showSaveDialog(yoshikoder); if (resp != JFileChooser.APPROVE_OPTION) return; file = chooser.getSelectedFile(); if (chooser.getFileFilter().equals(excel)){ if (!file.getName().toLowerCase().endsWith(".xls")) file = new File(file.getParent(), file.getName() + ".xls"); YKDocument[] docsa = yoshikoder.getSelectedDocuments(); List<YKDocument> docs = new ArrayList<YKDocument>(docsa.length); for (int ii = 0; ii < docsa.length; ii++) docs.add(docsa[ii]); writeExcel(docs, file); } else if (chooser.getFileFilter().equals(csvutf8)){ if (!file.getName().toLowerCase().endsWith("-utf8.csv")) file = new File(file.getParent(), file.getName() + "-utf8.csv"); YKDocument[] docsa = yoshikoder.getSelectedDocuments(); List<YKDocument> docs = new ArrayList<YKDocument>(docsa.length); for (int ii = 0; ii < docsa.length; ii++) docs.add(docsa[ii]); writeCsvUTF8(docs, file); } } catch (Exception ex){ DialogUtil.yelp(yoshikoder, ex.getMessage(), ex); return; } } } }