package ch.akuhn.hapax.corpus; import java.io.File; import java.io.InputStream; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.List; import ch.akuhn.util.Bag; import ch.akuhn.util.PrintOn; public class Terms extends Bag<String> implements ScannerClient { public Terms() { // do nothing } public Terms(File file) { new CamelCaseScanner().client(this).onFile(file).run(); } public Terms(String text) { new CamelCaseScanner().client(this).onString(text).run(); } public Terms(Collection<String> strings) { this.addAll(strings); // #addAll handles "instance of bag" special case } public Terms(InputStream stream) { new CamelCaseScanner().client(this).onStream(stream).run(); } public Terms(Terms... union) { for (Terms each: union) addAll(each); } public Terms stem() { Stemmer stemmer = new PorterStemmer(); Terms terms = new Terms(); for (Count<String> each: this.counts()) { terms.add(stemmer.stem(each.element), each.count); } return terms; } public Terms toLowerCase() { Terms terms = new Terms(); for (Count<String> each: this.counts()) { terms.add(each.element.toString().toLowerCase(), each.count); } return terms; } //@Override public void yield(CharSequence term) { this.add(term.toString()); } public Terms intern() { Terms terms = new Terms(); for (Count<String> each: this.counts()) { terms.add(each.element.intern(), each.count); } return terms; } public void storeOn(Appendable app) { PrintOn out = new PrintOn(app); int count = -1; for (Count<String> each: sortedCounts()) { if (each.count != count) out.print(count = each.count).space(); out.append(each.element).space(); } out.cr(); } public void readFrom(java.util.Scanner scanner) { while (scanner.hasNextInt()) { int count = scanner.nextInt(); while (!scanner.hasNextInt()) { add(scanner.next(), count); } } } public List<Count<String>> top(int num) { List<Count<String>> top = new ArrayList<Count<String>>(); Iterator<Count<String>> counts = this.sortedCounts().iterator(); for (int n = 0; n < num && counts.hasNext(); n++) top.add(counts.next()); return top; } }