// Copyright 2015 Thomas Müller // This file is part of MarMoT, which is licensed under GPLv3. package lemming.lemma; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import marmot.core.Sequence; import marmot.core.Token; import marmot.morph.Word; import marmot.morph.io.SentenceReader; import marmot.util.Mutable; public class LemmaInstance { private double count_; private String form_; private String lemma_; private String ptag_; private String mtag_; @Override public String toString() { String ptag = "_"; if (ptag_ != null) ptag = ptag_; String mtag = "_"; if (mtag_ != null) mtag = mtag_; return String.format("%s\t%s\t%s\t%s", form_, ptag, mtag, lemma_); } public LemmaInstance(String form, String lemma, String tag, String mtag) { count_ = 1; form_ = form; lemma_ = lemma; ptag_ = tag; mtag_ = mtag; } public String getForm() { return form_; } public String getLemma() { return lemma_; } public String getFormPadded() { return "123" + form_ + "456"; } public String getLemmaPadded() { return lemma_; } public String getPosTag() { return ptag_; } public void setCount(double count) { count_ = count; } public double getCount() { return count_; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((form_ == null) ? 0 : form_.hashCode()); result = prime * result + ((lemma_ == null) ? 0 : lemma_.hashCode()); result = prime * result + ((mtag_ == null) ? 0 : mtag_.hashCode()); result = prime * result + ((ptag_ == null) ? 0 : ptag_.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; LemmaInstance other = (LemmaInstance) obj; if (form_ == null) { if (other.form_ != null) return false; } else if (!form_.equals(other.form_)) return false; if (lemma_ == null) { if (other.lemma_ != null) return false; } else if (!lemma_.equals(other.lemma_)) return false; if (mtag_ == null) { if (other.mtag_ != null) return false; } else if (!mtag_.equals(other.mtag_)) return false; if (ptag_ == null) { if (other.ptag_ != null) return false; } else if (!ptag_.equals(other.ptag_)) return false; return true; } public static List<LemmaInstance> getInstances(Iterable<Sequence> reader) { return getInstances(reader, -1); } public static List<LemmaInstance> getInstances(Iterable<Sequence> reader, int limit) { return getInstances(reader, limit, true, true); } public static List<LemmaInstance> getInstances(Iterable<Sequence> reader, boolean use_ptag, boolean use_mtag) { return getInstances(reader, -1, use_ptag, use_mtag); } public static List<LemmaInstance> getInstances(Iterable<Sequence> reader, int limit, boolean use_postag, boolean use_mtag) { Map<LemmaInstance, Mutable<Integer>> map = new HashMap<>(); int number = 0; for (Sequence sentence : reader) { for (Token token : sentence) { number ++; LemmaInstance instance = LemmaInstance.getInstance((Word) token, use_postag, use_mtag); Mutable<Integer> mi = map.get(instance); if (mi == null) { mi = new Mutable<Integer>(0); map.put(instance, mi); } mi.set(mi.get() + 1); } if (limit >= 0 && number > limit) break; } List<LemmaInstance> instances = new LinkedList<LemmaInstance>(); for (Map.Entry<LemmaInstance, Mutable<Integer>> entry : map.entrySet()) { LemmaInstance instance = entry.getKey(); double count = entry.getValue().get(); instance.setCount(count); instances.add(instance); } return instances; } public static List<LemmaInstance> getInstances(String file) { return getInstances(new SentenceReader(file)); } public String getMorphTag() { return mtag_; } public static LemmaInstance getInstance(Word word, boolean use_postag, boolean use_mtag) { String form = word.getWordForm(); if (form == null) { throw new RuntimeException("Form is null. Did you specify a form-index?"); } form = form.toLowerCase(); String lemma = word.getLemma(); if (lemma != null) lemma = lemma.toLowerCase(); return new LemmaInstance(form, lemma, (use_postag)? word.getPosTag() : null, (use_mtag) ? word.getMorphTag() : null); } public static LemmaInstance getInstance(Word word) { return getInstance(word, true, true); } public void setPosTag(String string) { ptag_ = string; } public void setMorphTag(String string) { mtag_ = string; } }