package uk.ac.shef.dcs.jate.feature; import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; /** * A feature store that contains information of term distributions over a corpus. It contains following information: * <br>- total number of occurrences of all terms found in the corpus, which is the sum of occurrences of each term * <br>- number of occurrences of each term found in the corpus * * @author <a href="mailto:z.zhang@dcs.shef.ac.uk">Ziqi Zhang</a> */ public class FrequencyTermBased extends AbstractFeature { //term and its total freq in corpus private Map<String, Integer> term2TTF = new ConcurrentHashMap<>(); //term and its freq in each document, stored as a map private Map<String, Map<Integer, Integer>> term2FID = new ConcurrentHashMap<>(); private int corpusTotal = 0; private int totalDocs=0; protected FrequencyTermBased() { } public Map<String, Integer> getMapTerm2TTF(){ return term2TTF; } public synchronized int getCorpusTotal() { if(corpusTotal ==0){ for(int i: term2TTF.values()) corpusTotal +=i; } return corpusTotal; } public int getTotalDocs(){ return totalDocs; } protected void setTotalDocs(int totalDocs){ this.totalDocs=totalDocs; } public int getTTF(String term){ Integer freq = term2TTF.get(term); if(freq==null) freq=0; return freq; } /** * Get the normalised frequency of a term in the corpus, which is the number of occurrences of that term as a fraction * of the total number of occurrences of all terms in the corpus. * * @param term term string * @return double total term frequency */ public double getTTFNorm(String term) { return (double) getTTF(term) / ((double) getCorpusTotal() + 1); } /** * increment the number of occurrences of term by i * * @param term term string * @param i frequency count */ protected void increment(String term, int i) { Integer freq = term2TTF.get(term); if(freq==null) freq=0; freq+=i; term2TTF.put(term, freq); } protected void incrementTermFrequencyInDocument(String term, int luceneDocId, int freq){ Map<Integer, Integer> freqMap = term2FID.get(term); if(freqMap==null) freqMap = new HashMap<>(); freqMap.put(luceneDocId, freq); term2FID.put(term, freqMap); } public Map<Integer, Integer> getTermFrequencyInDocument(String term){ return term2FID.get(term); } }