/* * Concept profile generation tool suite * Copyright (C) 2015 Biosemantics Group, Erasmus University Medical Center, * Rotterdam, The Netherlands * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/> */ package org.erasmusmc.applications.conceptprofileevaluator; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.erasmusmc.groundhog.ConceptStatistic; import org.erasmusmc.groundhog.Groundhog; import org.erasmusmc.math.vector.VectorCursor; import org.erasmusmc.ontology.ConceptVector; import org.erasmusmc.ontology.ConceptVectorRecord; public class SubGroundhogStatistics { public Map<Integer, ConceptStatistic> conceptStatistics; public Collection<Integer> documentIDs; public int allConceptOccurrences; public SubGroundhogStatistics(Groundhog sourceGroundhog, Collection<Integer> docIDs) { conceptStatistics = new HashMap<Integer, ConceptStatistic>(); documentIDs = docIDs; extractStatisticsFromSubCollexion(sourceGroundhog); } public SubGroundhogStatistics(){} public void saveToFile(String filename){ try { FileOutputStream file = new FileOutputStream(filename); ObjectOutputStream objectOutputStream = new ObjectOutputStream(file); objectOutputStream.writeObject(conceptStatistics); objectOutputStream.writeObject(documentIDs); objectOutputStream.writeInt(allConceptOccurrences); objectOutputStream.flush(); file.close(); } catch (Exception e) { e.printStackTrace(); } } public static SubGroundhogStatistics loadFromFile(String filename){ SubGroundhogStatistics result = new SubGroundhogStatistics(); try { FileInputStream file = new FileInputStream(filename); ObjectInputStream objectInputStream = new ObjectInputStream(file); result.conceptStatistics = (Map<Integer, ConceptStatistic>)objectInputStream.readObject(); result.documentIDs = (Collection<Integer>)objectInputStream.readObject(); result.allConceptOccurrences = objectInputStream.readInt(); file.close(); } catch (Exception e) { e.printStackTrace(); } return result; } public SubGroundhogStatistics(Collection<ConceptVector> vectors){ int size = vectors.size(); if (size*50>10000){ size = 10000; } else size = size*50; conceptStatistics = new HashMap<Integer, ConceptStatistic>(Math.round(3f/2f*size)); extractStatisticsFromSet(vectors); } private void extractStatisticsFromSet(Collection<ConceptVector> vectors) { for (ConceptVector vector: vectors) { if (vector != null) { Double maxvalue = vector.max(); VectorCursor<Integer> cursor = vector.getNonzeroCursor(); while (cursor.isValid()) { int conceptID = cursor.dimension(); ConceptStatistic conceptCollexionStatistic = conceptStatistics.get(cursor.dimension()); if (conceptCollexionStatistic == null) { conceptCollexionStatistic = new ConceptStatistic(); //conceptCollexionStatistic.conceptID = (cursor.dimension()); conceptStatistics.put(conceptID, conceptCollexionStatistic); } //conceptCollexionStatistic.summedWeight += cursor.get() / maxvalue; conceptCollexionStatistic.termFrequency += cursor.get(); allConceptOccurrences += cursor.get(); conceptCollexionStatistic.docFrequency++; cursor.next(); } } } } protected void extractStatisticsFromSubCollexion(Groundhog sourceGroundhog) { Map<Integer, ConceptVectorRecord> records = sourceGroundhog.getSubMap(documentIDs); for (ConceptVectorRecord record: records.values()) { if (record != null) { VectorCursor<Integer> cursor = record.getConceptVector().getNonzeroCursor(); while (cursor.isValid()) { int conceptID = cursor.dimension(); ConceptStatistic conceptCollexionStatistic = conceptStatistics.get(conceptID); if (conceptCollexionStatistic == null) { conceptCollexionStatistic = new ConceptStatistic(); conceptStatistics.put(conceptID, conceptCollexionStatistic); } conceptCollexionStatistic.termFrequency += cursor.get(); allConceptOccurrences += cursor.get(); conceptCollexionStatistic.docFrequency++; cursor.next(); } } } } }