/* * Copyright 2004-2010 Information & Software Engineering Group (188/1) * Institute of Software Technology and Interactive Systems * Vienna University of Technology, Austria * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.ifs.tuwien.ac.at/dm/somtoolbox/license.html * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package at.tuwien.ifs.somtoolbox.database; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.HashMap; import java.util.Map; import java.util.Vector; import java.util.logging.Logger; import at.ec3.DoubleMatrix; import at.ec3.IntMatrix; import at.tuwien.ifs.somtoolbox.util.StdErrProgressWriter; import at.tuwien.ifs.somtoolbox.util.VectorTools; /** * Reads data from a TeSeTool generated (Lucene-based) index and writes it to a database. * * @author Rudolf Mayer * @version $Id: Index2DatabaseImporter.java 3589 2010-05-21 10:42:01Z mayer $ */ public class Index2DatabaseImporter { public static final String[] templateFields = new String[] { "number", "label", "documentFrequency", "collectionTermFrequency", "minimumTermFrequency", "maximumTermFrequency", "meanTermFrequency", "comment" }; public static final String[] documentTableFields = new String[] { "number", "label" }; public static final String[] documentTermTableFields = new String[] { "documentNumber", "termNumber", "rawTermFrequency", "weight", "weightNormalised" }; MySQLConnector dbConnector; private PreparedStatement documentTermPreparedStatement; public Index2DatabaseImporter(String databaseServerAddress, String databaseName, String user, String password, String databaseTableNamePrefix) throws SQLException, IOException { dbConnector = new MySQLConnector(databaseServerAddress, databaseName, user, password, databaseTableNamePrefix); dbConnector.setupTables(); } public void writeTemplateVector(IntMatrix tfMatrix, Vector<String> selectedTerms, HashMap<Integer, Object> allTerms) throws SQLException { Map<Object, Integer> reversedTermsMap = VectorTools.reverseHashMap(allTerms); // write the template vector StdErrProgressWriter progress = new StdErrProgressWriter(selectedTerms.size(), "Writing template vector ", 10); int index = 0; for (int i = 0; i < selectedTerms.size(); i++) { progress.progress(); if (allTerms.containsValue(selectedTerms.get(i))) { int terminmatrix = Integer.parseInt(reversedTermsMap.get(selectedTerms.get(i)).toString()); String label = (String) allTerms.get(new Integer(terminmatrix)); Integer df = new Integer(tfMatrix.getColumnCardinality(terminmatrix)); Integer tf = new Integer(tfMatrix.getSumColumnValue(terminmatrix)); Integer min = new Integer(tfMatrix.getminColumnValue(terminmatrix)); Integer max = new Integer(tfMatrix.getmaxColumnValue(terminmatrix)); Double mean = new Double(tf.doubleValue() / df.doubleValue()); Object[] values = new Object[] { new Integer(index), label, df, tf, min, max, mean, "" }; try { dbConnector.doInsert(dbConnector.getTermTableName(), templateFields, values); index++; } catch (SQLException e) { Logger.getLogger("at.tuwien.ifs.somtoolbox").severe( "Error in communicating with the database for element " + index + ": '" + e.getMessage() + "'. Aborting."); e.printStackTrace(); System.out.println("label: " + label); System.out.println("index: " + i); try { System.err.println(URLDecoder.decode(label, "UTF8")); } catch (UnsupportedEncodingException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } System.exit(-1); } } } } private void initDocumentTermPreparedStatement() throws SQLException { StringBuffer sql = new StringBuffer(50 + documentTermTableFields.length * +documentTermTableFields.length * 10); sql.append("INSERT into ").append(dbConnector.getDocumentTermsTableName()).append( " (documentNumber, termNumber, rawTermFrequency, weight, weightNormalised)"); sql.append(" VALUES (?,?,?,?,?);"); this.documentTermPreparedStatement = dbConnector.getPreparedStatement(sql.toString()); } private void executeDocumentTermInsert(int documentNumber, int termNumber, int rawTermFrequency, double weight, double weightNormalised) throws SQLException { documentTermPreparedStatement.setInt(1, documentNumber); documentTermPreparedStatement.setInt(2, termNumber); documentTermPreparedStatement.setInt(3, rawTermFrequency); documentTermPreparedStatement.setDouble(4, weight); documentTermPreparedStatement.setDouble(5, weightNormalised); documentTermPreparedStatement.execute(); } /** * Writes the vector from a term-frequency matrix. */ public void writeInputVector(IntMatrix tfMatrix, Vector<String> selectedTerms, HashMap<Integer, Object> allTerms, HashMap<Integer, Object> labelMap) throws SQLException { System.out.println("Writing input vector, calculating on the fly"); initDocumentTermPreparedStatement(); int documentNumber = labelMap.size(); Map<Object, Integer> terms_hm_reverse = new HashMap<Object, Integer>(); boolean reduced = false; if (selectedTerms.size() != allTerms.size()) { reduced = true; } int[] documentFrequencies = new int[selectedTerms.size()]; if (reduced) { StdErrProgressWriter progressDf = new StdErrProgressWriter(documentFrequencies.length, "Calculating df values ", 10); System.out.println("\n\nreduced, calcucalting hashmap\n"); terms_hm_reverse = VectorTools.reverseHashMap(allTerms); for (int i = 0; i < selectedTerms.size(); i++) { progressDf.progress(); int terminmatrix = terms_hm_reverse.get(selectedTerms.get(i)).intValue(); documentFrequencies[terminmatrix] = tfMatrix.getColumnCardinality(terminmatrix); } } else { StdErrProgressWriter progressDf = new StdErrProgressWriter(documentFrequencies.length, "Calculating df values ", 10); for (int i = 0; i < documentFrequencies.length; i++) { progressDf.progress(); documentFrequencies[i] = tfMatrix.getColumnCardinality(i); } } StdErrProgressWriter progress = new StdErrProgressWriter(tfMatrix.rows(), "Writing input vector ", 5); for (int i = 0; i < tfMatrix.rows(); i++) { dbConnector.doInsert(dbConnector.getDocumentTableName(), documentTableFields, new Object[] { new Integer(i), labelMap.get(new Integer(i)).toString() }); int[] tfs = new int[selectedTerms.size()]; double[] weights = new double[selectedTerms.size()]; double[] weightsNormalised = new double[selectedTerms.size()]; if (reduced) { for (int j = 0; j < selectedTerms.size(); j++) { int terminmatrix = terms_hm_reverse.get(selectedTerms.get(j)).intValue(); tfs[j] = tfMatrix.get(i, terminmatrix); double fraq = (double) documentNumber / (double) documentFrequencies[terminmatrix]; weights[j] = tfs[j] * Math.log(fraq); } } else { for (int j = 0; j < allTerms.size(); j++) { tfs[j] = tfMatrix.get(i, j); if (tfs[j] > 0) { double fraq = (double) documentNumber / (double) documentFrequencies[j]; weights[j] = tfs[j] * Math.log(fraq); } } } weightsNormalised = VectorTools.normaliseByLength(weights); // Object[][] values = new Object[tfs.length][]; for (int termIndex = 0; termIndex < tfs.length; termIndex++) { // values[termIndex] = new Object[] { new Integer(i), new Integer(termIndex), new // Double(tfs[termIndex]), // new Double(weights[termIndex]), new Double(weightsNormalised[termIndex]) }; // dbConnector.doInsert(dbConnector.getDocumentTermsTableName(), documentTermTableFields, // new Object[] { new Integer(i), new Integer(termIndex), new Double(tfs[termIndex]), new // Double(weights[termIndex]), // new Double(weightsNormalised[termIndex]) }); executeDocumentTermInsert(i, termIndex, tfs[termIndex], weights[termIndex], weightsNormalised[termIndex]); } // dbConnector.doInsert(dbConnector.getDocumentTermsTableName(), documentTermTableFields, values); // force some garbage collection // if (i % 50 == 0) { // Runtime rt = Runtime.getRuntime(); // long mem = rt.freeMemory(); // System.gc(); // System.out.println("Ran garbage collection. Freed: " + StringUtils.readableBytes(rt.freeMemory() - mem) + // ". Total in use: " // + StringUtils.readableBytes(rt.totalMemory() - rt.freeMemory())); // } progress.progress(); } } /** * Writes the input vectors from an already caluclated tfxidf matrix. */ public void writeInputVector(DoubleMatrix inputVectorMatrix, HashMap<Integer, Object> labelMap) throws SQLException { StdErrProgressWriter progress = new StdErrProgressWriter(inputVectorMatrix.rows(), "Writing input vector "); for (int i = 0; i < inputVectorMatrix.rows(); i++) { progress.progress(); dbConnector.doInsert(dbConnector.getDocumentTableName(), documentTableFields, new Object[] { new Integer(i), labelMap.get(new Integer(i)).toString() }); double[] st = inputVectorMatrix.getRow(i); for (int termIndex = 0; termIndex < st.length; termIndex++) { dbConnector.doInsert(dbConnector.getDocumentTermsTableName(), documentTermTableFields, new Object[] { new Integer(i), new Integer(termIndex), new Double(st[termIndex]) }); } } } }