/******************************************************************************* * Copyright 2013 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.csniper.resbuild; import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; import java.io.File; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.sql.Statement; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.uima.UIMAException; import org.apache.uima.analysis_engine.AnalysisEngine; import org.apache.uima.fit.factory.JCasFactory; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import de.tudarmstadt.ukp.csniper.resbuild.stuff.DummySentenceSplitter; import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.Corpus; import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService; import de.tudarmstadt.ukp.csniper.webapp.search.SearchEngine; import de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit; import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpEngine; import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpQuery; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.PennTree; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordParser; import de.tudarmstadt.ukp.dkpro.core.stanfordnlp.StanfordSegmenter; public class EvaluationItemFixer2 { private static final Log log = LogFactory.getLog(EvaluationItemFixer2.class); private static final String HOST = "jdbc:mysql://localhost/"; private static final String DATABASE = "csniper"; private static final String USER = "root"; private static final String PASSWORD = "gugaguga"; // private static final String CQP_EXECUTABLE = "/opt/imscwb/cqp"; // private static final String CORPUS_REPOSITORY = "/srv/csniper"; // private static final String HOST = "jdbc:mysql://127.0.0.1/"; // private static final String DATABASE = "csniper"; // private static final String USER = "root"; // private static final String PASSWORD = "gugaguga"; private static final String CQP_EXECUTABLE = "D:\\ukp\\cwb-3.4.3\\bin\\cqp.exe"; private static final String REPOSITORY = "D:\\ukp\\data\\csniper"; private static final String LOG_SUCCESSFUL = "C:\\users\\dodinh\\desktop\\successful.txt"; private static final String LOG_FAILED = "C:\\users\\dodinh\\desktop\\failed.txt"; private static final String LRB = "-LRB-"; private static final String RRB = "-RRB-"; private static Connection connection; private static CqpEngine engine = new CqpEngine() { private static final long serialVersionUID = 1L; { setCorpusService(new CorpusService() { @Override public List<SearchEngine> listEngines(String aCorpusId) { return null; } @Override public List<String> listCorpora() { return null; } @Override public File getRepositoryPath() { return new File(REPOSITORY); } @Override public Corpus getCorpus(String aCorpusId) { return null; } }); setCqpExecutable(new File(CQP_EXECUTABLE)); } }; public static void main(String[] args) { connect(HOST, DATABASE, USER, PASSWORD); Map<Integer, String> items = new HashMap<Integer, String>(); Map<Integer, String> failed = new HashMap<Integer, String>(); // fetch coveredTexts of dubious items and clean it PreparedStatement select = null; PreparedStatement update = null; try { StringBuilder selectQuery = new StringBuilder(); selectQuery.append("SELECT * FROM cachedparse WHERE pennTree = 'ERROR' OR pennTree = ''"); select = connection.prepareStatement(selectQuery.toString()); log.info("Running query [" + selectQuery.toString() + "]."); ResultSet rs = select.executeQuery(); // CSVWriter writer; String text; JCas jcas = JCasFactory.createJCas(); String updateQuery = "UPDATE CachedParse SET pennTree = ? WHERE collectionId = ? AND documentId = ? AND beginOffset = ? AND endOffset = ?"; update = connection.prepareStatement(updateQuery); // File base = new File(""); AnalysisEngine sentences = createEngine(DummySentenceSplitter.class); AnalysisEngine tokenizer = createEngine(StanfordSegmenter.class, StanfordSegmenter.PARAM_CREATE_SENTENCES, false, StanfordSegmenter.PARAM_CREATE_TOKENS, true); AnalysisEngine parser = createEngine(StanfordParser.class, StanfordParser.PARAM_WRITE_CONSTITUENT, true, // StanfordParser.PARAM_CREATE_DEPENDENCY_TAGS, true, StanfordParser.PARAM_WRITE_PENN_TREE, true, StanfordParser.PARAM_LANGUAGE, "en", StanfordParser.PARAM_VARIANT, "factored"); while (rs.next()) { String collectionId = rs.getString("collectionId"); String documentId = rs.getString("documentId"); int beginOffset = rs.getInt("beginOffset"); int endOffset = rs.getInt("endOffset"); text = retrieveCoveredText(collectionId, documentId, beginOffset, endOffset); jcas.setDocumentText(text); jcas.setDocumentLanguage("en"); sentences.process(jcas); tokenizer.process(jcas); parser.process(jcas); // writer = new CSVWriter(new FileWriter(new File(base, documentId + ".csv")); System.out.println("Updating " + text); for (PennTree p : JCasUtil.select(jcas, PennTree.class)) { String tree = StringUtils.normalizeSpace(p.getPennTree()); update.setString(1, tree); update.setString(2, collectionId); update.setString(3, documentId); update.setInt(4, beginOffset); update.setInt(5, endOffset); update.executeUpdate(); System.out.println("with tree " + tree); break; } jcas.reset(); } } catch (SQLException e) { log.error("Exception while selecting: " + e.getMessage()); } catch (UIMAException e) { e.printStackTrace(); } finally { closeQuietly(select); closeQuietly(update); } // write logs // BufferedWriter bwf = null; // BufferedWriter bws = null; // try { // bwf = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File( // LOG_FAILED)), "UTF-8")); // for (Entry<Integer, String> e : failed.entrySet()) { // bwf.write(e.getKey() + " - " + e.getValue() + "\n"); // } // // bws = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File( // LOG_SUCCESSFUL)), "UTF-8")); // for (Entry<Integer, String> e : items.entrySet()) { // bws.write(e.getKey() + " - " + e.getValue() + "\n"); // } // } // catch (IOException e) { // log.error("Got an IOException while writing the log files."); // } // finally { // IOUtils.closeQuietly(bwf); // IOUtils.closeQuietly(bws); // } log.info("Texts for [" + items.size() + "] items need to be cleaned up."); // update the dubious items with the cleaned coveredText // PreparedStatement update = null; // try { // String updateQuery = "UPDATE EvaluationItem SET coveredText = ? WHERE id = ?"; // // update = connection.prepareStatement(updateQuery); // int i = 0; // for (Entry<Integer, String> e : items.entrySet()) { // int id = e.getKey(); // String coveredText = e.getValue(); // // // update item in database // update.setString(1, coveredText); // update.setInt(2, id); // update.executeUpdate(); // log.debug("Updating " + id + " with [" + coveredText + "]"); // // // show percentage of updated items // i++; // int part = (int) Math.ceil((double) items.size() / 100); // if (i % part == 0) { // log.info(i / part + "% finished (" + i + "/" + items.size() + ")."); // } // } // } // catch (SQLException e) { // log.error("Exception while updating: " + e.getMessage()); // } // finally { // closeQuietly(update); // } closeQuietly(connection); } private static void connect(String aHost, String aDatabase, String aUser, String aPassword) { String url = aHost + aDatabase + "?user=" + aUser + "&password=" + aPassword; try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection(url); } catch (ClassNotFoundException e) { throw new RuntimeException("Failed to load the specified database driver.", e); } catch (SQLException e) { throw new RuntimeException( "There was an unrecoverable error while connecting to the database.", e); } } /** * Using a workaround for java < 1.7. */ private static void closeQuietly(Object aAutoCloseable) { try { if (aAutoCloseable != null) { if (aAutoCloseable instanceof Connection) { ((Connection) aAutoCloseable).close(); } if (aAutoCloseable instanceof Statement) { ((Statement) aAutoCloseable).close(); } } } catch (Exception e) { log.error("There was an unrecoverable error while closing [" + aAutoCloseable + "].", e); } } private static String retrieveCoveredText(final String aCollectionId, String aDocumentId, int aBeginOffset, int aEndOffset) { String coveredText; CqpQuery query = new CqpQuery(engine, "", aCollectionId); query.setContext(0, 0, ContextUnit.CHARACTER); String queryString = "[begin=\"" + aBeginOffset + "\"] []* [end=\"" + aEndOffset + "\"] :: match.text_id=\"" + aDocumentId + "\""; log.trace(queryString); query.runQuery(queryString); if (query.size() > 1) { log.warn("More than 1 entry found: " + query); } if (query.size() == 0) { log.error("Nothing found for: " + queryString); query.close(); throw new IllegalArgumentException(queryString); } coveredText = query.cat(1).get(0).getCoveredText(); query.close(); return coveredText; } }