package edu.uncc.cs.watsonsim.index;
import java.io.IOException;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;
import edu.stanford.nlp.util.Triple;
import edu.uncc.cs.watsonsim.Configuration;
import edu.uncc.cs.watsonsim.Database;
import edu.uncc.cs.watsonsim.Passage;
/**
*
* @author Phani Rahul
* @author Modifications: Jonathan Shuman
* @author Later rewrite by Sean Gallagher
* @purpose Index a database of plain-text sources using pluggable modules
*/
public class Reindex {
/**
* the input file which has to be indexed. This is a database made from TRECtext's
*/
private final Database db;
final List<Segment> indexers;
private final Configuration conf = new Configuration();
public Reindex() throws IOException {
db = new Database(conf);
indexers = Arrays.asList(
//new Lucene(Paths.get(conf.getConfOrDie("lucene_index")))
//new Indri(conf.getConfOrDie("indri_index")),
new Bigrams()
//new Edges(db)
);
}
/**
* Index collected datasources using Lucene and Indri
*/
public static void main(String[] args) throws SQLException, IOException {
new Reindex().run();
}
public void run() {
try {
//indexAll("SELECT title, text, reference FROM sources;");
if (db.backend().startsWith("SQLite")) {
// I highly recommend SQLite. The indexing will run much faster
// because it has a more efficient query plan for this
indexAll("SELECT "
+ "title, "
+ "group_concat(text, ' ') as text,"
+ "min(reference) as reference "
+ "FROM sources "
+ "GROUP BY title;");
} else {
indexAll("SELECT "
+ "title, "
+ "string_agg(text, ' ') as text,"
+ "min(reference) as reference "
+ "FROM sources "
+ "GROUP BY title;");
}
} catch (IllegalArgumentException | SQLException e) {
e.printStackTrace();
} finally {
// Even if the process is interrupted, save the indices!
indexers.forEach(i -> {
try {
i.close();
} catch (Exception e) {
e.printStackTrace();
}
});
db.commit();
}
// SemanticVectors Post-processing
/*try {
BuildIndex.main(new String[]{
"-luceneindexpath", conf.getConfOrDie("lucene_index"),
"-docidfield", "docno",
"-docindexing", "incremental",
"-contentsfields", "text"});
} catch (IllegalArgumentException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}*/
System.out.println("Done indexing.");
}
private void indexAll(String query) throws SQLException {
PreparedStatement statements = db.prep(query);
statements.setFetchSize(10000);
ResultSet rs = statements.executeQuery();
AtomicInteger c = new AtomicInteger();
Stream.generate(() -> {
List<Triple<String,String,String>> block = new ArrayList<>(300);
try {
synchronized(rs) {
while (block.size() < 300 && !rs.isAfterLast() && rs.next()) {
// The usual case, another result
block.add(Triple.makeTriple(
rs.getString(1), rs.getString(2), rs.getString(3)));
}
}
} catch (SQLException e) {
// Sometimes the resultset closes while we use it.
// What can we do about it?
e.printStackTrace();
}
return block;
}).parallel().flatMap((block) -> {
if (!block.isEmpty()) {
for (Triple<String,String,String> row : block) {
Passage pass = new Passage(
"none", row.first, row.second, row.third);
for (Segment i : indexers) {
i.accept(pass);
}
}
int count = c.addAndGet(block.size());
System.out.println("Indexed " + count);
}
// It's looking for the first non-empty stream
if (block.isEmpty()) return Stream.of("done");
else return Stream.empty();
}
).findFirst();
}
}