package experiments.collective.entdoccentric;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import experiments.collective.entdoccentric.StandardQueryDataObject.EntityObject;
import experiments.collective.entdoccentric.query.QueryGenerator;
import experiments.collective.entdoccentric.query.QuerySettings;
public class CollectiveTestApproach {
public static final String entIndexDirectory = "/home/quh/Arbeitsfläche/Entpackung/Arbeitsfläche/Code_Data/LuceneCorpora/Lucene 4.1/NoStemmingKnowledgeBaseCalbCSmallBackup/";
public static final String docIndexDirectory = "/home/quh/Arbeitsfläche/Entpackung/Arbeitsfläche/Code_Data/LuceneCorpora/Lucene 4.1/NoStemmingCalbCSmall/";
private boolean fuzzy;
private boolean withDescription;
private IndexSearcher entISearcher;
private IndexReader entIReader;
private Query entQuery;
private IndexSearcher docISearcher;
private IndexReader docIReader;
private Query docQuery;
private ScoreDoc[] docScore;
private Set<Bucket> rdyBuckets;
private int qryId;
public CollectiveTestApproach(boolean fuzzy, boolean standardSeacher,
boolean withDescription) {
File indexDir = new File(entIndexDirectory);
File indexDir1 = new File(docIndexDirectory);
this.fuzzy = fuzzy;
this.withDescription = withDescription;
try {
Directory dir = FSDirectory.open(indexDir);
Directory dir1 = FSDirectory.open(indexDir1);
entISearcher = new IndexSearcher(DirectoryReader.open(dir));
entIReader = DirectoryReader.open(dir);
docISearcher = new IndexSearcher(DirectoryReader.open(dir1));
docIReader = DirectoryReader.open(dir1);
if (!standardSeacher) {
entISearcher.setSimilarity(new BM25Similarity());
}
} catch (IOException e) {
e.printStackTrace();
}
}
public void search(StandardQueryDataObject object, int queryNumber) {
this.qryId = queryNumber;
List<Document> docDocList = new LinkedList<Document>();
Set<Bucket> buckets = new HashSet<Bucket>();
List<EntityObject> objects = object.getEnts();
for (int k = 0; k < objects.size(); ++k) {
EntityObject obj = objects.get(k);
// ENTITY CENTRIC CANDIDATES
QuerySettings settings = new QuerySettings();
settings.setDocumentcentric(false);
settings.setDescriptionFuzzy(fuzzy);
settings.setQuery("ltr");
settings.setUseDescription(withDescription);
entQuery = QueryGenerator.getInstance().createQuery(obj, settings);
try {
TopDocs top = entISearcher.search(entQuery, 10);
ScoreDoc[] docs = top.scoreDocs;
LinkedList<String> container = new LinkedList<String>();
for (int i = 0; i < docs.length; i++) {
container.add(entIReader.document(top.scoreDocs[i].doc)
.get("ID"));
}
Bucket buck = new Bucket();
buck.setContainer(container);
buck.setObjectPosition(k);
buckets.add(buck);
} catch (IOException e) {
e.printStackTrace();
}
// Document Centric Algorithm to get a set of relevant documents!
settings = new QuerySettings();
settings.setDocumentcentric(true);
settings.setDescriptionFuzzy(fuzzy);
settings.setQuery("std");
settings.setUseDescription(withDescription);
docQuery = QueryGenerator.getInstance().createQuery(obj, settings);
try {
TopDocs top = docISearcher.search(docQuery, 101);
docScore = top.scoreDocs;
/**
* Achtung! Damit der Test stimmt muss das Dokument, aus welchem
* das Query entity stammt entfernt werden. Im Folgenden eher
* als Hack realisiert. Muss noch besser implementiert werden.
*/
long objectId = Long.valueOf(object.getDocId());
for (int i = 0; i < docScore.length; i++) {
long scoreDoc = Long.valueOf(docIReader.document(
docScore[i].doc).get("id"));
// System.out.println(scoreDoc);
if (objectId != scoreDoc) {
Document doc = docIReader.document(docScore[i].doc);
docDocList.add(doc);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
this.rdyBuckets = new HashSet<Bucket>();
while (buckets.size() > 0) {
Document currentBestDoc = checkCurrentBestDoc(docDocList, buckets);
List<Bucket> rdy = extractRdyBuckets(currentBestDoc, buckets);
rdyBuckets.addAll(rdy);
for (Bucket rdyb : rdy) {
if (buckets.contains(rdyb)) {
buckets.remove(rdyb);
}
}
}
}
private List<Bucket> extractRdyBuckets(Document doc, Set<Bucket> buckets) {
List<Bucket> rdy = new LinkedList<Bucket>();
String concepts = doc.get("concept");
String[] arr = createConceptArray(concepts);
for (int i = 0; i < arr.length; i++) {
for (Bucket bucket : buckets) {
boolean add = false;
for (String s : bucket.getContainer()) {
if (s.equalsIgnoreCase(arr[i])) {
bucket.setEntity(arr[i]);
rdy.add(bucket);
add = true;
break;
}
}
if (add) {
break;
}
}
}
return rdy;
}
private Document checkCurrentBestDoc(List<Document> docs, Set<Bucket> buck) {
int max = 0;
Document maxdoc = null;
for (Document doc : docs) {
int tempnr = 0;
String concepts = doc.get("concept");
String[] arr = createConceptArray(concepts);
for (Bucket bucket : buck) {
for (int i = 0; i < arr.length; i++) {
boolean add = false;
for (String s : bucket.getContainer()) {
if (s.equalsIgnoreCase(arr[i])) {
tempnr++;
add = true;
break;
}
}
if (add) {
break;
}
}
}
if (tempnr >= max) {
max = tempnr;
maxdoc = doc;
}
}
System.out.println("Max wert: " + max + "Bucketanzahl: " + buck.size());
return maxdoc;
}
public void configureResultObject(List<TrecEvalResultObject> result,
List<EntityObject> object) {
// System.out.println(rdyBuckets.size());
for (int i = 0; i < rdyBuckets.size(); i++) {
String[] resultStrings = new String[6];
resultStrings[0] = String.valueOf(qryId + i);
resultStrings[1] = "Q0";
resultStrings[2] = String.valueOf(searchEntity(i));
resultStrings[3] = String.valueOf(i + 1);
resultStrings[4] = String.valueOf(1);
resultStrings[5] = "STANDARD";
result.get(i).setResult(resultStrings);
LinkedList<String> str = object.get(i).getResultLinks();
HashMap<Integer, String> hashm = new HashMap<Integer, String>();
for (int j = 0; j < str.size(); j++) {
hashm.put(str.get(j).hashCode(), str.get(j));
}
String[][] optimalResultStrings = new String[hashm.size()][4];
int amountIt = 0;
for (Integer key : hashm.keySet()) {
optimalResultStrings[amountIt][0] = String.valueOf(qryId + i);
optimalResultStrings[amountIt][1] = "Q0";
optimalResultStrings[amountIt][2] = hashm.get(key);
optimalResultStrings[amountIt][3] = "1";
System.out.println(hashm.get(key));
amountIt++;
}
System.out.println("----------------------------------------------"
+ optimalResultStrings.length);
result.get(i).setOptimalResult(optimalResultStrings);
}
}
private String searchEntity(int i) {
for (Bucket b : rdyBuckets) {
if (b.getObjectPosition() == i) {
return b.getEntity();
}
}
return null;
}
private String[] createConceptArray(String str) {
List<String> lst = new LinkedList<String>();
str = str.trim();
String[] arr = str.split(" ");
for (int i = 0; i < arr.length; i++) {
if (!arr[i].equalsIgnoreCase("") && analyseConcept(arr[i])) {
// if(randomNr % 5 == 0) {
lst.add(generateID(arr[i].toUpperCase()));
// }
}
}
String[] result = new String[lst.size()];
lst.toArray(result);
return result;
}
private boolean analyseConcept(String str) {
String[] arr = str.split(":");
// System.out.println(arr.length);
if (arr.length < 3) {
return false;
}
if (arr[2] == null || arr[2].equalsIgnoreCase("")) {
return false;
}
return true;
}
private String generateID(String line) {
String[] splitter = line.split(":");
String link = "";
if (splitter[1].equalsIgnoreCase("uniprot")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "UN_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("entrezgene")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("umls")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "LI_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("ncbi")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "NC_" + splitter[2];
} else if (splitter[1].equalsIgnoreCase("disease")
&& !splitter[2].equalsIgnoreCase("") && splitter[2] != null) {
link = "LI_" + splitter[2];
}
return link;
}
public class Bucket {
private LinkedList<String> container;
private int objectPosition;
private String entity;
public LinkedList<String> getContainer() {
return container;
}
public void setContainer(LinkedList<String> container) {
this.container = container;
}
public int getObjectPosition() {
return objectPosition;
}
public void setObjectPosition(int objectPosition) {
this.objectPosition = objectPosition;
}
public String getEntity() {
return entity;
}
public void setEntity(String entity) {
this.entity = entity;
}
public boolean equals(Object obj) {
if (!(obj instanceof Bucket))
return false;
return this.objectPosition == ((Bucket) obj).objectPosition;
}
public int hashCode() {
return objectPosition;
}
}
}