package edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashSet; import org.apache.lucene.document.Document; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import edu.cmu.geolocator.GlobalParam; import edu.cmu.geolocator.io.GetReader; import edu.cmu.geolocator.model.CandidateAndFeature; import edu.cmu.geolocator.model.LocEntityAnnotation; import edu.cmu.geolocator.resource.ResourceFactory; import edu.cmu.geolocator.resource.gazindexing.Index; public class CollaborativeIndex implements Index { private IndexSearcher stringSearcher, infoSearcher; private HashSet<String> ids; private BooleanQuery q; private ArrayList<Document> returnDocs; private String stringIndexName, infoIndexName, stringLoad, infoLoad; private static CollaborativeIndex ci; public static CollaborativeIndex getInstance() { if (ci == null) ci = new CollaborativeIndex().config(GlobalParam.getGazIndex() + "/StringIndex", GlobalParam.getGazIndex() + "/InfoIndex", "mmap", "mmap").open(); return ci; } public CollaborativeIndex config(String stringIndexName, String infoIndexName, String stringLoad, String infoLoad) { this.stringIndexName = stringIndexName; this.infoIndexName = infoIndexName; this.stringLoad = stringLoad; this.infoLoad = infoLoad; return this; } /* * public boolean inIndexStrict(String phrase) { if (phrase == null || phrase.length() == 0) throw * new NullPointerException(); TermQuery query = new TermQuery(new Term("LOWERED_ORIGIN", * phrase.toLowerCase())); //If it's an abbreviation, then return true; ignore case. * * if (ResourceFactory.getCountryCode2CountryMap().isCountryAbbreviation(phrase)) return true; * TopDocs res = null; try { res = stringSearcher.search(query, 1); } catch (IOException e) { // * TODO Auto-generated catch block e.printStackTrace(); } if (res == null) return false; else * return res.totalHits > 0 ? true : false; } */ /** * Check if the original string is in index. If not, check the non-space version. However, we have * to add some heuristics. */ @Override public boolean inIndex(String phrase) { if (phrase == null || phrase.length() == 0) throw new NullPointerException(); if (phrase.startsWith("#")) phrase = phrase.substring(1); if (ResourceFactory.getCountryCode2CountryMap().isInMap(phrase.trim().toLowerCase())) return true; phrase = phrase.toLowerCase().replace(" ", ""); TermQuery query = new TermQuery(new Term("LOWERED-NO-WS", phrase)); TopDocs res = null; try { res = stringSearcher.search(query, 1); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (res == null) return false; return res.totalHits > 0 ? true : false; } @Override public ArrayList<Document> getDocumentsByPhrase(String phrase) { if (phrase == null || phrase.length() == 0) throw new NullPointerException(); if (phrase.startsWith("#")) phrase = phrase.substring(1); TermQuery query = new TermQuery( new Term("LOWERED-NO-WS", phrase.toLowerCase().replace(" ", ""))); TopDocs res = null; try { res = stringSearcher.search(query, 2500); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } String abIds = null; if (ResourceFactory.getCountryCode2CountryMap().isInMap(phrase.toLowerCase())) abIds = ResourceFactory.getCountryCode2CountryMap().getValue(phrase.toLowerCase()).getId(); // System.out.println(res.totalHits); if (res == null && abIds == null) return null; if (res.totalHits == 0 && abIds == null) return null; ids = new HashSet<String>(); if (res != null) try { for (ScoreDoc doc : res.scoreDocs) { ids.add(stringSearcher.doc(doc.doc).get("ID")); } } catch (Exception e) { e.printStackTrace(); } if (abIds != null) ids.add(abIds); // System.out.println(ids); // System.out.println("total number of String ids are:" + ids.size()); q = new BooleanQuery(); for (String id : ids) { q.add(new TermQuery(new Term("ID", id)), Occur.SHOULD); } // use a term filter instead of a query filter. try { TopDocs docs = infoSearcher.search(q, 2500); // System.out.println("total hits in info is:" + docs.totalHits); returnDocs = new ArrayList<Document>(docs.totalHits); for (ScoreDoc d : docs.scoreDocs) { returnDocs.add(infoSearcher.doc(d.doc)); } return returnDocs; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } /* * public ArrayList<Document> getDocumentsByPhraseStrict(String phrase) { if (phrase == null || * phrase.length() == 0) throw new NullPointerException(); TermQuery query = new TermQuery(new * Term("LOWERED_ORIGIN", phrase.toLowerCase())); TopDocs res = null; try { res = * stringSearcher.search(query, 200); } catch (IOException e) { // TODO Auto-generated catch block * e.printStackTrace(); } System.out.println(res.totalHits); * * if (res == null) return null; if (res.totalHits == 0) return null; ids = new * HashSet<String>(res.totalHits); try { for (ScoreDoc doc : res.scoreDocs) { * ids.add(stringSearcher.doc(doc.doc).get("ID")); } } catch (Exception e) { e.printStackTrace(); * } if (ResourceFactory.getCountryCode2CountryMap().isCountryAbbreviation(phrase)) * ids.add(ResourceFactory.getCountryCode2CountryMap().getValue(phrase).getId()); // * System.out.println("total number of String ids are:" + ids.size()); q = new BooleanQuery(); * * for (String id : ids) { q.add(new TermQuery(new Term("ID", id)), Occur.SHOULD); } // use a term * filter instead of a query filter. try { TopDocs docs = infoSearcher.search(q, 200); // * System.out.println("total hits in info is:" + docs.totalHits); returnDocs = new * ArrayList<Document>(docs.totalHits); for (ScoreDoc d : docs.scoreDocs) { * returnDocs.add(infoSearcher.doc(d.doc)); } return returnDocs; } catch (IOException e) { // TODO * Auto-generated catch block e.printStackTrace(); } return null; } */ public CollaborativeIndex open() { try { stringSearcher = GetReader.getIndexSearcher(stringIndexName, stringLoad); // for setting the max clause count for search query. BooleanQuery.setMaxClauseCount(2500); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } try { infoSearcher = GetReader.getIndexSearcher(infoIndexName, infoLoad); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } return this; } @Override public void close() { // TODO Auto-generated method stub } public String[] getAlternateNames(String id) { HashSet<String> names = null; Query query = new TermQuery(new Term("ID", id)); try { TopDocs topDocs = infoSearcher.search(query, 2500); names = new HashSet<String>(topDocs.totalHits); for (ScoreDoc doc : topDocs.scoreDocs) { names.add(stringSearcher.doc(doc.doc).get("LOWERED_ORIGIN")); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return names.toArray(new String[names.size()]); } @Override public Document getDocumentsById(String id) { if (id == null || id.length() == 0) throw new NullPointerException(); TermQuery query = new TermQuery(new Term("ID", id)); TopDocs res = null; try { res = infoSearcher.search(query, 1); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } if (res == null) return null; if (res.totalHits == 0) return null; try { return infoSearcher.doc(res.scoreDocs[0].doc); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } public static void main(String argv[]) throws IOException { GlobalParam.setGazIndex("/users/indri/GazIndex"); // GlobalParam.setGeoNames("GeoNames"); CollaborativeIndex ci = ResourceFactory.getClbIndex(); System.out.println(ci.getDocumentsByPhrase("newport").size()); for (Document doc : ci.getDocumentsByPhrase("San Francisco")) System.out.println(doc.get(InfoFields.name) + " " + doc.get(InfoFields.countryCode) + " " + doc.get(InfoFields.adm1Code) + "\t" + doc.get(InfoFields.adm2Code) + "\t" + doc.get(InfoFields.alternativeNamesCount) + "\t" +doc.get(InfoFields.population)+"\t" +doc.get(InfoFields.id) + "\t" + doc.get(InfoFields.latitude) + "\t" + doc.get(InfoFields.longitude)); if (true) return; boolean mode = true; // string // mode = false; // id /** * search string //id */ BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = null; while ((s = br.readLine()) != null) { if (mode) { if (ResourceFactory.getClbIndex().inIndex(s)) for (Document d : ResourceFactory.getClbIndex().getDocumentsByPhrase(s)) System.out.println(d); else System.out.println("null."); } else { Document doc = ci.getDocumentsById(s); System.out.println(doc); CandidateAndFeature gc = new CandidateAndFeature(s, doc, new LocEntityAnnotation(0, 0, s, null)); System.out.println(gc.getId() + " " + gc.getAsciiName() + " " + gc.getCountryCode()); } } } }