/* * Copyright (C) 2000 - 2011 TagServlet Ltd * * This file is part of Open BlueDragon (OpenBD) CFML Server Engine. * * OpenBD is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * Free Software Foundation,version 3. * * OpenBD is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with OpenBD. If not, see http://www.gnu.org/licenses/ * * Additional permission under GNU GPL version 3 section 7 * * If you modify this Program, or any covered work, by linking or combining * it with any of the JARS listed in the README.txt (or a modified version of * (that library), containing parts covered by the terms of that JAR, the * licensors of this Program grant you additional permission to convey the * resulting work. * README.txt @ http://www.openbluedragon.org/license/README.txt * * http://www.openbluedragon.org/ * * $Id: PathFunction.java 1638 2011-07-31 16:08:50Z alan $ */ package com.bluedragon.search.index.path; import java.io.File; import java.util.Iterator; import java.util.Set; import com.bluedragon.search.DocumentWrap; import com.bluedragon.search.collection.Collection; import com.bluedragon.search.collection.CollectionFactory; import com.bluedragon.search.index.DocumentWriter; import com.bluedragon.search.index.crawl.CrawlFactory; import com.bluedragon.search.index.custom.CustomFunction; import com.naryx.tagfusion.cfm.engine.cfArgStructData; import com.naryx.tagfusion.cfm.engine.cfArrayData; import com.naryx.tagfusion.cfm.engine.cfData; import com.naryx.tagfusion.cfm.engine.cfNumberData; import com.naryx.tagfusion.cfm.engine.cfQueryResultData; import com.naryx.tagfusion.cfm.engine.cfSession; import com.naryx.tagfusion.cfm.engine.cfStringData; import com.naryx.tagfusion.cfm.engine.cfStructData; import com.naryx.tagfusion.cfm.engine.cfmRunTimeException; public class PathFunction extends CustomFunction { private static final long serialVersionUID = 1L; public PathFunction(){ String[] p = new String[]{"collection","key","title","summary","author","category","categorytree","urlpath","custommap","extensions","recurse","query"}; min = 2; max = p.length; setNamedParams( p ); } public String[] getParamInfo(){ return new String[]{ "the name of the collection", "the full path of the file to index. If a document already exists, it will be removed and replaced with this one. If 'query' present, this is the column where the file is found", "title for the document. If 'query' present this is a column name", "summary for the document. If 'query' present this is a column name. This column is not indexed, merely stored as a reference", "author for the document. If 'query' present this is a column name", "one or more categores, separated by a comma separated list. If 'query' present this is a column name", "the categorytree for this particular document. If 'query' present this is a column name", "the urlpath of this document. If 'query' present this is a column name", "a structure of custom atttributes that will be added to the document and indexed. The key of the structure element will be the field name and the value will be indexed. You can specify as many custom attributes as required. Each one is stored in the index as well. If 'query' then this is used for column names", "a list of extensions to include in this crawl. defaults to '.cfm, .cfml, .htm, .html, .dbm, .dbml'. '.*'/'*.*' handles all files", "a flag to determine whether or not the path is recursed for sub-directories", "the query representing all the rows to add to this index." }; } public java.util.Map getInfo(){ return makeInfo( "search", "Inserts/Updates a path into the collection. The key is the unique path for each directory that all files inside will be handled. Each field in the document can be searched against. " + "If a query is presented then the fields represent columns into the query. If the column does not exist then an exception is thrown. " + "The index can still be searched while an update is happening, however the new documents will not be available in the search until this operation has completed. " + "Note that all fields are treated as strings and will be indexed accordingly.", ReturnType.STRUCTURE ); } public cfData execute(cfSession _session, cfArgStructData argStruct) throws cfmRunTimeException { // Validate the collection String col = getNamedStringParam(argStruct, "collection", null); if ( col == null ) throwException(_session,"missing the collection parameter"); Collection collection = CollectionFactory.getCollection(col); if ( collection == null ) throwException(_session,"invalid collection (" + col + ") was not found"); // Let us determine if they have passed in a query object or not cfData tmpdata = getNamedParam(argStruct, "query", null ); if ( tmpdata != null && tmpdata.getDataType() != cfData.CFQUERYRESULTDATA ){ throwException(_session, "the query parameter was not a proper type"); } cfQueryResultData query = (cfQueryResultData)tmpdata; // Validate the body checkParam(_session, argStruct, "key", query, true ); checkParam(_session, argStruct, "title", query, false ); checkParam(_session, argStruct, "summary", query, false ); checkParam(_session, argStruct, "author", query, false ); checkParam(_session, argStruct, "category", query, false ); checkParam(_session, argStruct, "categorytree", query, false ); checkParam(_session, argStruct, "urlpath", query, false ); // Validate the custom map tmpdata = getNamedParam(argStruct, "custommap", null ); if ( tmpdata != null && tmpdata.getDataType() != cfData.CFSTRUCTDATA ){ throwException(_session, "the custommap parameter was not a proper structure/map"); } cfStructData custommap = (cfStructData)tmpdata; if ( custommap != null && query != null ){ Iterator<String> it = custommap.keySet().iterator(); while ( it.hasNext() ){ String k = it.next(); String v = custommap.getData(k).getString(); if ( !isColumn(query, v) ) throwException(_session, "custommap key: " + k + ", is mapped to: " + v + ", but this column was not found in the query"); } } // We have now have all our required parameters if ( query != null ) return indexPathQuery( collection, query, _session, custommap, argStruct ); else return indexPath( collection, _session, custommap, argStruct ); } /** * Indexes a single file * * @param collection * @param _session * @param custommap * @param argStruct * @return * @throws cfmRunTimeException */ private cfData indexPath(Collection collection, cfSession _session, cfStructData custommap, cfArgStructData argStruct) throws cfmRunTimeException { String filename = getNamedStringParam(argStruct, "key", ""); File originalfile = new File( filename ); if ( !originalfile.isDirectory() ) throwException(_session, "Path not found: " + originalfile.toString() ); CrawlFactory crawlfactory = new CrawlFactory( collection.bStoreBody() ); int totalFiles = 0; cfArrayData badFiles = cfArrayData.createArray(1); String urlpath = getNamedStringParam(argStruct, "urlpath", null ); String exts = getNamedStringParam(argStruct, "extensions", null ); boolean bRcurse = getNamedBooleanParam(argStruct, "recurse", false ); try{ String originalFileSt = originalfile.getCanonicalPath(); DocumentWriter docWriter = collection.getDocumentWriter(); Set<String> filesToCrawlSet = crawlfactory.getFilesToCrawl(originalfile, exts, bRcurse); Iterator<String> fit = filesToCrawlSet.iterator(); while ( fit.hasNext() ){ String fileToCrawl = fit.next(); DocumentWrap doc = crawlfactory.crawlFile( null, new File(fileToCrawl) ); if ( doc == null ){ badFiles.addElement( new cfStringData(fileToCrawl) ); continue; } // Set the URL if ( urlpath != null ){ doc.setURL( urlpath + fileToCrawl.replace('\\', '/').substring( originalFileSt.length()+1 ) ); } // Set the optional fields doc.setName( getNamedStringParam(argStruct, "title", null ) ); doc.setSummary( getNamedStringParam(argStruct, "summary", null ) ); doc.setAuthor( getNamedStringParam(argStruct, "author", null ) ); doc.setCategoryTree( getNamedStringParam(argStruct, "categorytree", null ) ); doc.setCategories( getNamedStringParam(argStruct, "category", "" ).split(",") ); // Set the custom attributes if ( custommap != null ){ Iterator<String> it = custommap.keySet().iterator(); while ( it.hasNext() ){ String k = it.next(); String v = custommap.getData(k).getString(); doc.setAttribute(k, v); } } // Add the document to the index docWriter.add(doc); totalFiles++; } docWriter.commit(); }catch(Exception e){ throwException(_session, e.getMessage()); } finally { crawlfactory.close(); } cfStructData sd = new cfStructData(); sd.setData("inserted", new cfNumberData(totalFiles) ); sd.setData("invalid", new cfNumberData(badFiles.size()) ); sd.setData("badkeys", badFiles ); return sd; } private cfData indexPathQuery(Collection collection, cfQueryResultData query, cfSession _session, cfStructData custommap, cfArgStructData argStruct) throws cfmRunTimeException { cfStructData sd = new cfStructData(); int totalDocs = 0; cfArrayData badFiles = cfArrayData.createArray(1); DocumentWriter docWriter = null; String exts = getNamedStringParam(argStruct, "extensions", null ); boolean bRcurse = getNamedBooleanParam(argStruct, "recurse", false ); CrawlFactory crawlfactory = new CrawlFactory( collection.bStoreBody() ); try{ docWriter = collection.getDocumentWriter(); query.reset(); while (query.nextRow()) { String filename = getQueryParam(argStruct, query, "key" ); File originalfile = new File( filename ); if ( !originalfile.isDirectory() ){ badFiles.addElement( new cfStringData(filename) ); continue; } String originalFileSt = originalfile.getCanonicalPath(); String urlpath = getQueryParam(argStruct, query, "urlpath" ); Set<String> filesToCrawlSet = crawlfactory.getFilesToCrawl(originalfile, exts, bRcurse); Iterator<String> fit = filesToCrawlSet.iterator(); while ( fit.hasNext() ){ String fileToCrawl = fit.next(); DocumentWrap doc = crawlfactory.crawlFile( null, new File(fileToCrawl) ); if ( doc == null ){ badFiles.addElement( new cfStringData(fileToCrawl) ); continue; } // Set the URL if ( urlpath != null ){ doc.setURL( urlpath + fileToCrawl.replace('\\', '/').substring( originalFileSt.length()+1 ) ); } // Set the optional fields doc.setName( getQueryParam(argStruct, query, "title" ) ); doc.setSummary( getQueryParam(argStruct, query, "summary" ) ); doc.setAuthor( getQueryParam(argStruct, query, "author" ) ); doc.setCategoryTree( getQueryParam(argStruct, query, "categorytree" ) ); // Set the custom attributes if ( custommap != null ){ Iterator<String> it = custommap.keySet().iterator(); while ( it.hasNext() ){ String k = it.next(); String v = query.getData( custommap.getData(k).getString() ).getString(); doc.setAttribute(k, v); } } String category = getQueryParam(argStruct, query, "category" ); if ( category != null ) doc.setCategories( category.split(",") ); docWriter.add( doc ); totalDocs++; } } } catch (Exception e) { throwException(_session, e.getMessage()); }finally{ try { if ( docWriter != null ) docWriter.commit(); } catch (Exception e) { throwException(_session, e.getMessage()); } crawlfactory.close(); } // Set the status sd.setData("inserted", new cfNumberData(totalDocs) ); sd.setData("invalid", new cfNumberData(badFiles.size()) ); sd.setData("badfiles", badFiles ); return sd; } }