/**
* OLAT - Online Learning and Training<br>
* http://www.olat.org
* <p>
* Licensed under the Apache License, Version 2.0 (the "License"); <br>
* you may not use this file except in compliance with the License.<br>
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing,<br>
* software distributed under the License is distributed on an "AS IS" BASIS, <br>
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. <br>
* See the License for the specific language governing permissions and <br>
* limitations under the License.
* <p>
* Copyright (c) since 2004 at Multimedia- & E-Learning Services (MELS),<br>
* University of Zurich, Switzerland.
* <hr>
* <a href="http://www.openolat.org">
* OpenOLAT - Online Learning and Training</a><br>
* This file has been modified by the OpenOLAT community. Changes are licensed
* under the Apache 2.0 license as the original file.
*/
package org.olat.search.service.indexer.repository.course;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.document.Document;
import org.olat.core.logging.OLog;
import org.olat.core.logging.Tracing;
import org.olat.core.util.FileUtils;
import org.olat.core.util.vfs.VFSContainer;
import org.olat.core.util.vfs.VFSItem;
import org.olat.core.util.vfs.VFSLeaf;
import org.olat.course.ICourse;
import org.olat.course.nodes.CourseNode;
import org.olat.course.nodes.sp.SPEditController;
import org.olat.search.service.SearchResourceContext;
import org.olat.search.service.document.CourseNodeDocument;
import org.olat.search.service.indexer.LeafIndexer;
import org.olat.search.service.indexer.OlatFullIndexer;
/**
* Indexer for SP (SinglePage) course-node.
* @author Christian Guretzki
*/
public class SPCourseNodeIndexer extends LeafIndexer implements CourseNodeIndexer {
private static final OLog log = Tracing.createLoggerFor(SPCourseNodeIndexer.class);
// Must correspond with LocalString_xx.properties
// Do not use '_' because we want to seach for certain documenttype and lucene haev problems with '_'
public final static String TYPE = "type.course.node.sp";
private final static String SUPPORTED_TYPE_NAME = "org.olat.course.nodes.SPCourseNode";
private final static boolean indexOnlyChosenFile = false;
private static final Pattern HREF_PATTERN = Pattern.compile("href=\\\"(?!http:\\/\\/|https:\\/\\/|javascript:|mailto:|tel:|\\/|:|#|\\.\\.)([^\\\"]*)\\\"", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE);
private static final String HTML_SUFFIXES = "html htm xhtml xml";
@Override
public void doIndex(SearchResourceContext courseResourceContext, ICourse course, CourseNode courseNode, OlatFullIndexer indexWriter) throws IOException,InterruptedException {
if (log.isDebug()) log.debug("Index SinglePage...");
SearchResourceContext courseNodeResourceContext = createSearchResourceContext(courseResourceContext, courseNode, TYPE);
Document nodeDocument = CourseNodeDocument.createDocument(courseNodeResourceContext, courseNode);
indexWriter.addDocument(nodeDocument);
// The root of the configured single page. Depends on the configuration
// whether to follow relative links or not. When relative links are
// followed, the root is the course folder root, if not, it is folder
// where the configured file is in
VFSContainer rootContainer;
// The filename of the configured file relative to the rootContainer
String chosenFile;
// Read the course node configuration
VFSContainer courseFolderContainer = course.getCourseEnvironment().getCourseFolderContainer();
boolean allowRelativeLinks = courseNode.getModuleConfiguration().getBooleanSafe(SPEditController.CONFIG_KEY_ALLOW_RELATIVE_LINKS);
String fileName = (String) courseNode.getModuleConfiguration().get(SPEditController.CONFIG_KEY_FILE);
// *** IF YOU CHANGE THIS LOGIC, do also change it in SinglePageController! ***
if (allowRelativeLinks) {
// Case 1: relative links are allowed. The root is the root of the
// course, the file name is relative to the root
rootContainer = courseFolderContainer;
chosenFile = fileName;
} else {
// Case 2: relative links are NOT allowed. We have to calculate the
// new root and remove the relative path to the course folder form
// the file.
String startURI = ( (fileName.charAt(0) == '/')? fileName.substring(1) : fileName);
int sla = startURI.lastIndexOf('/');
if (sla != -1) {
// Some subfolder path is detected, create basecontainer from it
String root = startURI.substring(0,sla);
startURI = startURI.substring(sla+1);
// Create new root folder from the relative folder path
VFSContainer newroot = (VFSContainer)courseFolderContainer.resolve(root);
newroot.setParentContainer(null);
rootContainer = newroot;
} else {
// No subpath detected, just use course base container
rootContainer = courseFolderContainer;
}
chosenFile = startURI;
}
VFSLeaf leaf = (VFSLeaf)rootContainer.resolve(chosenFile);
if (leaf != null) {
String filePath = getPathFor(leaf);
// Use inherited method from LeafIndexer for the actual indexing of the content
SearchResourceContext fileContext = new SearchResourceContext(courseNodeResourceContext);
doIndexVFSLeafByMySelf(fileContext, leaf, indexWriter, filePath);
if (!indexOnlyChosenFile) {
if (log.isDebug()) log.debug("Index sub pages in SP.");
Set<String> alreadyIndexFileNames = new HashSet<String>();
alreadyIndexFileNames.add(chosenFile);
// Check if page has links to subpages and index those as well
indexSubPages(courseNodeResourceContext,rootContainer,indexWriter,leaf,alreadyIndexFileNames,0,filePath);
} else if (log.isDebug()) {
log.debug("Index only chosen file in SP.");
}
} else if (log.isDebug()) {
log.debug("Can not found choosen file in SP => Nothing indexed.");
}
}
@Override
public String getSupportedTypeName() {
return SUPPORTED_TYPE_NAME;
}
private void indexSubPages(SearchResourceContext courseNodeResourceContext, VFSContainer rootContainer,
OlatFullIndexer indexWriter, VFSLeaf leaf, Set<String> alreadyIndexFileNames, int subPageLevel,
String rootFilePath) throws IOException, InterruptedException {
int mySubPageLevel = subPageLevel;
// check deepness of recursion
if (mySubPageLevel++ <= 5) {
List<String> links = getLinkListFrom(leaf);
for (String link : links) {
if (log.isDebug())
log.debug("link=" + link);
if ((rootFilePath != null) && !rootFilePath.equals("")) {
if (rootFilePath.endsWith("/")) {
link = rootFilePath + link;
} else {
link = rootFilePath + "/" + link;
}
}
if (!alreadyIndexFileNames.contains(link)) {
VFSItem item = rootContainer.resolve(link);
if ((item != null) && (item instanceof VFSLeaf)) {
VFSLeaf subPageLeaf = (VFSLeaf) item;
if (log.isDebug())
log.debug("subPageLeaf=" + subPageLeaf);
String filePath = getPathFor(subPageLeaf);
String newRootFilePath = filePath;
doIndexVFSLeafByMySelf(courseNodeResourceContext, subPageLeaf, indexWriter, filePath);
alreadyIndexFileNames.add(link);
indexSubPages(courseNodeResourceContext, rootContainer, indexWriter, subPageLeaf, alreadyIndexFileNames, mySubPageLevel, newRootFilePath);
} else {
if (log.isDebug())
log.debug("Could not found sub-page for link=" + link);
}
} else {
if (log.isDebug())
log.debug("sub-page already indexed, link=" + link);
}
}
} else {
if (log.isDebug())
log.debug("Reach to many sub-page levels. Go not further with indexing sub-pages last leaf=" + leaf.getName());
}
}
private List<String> getLinkListFrom(VFSLeaf leaf) {
List<String> linkList = new ArrayList<String>();
//only dive into file if it is a html file
String suffix = getSuffix(leaf.getName());
if (HTML_SUFFIXES.contains(suffix)) {
BufferedInputStream bis = new BufferedInputStream(leaf.getInputStream());
String inputString = FileUtils.load(bis, "utf-8");
// Remove all HTML Tags
if (log.isDebug()) log.debug(inputString);
extractSubpageLinks(inputString, linkList);
}
return linkList;
}
/**
* Extract links to subpages from given page content
* @param pageContent HTML content
* @param linkList found links are added to this list
*/
public static void extractSubpageLinks(String pageContent, List<String> linkList) {
Matcher m = HREF_PATTERN.matcher(pageContent);
String match;
while (m.find()) {
int groupCount = m.groupCount();
if (groupCount > 0) {
match = m.group(1); // e.g. 'seite2.html'
linkList.add(match);
}
}
}
private String getSuffix(String fileName) {
int dotpos = fileName.lastIndexOf('.');
if (dotpos < 0 || dotpos == fileName.length() - 1) {
return "";
}
String suffix = fileName.substring(dotpos+1).toLowerCase();
return suffix;
}
}