/*
* LinksExtractor.java - extract [[in- and [[out-links]] from the wikipedia articles
* via regular expressions
* Copyright (c) 2005, 2006 Andrew Krizhanovsky /aka at mail.iias.spb.su/
* Distributed under GNU Public License.
*/
package wikipedia.sql.maintenance;
import wikipedia.sql.*;
import wikipedia.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.regex.PatternSyntaxException;
// See docs at http://regex.info/java.html
// http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/package-summary.html
// http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
//
public class LinksOutExtractorText {
/*
public static Encodings encodings;
public LinksOutExtractorText() {
encodings = new Encodings();
}
*/
public String[] getLinksViaPattern(String text, String str_pattern, String[] append_result)
{
Integer i, current, size;
String[] result;
Pattern p = Pattern.compile(str_pattern);
Matcher m = p.matcher(text);
// calculate number of matches
size = 0;
while (m.find()){ size ++; }
m.reset();
if (0 >= size)
return append_result;
if (null==append_result) {
result = new String[size];
current = 0;
} else {
result = new String[size+append_result.length];
// copy first result, it is supposed that duplicate (in append result) were already removed
for(i=0;i<append_result.length; i++) {
result[i] = append_result[i];
}
current = append_result.length;
}
while (m.find()){
String new_match = m.group(1);
boolean bunique = true;
for(i=0; i<current; i++) {
if (result[i].equals(new_match)) {
bunique = false;
break;
}
}
if (bunique)
result[ current++ ] = new_match;
}
if (current == result.length) return result;
// else "chop the empty back of result"
String[] unique_result = new String [current];
for(i=0; i<current; i++) {
unique_result[i] = result[i];
}
return unique_result;
}
// Get links from the text
// 1) Stemmed case [[inside brackets till the first vertical line| others skip]]
// e.g. [[artificial consciousness|machines]]
//
// pattern: \[\[([^\]\|]+)\|[^\]]+\]\]
// with spaces: \[\[ ([^\]\|]+) \| [^\]]+ \]\]
//
// 2) Simple case [[only_one_link]], e.g. [[mind]], or [[brain]]
//
// pattern: \[\[([^\]\|]+)\]\]
// with spaces: \[\[ ( [^\]\|]+ ) \]\]
// ** Test in PowerGrep
public String[] getLinks(String text)
{
String[] result1 = getLinksViaPattern(text, "\\[\\[([^\\]\\|]+)\\|[^\\]]+\\]\\]", null);
return getLinksViaPattern(text, "\\[\\[([^\\]\\|]+)\\]\\]", result1);
}
}