/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.cleaning; import java.util.Vector; public class MultipleStringAlign { public MultipleStringAlign(Vector<String[]> strs) { } // generate all the possible next state in DFA public void generateChildren(ANode a) { Vector<Vector<int[]>> multiSeqMapping = new Vector<Vector<int[]>>(); Vector<Long> indexes = new Vector<Long>(); for(int i = 0; i < a.exps.size(); i++) { String[] example = a.exps.get(i); int pos = a.orgPos.get(i); Vector<int[]> mappings = findNext(example[0], example[1], pos); // tarpos, orgstartpos, orgendpos Vector<int[]> nmapping = new Vector<int[]>(); for(int[] elem: mappings) { int[] el = {pos, elem[0],elem[1]}; nmapping.add(el); } indexes.add((long)nmapping.size()); multiSeqMapping.add(nmapping); } // generate combinations Vector<Vector<Integer>> configs = new Vector<Vector<Integer>>(); getCrossIndex(indexes, configs); // using a combination to generate the ANode for(int i=0; i<configs.size(); i++) { Vector<Integer> poses = configs.get(i); Vector<Integer> orgPos = new Vector<Integer>(); Vector<Integer> tarPos = new Vector<Integer>(); Vector<Integer> length = new Vector<Integer>(); for(int j = 0; j< poses.size(); j++) { int[] e = multiSeqMapping.get(j).get(poses.get(j)); orgPos.add(e[1]); tarPos.add(e[0]); length.add(e[2]-e[1]); } ANode aNode = new ANode(orgPos, tarPos, length, a.exps); if(aNode.isvalid()) a.addChild(aNode); } } // iteratively generate the combinations public void getCrossIndex(Vector<Long> indexs,Vector<Vector<Integer>> configs) { int k = indexs.size(); int[] com = new int[k]; for (int i = 0; i < k; i++) com[i] = 0; while (com[k - 1] < indexs.get(k-1)) { Vector<Integer> res = new Vector<Integer>(); for (int i = 0; i < k; i++) { //System.out.print(""+com[i]); res.add(com[i]); } configs.add(res); int t = k - 1; while (t != 0 && com[t] == indexs.get(t)-1) t--; com[t]++; if(t==0 && com[t] >= indexs.get(0)) { break; } for (int i = t + 1; i < k; i++) com[i] = 0; } } //{[orgstartPos, orgendPos ], ...} public Vector<int[]> findNext(String org, String tar, int pos) { Vector<int[]> segs = new Vector<int[]>(); if(tar.length() == 0) { int[] elem = {-1,0}; segs.add(elem); return segs; } if (pos >= tar.length()) return segs; String tmp = ""; tmp += tar.charAt(pos); // identify the const string int q = org.indexOf(tmp); if (q == -1) { int cnt = pos; String tvec = ""; while (q == -1) { tvec+= tar.charAt(cnt); cnt++; tmp = ""; if(cnt >= tar.length()) break; tmp += tar.charAt(cnt); q = org.indexOf(tmp); } int[] elem = {-1,cnt-pos}; segs.add(elem); return segs; } for (int i = pos; i < tar.length(); i++) { String tvec = ""; for (int j = pos; j <= i; j++) { tvec += tar.charAt(j); } Vector<Integer> mappings = new Vector<Integer>(); int r = org.indexOf(tvec); while (r != -1) { mappings.add(r); r = org.indexOf(tvec,r+1); } if (mappings.size() > 1) { Vector<int[]> corrm = new Vector<int[]>(); for (int t : mappings) { int[] m = { t, t + tvec.length() }; corrm.add(m); } // create a segment now segs.addAll(corrm); continue; } else if (mappings.size() == 1) { Vector<int[]> corrm = new Vector<int[]>(); // creating based on whether can find segment with one more // token if (i >= (tar.length() - 1)) { int[] m = { mappings.get(0), mappings.get(0) + tvec.length() }; corrm.add(m); segs.addAll(corrm); } else { tvec += tar.charAt(i + 1); int p = org.indexOf(tvec, 0); String repToken = ""; repToken += tar.charAt(i+1); int rind = 0; int tokenCnt = 0; while ((rind=org.indexOf(repToken, rind))!=-1) { rind++; tokenCnt++; } if (p == -1 || tokenCnt > 1) { int[] m = { mappings.get(0), mappings.get(0) + tvec.length() - 1 }; corrm.add(m); segs.addAll(corrm); } else { continue; } } } else { break; } } return segs; } public static void main(String[] args) { Vector<String[]> strs = new Vector<String[]>(); MultipleStringAlign msa = new MultipleStringAlign(strs); String org = "hello world"; String tar = "world"; int pos = 0; for(int i = 0; i<tar.length(); i++) { System.out.println("starting pos: "+i); Vector<int[]> result = msa.findNext(org, tar, i); for(int[] seg:result) { System.out.println(""+org.substring(seg[0],seg[1])); } } } }