001 /*
002 * CrochemoreLandauZivUkelsonGlobalAlignment.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 /**
035 * This class implements the <B>global</B> pairwise sequence alignment algorithm (with
036 * linear gap penalty function) due to Maxime Crochemore, Gad Landau and Michal
037 * Ziv-Ukelson (2002).
038 *
039 * <P>This implementation derives from the paper of M.Crochemore, G.Landau and
040 * M.Ziv-Ukelson, <I>A Sub-quadratic Sequence Alignment Algorithm for Unrestricted Scoring
041 * Matrices</I> (available here as
042 * <A HREF="doc-files/Crochemore_Landau_Ziv-Ukelson_algorithm.pdf">PDF</A> or
043 * <A HREF="doc-files/Crochemore_Landau_Ziv-Ukelson_algorithm.pdf">Postscript</A>).</P>
044 *
045 * <P>For a general description of the algorithm, please refer to the specification of the
046 * abstract {@linkplain CrochemoreLandauZivUkelson} superclass.</P>
047 *
048 * <P>This class consist mainly of methods that:</P>
049 *
050 * <LU>
051 * <LI>create and compute all information of a block (see {@link #createBlock createBlock}
052 * and its variants);
053 * <LI>compute the output border of a block (see {@link #computeOutputBorder
054 * computeOutputBorder};
055 * <LI>locate the score of a high scoring global alignment in the block table (see {@link
056 * #locateScore locateScore};
057 * <LI>build an optimal global alignment from the information stored in the block table
058 * (see {@link #buildOptimalAlignment buildOptimalAlignment}.
059 * </LU>
060 *
061 * @see CrochemoreLandauZivUkelson
062 * @see CrochemoreLandauZivUkelsonLocalAlignment
063 * @author Sergio A. de Carvalho Jr.
064 */
065 public class CrochemoreLandauZivUkelsonGlobalAlignment extends CrochemoreLandauZivUkelson
066 {
067 /**
068 * Creates and computes all information of an alignment block. Its main job is to
069 * compute the DIST column for the block. It then request the
070 * <CODE>computeOutputBorder</CODE> method to compute the block's output border.
071 *
072 * @param factor1 factor of the first sequence
073 * @param factor2 factor of the second sequence
074 * @param row row index of the block in the block table
075 * @param col column index of the block in the block table
076 * @return the computed block
077 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
078 * with the sequences being aligned
079 */
080 protected AlignmentBlock createBlock (Factor factor1, Factor factor2, int row,
081 int col) throws IncompatibleScoringSchemeException
082 {
083 AlignmentBlock block, left_prefix, diag_prefix, top_prefix;
084 int size, lr, lc, score_ins, score_sub, score_del, ins, del, sub, max;
085
086 lr = factor1.length();
087 lc = factor2.length();
088 size = lr + lc + 1;
089
090 block = new AlignmentBlock (factor1, factor2, size);
091
092 // set up pointers to prefixes
093 left_prefix = getLeftPrefix (block);
094 diag_prefix = getDiagonalPrefix (block);
095 top_prefix = getTopPrefix (block);
096
097 // compute scores
098 score_ins = scoreInsertion (factor2.getNewChar());
099 score_sub = scoreSubstitution (factor1.getNewChar(), factor2.getNewChar());
100 score_del = scoreDeletion (factor1.getNewChar());
101
102 // compute dist column and direction
103 for (int i = 0; i < size; i++)
104 {
105 // compute optimal path to
106 // input border's ith position
107
108 ins = sub = del = Integer.MIN_VALUE;
109
110 if (i < size - 1)
111 ins = left_prefix.dist_column[i] + score_ins;
112
113 if ((i > 0) && (i < size - 1))
114 sub = diag_prefix.dist_column[i - 1] + score_sub;
115
116 if (i > 0)
117 del = top_prefix.dist_column[i - 1] + score_del;
118
119 block.dist_column[i] = max = max (ins, sub, del);
120
121 // record the direction to of the optimal
122 // path to input border's ith position
123 if (max == ins)
124 block.direction[i] = LEFT_DIRECTION;
125 else if (max == sub)
126 block.direction[i] = DIAGONAL_DIRECTION;
127 else
128 block.direction[i] = TOP_DIRECTION;
129 }
130
131 computeOutputBorder (block, row, col, size, lc, lr);
132
133 return block;
134 }
135
136 /**
137 * Creates the root block. This is a special case of the <CODE>createBlock</CODE>
138 * method. No information is actually computed.
139 *
140 * @param factor1 factor of the first sequence
141 * @param factor2 factor of the second sequence
142 * @return the root block
143 */
144 protected AlignmentBlock createRootBlock (Factor factor1, Factor factor2)
145 {
146 return new AlignmentBlock (factor1, factor2);
147 }
148
149 /**
150 * Creates and computes all information of an alignment block of the first row of the
151 * block table. This is a special case of the <CODE>createBlock</CODE> method.
152 *
153 * @param factor1 factor of the first sequence
154 * @param factor2 factor of the second sequence
155 * @param col column index of the block in the block table
156 * @return the computed block
157 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
158 * with the sequences being aligned
159 * @see #createBlock createBlock
160 */
161 protected AlignmentBlock createFirstRowBlock (Factor factor1, Factor factor2, int col)
162 throws IncompatibleScoringSchemeException
163 {
164 AlignmentBlock block, left_prefix;
165 int size, lr, lc, score_ins;
166
167 lr = 0; // factor1.length();
168 lc = factor2.length();
169 size = lr + lc + 1;
170
171 block = new AlignmentBlock (factor1, factor2, size);
172
173 // set up pointer to left prefix
174 left_prefix = getLeftPrefix (block);
175
176 // compute insertion's score
177 score_ins = scoreInsertion (factor2.getNewChar());
178
179 // compute dist column and direction
180 for (int i = 0; i < lc; i++)
181 {
182 block.dist_column[i] = left_prefix.dist_column[i] + score_ins;
183 block.direction[i] = LEFT_DIRECTION;
184 }
185
186 // last position
187 block.dist_column[lc] = 0;
188 block.direction[lc] = STOP_DIRECTION;
189
190 computeOutputBorder (block, 0, col, size, lc, lr);
191
192 return block;
193 }
194
195 /**
196 * Creates and computes all information of an alignment block of the first column of
197 * the block table. This is a special case of the <CODE>createBlock</CODE> method.
198 *
199 * @param factor1 factor of the first sequence
200 * @param factor2 factor of the second sequence
201 * @param row row index of the block in the block table
202 * @return the computed block
203 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
204 * with the sequences being aligned
205 * @see #createBlock createBlock
206 */
207 protected AlignmentBlock createFirstColumnBlock (Factor factor1, Factor factor2,
208 int row) throws IncompatibleScoringSchemeException
209 {
210 AlignmentBlock block, top_prefix;
211 int size, lr, lc, score_del;
212
213 lr = factor1.length();
214 lc = 0; // factor2.length();
215 size = lr + lc + 1;
216
217 block = new AlignmentBlock (factor1, factor2, size);
218
219 // set up pointer to top prefix
220 top_prefix = getTopPrefix (block);
221
222 // compute deletion's score
223 score_del = scoreDeletion (factor1.getNewChar());
224
225 // first position
226 block.dist_column[0] = 0;
227 block.direction[0] = STOP_DIRECTION;
228
229 // compute dist column and direction
230 for (int i = 1; i < size; i++)
231 {
232 block.dist_column[i] = top_prefix.dist_column[i - 1] + score_del;
233 block.direction[i] = TOP_DIRECTION;
234 }
235
236 computeOutputBorder (block, row, 0, size, lc, lr);
237
238 return block;
239 }
240
241 /**
242 * Computes the output border of a block. This is performed in five steps:
243 *
244 * <LU>
245 * <LI>Retrieve the block's input border;
246 * <LI>Retrieve the block's complete DIST matrix;
247 * <LI>Create an interface to the {@linkplain OutMatrix OUT} matrix from the input
248 * border and DIST matrix;
249 * <LI>Use {@linkplain Smawk SMAWK} to compute all column maxima of the OUT matrix
250 * (SMAWK finds the index of the row that contains the maximum value of a column);
251 * <LI>Assemble the output border by extracting the maximum values of each column of
252 * the OUT matrix using the information obtained in the previous step.
253 * </LU>
254 *
255 * @param block the block for which the output border is to be computed
256 * @param row row index of the block in the block table
257 * @param col column index of the block in the block table
258 * @param dim dimension of the output border
259 * @param lc number of columns of the block
260 * @param lr number of row of the block
261 */
262 protected void computeOutputBorder (AlignmentBlock block, int row, int col, int dim,
263 int lc, int lr)
264 {
265 int[] input = assembleInputBorder (dim, row, col, lr);
266
267 int[][] dist = assembleDistMatrix (block, dim, row, col, lc);
268
269 // update the interface to the OUT matrix
270 out_matrix.setData (dist, input, dim, lc);
271
272 // compute source_path using Smawk
273 smawk.computeColumnMaxima (out_matrix, block.source_path);
274
275 // update output border
276 for (int i = 0; i < dim; i++)
277 block.output_border[i] = out_matrix.valueAt(block.source_path[i], i);
278 }
279
280 /**
281 * Builds an optimal global alignment between the loaded sequences after the block
282 * table has been computed. This method traces a path back in the block table, from
283 * the last block to the first.
284 *
285 * @return an optimal global alignment
286 * @throws IncompatibleScoringSchemeException If the scoring scheme is not compatible
287 * with the loaded sequences.
288 * @see CrochemoreLandauZivUkelson#traverseBlock
289 */
290 protected PairwiseAlignment buildOptimalAlignment ()
291 throws IncompatibleScoringSchemeException
292 {
293 AlignmentBlock block, ancestor;
294 StringBuffer gapped_seq1, tag_line, gapped_seq2;
295 int source, dest, ancestor_source;
296 int row, col;
297
298 gapped_seq1 = new StringBuffer();
299 tag_line = new StringBuffer();
300 gapped_seq2 = new StringBuffer();
301
302 // start at the last row, last column of block table
303 row = num_rows - 1; col = num_cols - 1;
304 block = block_table[row][col];
305 dest = block.factor2.length();
306
307 while (row > 0 || col > 0)
308 {
309 block = block_table[row][col];
310 source = block.source_path[dest];
311 ancestor = block.ancestor[dest];
312
313 ancestor_source = source;
314 if (dest > block.factor2.length())
315 ancestor_source -= (block.factor1.length() - ancestor.factor1.length());
316
317 traverseBlock (ancestor, ancestor_source, gapped_seq1, tag_line, gapped_seq2);
318
319 if (row == 0)
320 {
321 col = col - 1;
322 dest = block_table[row][col].factor2.length();
323 }
324 else if (col == 0)
325 {
326 row = row - 1;
327 dest = 0;
328 }
329 else
330 {
331 if (source < block.factor1.length())
332 {
333 col = col - 1;
334 dest = block_table[row][col].factor2.length() + source;
335 }
336 else if (source == block.factor1.length())
337 {
338 row = row - 1; col = col - 1;
339 dest = block_table[row][col].factor2.length();
340 }
341 else
342 {
343 row = row - 1;
344 dest = source - block.factor1.length();
345 }
346 }
347 }
348
349 return new PairwiseAlignment (gapped_seq1.toString(), tag_line.toString(),
350 gapped_seq2.toString(), locateScore());
351 }
352
353 /**
354 * Locate the score of the highest scoring global alignment in the block table. This
355 * value is found in the output border of the last block (last row, last column).
356 *
357 * @return the score of the highest scoring global alignment
358 */
359 protected int locateScore ()
360 {
361 AlignmentBlock last_block = block_table[num_rows - 1][num_cols - 1];
362
363 return last_block.output_border[last_block.factor2.length()];
364 }
365 }