compareFiles.cpp
Go to the documentation of this file.
1 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
2  ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
3 
4  trimAl v1.4: a tool for automated alignment trimming in large-scale
5  phylogenetics analyses.
6 
7  2009-2015 Capella-Gutierrez S. and Gabaldon, T.
8  [scapella, tgabaldon]@crg.es
9 
10  This file is part of trimAl.
11 
12  trimAl is free software: you can redistribute it and/or modify
13  it under the terms of the GNU General Public License as published by
14  the Free Software Foundation, the last available version.
15 
16  trimAl is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  GNU General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with trimAl. If not, see <http://www.gnu.org/licenses/>.
23 
24 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
25 ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
26 
27 #include "compareFiles.h"
28 #include "alignment.h"
29 
30 #define LONG 80
31 
32 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
33 /* This method compares a set of alignment in order to select the most
34  * consistent one respect of the other ones. To compute the consistency
35  * values we use the proportion of residue pairs per column in the aligs
36  * to compare */
37 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
38 int compareFiles::algorithm(alignment **vectAlignments, char **fileNames, float *columnsValue, int numAlignments, bool verbosity) {
39 
40  int *numResiduesAlig, *correspNames, *columnSeqMatrix, *columnSeqMatrixAux;
41  int i, j, k, l, m, numSeqs, pairRes, hits, alig = 0;
42  float max = 0, value = 0, **vectHits;
43  bool appearErrors = false;
44  string *names;
45 
46  /* ***** ***** ***** ***** ***** ***** ***** ***** */
47  /* Get some parameters from the alignment that has
48  * been selected */
49  numSeqs = vectAlignments[0] -> getNumSpecies();
50  /* ***** ***** ***** ***** ***** ***** ***** ***** */
51 
52  /* ***** ***** ***** ***** ***** ***** ***** ***** */
53  /* Allocate dinamic local memory */
54  names = new string[numSeqs];
55  correspNames = new int[numSeqs];
56  numResiduesAlig = new int[numAlignments];
57  columnSeqMatrix = new int[numSeqs];
58  vectHits = new float*[numAlignments];
59  columnSeqMatrixAux = new int[numSeqs];
60  /* ***** ***** ***** ***** ***** ***** ***** ***** */
61 
62  /* ***** ***** ***** ***** ***** ***** ***** ***** */
63  /* Check that all of alignment has the same number of
64  * sequence as well as there exists a correspondence
65  * between the names for each pars of aligs. */
66  for(i = 1; i < numAlignments; i++) {
67  /* ***** ***** ***** ***** ***** ***** ***** ***** */
68  if(numSeqs != vectAlignments[i] -> getNumSpecies()) {
69  cerr << endl << "ERROR: The files to compare do not have "
70  << "the same number of sequences" << endl << endl;
71  appearErrors = true;
72  break;
73  }
74  /* ***** ***** ***** ***** ***** ***** ***** ***** */
75 
76  /* ***** ***** ***** ***** ***** ***** ***** ***** */
77  vectAlignments[i] -> getSequences(names);
78  if(!vectAlignments[0] -> getSeqNameOrder(names, correspNames)) {
79  cerr << endl << "ERROR: The files to compare do not"
80  << " have the sequence names" << endl << endl;
81  appearErrors = true;
82  break;
83  }
84  /* ***** ***** ***** ***** ***** ***** ***** ***** */
85  }
86 
87  /* ***** ***** ***** ***** ***** ***** ***** ***** */
88  /* Changes the order in sequences number matrix
89  * according to the order in the selected alignment */
90  for(i = 1; ((i < numAlignments) && (!appearErrors)); i++) {
91  vectAlignments[i] -> getSequences(names);
92  vectAlignments[0] -> getSeqNameOrder(names, correspNames);
93  vectAlignments[i] -> setSeqMatrixOrder(correspNames);
94  }
95  /* ***** ***** ***** ***** ***** ***** ***** ***** */
96 
97  /* ***** ***** ***** ***** ***** ***** ***** ***** */
98  /* Get back the residues number for each alignment */
99  for(i = 0; ((i < numAlignments) && (!appearErrors)); i++)
100  numResiduesAlig[i] = vectAlignments[i] -> getNumAminos();
101  /* ***** ***** ***** ***** ***** ***** ***** ***** */
102 
103  /* ***** ***** ***** ***** ***** ***** ***** ***** */
104  /* Start the comparison among the alignments */
105  for(i = 0; ((i < numAlignments) && (!appearErrors)); i++, value = 0) {
106 
107  /* ***** ***** ***** ***** ***** ***** ***** ***** */
108  /* If it's necessary, we print some information */
109  if(verbosity)
110  cout << endl;
111  /* ***** ***** ***** ***** ***** ***** ***** ***** */
112 
113  /* ***** ***** ***** ***** ***** ***** ***** ***** */
114  /* Initialize the hits vector for each alignment */
115  vectHits[i] = new float[numResiduesAlig[i]];
116  utils::initlVect(vectHits[i], numResiduesAlig[i], 0);
117  /* ***** ***** ***** ***** ***** ***** ***** ***** */
118 
119  /* ***** ***** ***** ***** ***** ***** ***** ***** */
120  for(j = 0, pairRes = 0, hits = 0; j < numResiduesAlig[i]; j++, pairRes = 0, hits = 0) {
121 
122  /* ***** ***** ***** ***** ***** ***** ***** ***** */
123  /* Get back each column from the current selected
124  * alignment */
125  vectAlignments[i] -> getColumnSeqMatrix(j, columnSeqMatrix);
126  /* ***** ***** ***** ***** ***** ***** ***** ***** */
127 
128  /* ***** ***** ***** ***** ***** ***** ***** ***** */
129  /* For each position from the previous recovered
130  * columns, we carry out the analysis to detect the
131  * same residues pair in the rest of the alignmetns */
132  for(k = 0; k < numSeqs; k++) {
133 
134  /* ***** ***** ***** ***** ***** ***** ***** ***** */
135  /* If there is a valid residue, we go ahead with
136  * the analysis */
137  if(columnSeqMatrix[k] != 0) {
138  /* ***** ***** ***** ***** ***** ***** ***** ***** */
139  for(l = 0; l < i; l++) {
140  /* Recover the residue pairs from the others aligs */
141  vectAlignments[l] -> getColumnSeqMatrix(columnSeqMatrix[k], k, columnSeqMatrixAux);
142  /* ***** ***** ***** ***** ***** ***** ***** ***** */
143  /* and count the similar residue pairs */
144  for(m = k + 1; m < numSeqs; m++)
145  if(columnSeqMatrix[m] != 0) {
146  if(columnSeqMatrix[m] == columnSeqMatrixAux[m])
147  hits++;
148  pairRes++;
149  }
150  }
151  /* ***** ***** ***** ***** ***** ***** ***** ***** */
152 
153  /* ***** ***** ***** ***** ***** ***** ***** ***** */
154  for(l = i + 1; l < numAlignments; l++) {
155  /* Recover the residue pairs from the others aligs */
156  vectAlignments[l] -> getColumnSeqMatrix(columnSeqMatrix[k], k, columnSeqMatrixAux);
157  /* ***** ***** ***** ***** ***** ***** ***** ***** */
158  /* and count the similar residue pairs */
159  for(m = k + 1; m < numSeqs; m++)
160  if(columnSeqMatrix[m] != 0) {
161  if(columnSeqMatrix[m] == columnSeqMatrixAux[m])
162  hits++;
163  pairRes++;
164  }
165  }
166  /* ***** ***** ***** ***** ***** ***** ***** ***** */
167  }
168  }
169  /* ***** ***** ***** ***** ***** ***** ***** ***** */
170 
171  /* ***** ***** ***** ***** ***** ***** ***** ***** */
172  /* For each column, compute the hits proportion for
173  * every residue pair against the rest of alignments */
174  if(pairRes != 0) {
175  vectHits[i][j] += ((1.0 * hits)/pairRes);
176  value += vectHits[i][j];
177  }
178  /* ***** ***** ***** ***** ***** ***** ***** ***** */
179  }
180 
181  /* ***** ***** ***** ***** ***** ***** ***** ***** */
182  /* The method can offer some information about the
183  * comparison progression */
184  if(verbosity)
185  cout << "File:\t\t" << fileNames[i] << endl << "Values:\t\tSequences: " << numSeqs
186  << "\tResidues: " << numResiduesAlig[i] << "\tPond. Hits: " << setw(8)
187  << value << "\t%Consistency: " << value/numResiduesAlig[i] << endl;
188  /* ***** ***** ***** ***** ***** ***** ***** ***** */
189 
190  /* ***** ***** ***** ***** ***** ***** ***** ***** */
191  /* Keep the alignment with higher consistency value */
192  if((value/numResiduesAlig[i]) > max) {
193  alig = i;
194  max = value/numResiduesAlig[i];
195  }
196  /* ***** ***** ***** ***** ***** ***** ***** ***** */
197  }
198 
199  /* ***** ***** ***** ***** ***** ***** ***** ***** */
200  /* Prints the alignment that have been selected */
201  if((verbosity) && (!appearErrors)) {
202  cout << "\t\t\t\t\t--------------" << endl;
203  cout << endl << "File Selected:\t" << fileNames[alig] << endl << "Value:\t\t" << max << endl << endl;
204  }
205  /* ***** ***** ***** ***** ***** ***** ***** ***** */
206 
207  /* ***** ***** ***** ***** ***** ***** ***** ***** */
208  /* The method returns a vector with the consistency
209  * value for each column in the selected alignment */
210  if((columnsValue != NULL) && (!appearErrors)) {
211  utils::initlVect(columnsValue, numResiduesAlig[alig], -1);
212  for(i = 0; i < numResiduesAlig[alig]; i++)
213  columnsValue[i] = vectHits[alig][i];
214  }
215  /* ***** ***** ***** ***** ***** ***** ***** ***** */
216 
217  /* ***** ***** ***** ***** ***** ***** ***** ***** */
218  /* Deallocate memmory */
219  for(i = 0; ((i < numAlignments) && (!appearErrors)); i++)
220  delete [] vectHits[i];
221  delete [] vectHits;
222 
223  delete [] names;
224  delete [] correspNames;
225  delete [] numResiduesAlig;
226  delete [] columnSeqMatrix;
227  delete [] columnSeqMatrixAux;
228  /* ***** ***** ***** ***** ***** ***** ***** ***** */
229 
230  /* ***** ***** ***** ***** ***** ***** ***** ***** */
231  /* Return the selected alignment index or an error
232  * flag otherwise */
233  if(appearErrors) return -1;
234  else return alig;
235  /* ***** ***** ***** ***** ***** ***** ***** ***** */
236 }
237 
238 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
239 /* This method returns the consistency value vector for a given alignment
240  * against a set of alignments with the same sequences */
241 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
242 bool compareFiles::forceComparison(alignment **vectAlignments, int numAlignments, alignment *selected, float *columnsValue) {
243 
244  int *correspNames, *columnSeqMatrix, *columnSeqMatrixAux;
245  int i, j, k, ll, numResidues, numSeqs, pairRes, hit;
246  bool appearErrors = false;
247  string *names;
248 
249  /* ***** ***** ***** ***** ***** ***** ***** ***** */
250  /* Get some parameters from the alignment that has
251  * been selected */
252  numResidues = selected -> getNumAminos();
253  numSeqs = selected -> getNumSpecies();
254  /* ***** ***** ***** ***** ***** ***** ***** ***** */
255 
256  /* ***** ***** ***** ***** ***** ***** ***** ***** */
257  /* Initialize the vector where we are going to store
258  * the proportion of hits for each column in the
259  * selected alignment */
260  utils::initlVect(columnsValue, numResidues, 0);
261  /* ***** ***** ***** ***** ***** ***** ***** ***** */
262 
263  /* ***** ***** ***** ***** ***** ***** ***** ***** */
264  /* Allocate dinamic local memory */
265  names = new string[numSeqs];
266  correspNames = new int[numSeqs];
267  columnSeqMatrix = new int[numSeqs];
268  columnSeqMatrixAux = new int[numSeqs];
269  /* ***** ***** ***** ***** ***** ***** ***** ***** */
270 
271  /* ***** ***** ***** ***** ***** ***** ***** ***** */
272  /* Check that all of alignment has the same number of
273  * sequence as well as there exists a correspondence
274  * between the names for each pars of aligs. */
275  for(i = 0; i < numAlignments; i++) {
276  /* ***** ***** ***** ***** ***** ***** ***** ***** */
277  if(numSeqs != vectAlignments[i] -> getNumSpecies()) {
278  cerr << endl << "ERROR: The files to compare do not have "
279  << "the same number of sequences" << endl << endl;
280  appearErrors = true;
281  break;
282  }
283  /* ***** ***** ***** ***** ***** ***** ***** ***** */
284 
285  /* ***** ***** ***** ***** ***** ***** ***** ***** */
286  vectAlignments[i] -> getSequences(names);
287  if(!selected -> getSeqNameOrder(names, correspNames)) {
288  cerr << endl << "ERROR: The files to compare do not"
289  << " have the sequence names" << endl << endl;
290  appearErrors = true;
291  break;
292  }
293  /* ***** ***** ***** ***** ***** ***** ***** ***** */
294  }
295  /* ***** ***** ***** ***** ***** ***** ***** ***** */
296 
297  /* ***** ***** ***** ***** ***** ***** ***** ***** */
298  /* Changes the order in sequences number matrix
299  * according to the order in the selected alignment */
300  for(i = 0; i < numAlignments; i++) {
301  vectAlignments[i] -> getSequences(names);
302  selected -> getSeqNameOrder(names, correspNames);
303  vectAlignments[i] -> setSeqMatrixOrder(correspNames);
304  }
305  /* ***** ***** ***** ***** ***** ***** ***** ***** */
306 
307  /* ***** ***** ***** ***** ***** ***** ***** ***** */
308  /* Do the same analysis for each column */
309  for(i = 0, pairRes = 0, hit = 0; ((i < numResidues) && (!appearErrors)); i++, pairRes = 0, hit = 0) {
310 
311  /* ***** ***** ***** ***** ***** ***** ***** ***** */
312  /* We get back the sequence position for each residue
313  * from every column in the selected alignment */
314  utils::initlVect(columnSeqMatrix, numSeqs, 0);
315  selected -> getColumnSeqMatrix(i, columnSeqMatrix);
316  /* ***** ***** ***** ***** ***** ***** ***** ***** */
317 
318  /* ***** ***** ***** ***** ***** ***** ***** ***** */
319  /* For each residue pairs, we look for it in the rest
320  * of alignments */
321  for(j = 0; j < numSeqs; j++) {
322 
323  /* ***** ***** ***** ***** ***** ***** ***** ***** */
324  /* If there is a residue, not a gap, we carry out the
325  * assesment */
326  if(columnSeqMatrix[j] != 0) {
327  for(k = 0; k < numAlignments; k++) {
328 
329  /* ***** ***** ***** ***** ***** ***** ***** ***** */
330  /* We look for the same residue in the same row in
331  * the rest of alignments */
332  utils::initlVect(columnSeqMatrixAux, numSeqs, 0);
333  vectAlignments[k] -> getColumnSeqMatrix(columnSeqMatrix[j], j, columnSeqMatrixAux);
334  /* ***** ***** ***** ***** ***** ***** ***** ***** */
335 
336  /* ***** ***** ***** ***** ***** ***** ***** ***** */
337  /* We count when we get the same residue pairs in the
338  * rest of alignments */
339  for(ll = j + 1; ll < numSeqs; ll++)
340  if(columnSeqMatrix[ll] != 0) {
341  if(columnSeqMatrix[ll] == columnSeqMatrixAux[ll])
342  hit++;
343  pairRes++;
344  }
345  /* ***** ***** ***** ***** ***** ***** ***** ***** */
346  }
347  /* ***** ***** ***** ***** ***** ***** ***** ***** */
348  }
349  /* ***** ***** ***** ***** ***** ***** ***** ***** */
350  }
351  /* ***** ***** ***** ***** ***** ***** ***** ***** */
352  /* Store the hits proportion for each column */
353  if(pairRes != 0) columnsValue[i] += ((1.0 * hit)/pairRes);
354  /* ***** ***** ***** ***** ***** ***** ***** ***** */
355  }
356  /* ***** ***** ***** ***** ***** ***** ***** ***** */
357 
358  /* ***** ***** ***** ***** ***** ***** ***** ***** */
359  /* Deallocate dinamic memory */
360  delete [] names;
361  delete [] correspNames;
362  delete [] columnSeqMatrix;
363  delete [] columnSeqMatrixAux;
364  /* ***** ***** ***** ***** ***** ***** ***** ***** */
365 
366  /* ***** ***** ***** ***** ***** ***** ***** ***** */
367  /* If it was fine, return true. Otherwise, return
368  * false */
369  if(appearErrors) return false;
370  else return true;
371  /* ***** ***** ***** ***** ***** ***** ***** ***** */
372 }
373 
374 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
375 /* This method applies a specific windows size to a selected alignment */
376 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
377 bool compareFiles::applyWindow(int columns, int halfWindow, float *columnsValue) {
378 
379  int i, j, window;
380  float *vectAux;
381 
382  /* ***** ***** ***** ***** ***** ***** ***** ***** */
383  /* If windows size is greater than 1/4 of alignment
384  *length, trimAl rejects this windows size */
385  if(halfWindow > columns/4) return false;
386  else window = (2 * halfWindow + 1);
387  /* ***** ***** ***** ***** ***** ***** ***** ***** */
388 
389  /* ***** ***** ***** ***** ***** ***** ***** ***** */
390  /* Allocate local memory. Copy the array values to
391  * auxiliar memory. */
392  vectAux = new float[columns];
393  utils::copyVect(columnsValue, vectAux, columns);
394  /* ***** ***** ***** ***** ***** ***** ***** ***** */
395 
396  /* ***** ***** ***** ***** ***** ***** ***** ***** */
397  /* For each column from the selected alignment,
398  * compute the average for its consistency values */
399  for(i = 0; i < columns; i++) {
400  /* ***** ***** ***** ***** ***** ***** ***** ***** */
401  /* This average is computed from halfWindow positions
402  * before to halfWindow positions after */
403  for(j = i - halfWindow, columnsValue[i] = 0; j <= i + halfWindow; j++) {
404  if(j < 0) columnsValue[i] += vectAux[-j];
405  else if(j >= columns)
406  columnsValue[i] += vectAux[((2 * columns - j) - 2)];
407  else columnsValue[i] += vectAux[j];
408  }
409  /* ***** ***** ***** ***** ***** ***** ***** ***** */
410 
411  /* ***** ***** ***** ***** ***** ***** ***** ***** */
412  /* Finally, the column value is divided by the window
413  * size in order to compute the average score. */
414  columnsValue[i] /= window;
415  }
416  /* ***** ***** ***** ***** ***** ***** ***** ***** */
417 
418  /* ***** ***** ***** ***** ***** ***** ***** ***** */
419  /* Deallocate dinamic memory */
420  delete [] vectAux;
421  /* ***** ***** ***** ***** ***** ***** ***** ***** */
422 
423  /* ***** ***** ***** ***** ***** ***** ***** ***** */
424  /* If everything is OK, return true */
425  return true;
426  /* ***** ***** ***** ***** ***** ***** ***** ***** */
427 }
428 
429 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
430 /* Print the consistency value for each column from the selected alignment */
431 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
432 void compareFiles::printStatisticsFileColumns(int numAminos, float *compareVect) {
433 
434  /* ***** ***** ***** ***** ***** ***** ***** ***** */
435  /* Prepare the header information */
436  cout << "| Residue\tConsistency |" << endl;
437  cout << "| Number \t Value |" << endl;
438  cout << "+---------------------------+" << endl;
439  cout.precision(10);
440  /* ***** ***** ***** ***** ***** ***** ***** ***** */
441 
442  /* ***** ***** ***** ***** ***** ***** ***** ***** */
443  /* Print the consistency values for each column from
444  * the selected alignment */
445  for(int i = 0; i < numAminos; i++)
446  cout << " " << setw(5) << i + 1 << "\t"
447  << "\t" << compareVect[i] << endl;
448  /* ***** ***** ***** ***** ***** ***** ***** ***** */
449 }
450 
451 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
452 /* Print the consistency values accumulative distribution for the selected
453  * alignment */
454 /* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
455 void compareFiles::printStatisticsFileAcl(int numAminos, float *compareVect) {
456 
457  float refer, *vectAux;
458  int i, num;
459 
460  /* ***** ***** ***** ***** ***** ***** ***** ***** */
461  /* Allocate dinamic memory to copy the input vector
462  * and sort it */
463  vectAux = new float[numAminos];
464  utils::copyVect(compareVect, vectAux, numAminos);
465  utils::quicksort(vectAux, 0, numAminos-1);
466  /* ***** ***** ***** ***** ***** ***** ***** ***** */
467 
468  /* ***** ***** ***** ***** ***** ***** ***** ***** */
469  /* Set the output precision and print the header */
470  cout << "| Number of\t \t|\t Cumulative \t% "
471  << "Cumulative\t| Consistency |" << endl;
472  cout << "| Residues \t% Length\t|\tNumberResid.\t "
473  << "Length \t| Value |" << endl;
474  cout << "+-------------------------------+------------"
475  << "---------------------------+-----------------+"
476  << endl;
477  cout.precision(10);
478  /* ***** ***** ***** ***** ***** ***** ***** ***** */
479 
480  /* ***** ***** ***** ***** ***** ***** ***** ***** */
481  /* Fix the initial values to count how many columns
482  * has the same consistency value */
483  refer = vectAux[0];
484  num = 1;
485  /* ***** ***** ***** ***** ***** ***** ***** ***** */
486 
487  /* ***** ***** ***** ***** ***** ***** ***** ***** */
488  /* Print the accumulative distribution */
489  for(i = 1; i < numAminos; i++) {
490  /* ***** ***** ***** ***** ***** ***** ***** ***** */
491  /* When the method detects a new consistency value
492  * print the previous value as well as its frequency
493  * and starts to count how many columns are for this
494  * new value */
495  if(refer != vectAux[i]) {
496  cout << " " << num << "\t\t" << setw(10) << ((float) num/numAminos * 100.0)
497  << "\t\t" << i << "\t\t" << setw(10) << ((float) i/numAminos * 100.0)
498  << "\t" << setw(15) << refer << endl;
499  refer = vectAux[i];
500  num = 1;
501  }
502  else num++;
503  /* ***** ***** ***** ***** ***** ***** ***** ***** */
504  }
505 
506  /* ***** ***** ***** ***** ***** ***** ***** ***** */
507  /* Print the last consistency value as well as its
508  * frequency */
509  cout << " " << num << "\t\t" << setw(10) << ((float) num/numAminos * 100.0)
510  << "\t\t" << i << "\t\t" << setw(10) << ((float) i/numAminos * 100.0)
511  << "\t" << setw(15) << refer << endl;
512  /* ***** ***** ***** ***** ***** ***** ***** ***** */
513 
514  /* ***** ***** ***** ***** ***** ***** ***** ***** */
515  /* Deallocate dinamic memory */
516  delete [] vectAux;
517  /* ***** ***** ***** ***** ***** ***** ***** ***** */
518 }