similarityMatrix.cpp
Go to the documentation of this file.
1 /* *****************************************************************************
2 
3  trimAl v2.0: a tool for automated alignment trimming in large-scale
4  phylogenetics analyses.
5 
6  readAl v2.0: a tool for automated alignment conversion among different
7  formats.
8 
9  2009-2019
10  Fernandez-Rodriguez V. (victor.fernandez@bsc.es)
11  Capella-Gutierrez S. (salvador.capella@bsc.es)
12  Gabaldon, T. (tgabaldon@crg.es)
13 
14  This file is part of trimAl/readAl.
15 
16  trimAl/readAl are free software: you can redistribute it and/or modify
17  it under the terms of the GNU General Public License as published by
18  the Free Software Foundation, the last available version.
19 
20  trimAl/readAl are distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23  GNU General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>.
27 
28 ***************************************************************************** */
29 
30 #ifndef SIMMatrix
31 #define SIMMatrix
32 
33 #define NUMAMINOS 20
34 #define TAMABC 28
35 #define LINE_LENGTH 256
36 #define REFER 65
37 
38 #include "defines.h"
39 
40 #endif
41 
43 #include "Statistics/similarityMatrix.h"
44 #include "reportsystem.h"
45 #include "values.h"
46 #include "utils.h"
47 
48 namespace statistics {
49 
51  // Create a timer that will report times upon its destruction
52  // which means the end of the current scope.
53  StartTiming("similarityMatrix::similarityMatrix() ");
54  numPositions = 0;
55  vhash = nullptr;
56  simMat = nullptr;
57  distMat = nullptr;
58  }
59 
61  // Create a timer that will report times upon its destruction
62  // which means the end of the current scope.
63  StartTiming("void similarityMatrix::memoryAllocation(int nPos) ");
64  int i, j;
65 
66  // Initialize square table dimension to store the distances
67  // and to store the similarity matrix.
69  numPositions = nPos;
70 
71  // Reserve memory for all structures
72  vhash = new int[TAMABC];
73 
74  simMat = new float *[nPos];
75  distMat = new float *[nPos];
76 
77  for (i = 0; i < nPos; i++) {
78  simMat[i] = new float[nPos];
79  distMat[i] = new float[nPos];
80 
81  for (j = 0; j < nPos; j++) {
82  distMat[i][j] = 0.0;
83  simMat[i][j] = 0.0;
84  }
85  }
86  }
87 
89  // Create a timer that will report times upon its destruction
90  // which means the end of the current scope.
91  StartTiming("similarityMatrix::~similarityMatrix() ");
92 
94 
95  }
96 
98  // Create a timer that will report times upon its destruction
99  // which means the end of the current scope.
100  StartTiming("void similarityMatrix::memoryDeletion() ");
101  int i;
102 
103  for (i = 0; i < numPositions; i++) {
104  delete[] simMat[i];
105  delete[] distMat[i];
106  }
107 
108  delete[] distMat;
109  delete[] simMat;
110  delete[] vhash;
111 
112  numPositions = 0;
113  distMat = nullptr;
114  simMat = nullptr;
115  vhash = nullptr;
116  }
117 
118  bool similarityMatrix::loadSimMatrix(char *filename) {
119  // Create a timer that will report times upon its destruction
120  // which means the end of the current scope.
121  StartTiming("bool similarityMatrix::loadSimMatrix(char *filename) ");
122  char aux[LINE_LENGTH + 1],
123  first[LINE_LENGTH],
124  listSym[LINE_LENGTH + 1];
125  int i, j, k;
126  float sum;
127  bool firstColumn = true;
128  std::ifstream file;
129 
130  // We try to open the file, if we can't open the file
131  // we return false.
132  file.open(filename);
133  if (file.fail())
134  return false;
135 
136  // Read the first line of the file and, depending on the
137  // line length (free of spaces an tabulators), we allocate
138  // memory for the object structures
139  file.getline(aux, LINE_LENGTH);
140  utils::removeSpaces(aux, listSym);
141  memoryAllocation(strlen(listSym));
142 
144  // We create the hashing vector
145  for (i = 0; i < numPositions; i++) {
146  listSym[i] = (char) toupper((int) listSym[i]);
147 
148  if ((listSym[i] >= 'A') && (listSym[i] <= 'Z')) {
149  if ((vhash[listSym[i] - 'A']) != -1) {
151  return false;
152  }
153  vhash[listSym[i] - 'A'] = i;
154 
155  } else {
157  return false;
158  }
159  }
160 
161  for (i = 0; i < numPositions; i++) {
162  // Read the first symbol of the line
163  j = 0;
164  file >> first;
165 
166  // If the format includes the first aminoacid in the line
167  if (firstColumn) {
168  first[0] = (char) toupper((int) first[0]);
169 
170  // Format checking. The first token must not be a valid number
171  if (((first[0] >= '0' && first[0] <= '9') ||
172  (first[0] == '-' && (first[1] >= '0' && first[1] <= '9')))
173  && i > 0) {
175  return false;
176  }
177 
178  // If in the token is a character, there is "first column"
179  // in the format of the alignment
180  if ((first[0] >= 'A') && (first[0] <= 'Z')) {
181  firstColumn = true;
182 
183  if ((vhash[first[0] - 'A']) == -1) {
185  return false;
186  }
187  }
188 
189  // If we have read a number there is no "first column"
190  // in the format of the alignment
191  else if ((first[0] >= '0' && first[0] <= '9') || (first[0] == '-' && (first[1] >= '0' && first[1] <= '9'))) {
192  firstColumn = false;
193  j = 1;
194 
195  simMat[i][0] = atof(first);
196  first[0] = listSym[i];
197  }
198 
199  } else {
200  j = 1;
201 
202  // Do some checkings
203  if ((first[0] >= 'A') && (first[0] <= 'Z') && (i > 0)) {
205  return false;
206  }
207 
208  simMat[i][0] = atof(first);
209  first[0] = listSym[i];
210  }
211 
212  // Read the corresponding number row
213  for (; j < numPositions; j++)
214  file >> simMat[vhash[first[0] - 'A']][j];
215  }
216 
217  // Calculate the average between two simmetric positions
218  // respect to the diagonal of the matrix (between [i][j] and [j][i]
219  // If the input is a non-symmetric matrix, the output will be a
220  // symmetric matrix
221 
222  for (i = 0; i < numPositions; i++) {
223  for (j = i + 1; j < numPositions; j++) {
224  if (simMat[i][j] != simMat[j][i]) {
225  float value = (simMat[i][j] + simMat[j][i]) / 2.0;
226  simMat[i][j] = value;
227  simMat[j][i] = value;
228  }
229  }
230  }
231 
232  // Calculate the distances between aminoacids
233  // based on Euclidean distance
234 
235  for (j = 0; j < numPositions; j++) {
236  for (i = 0; i < numPositions; i++) {
237  if ((i != j) && (distMat[i][j] == 0.0)) {
238  for (k = 0, sum = 0; k < numPositions; k++)
239  sum += ((simMat[k][j] - simMat[k][i]) * (simMat[k][j] - simMat[k][i]));
240  sum = (float) sqrt(sum);
241  distMat[i][j] = sum;
242  distMat[j][i] = sum;
243  }
244  }
245  }
246 
247  file.close();
248  return true;
249  }
250 
252  // Create a timer that will report times upon its destruction
253  // which means the end of the current scope.
254  StartTiming("void similarityMatrix::defaultAASimMatrix(void) ");
255 
256  int i, j, k;
257  float sum;
258 
260  for (i = 0; i < TAMABC; i++)
261  vhash[i] = -1;
262 
263  // We create the hashing vector
264  for (i = 0; i < numPositions; i++)
265  vhash[listAASym[i] - 'A'] = i;
266 
267  for (i = 0; i < numPositions; i++)
268  for (j = 0; j < numPositions; j++)
269  simMat[i][j] = defaultAAMatrix[i][j];
270 
271  // Calculate the distances between aminoacids
272  // based on Euclidean distance
273  for (j = 0; j < numPositions; j++) {
274  for (i = 0; i < numPositions; i++) {
275  if ((i != j) && (distMat[i][j] == 0.0)) {
276  for (k = 0, sum = 0; k < numPositions; k++)
277  sum += ((simMat[k][j] - simMat[k][i]) * (simMat[k][j] - simMat[k][i]));
278  sum = (float) sqrt(sum);
279  distMat[i][j] = sum;
280  distMat[j][i] = sum;
281  }
282  }
283  }
284  }
285 
287  // Create a timer that will report times upon its destruction
288  // which means the end of the current scope.
289  StartTiming("void similarityMatrix::defaultNTSimMatrix(void) ");
290  int i, j, k;
291  float sum;
292 
294  for (i = 0; i < TAMABC; i++)
295  vhash[i] = -1;
296 
297  // We create the hashing vector
298  for (i = 0; i < numPositions; i++)
299  vhash[listNTSym[i] - 'A'] = i;
300 
301  for (i = 0; i < numPositions; i++)
302  for (j = 0; j < numPositions; j++)
303  simMat[i][j] = defaultNTMatrix[i][j];
304 
305  // Calculate the distances between aminoacids
306  // based on Euclidean distance
307  for (j = 0; j < numPositions; j++) {
308  for (i = 0; i < numPositions; i++) {
309  if ((i != j) && (distMat[i][j] == 0.0)) {
310  for (k = 0, sum = 0; k < numPositions; k++)
311  sum += ((simMat[k][j] - simMat[k][i]) * (simMat[k][j] - simMat[k][i]));
312  sum = (float) sqrt(sum);
313  distMat[i][j] = sum;
314  distMat[j][i] = sum;
315  }
316  }
317  }
318  }
319 
321  // Create a timer that will report times upon its destruction
322  // which means the end of the current scope.
323  StartTiming("void similarityMatrix::defaultNTDegeneratedSimMatrix(void) ");
324  int i, j, k;
325  float sum;
326 
328  for (i = 0; i < TAMABC; i++)
329  vhash[i] = -1;
330 
331  // We create the hashing vector
332  for (i = 0; i < numPositions; i++)
333  vhash[listNTDegenerateSym[i] - 'A'] = i;
334 
335  for (i = 0; i < numPositions; i++)
336  for (j = 0; j < numPositions; j++)
337  simMat[i][j] = defaultNTDegeneratedMatrix[i][j];
338 
339  // Calculate the distances between nucleotides based on Euclidean distance
340  for (j = 0; j < numPositions; j++) {
341  for (i = 0; i < numPositions; i++) {
342  if ((i != j) && (distMat[i][j] == 0.0)) {
343  for (k = 0, sum = 0; k < numPositions; k++)
344  sum += ((simMat[k][j] - simMat[k][i]) * (simMat[k][j] - simMat[k][i]));
345  sum = (float) sqrt(sum);
346  distMat[i][j] = sum;
347  distMat[j][i] = sum;
348  }
349  }
350  }
351  }
352 
354  int datatype) {
355  int i, j, k;
356  float sum;
357 
358  // Allocate memory depending on the input datatype
359  switch (datatype) {
360  case SequenceTypes::AA:
362  break;
363  case SequenceTypes::DNA:
364  case SequenceTypes::RNA:
366  break;
370  break;
371  }
372 
373  for (i = 0; i < TAMABC; i++)
374  vhash[i] = -1;
375 
376  // We create the hashing vector taking into account the input datatype
377  for (i = 0; i < numPositions; i++) {
378  switch (datatype) {
379  case SequenceTypes::AA:
380  vhash[listAASym[i] - 'A'] = i;
381  break;
382  case SequenceTypes::DNA:
383  case SequenceTypes::RNA:
384  vhash[listNTSym[i] - 'A'] = i;
385  break;
388  vhash[listNTDegenerateSym[i] - 'A'] = i;
389  break;
390  }
391  }
392 
393  // Working similarity matrix is set depending on the pre loaded matrices
394  for (i = 0; i < numPositions; i++) {
395  for (j = 0; j < numPositions; j++) {
396  switch (matrix_code) {
397  case 1:
398  simMat[i][j] = alternative_1_NTDegeneratedMatrix[i][j];
399  break;
400  }
401  }
402  }
403 
404  // Calculate the distances between residues based on Euclidean distance
405  for (j = 0; j < numPositions; j++) {
406  for (i = 0; i < numPositions; i++) {
407  if ((i != j) && (distMat[i][j] == 0.0)) {
408  for (k = 0, sum = 0; k < numPositions; k++)
409  sum += ((simMat[k][j] - simMat[k][i]) * (simMat[k][j] - simMat[k][i]));
410  sum = (float) sqrt(sum);
411  distMat[i][j] = sum;
412  distMat[j][i] = sum;
413  }
414  }
415  }
416  }
417 
419  // Create a timer that will report times upon its destruction
420  // which means the end of the current scope.
421  StartTiming("void similarityMatrix::printMatrix() ");
422 
423  for (int i = 0; i < numPositions; i++) {
424  for (int j = 0; j < numPositions; j++)
425  std::cerr << std::setw(8) << std::setprecision(4) << std::right << simMat[i][j];
426  std::cerr << std::endl;
427  }
428  }
429 
430  float similarityMatrix::getDistance(char a, char b) {
431  // Create a timer that will report times upon its destruction
432  // which means the end of the current scope.
433  StartTiming("float similarityMatrix::getDistance(char a, char b) ");
434  int numa = -1, numb = -1;
435 
436  // Search the first character position
437  if ((a >= 'A') && (a <= 'Z')) numa = vhash[a - 'A'];
438  else {
439  debug.report(ErrorCode::IncorrectSymbol, new std::string[1]{std::string(1, a)});
440  return -1;
441  }
442 
443  // Search the second character position
444  if ((b >= 'A') && (b <= 'Z')) numb = vhash[b - 'A'];
445  else {
446  debug.report(ErrorCode::IncorrectSymbol, new std::string[1]{std::string(1, b)});
447  return -1;
448  }
449 
450  // We check if the two character positions are valid positions
451  if (numa == -1) {
452  debug.report(ErrorCode::UndefinedSymbol, new std::string[1]{std::string(1, a)});
453  return -1;
454  }
455 
456  if (numb == -1) {
457  debug.report(ErrorCode::UndefinedSymbol, new std::string[1]{std::string(1, b)});
458  return -1;
459  }
460 
461  // Return the distance value between a and b
462  return distMat[numa][numb];
463  }
464 }
1 << 1 = 2
Definition: defines.h:80
void defaultNTDegeneratedSimMatrix()
Method to load the default DEG NT similarity matrix.
1 << 2 = 4
Definition: defines.h:84
SequenceTypes
Definition: defines.h:72
void printMatrix()
Method to print the loaded matrix.
void defaultAASimMatrix()
Method to load the default AA similarity matrix.
void memoryDeletion()
Method to deallocate memory allocated on the similarityMatrix::memoryAllocation method. It makes use of the numPositions to effectively remove the memory.
void alternativeSimilarityMatrices(int matrix_code, int datatype)
Method to load alternative similarity matrices also included on the suite. Currently, only one type of alternative matrix is available: matrix_code: 1 datatype SequenceTypes::AA.
#define StartTiming(name)
void removeSpaces(char *in, char *out)
Removing spaces method.
Definition: utils.cpp:144
float getDistance(char a, char b)
Method to get the similarity distance between two residues, A and B Characters provided must be both ...
bool loadSimMatrix(char *filename)
Method to load a custom matrix.
Class that contains information of similarity matrices. These are used to calculate the similarity be...
void initlVect(int *vector, int tam, int valor)
Vector initialization.
Definition: utils.cpp:40
ErrorCode
Definition: reportsystem.h:56
void defaultNTSimMatrix()
Method to load the default NT similarity matrix.
#define LINE_LENGTH
Namespace containing all classes related to statistics handling.
Definition: Similarity.h:44
reporting::reportManager debug
void report(ErrorCode message, std::string *vars=nullptr)
Method to report an Error. It will be displayed if Level is equal or higher to VerboseLevel::ERROR...
void memoryAllocation(int nPos)
Method to allocate memory for the similiarity matrix.
#define TAMABC
Utilities class. This class contains shared methods to be used in multiple parts of the code...
Definition: utils.h:50
1 << 3 = 8
Definition: defines.h:88
1 << 4 = 16
Definition: defines.h:92