readAlMain.cpp
Go to the documentation of this file.
1 /* *****************************************************************************
2 
3  trimAl v2.0: a tool for automated alignment trimming in large-scale
4  phylogenetics analyses.
5 
6  readAl v2.0: a tool for automated alignment conversion among different
7  formats.
8 
9  2009-2019
10  Fernandez-Rodriguez V. (victor.fernandez@bsc.es)
11  Capella-Gutierrez S. (salvador.capella@bsc.es)
12  Gabaldon, T. (tgabaldon@crg.es)
13 
14  This file is part of trimAl/readAl.
15 
16  trimAl/readAl are free software: you can redistribute it and/or modify
17  it under the terms of the GNU General Public License as published by
18  the Free Software Foundation, the last available version.
19 
20  trimAl/readAl are distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23  GNU General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>.
27 
28 ***************************************************************************** */
29 
30 #include "FormatHandling/FormatManager.h"
31 #include "Alignment/Alignment.h"
32 #include "defines.h"
33 #include "values.h"
34 
35 #include <iostream>
36 #include <cstring>
37 #include <iomanip>
38 
39 int parseArguments(int argc, char *argv[],
40  FormatHandling::FormatManager* machine,
41  std::vector<std::string>* inFiles,
42  std::vector<std::string>* outFormats,
43  std::string* outPattern)
44 {
45  if (argc == 1) return 1;
46  for(int i = 1; i < argc; i++ )
47  {
48  if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help"))
49  return -1;
50  // Check if current argument is the '-in' argument.
51  if (!strcmp(argv[i], "-in"))
52  {
53  // Check if this is the last argument.
54  if (i >= argc -1)
55  {
56  std::cerr << "ERROR: At least one file should be passed after the '-in' argument\n";
57  return 1;
58  }
59 
60  // Check if the next argument is a parameter or file argument.
61  else if (argv[i + 1][0] == '-')
62  {
63  std::cerr << "ERROR: At least one file should be passed after the '-in' argument and you passed argument " << argv[i + 1] << "\n";
64  return 1;
65  }
66 
67  // Add every file that is not a parameter
68  else while(++i != argc)
69  {
70  if (argv[i][0] == '-')
71  {
72  i--;
73  break;
74  }
75  else
76  {
77  inFiles->emplace_back(argv[i]);
78  }
79  }
80  }
81 
82  // Check if current argument is the '-out' argument.
83  else if (!strcmp(argv[i], "-out"))
84  {
85  // Check if this is the last argument.
86  if (i >= argc -1)
87  {
88  std::cerr << "A file pattern should be passed after the '-out' argument\n";
89  return 1;
90  }
91  else
92  {
93  if (argv[i + 1][0] == '-')
94  {
95  std::cerr << "A file pattern should be passed after the '-out' argument and you passed argument " << argv[i + 1] << "\n";
96  return 1;
97  }
98  *outPattern = argv[++i];
99  continue;
100  }
101  }
102 
103  // Check if current argument is the '-formats' argument.
104  else if (!strcmp(argv[i], "-formats"))
105  {
106  if (i >= argc -1)
107  {
108  std::cerr << "A format should be passed after the '-formats' argument\n";
109  return 1;
110  }
111  else while(++i != argc)
112  {
113  if (argv[i][0] == '-')
114  {
115  i--;
116  break;
117  }
118  else
119  {
120  outFormats->emplace_back(argv[i]);
121  }
122  }
123  }
124 
125  else if (!strcmp(argv[i], "-reverse"))
126  {
127  machine->reverse = true;
128  }
129  else if (!strcmp(argv[i], "-keepHeaders"))
130  {
131  machine->keepHeader = true;
132  }
133 
134  //Compatibility with legacy options:
135 
136  else if (!strcmp(argv[i], "-html"))
137  outFormats->emplace_back("html");
138 
139  else if (!strcmp(argv[i], "-nbrf"))
140  outFormats->emplace_back("nbrf");
141 
142  else if (!strcmp(argv[i], "-mega"))
143  outFormats->emplace_back("mega");
144 
145  else if (!strcmp(argv[i], "-nexus"))
146  outFormats->emplace_back("nexus");
147 
148  else if (!strcmp(argv[i], "-clustal"))
149  outFormats->emplace_back("clustal");
150 
151  else if (!strcmp(argv[i], "-fasta") || !strcmp(argv[i], "-onlyseqs"))
152  outFormats->emplace_back("fasta");
153 
154  else if (!strcmp(argv[i], "-fasta_m10"))
155  {
156  outFormats->emplace_back("fasta");
157  }
158 
159  else if (!strcmp(argv[i], "-phylip"))
160  outFormats->emplace_back("phylip40");
161 
162  else if (!strcmp(argv[i], "-phylip_m10"))
163  {
164  outFormats->emplace_back("phylip40_m10");
165  }
166 
167  else if (!strcmp(argv[i], "-phylip_paml"))
168  outFormats->emplace_back("phylippaml");
169 
170  else if (!strcmp(argv[i], "-phylip_paml_m10"))
171  {
172  outFormats->emplace_back("phylippaml_m10");
173  }
174 
175  else if (!strcmp(argv[i], "-phylip3.2"))
176  outFormats->emplace_back("phylip32");
177 
178  else if (!strcmp(argv[i], "-phylip3.2_m10"))
179  {
180  outFormats->emplace_back("phylip32_m10");
181  }
182  else if (!strcmp(argv[i], "-format"))
183  {
184  machine->format = true;
185  }
186  else if (!strcmp(argv[i], "-type"))
187  {
188  machine->type = true;
189  }
190  else if (!strcmp(argv[i], "-info"))
191  {
192  machine->info = true;
193  }
194 
195  // If a command is not recognized, give an error.
196  else
197  {
198  std::cerr << argv[i] << " not recognized or repeated.\n";
199  return 1;
200  }
201  }
202 
203 #if debug
204  std::cout << "Input Files:\n";
205  for (std::string ifile : *inFiles)
206  {
207  std::cout << "-> Input file: " << ifile << "\n";
208  }
209 
210  std::cout << "Out Formats:\n";
211  for (std::string oformat : *outFormats)
212  {
213  std::cout << "-> Output format: " << oformat << "\n";
214  }
215 
216  std::cout << "Under the pattern\n-> " << *outPattern << "\n";
217 #endif
218 
219  return 0;
220 }
221 
222 int checkArguments(FormatHandling::FormatManager* machine, std::vector<std::string>* inFiles, std::vector<std::string>* outFormats, std::string* outPattern)
223 {
224  int returnValue = 0;
225  if (inFiles->size() == 0)
226  {
227  std::cerr << "ERROR: At least one input file must be provided\n";
228  returnValue = 1;
229  }
230  if (*outPattern == "")
231  {
232  if (inFiles->size() == 1 && outFormats->size() == 1 && !(machine->format || machine->info || machine->type))
233  machine->hasOutputFile = false;
234  else if (outFormats->size() != 0)
235  {
236  std::cerr << "ERROR: Terminal output option not compatible with information printing (-info | -format | -type)\n"
237  << "Provide an output format or disable information printing.\n";
238  returnValue = 1;
239  }
240  }
241  else if (outFormats->size() == 0)
242  {
243  std::cerr << "ERROR: At least one output format must be provided\n";
244  returnValue = 1;
245  }
246  return returnValue;
247 }
248 
249 void menu()
250 {
251 
252  std::cout << "\n"
253  << "readAl v" << VERSION << ".rev" << REVISION << " build[" << BUILD
254  << "]. " << AUTHORS << "\n\n"
255 
256  << "readAl webpage: http://trimal.cgenomics.org\n\n"
257 
258  << "This program is free software: you can redistribute it and/or modify "
259  << "\n"
260  << "it under the terms of the GNU General Public License as published by "
261  << "\n"
262  << "the Free Software Foundation, the last available version.\n"
263  << "\n"
264 
265  << "BASIC USAGE\n\n"
266  << "\treadalMS -in <inputfiles> -out <pattern> -format [formats] [options].\n\n"
267 
268  << "\t-h Show this information.\n"
269 // << "\t--version Show readAl version.\n\n"
270 
271  << "\t-in <inputfiles> Input files in several formats. Separated by spaces.\n"
272  << "\t Available formats are: " << FormatHandling::FormatManager().getInputFormatsAvailable() << "\n"
273  << "\t-out <pattern> Output file name pattern (default STDOUT).\n"
274  << "\t It will replace optional the tags [in] -> Original filename without extension.\n"
275  << "\t [format] -> Output's format name\n"
276  << "\t [extension] -> Output's extension\n"
277  << "\n"
278 
279  << "\t-formats Formats you want the output to be converted to.\n"
280  << "\t Available formats are: " << FormatHandling::FormatManager().getOutputFormatsAvailable() << "\n"
281  << "\t Being the HTML format not a format itself, but a colored report of the alignment files.\n\n"
282  << "\t-format Print information about input file format "
283  << "and if sequences are aligned or not.\n"
284 
285  << "\t-type Print information about biological "
286  << "sequences datatype (e.g. nucleotides:dna, nucleotides:rna, aminoacids, etc)"
287  << "\n"
288 
289  << "\t-info Print information about sequences number, "
290  << "average sequence length, max & min sequence length"
291  << "\n"
292 
293  << "\t-reverse Output the reverse of sequences in "
294  << "input file.\n\n"
295 
296  << "\t-keepHeaders Keeps the headers of the original format if it had any\n\n"
297 
298 
299  << "LEGACY OPTIONS\nTake in mind that this arguments may be discontinued any time."<< "\n\n"
300 
301  << "\t-onlyseqs Generate output with only residues from "
302  << "input file\n\n"
303 
304  << "\t-html Output residues colored according their "
305  << "physicochemical properties. HTML file.\n\n"
306 
307 
308  << "\t-nbrf Output file in NBRF/PIR format\n"
309  << "\t-mega Output file in MEGA format\n"
310 
311  << "\t-nexus Output file in NEXUS format\n"
312  << "\t-clustal Output file in CLUSTAL format\n"
313  << "\n"
314 
315  << "\t-fasta Output file in FASTA format\n"
316  << "\t-fasta_m10 Output file in FASTA format. Sequences "
317  << "name up to 10 characters.\n\n"
318 
319  << "\t-phylip Output file in PHYLIP/PHYLIP4 format"
320  << "\n"
321  << "\t-phylip_m10 Output file in PHYLIP/PHYLIP4 format. "
322  << "Sequences name up to 10 characters.\n"
323  << "\t-phylip_paml Output file in PHYLIP format compatible "
324  << "with PAML\n"
325  << "\t-phylip_paml_m10 Output file in PHYLIP format compatible "
326  << "with PAML. Sequences name up to 10 characters.\n"
327  << "\t-phylip3.2 Output file in PHYLIP3.2 format\n"
328  << "\t-phylip3.2_m10 Output file in PHYLIP3.2 format. Sequences"
329  << " name up to 10 characters.\n\n"
330  << "If you specify any m10 format, this will result in all formats having the sequences names shortened as this has the same effect as '-shortNames' argument\n\n"
331 
332 
333  << "EXAMPLES OF USE\n\n"
334 
335  << "\treadalMS -in ./dataset/AA1.fas -out ./dataset/[in].output.[extension] -formats clustal\n"
336  << "\t -> Will produce ./dataset/AA1.output.clw\n\n"
337 
338  << "\treadalMS -in ./dataset/example1.clw -out ./dataset/[in].[format].[extension] -formats fasta phylip32 phylip40\n"
339  << "\t -> Will produce ./dataset/example1.FASTA.fasta ./dataset/example1.PHYLIP32.phy ./dataset/example1.PHYLIP40.phy\n\n"
340 
341  << "\treadalMS -in ./dataset/example1.clw -out ./dataset/[in]/[format].[extension] -formats fasta phylip32 phylip40\n"
342  << "\t -> Will produce ./dataset/example1/FASTA.fasta ./dataset/example1/PHYLIP32.phy ./dataset/example1/PHYLIP40.phy\n"
343  << "\t ONLY if ./dataset/example1/ already exists.\n\n"
344 
345  << "\treadalMS -in ./dataset/AA1.fas ./dataset/AA2.fas -out ./dataset/[in].output.[extension] -formats clustal pir\n"
346  << "\t -> Will produce ./dataset/AA1.output.clw ./dataset/AA2.output.clw ./dataset/AA1.output.pir ./dataset/AA2.output.pir\n\n"
347 
348  << "\treadalMS -in ./dataset/AA1.fas -format -type -info\n"
349  << "\t -> Will produce terminal output giving information about AA1.fas alignment file\n\n"
350 
351  << "\treadalMS -in ./dataset/AA1.fas ./dataset/AA2.fas -out ./dataset/[in].output.[extension] -formats html\n"
352  << "\t -> Will produce ./dataset/AA1.output.html ./dataset/AA2.output.html\n"
353  << "\t Those files are not indeed reformats of the original alignments, but an HTML colored report of the alignment file.\n"
354 
355  << "\n";
356 }
357 
358 
359 int main(int argc, char *argv[])
360 {
362 
363  std::vector<std::string> outFormats = std::vector<std::string>();
364  std::vector<std::string> inFiles = std::vector<std::string>();
365  std::string outPattern;
366 
367  int result = parseArguments(argc, argv, &MachineState, &inFiles, &outFormats, &outPattern);
368  if (result == 1)
369  {
370  menu();
371  return 0;
372  }
373  else if (result != 0) return result;
374 
375  result = checkArguments(&MachineState, &inFiles, &outFormats, &outPattern);
376  if (result != 0) return result;
377 
378  if(MachineState.format || MachineState.info || MachineState.type) {
379  for (const std::string &str : inFiles)
380  {
381  Alignment* alignment = MachineState.loadAlignment(str);
382  if (alignment != nullptr)
383  {
384 
385  std::cout << "## Alignment File:\t" << str << "\n";
386 
387  if (MachineState.format)
388  /* Inform about if sequences are aligned or not */
389  std::cout << "## Input file format\t" << MachineState.getFileFormatName(str) << "\n"
390  << "## Input file aligned\t" << (alignment->isAligned ? "YES":"NO")
391  << "\n";
392 
393  if(MachineState.type) {
394  /* Inform about biological datatype */
396  std::cout << "## Input file datatype\tnucleotides:dna\n";
398  std::cout << "## Input file datatype\tnucleotides:dna_degenerate_codes\n";
399  else if (alignment->getAlignmentType() == SequenceTypes::RNA)
400  std::cout << "## Input file datatype\tnucleotides:rna\n";
402  std::cout << "## Input file datatype\tnucleotides:rna_degenerate_codes\n";
403  else if (alignment->getAlignmentType() == SequenceTypes::AA)
404  std::cout << "## Input file datatype\tamino-acids\n";
406  std::cout << "## Input file datatype\tamino-acids_degenerate_codes\n";
407  else
408  std::cout << "## Input file datatype\tunknown\n";
409  }
410 
411  if(MachineState.info)
412  alignment->printAlignmentInfo(std::cout);
413 
414  std::cout << "\n";
415 
416  if (!outFormats.empty())
417  MachineState.saveAlignment(outPattern, outFormats, *alignment);
418 
419  }
420  delete alignment;
421  }
422  }
423  else if (!outFormats.empty() || MachineState.reverse)
424  MachineState.loadAndSaveMultipleAlignments(inFiles, outPattern, outFormats);
425  else
426  std::cerr << "ERROR: An option has to be chosen\n";
427 }
1 << 1 = 2
Definition: defines.h:80
Class to handle Format Handlers . It serves as a proxy to the handlers, so the code outside the Form...
Definition: FormatManager.h:64
bool format
Tag to know if the machine should output the format information about the alignment.
#define AUTHORS
Definition: defines.h:36
1 << 2 = 4
Definition: defines.h:84
#define REVISION
Definition: defines.h:35
SequenceTypes
Definition: defines.h:72
void printAlignmentInfo(std::ostream &output)
Print information about sequences number, average sequence length, maximum and minimum sequences leng...
Definition: Alignment.cpp:868
int checkArguments(FormatHandling::FormatManager *machine, std::vector< std::string > *inFiles, std::vector< std::string > *outFormats, std::string *outPattern)
Definition: readAlMain.cpp:222
Alignment * loadAlignment(const std::string &inFile)
Function that loads an alignment given a file path. It automatically detects the format of the file...
void loadAndSaveMultipleAlignments(const std::vector< std::string > &inFiles, const std::string &outPattern, const std::vector< std::string > &outFormats)
Function that takes multiple files, loads them and saves in a cumulus of formats, using an outPattern...
Namespace that encapsulates all logic referent to Load, Save and Recognize Multiple Sequence Alignm...
Definition: FormatManager.h:52
bool reverse
Tag to know if sequences should be reversed before saving them.
Class containing an alignment This class stores the alignment sequences with it&#39;s names...
Definition: Alignment.h:49
int main(int argc, char *argv[])
Definition: trimAlMain.cpp:34
bool keepHeader
Tag to know if the machine should keep original headers.
int parseArguments(int argc, char *argv[], FormatHandling::FormatManager *machine, std::vector< std::string > *inFiles, std::vector< std::string > *outFormats, std::string *outPattern)
Definition: readAlMain.cpp:39
std::string getOutputFormatsAvailable()
Function to obtain all format names available by this object that can save an alignment.
bool info
Tag to know if the machine should output the information of the alignment.
std::string getFileFormatName(const std::string &inFile)
Function to obtain the format name of a given file.
bool hasOutputFile
Tag to know if the machine has an output file or it has to output to console.
Definition: FormatManager.h:99
int getAlignmentType() const
Alignment type getter. See SequenceTypes.
Definition: Alignment.cpp:469
bool isAligned
Flag that indicates if all sequences on the alignment have the same length (Including gaps)...
Definition: Alignment.h:76
#define VERSION
Definition: defines.h:34
1 << 3 = 8
Definition: defines.h:88
1 << 4 = 16
Definition: defines.h:92
void menu(void)
Definition: main.cpp:1798
std::string getInputFormatsAvailable()
Function to obtain all format names available by this object that can load an alignment.
bool type
Tag to know if the machine should output the type of the alignment.
bool saveAlignment(const std::string &outPattern, const std::vector< std::string > &outFormats, Alignment &alignment)
Function to save an alignment to a file. It searches among the available_states one that can write th...
#define BUILD
Definition: defines.h:33