-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
364 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,176 @@ | ||
|
||
/* argument parsing ------------------------------------------------------------------------------------------------- */ | ||
|
||
#include <cstdarg> | ||
#include <cstdio> | ||
#include <cstdlib> | ||
#include <cstring> | ||
#include <cctype> | ||
|
||
#include "argparse_fasta_digest.hpp" | ||
|
||
// some crazy shit for stringifying preprocessor directives | ||
#define STRIFY(x) #x | ||
#define TO_STR(x) STRIFY(x) | ||
|
||
namespace argparse { | ||
const char usage[] = | ||
"usage: " PROGNAME " [-h] " | ||
"[-o OUTPUT] " | ||
"[-r REVERSE_COMPLEMENT] " | ||
"[-m MATCH] " | ||
"-e MOTIFS " | ||
"[FASTA]\n"; | ||
|
||
const char help_msg[] = | ||
"Read a FASTA file, a comma separated list of nucleotide binding motifs and report, for each sequence, 1-based coordinates of any motifs (nucleotide strings, 4-16 letters) found in the sequence\n" | ||
"\n" | ||
"optional arguments:\n" | ||
" -h, --help show this help message and exit\n" | ||
" -o OUTPUT write the resulting JSON file to a file named OUTPUT (default=stdout)\n" | ||
" -r OPERATION consider reverse complements (default=" TO_STR( DEFAULT_RC ) ")\n" | ||
" complement: for each motif, also consider its reverse complement \n" | ||
" nocomplement: only consider the motifs themselves\n" | ||
" -m MATCH how to match the motif (default=" TO_STR( DEFAULT_MATCH ) ")\n" | ||
" exact: match the motif exactly\n" | ||
" oneoff: allow any one motif mis-match\n" | ||
" transition: allow any one motif mis-match that is a transition (A:G, C:T)\n" | ||
" -e MOTIF_LIST specify delimited nucleotide sequences to use as motifs\n" | ||
" not case sensitve (aaaccc is the same as AAACCC)\n" | ||
" any contigous set of ACGTs define a motif but it must be 4-12 bases in length\n" | ||
" FASTA read FASTA sequences from this file (default=stdin)\n"; | ||
|
||
inline | ||
void help() | ||
{ | ||
fprintf( stderr, "%s\n%s", usage, help_msg ); | ||
exit( 1 ); | ||
} | ||
|
||
inline | ||
void ERROR( const char * msg, ... ) | ||
{ | ||
va_list args; | ||
fprintf( stderr, "%s" PROGNAME ": error: ", usage ); | ||
va_start( args, msg ); | ||
vfprintf( stderr, msg, args ); | ||
va_end( args ); | ||
fprintf( stderr, "\n" ); | ||
exit( 1 ); | ||
} | ||
|
||
const char * next_arg (int& i, const int argc, const char * argv[]) { | ||
i++; | ||
if (i == argc) | ||
ERROR ("ran out of command line arguments"); | ||
|
||
return argv[i]; | ||
|
||
} | ||
|
||
args_t::args_t( int argc, const char * argv[] ) : | ||
input( stdin ), | ||
output( stdout ), | ||
rc ( DEFAULT_RC ), | ||
match (DEFAULT_MATCH), | ||
motif_list (NULL) | ||
{ | ||
// skip arg[0], it's just the program name | ||
for (int i = 1; i < argc; ++i ) { | ||
const char * arg = argv[i]; | ||
|
||
if ( arg[0] == '-' && arg[1] == '-' ) { | ||
if ( !strcmp( &arg[2], "help" ) ) help(); | ||
else | ||
ERROR( "unknown argument: %s", arg ); | ||
} | ||
else if ( arg[0] == '-' ) { | ||
if ( !strcmp( &arg[1], "h" ) ) help(); | ||
else if ( arg[1] == 'o' ) parse_output( next_arg (i, argc, argv) ); | ||
else if ( arg[1] == 'r') parse_rc_option ( next_arg (i, argc, argv) ); | ||
else if ( arg[1] == 'm') parse_match_mode( next_arg (i, argc, argv) ); | ||
else if ( arg[1] == 'e') parse_motif( next_arg (i, argc, argv) ); | ||
else | ||
ERROR( "unknown argument: %s", arg ); | ||
} | ||
else | ||
if (i == argc-1) { | ||
parse_input (arg); | ||
} else { | ||
ERROR( "unknown argument: %s", arg ); | ||
} | ||
} | ||
|
||
if (motif_list == NULL) { | ||
ERROR ("Required argument MOTIF_LIST was not provided"); | ||
} | ||
} | ||
|
||
args_t::~args_t() { | ||
if ( output && output != stdout ) | ||
fclose( output ); | ||
|
||
if ( input && input != stdin) | ||
fclose ( input ); | ||
|
||
if ( motif_list ) | ||
free ( motif_list ); | ||
} | ||
|
||
|
||
void args_t::parse_output( const char * str ) { | ||
if ( str && strcmp( str, "-" ) ) | ||
output = fopen( str, "wb" ); | ||
else | ||
output = stdout; | ||
|
||
if ( output == NULL ) | ||
ERROR( "failed to open the OUTPUT file %s", str ); | ||
} | ||
|
||
void args_t::parse_input ( const char * str ) { | ||
if ( str && strcmp( str, "-" ) ) | ||
input = fopen( str, "rb" ); | ||
else | ||
input = stdin; | ||
|
||
if ( input == NULL ) | ||
ERROR( "failed to open the INPUT file %s", str ); | ||
} | ||
|
||
|
||
|
||
void args_t::parse_rc_option( const char * str ) | ||
{ | ||
if (!strcmp (str, "complement")) { | ||
rc = complement; | ||
} else if (!strcmp (str, "nocomplement")) { | ||
rc = nocomplement; | ||
} else { | ||
ERROR( "invalid file operation: %s", str ); | ||
} | ||
} | ||
|
||
void args_t::parse_match_mode ( const char * str ) | ||
{ | ||
if (!strcmp (str, "exact")) { | ||
match = exact; | ||
} else if (!strcmp (str, "oneoff")) { | ||
match = oneoff; | ||
} else if (!strcmp (str, "transitions")) { | ||
match = transition; | ||
} else { | ||
ERROR( "invalid match mode: %s", str ); | ||
} | ||
} | ||
|
||
void args_t::parse_motif ( const char * str ) { | ||
const int L = strlen (str); | ||
if (L >= 4) { | ||
motif_list = (char*)malloc (L + 1); | ||
strcpy (motif_list, str); | ||
} else { | ||
ERROR( "motif list must be at least 4 characters long: %s", str ); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
|
||
#ifndef ARGPARSE_H | ||
#define ARGPARSE_H | ||
#include <cstdarg> | ||
|
||
|
||
// argument defaults | ||
|
||
|
||
#define PROGNAME "fasta_digest" | ||
|
||
#define DEFAULT_RC complement | ||
#define DEFAULT_MATCH exact | ||
|
||
|
||
namespace argparse { | ||
void ERROR( const char * msg, ... ); | ||
|
||
enum rc_option { | ||
complement, | ||
nocomplement | ||
}; | ||
|
||
enum match_mode { | ||
exact, | ||
oneoff, | ||
transition | ||
}; | ||
|
||
|
||
class args_t { | ||
public: | ||
|
||
FILE * input, | ||
* output; | ||
|
||
char * motif_list; | ||
|
||
rc_option rc; | ||
match_mode match; | ||
|
||
|
||
args_t( int, const char ** ); | ||
~args_t(); | ||
|
||
private: | ||
void parse_input ( const char * ); | ||
void parse_enzyme_lis ( const char * ); | ||
void parse_output ( const char * ); | ||
void parse_rc_option ( const char * ); | ||
void parse_match_mode ( const char * ); | ||
void parse_motif ( const char * ); | ||
}; | ||
} | ||
|
||
#endif // ARGPARSE_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
|
||
#include "tn93_shared.h" | ||
#include "argparse_fasta_digest.hpp" | ||
#include <map> | ||
#include <list> | ||
#include <string> | ||
|
||
using namespace std; | ||
using namespace argparse; | ||
|
||
Vector* patterns[13] = {NULL}; | ||
|
||
// bit masks for | ||
const unsigned long masks [16] = { | ||
0b00000000000000000000000000000011, | ||
0b00000000000000000000000000001100, | ||
0b00000000000000000000000000110000, | ||
0b00000000000000000000000011000000, | ||
0b00000000000000000000001100000000, | ||
0b00000000000000000000110000000000, | ||
0b00000000000000000011000000000000, | ||
0b00000000000000001100000000000000, | ||
0b00000000000000110000000000000000, | ||
0b00000000000011000000000000000000, | ||
0b00000000001100000000000000000000, | ||
0b00000000110000000000000000000000, | ||
0b00000011000000000000000000000000, | ||
0b00001100000000000000000000000000, | ||
0b00110000000000000000000000000000, | ||
0b11000000000000000000000000000000 | ||
}; | ||
|
||
int allowed_chars [255]; | ||
|
||
unsigned patternToCode (const StringBuffer& pattern) { | ||
|
||
unsigned code = 0, | ||
L = pattern.length(); | ||
|
||
for (int i = 0; i < L; i++) { | ||
code += allowed_chars[pattern.getChar (i)] << (2*i); | ||
} | ||
|
||
cout << pattern.getString() << "=" << code << endl; | ||
return code; | ||
} | ||
|
||
//--------------------------------------------------------------- | ||
|
||
int main(int argc, const char *argv[]) { | ||
args_t args = args_t(argc, argv); | ||
|
||
initAlphabets(false, NULL, true); | ||
|
||
|
||
for (int i = 0; i < 256; i++) { | ||
allowed_chars[i] = -1; | ||
} | ||
|
||
allowed_chars['A'] = 0; | ||
allowed_chars['C'] = 1; | ||
allowed_chars['G'] = 2; | ||
allowed_chars['T'] = 3; | ||
|
||
char complement_chars[255] = {0}; | ||
complement_chars['A'] = 'T'; | ||
complement_chars['T'] = 'A'; | ||
complement_chars['C'] = 'G'; | ||
complement_chars['G'] = 'C'; | ||
|
||
|
||
auto process_motif = [&] (StringBuffer& pattern) -> void { | ||
int L = pattern.length(); | ||
if (L >= 4 && L <= 16) { | ||
cout << pattern.getString () << endl; | ||
if (!patterns[L]) { | ||
patterns[L] = new Vector; | ||
} | ||
patterns[L]->appendValue(patternToCode (pattern)); | ||
|
||
if (args.rc == complement) { | ||
StringBuffer rc; | ||
for (int i = L-1; i>=0; i--) { | ||
rc.appendChar (complement_chars[pattern.getChar (i)]); | ||
} | ||
patterns[L]->appendValue(patternToCode (rc)); | ||
} | ||
|
||
pattern.resetString (); | ||
} else { | ||
throw (std::string ("Motif '") + std::string (pattern.getString()) + "' is not 4-16 nucleotides in length"); | ||
} | ||
}; | ||
|
||
try { | ||
// parse the patterns | ||
StringBuffer pattern; | ||
char automatonState = 0; | ||
long firstSequenceLength = strlen (args.motif_list); | ||
|
||
for (unsigned i = 0; i < firstSequenceLength; i++) { | ||
char current_char = toupper (args.motif_list[i]); | ||
if (allowed_chars [current_char] >= 0) { | ||
automatonState = 1; | ||
pattern.appendChar (current_char); | ||
} else { | ||
if (automatonState == 1) { | ||
process_motif (pattern); | ||
} | ||
automatonState = 0; | ||
} | ||
} | ||
|
||
if (automatonState == 1) { | ||
process_motif (pattern); | ||
} | ||
|
||
return 1; | ||
automatonState = 0; | ||
while (long state = readFASTA(args.input, automatonState, names, sequences, nameLengths, | ||
seqLengths, firstSequenceLength, true)) { // read sequences one by one | ||
|
||
firstSequenceLength = 0L; | ||
} | ||
|
||
} catch (const std::string err) { | ||
cerr << "ERROR: " << err << endl; | ||
return 1; | ||
} | ||
|
||
return 0; | ||
} |