Skip to content

Commit

Permalink
Adding fasta_digest files
Browse files Browse the repository at this point in the history
  • Loading branch information
spond committed May 6, 2024
1 parent a4037d9 commit 8c54a5c
Show file tree
Hide file tree
Showing 3 changed files with 364 additions and 0 deletions.
176 changes: 176 additions & 0 deletions src/argparse_fasta_digest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@

/* argument parsing ------------------------------------------------------------------------------------------------- */

#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cctype>

#include "argparse_fasta_digest.hpp"

// some crazy shit for stringifying preprocessor directives
#define STRIFY(x) #x
#define TO_STR(x) STRIFY(x)

namespace argparse {
const char usage[] =
"usage: " PROGNAME " [-h] "
"[-o OUTPUT] "
"[-r REVERSE_COMPLEMENT] "
"[-m MATCH] "
"-e MOTIFS "
"[FASTA]\n";

const char help_msg[] =
"Read a FASTA file, a comma separated list of nucleotide binding motifs and report, for each sequence, 1-based coordinates of any motifs (nucleotide strings, 4-16 letters) found in the sequence\n"
"\n"
"optional arguments:\n"
" -h, --help show this help message and exit\n"
" -o OUTPUT write the resulting JSON file to a file named OUTPUT (default=stdout)\n"
" -r OPERATION consider reverse complements (default=" TO_STR( DEFAULT_RC ) ")\n"
" complement: for each motif, also consider its reverse complement \n"
" nocomplement: only consider the motifs themselves\n"
" -m MATCH how to match the motif (default=" TO_STR( DEFAULT_MATCH ) ")\n"
" exact: match the motif exactly\n"
" oneoff: allow any one motif mis-match\n"
" transition: allow any one motif mis-match that is a transition (A:G, C:T)\n"
" -e MOTIF_LIST specify delimited nucleotide sequences to use as motifs\n"
" not case sensitve (aaaccc is the same as AAACCC)\n"
" any contigous set of ACGTs define a motif but it must be 4-12 bases in length\n"
" FASTA read FASTA sequences from this file (default=stdin)\n";

inline
void help()
{
fprintf( stderr, "%s\n%s", usage, help_msg );
exit( 1 );
}

inline
void ERROR( const char * msg, ... )
{
va_list args;
fprintf( stderr, "%s" PROGNAME ": error: ", usage );
va_start( args, msg );
vfprintf( stderr, msg, args );
va_end( args );
fprintf( stderr, "\n" );
exit( 1 );
}

const char * next_arg (int& i, const int argc, const char * argv[]) {
i++;
if (i == argc)
ERROR ("ran out of command line arguments");

return argv[i];

}

args_t::args_t( int argc, const char * argv[] ) :
input( stdin ),
output( stdout ),
rc ( DEFAULT_RC ),
match (DEFAULT_MATCH),
motif_list (NULL)
{
// skip arg[0], it's just the program name
for (int i = 1; i < argc; ++i ) {
const char * arg = argv[i];

if ( arg[0] == '-' && arg[1] == '-' ) {
if ( !strcmp( &arg[2], "help" ) ) help();
else
ERROR( "unknown argument: %s", arg );
}
else if ( arg[0] == '-' ) {
if ( !strcmp( &arg[1], "h" ) ) help();
else if ( arg[1] == 'o' ) parse_output( next_arg (i, argc, argv) );
else if ( arg[1] == 'r') parse_rc_option ( next_arg (i, argc, argv) );
else if ( arg[1] == 'm') parse_match_mode( next_arg (i, argc, argv) );
else if ( arg[1] == 'e') parse_motif( next_arg (i, argc, argv) );
else
ERROR( "unknown argument: %s", arg );
}
else
if (i == argc-1) {
parse_input (arg);
} else {
ERROR( "unknown argument: %s", arg );
}
}

if (motif_list == NULL) {
ERROR ("Required argument MOTIF_LIST was not provided");
}
}

args_t::~args_t() {
if ( output && output != stdout )
fclose( output );

if ( input && input != stdin)
fclose ( input );

if ( motif_list )
free ( motif_list );
}


void args_t::parse_output( const char * str ) {
if ( str && strcmp( str, "-" ) )
output = fopen( str, "wb" );
else
output = stdout;

if ( output == NULL )
ERROR( "failed to open the OUTPUT file %s", str );
}

void args_t::parse_input ( const char * str ) {
if ( str && strcmp( str, "-" ) )
input = fopen( str, "rb" );
else
input = stdin;

if ( input == NULL )
ERROR( "failed to open the INPUT file %s", str );
}



void args_t::parse_rc_option( const char * str )
{
if (!strcmp (str, "complement")) {
rc = complement;
} else if (!strcmp (str, "nocomplement")) {
rc = nocomplement;
} else {
ERROR( "invalid file operation: %s", str );
}
}

void args_t::parse_match_mode ( const char * str )
{
if (!strcmp (str, "exact")) {
match = exact;
} else if (!strcmp (str, "oneoff")) {
match = oneoff;
} else if (!strcmp (str, "transitions")) {
match = transition;
} else {
ERROR( "invalid match mode: %s", str );
}
}

void args_t::parse_motif ( const char * str ) {
const int L = strlen (str);
if (L >= 4) {
motif_list = (char*)malloc (L + 1);
strcpy (motif_list, str);
} else {
ERROR( "motif list must be at least 4 characters long: %s", str );
}
}
}
56 changes: 56 additions & 0 deletions src/argparse_fasta_digest.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

#ifndef ARGPARSE_H
#define ARGPARSE_H
#include <cstdarg>


// argument defaults


#define PROGNAME "fasta_digest"

#define DEFAULT_RC complement
#define DEFAULT_MATCH exact


namespace argparse {
void ERROR( const char * msg, ... );

enum rc_option {
complement,
nocomplement
};

enum match_mode {
exact,
oneoff,
transition
};


class args_t {
public:

FILE * input,
* output;

char * motif_list;

rc_option rc;
match_mode match;


args_t( int, const char ** );
~args_t();

private:
void parse_input ( const char * );
void parse_enzyme_lis ( const char * );
void parse_output ( const char * );
void parse_rc_option ( const char * );
void parse_match_mode ( const char * );
void parse_motif ( const char * );
};
}

#endif // ARGPARSE_H
132 changes: 132 additions & 0 deletions src/fasta_digest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@

#include "tn93_shared.h"
#include "argparse_fasta_digest.hpp"
#include <map>
#include <list>
#include <string>

using namespace std;
using namespace argparse;

Vector* patterns[13] = {NULL};

// bit masks for
const unsigned long masks [16] = {
0b00000000000000000000000000000011,
0b00000000000000000000000000001100,
0b00000000000000000000000000110000,
0b00000000000000000000000011000000,
0b00000000000000000000001100000000,
0b00000000000000000000110000000000,
0b00000000000000000011000000000000,
0b00000000000000001100000000000000,
0b00000000000000110000000000000000,
0b00000000000011000000000000000000,
0b00000000001100000000000000000000,
0b00000000110000000000000000000000,
0b00000011000000000000000000000000,
0b00001100000000000000000000000000,
0b00110000000000000000000000000000,
0b11000000000000000000000000000000
};

int allowed_chars [255];

unsigned patternToCode (const StringBuffer& pattern) {

unsigned code = 0,
L = pattern.length();

for (int i = 0; i < L; i++) {
code += allowed_chars[pattern.getChar (i)] << (2*i);
}

cout << pattern.getString() << "=" << code << endl;
return code;
}

//---------------------------------------------------------------

int main(int argc, const char *argv[]) {
args_t args = args_t(argc, argv);

initAlphabets(false, NULL, true);


for (int i = 0; i < 256; i++) {
allowed_chars[i] = -1;
}

allowed_chars['A'] = 0;
allowed_chars['C'] = 1;
allowed_chars['G'] = 2;
allowed_chars['T'] = 3;

char complement_chars[255] = {0};
complement_chars['A'] = 'T';
complement_chars['T'] = 'A';
complement_chars['C'] = 'G';
complement_chars['G'] = 'C';


auto process_motif = [&] (StringBuffer& pattern) -> void {
int L = pattern.length();
if (L >= 4 && L <= 16) {
cout << pattern.getString () << endl;
if (!patterns[L]) {
patterns[L] = new Vector;
}
patterns[L]->appendValue(patternToCode (pattern));

if (args.rc == complement) {
StringBuffer rc;
for (int i = L-1; i>=0; i--) {
rc.appendChar (complement_chars[pattern.getChar (i)]);
}
patterns[L]->appendValue(patternToCode (rc));
}

pattern.resetString ();
} else {
throw (std::string ("Motif '") + std::string (pattern.getString()) + "' is not 4-16 nucleotides in length");
}
};

try {
// parse the patterns
StringBuffer pattern;
char automatonState = 0;
long firstSequenceLength = strlen (args.motif_list);

for (unsigned i = 0; i < firstSequenceLength; i++) {
char current_char = toupper (args.motif_list[i]);
if (allowed_chars [current_char] >= 0) {
automatonState = 1;
pattern.appendChar (current_char);
} else {
if (automatonState == 1) {
process_motif (pattern);
}
automatonState = 0;
}
}

if (automatonState == 1) {
process_motif (pattern);
}

return 1;
automatonState = 0;
while (long state = readFASTA(args.input, automatonState, names, sequences, nameLengths,
seqLengths, firstSequenceLength, true)) { // read sequences one by one

firstSequenceLength = 0L;
}

} catch (const std::string err) {
cerr << "ERROR: " << err << endl;
return 1;
}

return 0;
}

0 comments on commit 8c54a5c

Please sign in to comment.