Adding fasta_digest files

veg · May 6, 2024 · 8c54a5c · 8c54a5c
1 parent a4037d9
commit 8c54a5c
Show file tree

Hide file tree

Showing 3 changed files with 364 additions and 0 deletions.
diff --git a/src/argparse_fasta_digest.cpp b/src/argparse_fasta_digest.cpp
@@ -0,0 +1,176 @@
+
+/* argument parsing ------------------------------------------------------------------------------------------------- */
+
+#include <cstdarg>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cctype>
+
+#include "argparse_fasta_digest.hpp"
+
+// some crazy shit for stringifying preprocessor directives
+#define STRIFY(x) #x
+#define TO_STR(x) STRIFY(x)
+
+namespace argparse {
+    const char usage[] =
+    "usage: " PROGNAME " [-h] "
+    "[-o OUTPUT] "
+    "[-r REVERSE_COMPLEMENT] "
+    "[-m MATCH] "
+    "-e MOTIFS "
+    "[FASTA]\n";
+
+    const char help_msg[] =
+    "Read a FASTA file, a comma separated list of nucleotide binding motifs and report, for each sequence, 1-based coordinates of any motifs (nucleotide strings, 4-16 letters) found in the sequence\n"
+    "\n"
+    "optional arguments:\n"
+    "  -h, --help               show this help message and exit\n"
+    "  -o OUTPUT                write the resulting JSON file to a file named OUTPUT (default=stdout)\n"
+    "  -r OPERATION             consider reverse complements (default=" TO_STR( DEFAULT_RC ) ")\n"
+    "                           complement: for each motif, also consider its reverse complement  \n"
+    "                           nocomplement: only consider the motifs themselves\n"
+    "  -m MATCH                 how to match the motif (default=" TO_STR( DEFAULT_MATCH ) ")\n"
+    "                           exact: match the motif exactly\n"
+    "                           oneoff: allow any one motif mis-match\n"
+    "                           transition: allow any one motif mis-match that is a transition (A:G, C:T)\n"
+    "  -e MOTIF_LIST            specify delimited nucleotide sequences to use as motifs\n"
+    "                           not case sensitve (aaaccc is the same as AAACCC)\n"
+    "                           any contigous set of ACGTs define a motif but it must be 4-12 bases in length\n"
+    "  FASTA                    read FASTA sequences from this file (default=stdin)\n";
+
+    inline
+    void help()
+    {
+        fprintf( stderr, "%s\n%s", usage, help_msg );
+        exit( 1 );
+    }
+
+    inline
+    void ERROR( const char * msg, ... )
+    {
+        va_list args;
+        fprintf( stderr, "%s" PROGNAME ": error: ", usage );
+        va_start( args, msg );
+        vfprintf( stderr, msg, args );
+        va_end( args );
+        fprintf( stderr, "\n" );
+        exit( 1 );
+    }
+
+    const char * next_arg (int& i, const int argc, const char * argv[]) {
+        i++;
+        if (i == argc)
+            ERROR ("ran out of command line arguments");
+
+        return argv[i];
+
+    }
+
+    args_t::args_t( int argc, const char * argv[] ) :
+    input( stdin ),
+    output( stdout ),
+    rc ( DEFAULT_RC ),
+    match (DEFAULT_MATCH),
+    motif_list (NULL)
+    {
+        // skip arg[0], it's just the program name
+        for (int i = 1; i < argc; ++i ) {
+            const char * arg = argv[i];
+
+            if ( arg[0] == '-' && arg[1] == '-' ) {
+                if ( !strcmp( &arg[2], "help" ) ) help();
+                else
+                    ERROR( "unknown argument: %s", arg );
+            }
+            else if ( arg[0] == '-' ) {
+                if ( !strcmp( &arg[1], "h" ) ) help();
+                else if (  arg[1] == 'o' ) parse_output( next_arg (i, argc, argv) );
+                else if (  arg[1] == 'r')  parse_rc_option ( next_arg (i, argc, argv) );
+                else if (  arg[1] == 'm')  parse_match_mode( next_arg (i, argc, argv) );
+                else if (  arg[1] == 'e')  parse_motif( next_arg (i, argc, argv) );
+                else
+                    ERROR( "unknown argument: %s", arg );
+            }
+            else
+                if (i == argc-1) {
+                    parse_input (arg);
+                } else {
+                    ERROR( "unknown argument: %s", arg );
+                }
+        }
+
+        if (motif_list == NULL) {
+            ERROR ("Required argument MOTIF_LIST was not provided");
+        }
+    }
+
+    args_t::~args_t() {
+        if ( output && output != stdout )
+            fclose( output );
+
+        if ( input && input != stdin)
+            fclose ( input );
+
+        if ( motif_list )
+            free ( motif_list );
+    }
+
+
+    void args_t::parse_output( const char * str ) {
+        if ( str && strcmp( str, "-" ) )
+            output = fopen( str, "wb" );
+        else
+            output = stdout;
+
+        if ( output == NULL )
+            ERROR( "failed to open the OUTPUT file %s", str );
+    }
+
+    void args_t::parse_input ( const char * str ) {
+        if ( str && strcmp( str, "-" ) )
+            input = fopen( str, "rb" );
+        else
+            input = stdin;
+
+        if ( input == NULL )
+            ERROR( "failed to open the INPUT file %s", str );
+    }
+
+
+
+    void args_t::parse_rc_option( const char * str )
+    {
+        if (!strcmp (str, "complement")) {
+            rc = complement;
+        } else if (!strcmp (str, "nocomplement")) {
+            rc = nocomplement;
+        } else  {
+            ERROR( "invalid file operation: %s", str );
+        }
+    }
+
+    void args_t::parse_match_mode ( const char * str )
+    {
+        if (!strcmp (str, "exact")) {
+            match = exact;
+        } else if (!strcmp (str, "oneoff")) {
+            match = oneoff;
+        } else if (!strcmp (str, "transitions")) {
+            match = transition;
+        } else  {
+            ERROR( "invalid match mode: %s", str );
+        }
+    }
+
+    void args_t::parse_motif ( const char * str  ) {
+        const int L = strlen (str);
+        if (L >= 4) {
+            motif_list = (char*)malloc (L + 1);
+            strcpy (motif_list, str);
+        } else {
+            ERROR( "motif list must be at least 4 characters long: %s", str );
+        }
+    }
+}
diff --git a/src/argparse_fasta_digest.hpp b/src/argparse_fasta_digest.hpp
@@ -0,0 +1,56 @@
+
+#ifndef ARGPARSE_H
+#define ARGPARSE_H
+#include <cstdarg>
+
+
+// argument defaults
+
+
+#define PROGNAME                         "fasta_digest"
+
+#define        DEFAULT_RC                complement
+#define        DEFAULT_MATCH             exact
+
+
+namespace argparse {
+    void ERROR( const char * msg, ... );
+
+    enum rc_option {
+        complement,
+        nocomplement
+    };
+
+    enum match_mode {
+        exact,
+        oneoff,
+        transition
+    };
+
+
+    class args_t {
+    public:
+
+        FILE * input,
+             * output;
+
+        char * motif_list;
+
+        rc_option           rc;
+        match_mode          match;
+
+
+        args_t( int, const char ** );
+        ~args_t();
+
+    private:
+        void parse_input              ( const char * );
+        void parse_enzyme_lis         ( const char * );
+        void parse_output             ( const char * );
+        void parse_rc_option          ( const char * );
+        void parse_match_mode         ( const char * );
+        void parse_motif              ( const char * );
+   };
+}
+
+#endif // ARGPARSE_H
diff --git a/src/fasta_digest.cpp b/src/fasta_digest.cpp
@@ -0,0 +1,132 @@
+
+#include "tn93_shared.h"
+#include "argparse_fasta_digest.hpp"
+#include <map>
+#include <list>
+#include <string>
+
+using namespace std;
+using namespace argparse;
+
+Vector* patterns[13] = {NULL};
+
+// bit masks for
+const unsigned long masks [16] = {
+    0b00000000000000000000000000000011,
+    0b00000000000000000000000000001100,
+    0b00000000000000000000000000110000,
+    0b00000000000000000000000011000000,
+    0b00000000000000000000001100000000,
+    0b00000000000000000000110000000000,
+    0b00000000000000000011000000000000,
+    0b00000000000000001100000000000000,
+    0b00000000000000110000000000000000,
+    0b00000000000011000000000000000000,
+    0b00000000001100000000000000000000,
+    0b00000000110000000000000000000000,
+    0b00000011000000000000000000000000,
+    0b00001100000000000000000000000000,
+    0b00110000000000000000000000000000,
+    0b11000000000000000000000000000000
+};
+
+int  allowed_chars [255];
+
+unsigned patternToCode (const StringBuffer& pattern) {
+
+    unsigned code = 0,
+             L = pattern.length();
+
+    for (int i = 0; i < L; i++) {
+        code += allowed_chars[pattern.getChar (i)] << (2*i);
+    }
+
+    cout << pattern.getString() << "=" << code << endl;
+    return code;
+}
+
+//---------------------------------------------------------------
+
+int main(int argc, const char *argv[]) {
+    args_t args = args_t(argc, argv);
+
+    initAlphabets(false, NULL, true);
+
+
+    for (int i = 0; i < 256; i++) {
+        allowed_chars[i] = -1;
+    }
+
+    allowed_chars['A'] = 0;
+    allowed_chars['C'] = 1;
+    allowed_chars['G'] = 2;
+    allowed_chars['T'] = 3;
+
+    char complement_chars[255] = {0};
+    complement_chars['A'] = 'T';
+    complement_chars['T'] = 'A';
+    complement_chars['C'] = 'G';
+    complement_chars['G'] = 'C';
+
+
+    auto process_motif = [&] (StringBuffer& pattern) -> void {
+        int L = pattern.length();
+        if (L >= 4 && L <= 16) {
+            cout << pattern.getString () << endl;
+            if (!patterns[L]) {
+                patterns[L] = new Vector;
+            }
+            patterns[L]->appendValue(patternToCode (pattern));
+
+            if (args.rc == complement) {
+                StringBuffer rc;
+                for (int i = L-1; i>=0; i--) {
+                    rc.appendChar (complement_chars[pattern.getChar (i)]);
+                }
+                patterns[L]->appendValue(patternToCode (rc));
+            }
+
+            pattern.resetString ();
+        } else {
+            throw (std::string ("Motif '") + std::string (pattern.getString()) + "' is not 4-16 nucleotides in length");
+        }
+    };
+
+    try {
+        // parse the patterns
+        StringBuffer pattern;
+        char automatonState = 0;
+        long firstSequenceLength = strlen (args.motif_list);
+
+        for (unsigned i = 0; i < firstSequenceLength; i++) {
+            char current_char = toupper (args.motif_list[i]);
+            if (allowed_chars [current_char] >= 0) {
+                automatonState = 1;
+                pattern.appendChar (current_char);
+            } else {
+                if (automatonState == 1) {
+                    process_motif (pattern);
+                }
+                automatonState = 0;
+            }
+        }
+
+        if (automatonState == 1) {
+            process_motif (pattern);
+        }
+
+        return 1;
+        automatonState = 0;
+        while (long state = readFASTA(args.input, automatonState, names, sequences, nameLengths,
+                                      seqLengths, firstSequenceLength, true)) { // read sequences one by one
+
+            firstSequenceLength = 0L;
+        }
+
+    } catch (const std::string err) {
+        cerr << "ERROR: " << err << endl;
+        return 1;
+    }
+
+    return 0;
+}